diff -urN oldtree/arch/um/drivers/ubd_kern.c newtree/arch/um/drivers/ubd_kern.c --- oldtree/arch/um/drivers/ubd_kern.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/arch/um/drivers/ubd_kern.c 2006-08-03 13:27:09.000000000 -0700 @@ -986,8 +986,6 @@ __u64 offset; int len; - if(req->rq_status == RQ_INACTIVE) return(1); - /* This should be impossible now */ if((rq_data_dir(req) == WRITE) && !dev->openflags.w){ printk("Write attempted on readonly ubd device %s\n", diff -urN oldtree/block/as-iosched.c newtree/block/as-iosched.c --- oldtree/block/as-iosched.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/block/as-iosched.c 2006-08-03 13:27:09.000000000 -0700 @@ -14,7 +14,6 @@ #include #include #include -#include #include #include @@ -93,9 +92,8 @@ struct rb_root sort_list[2]; struct list_head fifo_list[2]; - struct as_rq *next_arq[2]; /* next in sort order */ + struct request *next_rq[2]; /* next in sort order */ sector_t last_sector[2]; /* last REQ_SYNC & REQ_ASYNC sectors */ - struct hlist_head *hash; /* request hash */ unsigned long exit_prob; /* probability a task will exit while being waited on */ @@ -115,7 +113,6 @@ int write_batch_count; /* max # of reqs in a write batch */ int current_write_count; /* how many requests left this batch */ int write_batch_idled; /* has the write batch gone idle? */ - mempool_t *arq_pool; enum anticipation_status antic_status; unsigned long antic_start; /* jiffies: when it started */ @@ -133,8 +130,6 @@ unsigned long antic_expire; }; -#define list_entry_fifo(ptr) list_entry((ptr), struct as_rq, fifo) - /* * per-request data. */ @@ -150,40 +145,14 @@ AS_RQ_POSTSCHED, /* when they shouldn't be */ }; -struct as_rq { - /* - * rbtree index, key is the starting offset - */ - struct rb_node rb_node; - sector_t rb_key; - - struct request *request; - - struct io_context *io_context; /* The submitting task */ - - /* - * request hash, key is the ending offset (for back merge lookup) - */ - struct hlist_node hash; - - /* - * expire fifo - */ - struct list_head fifo; - unsigned long expires; +#define RQ_IOC(rq) ((struct io_context *) (rq)->elevator_private) +#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) +#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) - unsigned int is_sync; - enum arq_state state; -}; - -#define RQ_DATA(rq) ((struct as_rq *) (rq)->elevator_private) - -static kmem_cache_t *arq_pool; - -static atomic_t ioc_count = ATOMIC_INIT(0); +static DEFINE_PER_CPU(unsigned long, ioc_count); static struct completion *ioc_gone; -static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq); +static void as_move_to_dispatch(struct as_data *ad, struct request *rq); static void as_antic_stop(struct as_data *ad); /* @@ -194,7 +163,8 @@ static void free_as_io_context(struct as_io_context *aic) { kfree(aic); - if (atomic_dec_and_test(&ioc_count) && ioc_gone) + elv_ioc_count_dec(ioc_count); + if (ioc_gone && !elv_ioc_count_read(ioc_count)) complete(ioc_gone); } @@ -230,7 +200,7 @@ ret->seek_total = 0; ret->seek_samples = 0; ret->seek_mean = 0; - atomic_inc(&ioc_count); + elv_ioc_count_inc(ioc_count); } return ret; @@ -240,9 +210,9 @@ * If the current task has no AS IO context then create one and initialise it. * Then take a ref on the task's io context and return it. */ -static struct io_context *as_get_io_context(void) +static struct io_context *as_get_io_context(int node) { - struct io_context *ioc = get_io_context(GFP_ATOMIC); + struct io_context *ioc = get_io_context(GFP_ATOMIC, node); if (ioc && !ioc->aic) { ioc->aic = alloc_as_io_context(); if (!ioc->aic) { @@ -253,194 +223,43 @@ return ioc; } -static void as_put_io_context(struct as_rq *arq) +static void as_put_io_context(struct request *rq) { struct as_io_context *aic; - if (unlikely(!arq->io_context)) + if (unlikely(!RQ_IOC(rq))) return; - aic = arq->io_context->aic; + aic = RQ_IOC(rq)->aic; - if (arq->is_sync == REQ_SYNC && aic) { + if (rq_is_sync(rq) && aic) { spin_lock(&aic->lock); set_bit(AS_TASK_IORUNNING, &aic->state); aic->last_end_request = jiffies; spin_unlock(&aic->lock); } - put_io_context(arq->io_context); -} - -/* - * the back merge hash support functions - */ -static const int as_hash_shift = 6; -#define AS_HASH_BLOCK(sec) ((sec) >> 3) -#define AS_HASH_FN(sec) (hash_long(AS_HASH_BLOCK((sec)), as_hash_shift)) -#define AS_HASH_ENTRIES (1 << as_hash_shift) -#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) - -static inline void __as_del_arq_hash(struct as_rq *arq) -{ - hlist_del_init(&arq->hash); -} - -static inline void as_del_arq_hash(struct as_rq *arq) -{ - if (!hlist_unhashed(&arq->hash)) - __as_del_arq_hash(arq); -} - -static void as_add_arq_hash(struct as_data *ad, struct as_rq *arq) -{ - struct request *rq = arq->request; - - BUG_ON(!hlist_unhashed(&arq->hash)); - - hlist_add_head(&arq->hash, &ad->hash[AS_HASH_FN(rq_hash_key(rq))]); -} - -/* - * move hot entry to front of chain - */ -static inline void as_hot_arq_hash(struct as_data *ad, struct as_rq *arq) -{ - struct request *rq = arq->request; - struct hlist_head *head = &ad->hash[AS_HASH_FN(rq_hash_key(rq))]; - - if (hlist_unhashed(&arq->hash)) { - WARN_ON(1); - return; - } - - if (&arq->hash != head->first) { - hlist_del(&arq->hash); - hlist_add_head(&arq->hash, head); - } -} - -static struct request *as_find_arq_hash(struct as_data *ad, sector_t offset) -{ - struct hlist_head *hash_list = &ad->hash[AS_HASH_FN(offset)]; - struct hlist_node *entry, *next; - struct as_rq *arq; - - hlist_for_each_entry_safe(arq, entry, next, hash_list, hash) { - struct request *__rq = arq->request; - - BUG_ON(hlist_unhashed(&arq->hash)); - - if (!rq_mergeable(__rq)) { - as_del_arq_hash(arq); - continue; - } - - if (rq_hash_key(__rq) == offset) - return __rq; - } - - return NULL; + put_io_context(RQ_IOC(rq)); } /* * rb tree support functions */ -#define rb_entry_arq(node) rb_entry((node), struct as_rq, rb_node) -#define ARQ_RB_ROOT(ad, arq) (&(ad)->sort_list[(arq)->is_sync]) -#define rq_rb_key(rq) (rq)->sector - -/* - * as_find_first_arq finds the first (lowest sector numbered) request - * for the specified data_dir. Used to sweep back to the start of the disk - * (1-way elevator) after we process the last (highest sector) request. - */ -static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir) -{ - struct rb_node *n = ad->sort_list[data_dir].rb_node; - - if (n == NULL) - return NULL; - - for (;;) { - if (n->rb_left == NULL) - return rb_entry_arq(n); - - n = n->rb_left; - } -} - -/* - * Add the request to the rb tree if it is unique. If there is an alias (an - * existing request against the same sector), which can happen when using - * direct IO, then return the alias. - */ -static struct as_rq *__as_add_arq_rb(struct as_data *ad, struct as_rq *arq) -{ - struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node; - struct rb_node *parent = NULL; - struct as_rq *__arq; - struct request *rq = arq->request; - - arq->rb_key = rq_rb_key(rq); - - while (*p) { - parent = *p; - __arq = rb_entry_arq(parent); - - if (arq->rb_key < __arq->rb_key) - p = &(*p)->rb_left; - else if (arq->rb_key > __arq->rb_key) - p = &(*p)->rb_right; - else - return __arq; - } - - rb_link_node(&arq->rb_node, parent, p); - rb_insert_color(&arq->rb_node, ARQ_RB_ROOT(ad, arq)); - - return NULL; -} +#define RQ_RB_ROOT(ad, rq) (&(ad)->sort_list[rq_is_sync((rq))]) -static void as_add_arq_rb(struct as_data *ad, struct as_rq *arq) +static void as_add_rq_rb(struct as_data *ad, struct request *rq) { - struct as_rq *alias; + struct request *alias; - while ((unlikely(alias = __as_add_arq_rb(ad, arq)))) { + while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) { as_move_to_dispatch(ad, alias); as_antic_stop(ad); } } -static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq) +static inline void as_del_rq_rb(struct as_data *ad, struct request *rq) { - if (!RB_EMPTY_NODE(&arq->rb_node)) { - WARN_ON(1); - return; - } - - rb_erase(&arq->rb_node, ARQ_RB_ROOT(ad, arq)); - RB_CLEAR_NODE(&arq->rb_node); -} - -static struct request * -as_find_arq_rb(struct as_data *ad, sector_t sector, int data_dir) -{ - struct rb_node *n = ad->sort_list[data_dir].rb_node; - struct as_rq *arq; - - while (n) { - arq = rb_entry_arq(n); - - if (sector < arq->rb_key) - n = n->rb_left; - else if (sector > arq->rb_key) - n = n->rb_right; - else - return arq->request; - } - - return NULL; + elv_rb_del(RQ_RB_ROOT(ad, rq), rq); } /* @@ -458,26 +277,26 @@ * as_choose_req selects the preferred one of two requests of the same data_dir * ignoring time - eg. timeouts, which is the job of as_dispatch_request */ -static struct as_rq * -as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2) +static struct request * +as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2) { int data_dir; sector_t last, s1, s2, d1, d2; int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */ const sector_t maxback = MAXBACK; - if (arq1 == NULL || arq1 == arq2) - return arq2; - if (arq2 == NULL) - return arq1; + if (rq1 == NULL || rq1 == rq2) + return rq2; + if (rq2 == NULL) + return rq1; - data_dir = arq1->is_sync; + data_dir = rq_is_sync(rq1); last = ad->last_sector[data_dir]; - s1 = arq1->request->sector; - s2 = arq2->request->sector; + s1 = rq1->sector; + s2 = rq2->sector; - BUG_ON(data_dir != arq2->is_sync); + BUG_ON(data_dir != rq_is_sync(rq2)); /* * Strict one way elevator _except_ in the case where we allow @@ -504,61 +323,58 @@ /* Found required data */ if (!r1_wrap && r2_wrap) - return arq1; + return rq1; else if (!r2_wrap && r1_wrap) - return arq2; + return rq2; else if (r1_wrap && r2_wrap) { /* both behind the head */ if (s1 <= s2) - return arq1; + return rq1; else - return arq2; + return rq2; } /* Both requests in front of the head */ if (d1 < d2) - return arq1; + return rq1; else if (d2 < d1) - return arq2; + return rq2; else { if (s1 >= s2) - return arq1; + return rq1; else - return arq2; + return rq2; } } /* - * as_find_next_arq finds the next request after @prev in elevator order. + * as_find_next_rq finds the next request after @prev in elevator order. * this with as_choose_req form the basis for how the scheduler chooses * what request to process next. Anticipation works on top of this. */ -static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last) +static struct request * +as_find_next_rq(struct as_data *ad, struct request *last) { - const int data_dir = last->is_sync; - struct as_rq *ret; struct rb_node *rbnext = rb_next(&last->rb_node); struct rb_node *rbprev = rb_prev(&last->rb_node); - struct as_rq *arq_next, *arq_prev; + struct request *next = NULL, *prev = NULL; - BUG_ON(!RB_EMPTY_NODE(&last->rb_node)); + BUG_ON(RB_EMPTY_NODE(&last->rb_node)); if (rbprev) - arq_prev = rb_entry_arq(rbprev); - else - arq_prev = NULL; + prev = rb_entry_rq(rbprev); if (rbnext) - arq_next = rb_entry_arq(rbnext); + next = rb_entry_rq(rbnext); else { - arq_next = as_find_first_arq(ad, data_dir); - if (arq_next == last) - arq_next = NULL; - } + const int data_dir = rq_is_sync(last); - ret = as_choose_req(ad, arq_next, arq_prev); + rbnext = rb_first(&ad->sort_list[data_dir]); + if (rbnext && rbnext != &last->rb_node) + next = rb_entry_rq(rbnext); + } - return ret; + return as_choose_req(ad, next, prev); } /* @@ -712,8 +528,7 @@ static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq) { - struct as_rq *arq = RQ_DATA(rq); - int data_dir = arq->is_sync; + int data_dir = rq_is_sync(rq); unsigned long thinktime = 0; sector_t seek_dist; @@ -752,11 +567,11 @@ * previous one issued. */ static int as_close_req(struct as_data *ad, struct as_io_context *aic, - struct as_rq *arq) + struct request *rq) { unsigned long delay; /* milliseconds */ sector_t last = ad->last_sector[ad->batch_data_dir]; - sector_t next = arq->request->sector; + sector_t next = rq->sector; sector_t delta; /* acceptable close offset (in sectors) */ sector_t s; @@ -813,7 +628,7 @@ * * If this task has queued some other IO, do not enter enticipation. */ -static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq) +static int as_can_break_anticipation(struct as_data *ad, struct request *rq) { struct io_context *ioc; struct as_io_context *aic; @@ -821,7 +636,7 @@ ioc = ad->io_context; BUG_ON(!ioc); - if (arq && ioc == arq->io_context) { + if (rq && ioc == RQ_IOC(rq)) { /* request from same process */ return 1; } @@ -848,7 +663,7 @@ return 1; } - if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, aic, arq)) { + if (rq && rq_is_sync(rq) && as_close_req(ad, aic, rq)) { /* * Found a close request that is not one of ours. * @@ -864,7 +679,7 @@ ad->exit_no_coop = (7*ad->exit_no_coop)/8; } - as_update_iohist(ad, aic, arq->request); + as_update_iohist(ad, aic, rq); return 1; } @@ -891,10 +706,10 @@ } /* - * as_can_anticipate indicates whether we should either run arq + * as_can_anticipate indicates whether we should either run rq * or keep anticipating a better request. */ -static int as_can_anticipate(struct as_data *ad, struct as_rq *arq) +static int as_can_anticipate(struct as_data *ad, struct request *rq) { if (!ad->io_context) /* @@ -908,7 +723,7 @@ */ return 0; - if (as_can_break_anticipation(ad, arq)) + if (as_can_break_anticipation(ad, rq)) /* * This request is a good candidate. Don't keep anticipating, * run it. @@ -926,16 +741,16 @@ } /* - * as_update_arq must be called whenever a request (arq) is added to + * as_update_rq must be called whenever a request (rq) is added to * the sort_list. This function keeps caches up to date, and checks if the * request might be one we are "anticipating" */ -static void as_update_arq(struct as_data *ad, struct as_rq *arq) +static void as_update_rq(struct as_data *ad, struct request *rq) { - const int data_dir = arq->is_sync; + const int data_dir = rq_is_sync(rq); - /* keep the next_arq cache up to date */ - ad->next_arq[data_dir] = as_choose_req(ad, arq, ad->next_arq[data_dir]); + /* keep the next_rq cache up to date */ + ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]); /* * have we been anticipating this request? @@ -944,7 +759,7 @@ */ if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { - if (as_can_break_anticipation(ad, arq)) + if (as_can_break_anticipation(ad, rq)) as_antic_stop(ad); } } @@ -984,12 +799,11 @@ static void as_completed_request(request_queue_t *q, struct request *rq) { struct as_data *ad = q->elevator->elevator_data; - struct as_rq *arq = RQ_DATA(rq); WARN_ON(!list_empty(&rq->queuelist)); - if (arq->state != AS_RQ_REMOVED) { - printk("arq->state %d\n", arq->state); + if (RQ_STATE(rq) != AS_RQ_REMOVED) { + printk("rq->state %d\n", RQ_STATE(rq)); WARN_ON(1); goto out; } @@ -1009,14 +823,14 @@ * actually serviced. This should help devices with big TCQ windows * and writeback caches */ - if (ad->new_batch && ad->batch_data_dir == arq->is_sync) { + if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) { update_write_batch(ad); ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC]; ad->new_batch = 0; } - if (ad->io_context == arq->io_context && ad->io_context) { + if (ad->io_context == RQ_IOC(rq) && ad->io_context) { ad->antic_start = jiffies; ad->ioc_finished = 1; if (ad->antic_status == ANTIC_WAIT_REQ) { @@ -1028,9 +842,9 @@ } } - as_put_io_context(arq); + as_put_io_context(rq); out: - arq->state = AS_RQ_POSTSCHED; + RQ_SET_STATE(rq, AS_RQ_POSTSCHED); } /* @@ -1041,27 +855,27 @@ */ static void as_remove_queued_request(request_queue_t *q, struct request *rq) { - struct as_rq *arq = RQ_DATA(rq); - const int data_dir = arq->is_sync; + const int data_dir = rq_is_sync(rq); struct as_data *ad = q->elevator->elevator_data; + struct io_context *ioc; - WARN_ON(arq->state != AS_RQ_QUEUED); + WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED); - if (arq->io_context && arq->io_context->aic) { - BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued)); - atomic_dec(&arq->io_context->aic->nr_queued); + ioc = RQ_IOC(rq); + if (ioc && ioc->aic) { + BUG_ON(!atomic_read(&ioc->aic->nr_queued)); + atomic_dec(&ioc->aic->nr_queued); } /* - * Update the "next_arq" cache if we are about to remove its + * Update the "next_rq" cache if we are about to remove its * entry */ - if (ad->next_arq[data_dir] == arq) - ad->next_arq[data_dir] = as_find_next_arq(ad, arq); + if (ad->next_rq[data_dir] == rq) + ad->next_rq[data_dir] = as_find_next_rq(ad, rq); - list_del_init(&arq->fifo); - as_del_arq_hash(arq); - as_del_arq_rb(ad, arq); + rq_fifo_clear(rq); + as_del_rq_rb(ad, rq); } /* @@ -1074,7 +888,7 @@ */ static int as_fifo_expired(struct as_data *ad, int adir) { - struct as_rq *arq; + struct request *rq; long delta_jif; delta_jif = jiffies - ad->last_check_fifo[adir]; @@ -1088,9 +902,9 @@ if (list_empty(&ad->fifo_list[adir])) return 0; - arq = list_entry_fifo(ad->fifo_list[adir].next); + rq = rq_entry_fifo(ad->fifo_list[adir].next); - return time_after(jiffies, arq->expires); + return time_after(jiffies, rq_fifo_time(rq)); } /* @@ -1113,25 +927,25 @@ /* * move an entry to dispatch queue */ -static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq) +static void as_move_to_dispatch(struct as_data *ad, struct request *rq) { - struct request *rq = arq->request; - const int data_dir = arq->is_sync; + const int data_dir = rq_is_sync(rq); - BUG_ON(!RB_EMPTY_NODE(&arq->rb_node)); + BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); as_antic_stop(ad); ad->antic_status = ANTIC_OFF; /* * This has to be set in order to be correctly updated by - * as_find_next_arq + * as_find_next_rq */ ad->last_sector[data_dir] = rq->sector + rq->nr_sectors; if (data_dir == REQ_SYNC) { + struct io_context *ioc = RQ_IOC(rq); /* In case we have to anticipate after this */ - copy_io_context(&ad->io_context, &arq->io_context); + copy_io_context(&ad->io_context, &ioc); } else { if (ad->io_context) { put_io_context(ad->io_context); @@ -1143,19 +957,19 @@ } ad->ioc_finished = 0; - ad->next_arq[data_dir] = as_find_next_arq(ad, arq); + ad->next_rq[data_dir] = as_find_next_rq(ad, rq); /* * take it off the sort and fifo list, add to dispatch queue */ as_remove_queued_request(ad->q, rq); - WARN_ON(arq->state != AS_RQ_QUEUED); + WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED); elv_dispatch_sort(ad->q, rq); - arq->state = AS_RQ_DISPATCHED; - if (arq->io_context && arq->io_context->aic) - atomic_inc(&arq->io_context->aic->nr_dispatched); + RQ_SET_STATE(rq, AS_RQ_DISPATCHED); + if (RQ_IOC(rq) && RQ_IOC(rq)->aic) + atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched); ad->nr_dispatched++; } @@ -1167,9 +981,9 @@ static int as_dispatch_request(request_queue_t *q, int force) { struct as_data *ad = q->elevator->elevator_data; - struct as_rq *arq; const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]); const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]); + struct request *rq; if (unlikely(force)) { /* @@ -1185,14 +999,14 @@ ad->changed_batch = 0; ad->new_batch = 0; - while (ad->next_arq[REQ_SYNC]) { - as_move_to_dispatch(ad, ad->next_arq[REQ_SYNC]); + while (ad->next_rq[REQ_SYNC]) { + as_move_to_dispatch(ad, ad->next_rq[REQ_SYNC]); dispatched++; } ad->last_check_fifo[REQ_SYNC] = jiffies; - while (ad->next_arq[REQ_ASYNC]) { - as_move_to_dispatch(ad, ad->next_arq[REQ_ASYNC]); + while (ad->next_rq[REQ_ASYNC]) { + as_move_to_dispatch(ad, ad->next_rq[REQ_ASYNC]); dispatched++; } ad->last_check_fifo[REQ_ASYNC] = jiffies; @@ -1216,19 +1030,19 @@ /* * batch is still running or no reads or no writes */ - arq = ad->next_arq[ad->batch_data_dir]; + rq = ad->next_rq[ad->batch_data_dir]; if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) { if (as_fifo_expired(ad, REQ_SYNC)) goto fifo_expired; - if (as_can_anticipate(ad, arq)) { + if (as_can_anticipate(ad, rq)) { as_antic_waitreq(ad); return 0; } } - if (arq) { + if (rq) { /* we have a "next request" */ if (reads && !writes) ad->current_batch_expires = @@ -1256,7 +1070,7 @@ ad->changed_batch = 1; } ad->batch_data_dir = REQ_SYNC; - arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next); + rq = rq_entry_fifo(ad->fifo_list[REQ_SYNC].next); ad->last_check_fifo[ad->batch_data_dir] = jiffies; goto dispatch_request; } @@ -1282,7 +1096,7 @@ ad->batch_data_dir = REQ_ASYNC; ad->current_write_count = ad->write_batch_count; ad->write_batch_idled = 0; - arq = ad->next_arq[ad->batch_data_dir]; + rq = ad->next_rq[ad->batch_data_dir]; goto dispatch_request; } @@ -1296,8 +1110,7 @@ if (as_fifo_expired(ad, ad->batch_data_dir)) { fifo_expired: - arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next); - BUG_ON(arq == NULL); + rq = rq_entry_fifo(ad->fifo_list[ad->batch_data_dir].next); } if (ad->changed_batch) { @@ -1316,70 +1129,58 @@ } /* - * arq is the selected appropriate request. + * rq is the selected appropriate request. */ - as_move_to_dispatch(ad, arq); + as_move_to_dispatch(ad, rq); return 1; } /* - * add arq to rbtree and fifo + * add rq to rbtree and fifo */ static void as_add_request(request_queue_t *q, struct request *rq) { struct as_data *ad = q->elevator->elevator_data; - struct as_rq *arq = RQ_DATA(rq); int data_dir; - arq->state = AS_RQ_NEW; + RQ_SET_STATE(rq, AS_RQ_NEW); - if (rq_data_dir(arq->request) == READ - || (arq->request->flags & REQ_RW_SYNC)) - arq->is_sync = 1; - else - arq->is_sync = 0; - data_dir = arq->is_sync; + data_dir = rq_is_sync(rq); - arq->io_context = as_get_io_context(); + rq->elevator_private = as_get_io_context(q->node); - if (arq->io_context) { - as_update_iohist(ad, arq->io_context->aic, arq->request); - atomic_inc(&arq->io_context->aic->nr_queued); + if (RQ_IOC(rq)) { + as_update_iohist(ad, RQ_IOC(rq)->aic, rq); + atomic_inc(&RQ_IOC(rq)->aic->nr_queued); } - as_add_arq_rb(ad, arq); - if (rq_mergeable(arq->request)) - as_add_arq_hash(ad, arq); + as_add_rq_rb(ad, rq); /* * set expire time (only used for reads) and add to fifo list */ - arq->expires = jiffies + ad->fifo_expire[data_dir]; - list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]); + rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]); + list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]); - as_update_arq(ad, arq); /* keep state machine up to date */ - arq->state = AS_RQ_QUEUED; + as_update_rq(ad, rq); /* keep state machine up to date */ + RQ_SET_STATE(rq, AS_RQ_QUEUED); } static void as_activate_request(request_queue_t *q, struct request *rq) { - struct as_rq *arq = RQ_DATA(rq); - - WARN_ON(arq->state != AS_RQ_DISPATCHED); - arq->state = AS_RQ_REMOVED; - if (arq->io_context && arq->io_context->aic) - atomic_dec(&arq->io_context->aic->nr_dispatched); + WARN_ON(RQ_STATE(rq) != AS_RQ_DISPATCHED); + RQ_SET_STATE(rq, AS_RQ_REMOVED); + if (RQ_IOC(rq) && RQ_IOC(rq)->aic) + atomic_dec(&RQ_IOC(rq)->aic->nr_dispatched); } static void as_deactivate_request(request_queue_t *q, struct request *rq) { - struct as_rq *arq = RQ_DATA(rq); - - WARN_ON(arq->state != AS_RQ_REMOVED); - arq->state = AS_RQ_DISPATCHED; - if (arq->io_context && arq->io_context->aic) - atomic_inc(&arq->io_context->aic->nr_dispatched); + WARN_ON(RQ_STATE(rq) != AS_RQ_REMOVED); + RQ_SET_STATE(rq, AS_RQ_DISPATCHED); + if (RQ_IOC(rq) && RQ_IOC(rq)->aic) + atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched); } /* @@ -1396,93 +1197,35 @@ && list_empty(&ad->fifo_list[REQ_SYNC]); } -static struct request *as_former_request(request_queue_t *q, - struct request *rq) -{ - struct as_rq *arq = RQ_DATA(rq); - struct rb_node *rbprev = rb_prev(&arq->rb_node); - struct request *ret = NULL; - - if (rbprev) - ret = rb_entry_arq(rbprev)->request; - - return ret; -} - -static struct request *as_latter_request(request_queue_t *q, - struct request *rq) -{ - struct as_rq *arq = RQ_DATA(rq); - struct rb_node *rbnext = rb_next(&arq->rb_node); - struct request *ret = NULL; - - if (rbnext) - ret = rb_entry_arq(rbnext)->request; - - return ret; -} - static int as_merge(request_queue_t *q, struct request **req, struct bio *bio) { struct as_data *ad = q->elevator->elevator_data; sector_t rb_key = bio->bi_sector + bio_sectors(bio); struct request *__rq; - int ret; - - /* - * see if the merge hash can satisfy a back merge - */ - __rq = as_find_arq_hash(ad, bio->bi_sector); - if (__rq) { - BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); - - if (elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_BACK_MERGE; - goto out; - } - } /* * check for front merge */ - __rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio)); - if (__rq) { - BUG_ON(rb_key != rq_rb_key(__rq)); - - if (elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_FRONT_MERGE; - goto out; - } + __rq = elv_rb_find(&ad->sort_list[bio_data_dir(bio)], rb_key); + if (__rq && elv_rq_merge_ok(__rq, bio)) { + *req = __rq; + return ELEVATOR_FRONT_MERGE; } return ELEVATOR_NO_MERGE; -out: - if (ret) { - if (rq_mergeable(__rq)) - as_hot_arq_hash(ad, RQ_DATA(__rq)); - } - *req = __rq; - return ret; } -static void as_merged_request(request_queue_t *q, struct request *req) +static void as_merged_request(request_queue_t *q, struct request *req, int type) { struct as_data *ad = q->elevator->elevator_data; - struct as_rq *arq = RQ_DATA(req); - - /* - * hash always needs to be repositioned, key is end sector - */ - as_del_arq_hash(arq); - as_add_arq_hash(ad, arq); /* * if the merge was a front merge, we need to reposition request */ - if (rq_rb_key(req) != arq->rb_key) { - as_del_arq_rb(ad, arq); - as_add_arq_rb(ad, arq); + if (type == ELEVATOR_FRONT_MERGE) { + as_del_rq_rb(ad, req); + as_add_rq_rb(ad, req); /* * Note! At this stage of this and the next function, our next * request may not be optimal - eg the request may have "grown" @@ -1494,38 +1237,22 @@ static void as_merged_requests(request_queue_t *q, struct request *req, struct request *next) { - struct as_data *ad = q->elevator->elevator_data; - struct as_rq *arq = RQ_DATA(req); - struct as_rq *anext = RQ_DATA(next); - - BUG_ON(!arq); - BUG_ON(!anext); - /* - * reposition arq (this is the merged request) in hash, and in rbtree - * in case of a front merge + * if next expires before rq, assign its expire time to arq + * and move into next position (next will be deleted) in fifo */ - as_del_arq_hash(arq); - as_add_arq_hash(ad, arq); + if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { + if (time_before(rq_fifo_time(next), rq_fifo_time(req))) { + struct io_context *rioc = RQ_IOC(req); + struct io_context *nioc = RQ_IOC(next); - if (rq_rb_key(req) != arq->rb_key) { - as_del_arq_rb(ad, arq); - as_add_arq_rb(ad, arq); - } - - /* - * if anext expires before arq, assign its expire time to arq - * and move into anext position (anext will be deleted) in fifo - */ - if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) { - if (time_before(anext->expires, arq->expires)) { - list_move(&arq->fifo, &anext->fifo); - arq->expires = anext->expires; + list_move(&req->queuelist, &next->queuelist); + rq_set_fifo_time(req, rq_fifo_time(next)); /* * Don't copy here but swap, because when anext is * removed below, it must contain the unused context */ - swap_io_context(&arq->io_context, &anext->io_context); + swap_io_context(&rioc, &nioc); } } @@ -1533,9 +1260,9 @@ * kill knowledge of next, this one is a goner */ as_remove_queued_request(q, next); - as_put_io_context(anext); + as_put_io_context(next); - anext->state = AS_RQ_MERGED; + RQ_SET_STATE(next, AS_RQ_MERGED); } /* @@ -1553,61 +1280,18 @@ unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); - if (!as_queue_empty(q)) - q->request_fn(q); + blk_start_queueing(q); spin_unlock_irqrestore(q->queue_lock, flags); } -static void as_put_request(request_queue_t *q, struct request *rq) -{ - struct as_data *ad = q->elevator->elevator_data; - struct as_rq *arq = RQ_DATA(rq); - - if (!arq) { - WARN_ON(1); - return; - } - - if (unlikely(arq->state != AS_RQ_POSTSCHED && - arq->state != AS_RQ_PRESCHED && - arq->state != AS_RQ_MERGED)) { - printk("arq->state %d\n", arq->state); - WARN_ON(1); - } - - mempool_free(arq, ad->arq_pool); - rq->elevator_private = NULL; -} - -static int as_set_request(request_queue_t *q, struct request *rq, - struct bio *bio, gfp_t gfp_mask) -{ - struct as_data *ad = q->elevator->elevator_data; - struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask); - - if (arq) { - memset(arq, 0, sizeof(*arq)); - RB_CLEAR_NODE(&arq->rb_node); - arq->request = rq; - arq->state = AS_RQ_PRESCHED; - arq->io_context = NULL; - INIT_HLIST_NODE(&arq->hash); - INIT_LIST_HEAD(&arq->fifo); - rq->elevator_private = arq; - return 0; - } - - return 1; -} - -static int as_may_queue(request_queue_t *q, int rw, struct bio *bio) +static int as_may_queue(request_queue_t *q, int rw) { int ret = ELV_MQUEUE_MAY; struct as_data *ad = q->elevator->elevator_data; struct io_context *ioc; if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { - ioc = as_get_io_context(); + ioc = as_get_io_context(q->node); if (ad->io_context == ioc) ret = ELV_MQUEUE_MUST; put_io_context(ioc); @@ -1626,23 +1310,16 @@ BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC])); BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC])); - mempool_destroy(ad->arq_pool); put_io_context(ad->io_context); - kfree(ad->hash); kfree(ad); } /* - * initialize elevator private data (as_data), and alloc a arq for - * each request on the free lists + * initialize elevator private data (as_data). */ static void *as_init_queue(request_queue_t *q, elevator_t *e) { struct as_data *ad; - int i; - - if (!arq_pool) - return NULL; ad = kmalloc_node(sizeof(*ad), GFP_KERNEL, q->node); if (!ad) @@ -1651,30 +1328,12 @@ ad->q = q; /* Identify what queue the data belongs to */ - ad->hash = kmalloc_node(sizeof(struct hlist_head)*AS_HASH_ENTRIES, - GFP_KERNEL, q->node); - if (!ad->hash) { - kfree(ad); - return NULL; - } - - ad->arq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, arq_pool, q->node); - if (!ad->arq_pool) { - kfree(ad->hash); - kfree(ad); - return NULL; - } - /* anticipatory scheduling helpers */ ad->antic_timer.function = as_antic_timeout; ad->antic_timer.data = (unsigned long)q; init_timer(&ad->antic_timer); INIT_WORK(&ad->antic_work, as_work_handler, q); - for (i = 0; i < AS_HASH_ENTRIES; i++) - INIT_HLIST_HEAD(&ad->hash[i]); - INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]); INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]); ad->sort_list[REQ_SYNC] = RB_ROOT; @@ -1787,10 +1446,8 @@ .elevator_deactivate_req_fn = as_deactivate_request, .elevator_queue_empty_fn = as_queue_empty, .elevator_completed_req_fn = as_completed_request, - .elevator_former_req_fn = as_former_request, - .elevator_latter_req_fn = as_latter_request, - .elevator_set_req_fn = as_set_request, - .elevator_put_req_fn = as_put_request, + .elevator_former_req_fn = elv_rb_former_request, + .elevator_latter_req_fn = elv_rb_latter_request, .elevator_may_queue_fn = as_may_queue, .elevator_init_fn = as_init_queue, .elevator_exit_fn = as_exit_queue, @@ -1806,11 +1463,6 @@ { int ret; - arq_pool = kmem_cache_create("as_arq", sizeof(struct as_rq), - 0, 0, NULL, NULL); - if (!arq_pool) - return -ENOMEM; - ret = elv_register(&iosched_as); if (!ret) { /* @@ -1822,7 +1474,6 @@ return 0; } - kmem_cache_destroy(arq_pool); return ret; } @@ -1833,10 +1484,9 @@ ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); - if (atomic_read(&ioc_count)) + if (elv_ioc_count_read(ioc_count)) wait_for_completion(ioc_gone); synchronize_rcu(); - kmem_cache_destroy(arq_pool); } module_init(as_init); diff -urN oldtree/block/blktrace.c newtree/block/blktrace.c --- oldtree/block/blktrace.c 2006-08-02 07:14:14.000000000 -0700 +++ newtree/block/blktrace.c 2006-08-03 13:28:27.000000000 -0700 @@ -69,7 +69,7 @@ /* * Bio action bits of interest */ -static u32 bio_act[5] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD) }; +static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) }; /* * More could be added as needed, taking care to increment the decrementer @@ -80,7 +80,9 @@ #define trace_sync_bit(rw) \ (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1)) #define trace_ahead_bit(rw) \ - (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD)) + (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD)) +#define trace_meta_bit(rw) \ + (((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3)) /* * The worker for the various blk_add_trace*() types. Fills out a @@ -103,6 +105,7 @@ what |= bio_act[trace_barrier_bit(rw)]; what |= bio_act[trace_sync_bit(rw)]; what |= bio_act[trace_ahead_bit(rw)]; + what |= bio_act[trace_meta_bit(rw)]; pid = tsk->pid; if (unlikely(act_log_check(bt, what, sector, pid))) diff -urN oldtree/block/cfq-iosched.c newtree/block/cfq-iosched.c --- oldtree/block/cfq-iosched.c 2006-08-02 07:14:14.000000000 -0700 +++ newtree/block/cfq-iosched.c 2006-08-03 13:30:15.000000000 -0700 @@ -17,7 +17,6 @@ * tunables */ static const int cfq_quantum = 4; /* max queue in one round of service */ -static const int cfq_queued = 8; /* minimum rq allocate limit per-queue*/ static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; static const int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */ static const int cfq_back_penalty = 2; /* penalty of a backwards seek */ @@ -32,8 +31,6 @@ #define CFQ_KEY_ASYNC (0) -static DEFINE_SPINLOCK(cfq_exit_lock); - /* * for the hash of cfqq inside the cfqd */ @@ -41,37 +38,19 @@ #define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash) -/* - * for the hash of crq inside the cfqq - */ -#define CFQ_MHASH_SHIFT 6 -#define CFQ_MHASH_BLOCK(sec) ((sec) >> 3) -#define CFQ_MHASH_ENTRIES (1 << CFQ_MHASH_SHIFT) -#define CFQ_MHASH_FN(sec) hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT) -#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) -#define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash) - #define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) -#define list_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) -#define RQ_DATA(rq) (rq)->elevator_private +#define RQ_CIC(rq) ((struct cfq_io_context*)(rq)->elevator_private) +#define RQ_CFQQ(rq) ((rq)->elevator_private2) -/* - * rb-tree defines - */ -#define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node) -#define rq_rb_key(rq) (rq)->sector - -static kmem_cache_t *crq_pool; static kmem_cache_t *cfq_pool; static kmem_cache_t *cfq_ioc_pool; -static atomic_t ioc_count = ATOMIC_INIT(0); +static DEFINE_PER_CPU(unsigned long, ioc_count); static struct completion *ioc_gone; #define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -#define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE) #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) #define ASYNC (0) @@ -103,29 +82,14 @@ unsigned int busy_queues; /* - * non-ordered list of empty cfqq's - */ - struct list_head empty_list; - - /* * cfqq lookup hash */ struct hlist_head *cfq_hash; - /* - * global crq hash for all queues - */ - struct hlist_head *crq_hash; - - mempool_t *crq_pool; - int rq_in_driver; int hw_tag; /* - * schedule slice state info - */ - /* * idle window management */ struct timer_list idle_slice_timer; @@ -141,13 +105,10 @@ sector_t last_sector; unsigned long last_end_request; - unsigned int rq_starved; - /* * tunables, see top of file */ unsigned int cfq_quantum; - unsigned int cfq_queued; unsigned int cfq_fifo_expire[2]; unsigned int cfq_back_penalty; unsigned int cfq_back_max; @@ -170,23 +131,24 @@ struct hlist_node cfq_hash; /* hash key */ unsigned int key; - /* on either rr or empty list of cfqd */ + /* member of the rr/busy/cur/idle cfqd list */ struct list_head cfq_list; /* sorted list of pending requests */ struct rb_root sort_list; /* if fifo isn't expired, next request to serve */ - struct cfq_rq *next_crq; + struct request *next_rq; /* requests queued in sort_list */ int queued[2]; /* currently allocated requests */ int allocated[2]; + /* pending metadata requests */ + int meta_pending; /* fifo list of requests in sort_list */ struct list_head fifo; unsigned long slice_start; unsigned long slice_end; unsigned long slice_left; - unsigned long service_last; /* number of requests that are on the dispatch list */ int on_dispatch[2]; @@ -199,18 +161,6 @@ unsigned int flags; }; -struct cfq_rq { - struct rb_node rb_node; - sector_t rb_key; - struct request *request; - struct hlist_node hash; - - struct cfq_queue *cfq_queue; - struct cfq_io_context *io_context; - - unsigned int crq_flags; -}; - enum cfqq_state_flags { CFQ_CFQQ_FLAG_on_rr = 0, CFQ_CFQQ_FLAG_wait_request, @@ -220,6 +170,7 @@ CFQ_CFQQ_FLAG_fifo_expire, CFQ_CFQQ_FLAG_idle_window, CFQ_CFQQ_FLAG_prio_changed, + CFQ_CFQQ_FLAG_queue_new, }; #define CFQ_CFQQ_FNS(name) \ @@ -244,70 +195,14 @@ CFQ_CFQQ_FNS(fifo_expire); CFQ_CFQQ_FNS(idle_window); CFQ_CFQQ_FNS(prio_changed); +CFQ_CFQQ_FNS(queue_new); #undef CFQ_CFQQ_FNS -enum cfq_rq_state_flags { - CFQ_CRQ_FLAG_is_sync = 0, -}; - -#define CFQ_CRQ_FNS(name) \ -static inline void cfq_mark_crq_##name(struct cfq_rq *crq) \ -{ \ - crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name); \ -} \ -static inline void cfq_clear_crq_##name(struct cfq_rq *crq) \ -{ \ - crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name); \ -} \ -static inline int cfq_crq_##name(const struct cfq_rq *crq) \ -{ \ - return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0; \ -} - -CFQ_CRQ_FNS(is_sync); -#undef CFQ_CRQ_FNS - static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short); -static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *); +static void cfq_dispatch_insert(request_queue_t *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask); /* - * lots of deadline iosched dupes, can be abstracted later... - */ -static inline void cfq_del_crq_hash(struct cfq_rq *crq) -{ - hlist_del_init(&crq->hash); -} - -static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq) -{ - const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request)); - - hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]); -} - -static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) -{ - struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)]; - struct hlist_node *entry, *next; - - hlist_for_each_safe(entry, next, hash_list) { - struct cfq_rq *crq = list_entry_hash(entry); - struct request *__rq = crq->request; - - if (!rq_mergeable(__rq)) { - cfq_del_crq_hash(crq); - continue; - } - - if (rq_hash_key(__rq) == offset) - return __rq; - } - - return NULL; -} - -/* * scheduler run of queue, if there are requests pending and no one in the * driver that will restart queueing */ @@ -333,12 +228,12 @@ } /* - * Lifted from AS - choose which of crq1 and crq2 that is best served now. + * Lifted from AS - choose which of rq1 and rq2 that is best served now. * We choose the request that is closest to the head right now. Distance * behind the head is penalized and only allowed to a certain extent. */ -static struct cfq_rq * -cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2) +static struct request * +cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) { sector_t last, s1, s2, d1 = 0, d2 = 0; unsigned long back_max; @@ -346,18 +241,22 @@ #define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ unsigned wrap = 0; /* bit mask: requests behind the disk head? */ - if (crq1 == NULL || crq1 == crq2) - return crq2; - if (crq2 == NULL) - return crq1; - - if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2)) - return crq1; - else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1)) - return crq2; + if (rq1 == NULL || rq1 == rq2) + return rq2; + if (rq2 == NULL) + return rq1; + + if (rq_is_sync(rq1) && !rq_is_sync(rq2)) + return rq1; + else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) + return rq2; + if (rq_is_meta(rq1) && !rq_is_meta(rq2)) + return rq1; + else if (rq_is_meta(rq2) && !rq_is_meta(rq1)) + return rq2; - s1 = crq1->request->sector; - s2 = crq2->request->sector; + s1 = rq1->sector; + s2 = rq2->sector; last = cfqd->last_sector; @@ -392,23 +291,23 @@ * check two variables for all permutations: --> faster! */ switch (wrap) { - case 0: /* common case for CFQ: crq1 and crq2 not wrapped */ + case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ if (d1 < d2) - return crq1; + return rq1; else if (d2 < d1) - return crq2; + return rq2; else { if (s1 >= s2) - return crq1; + return rq1; else - return crq2; + return rq2; } case CFQ_RQ2_WRAP: - return crq1; + return rq1; case CFQ_RQ1_WRAP: - return crq2; - case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both crqs wrapped */ + return rq2; + case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ default: /* * Since both rqs are wrapped, @@ -417,50 +316,43 @@ * since back seek takes more time than forward. */ if (s1 <= s2) - return crq1; + return rq1; else - return crq2; + return rq2; } } /* * would be nice to take fifo expire time into account as well */ -static struct cfq_rq * -cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct cfq_rq *last) +static struct request * +cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct request *last) { - struct cfq_rq *crq_next = NULL, *crq_prev = NULL; - struct rb_node *rbnext, *rbprev; + struct rb_node *rbnext = rb_next(&last->rb_node); + struct rb_node *rbprev = rb_prev(&last->rb_node); + struct request *next = NULL, *prev = NULL; - if (!(rbnext = rb_next(&last->rb_node))) { - rbnext = rb_first(&cfqq->sort_list); - if (rbnext == &last->rb_node) - rbnext = NULL; - } - - rbprev = rb_prev(&last->rb_node); + BUG_ON(RB_EMPTY_NODE(&last->rb_node)); if (rbprev) - crq_prev = rb_entry_crq(rbprev); - if (rbnext) - crq_next = rb_entry_crq(rbnext); + prev = rb_entry_rq(rbprev); - return cfq_choose_req(cfqd, crq_next, crq_prev); -} - -static void cfq_update_next_crq(struct cfq_rq *crq) -{ - struct cfq_queue *cfqq = crq->cfq_queue; + if (rbnext) + next = rb_entry_rq(rbnext); + else { + rbnext = rb_first(&cfqq->sort_list); + if (rbnext && rbnext != &last->rb_node) + next = rb_entry_rq(rbnext); + } - if (cfqq->next_crq == crq) - cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq); + return cfq_choose_req(cfqd, next, prev); } static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) { struct cfq_data *cfqd = cfqq->cfqd; - struct list_head *list, *entry; + struct list_head *list; BUG_ON(!cfq_cfqq_on_rr(cfqq)); @@ -485,31 +377,14 @@ } /* - * if queue was preempted, just add to front to be fair. busy_rr - * isn't sorted, but insert at the back for fairness. + * If this queue was preempted or is new (never been serviced), let + * it be added first for fairness. Otherwise, just add to the back + * of the list. */ - if (preempted || list == &cfqd->busy_rr) { - if (preempted) - list = list->prev; - - list_add_tail(&cfqq->cfq_list, list); - return; - } + if (preempted || cfq_cfqq_queue_new(cfqq)) + list = list->prev; - /* - * sort by when queue was last serviced - */ - entry = list; - while ((entry = entry->prev) != list) { - struct cfq_queue *__cfqq = list_entry_cfqq(entry); - - if (!__cfqq->service_last) - break; - if (time_before(__cfqq->service_last, cfqq->service_last)) - break; - } - - list_add(&cfqq->cfq_list, entry); + list_add_tail(&cfqq->cfq_list, list); } /* @@ -531,7 +406,7 @@ { BUG_ON(!cfq_cfqq_on_rr(cfqq)); cfq_clear_cfqq_on_rr(cfqq); - list_move(&cfqq->cfq_list, &cfqd->empty_list); + list_del_init(&cfqq->cfq_list); BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; @@ -540,81 +415,43 @@ /* * rb tree support functions */ -static inline void cfq_del_crq_rb(struct cfq_rq *crq) +static inline void cfq_del_rq_rb(struct request *rq) { - struct cfq_queue *cfqq = crq->cfq_queue; + struct cfq_queue *cfqq = RQ_CFQQ(rq); struct cfq_data *cfqd = cfqq->cfqd; - const int sync = cfq_crq_is_sync(crq); + const int sync = rq_is_sync(rq); BUG_ON(!cfqq->queued[sync]); cfqq->queued[sync]--; - cfq_update_next_crq(crq); - - rb_erase(&crq->rb_node, &cfqq->sort_list); + elv_rb_del(&cfqq->sort_list, rq); if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) cfq_del_cfqq_rr(cfqd, cfqq); } -static struct cfq_rq * -__cfq_add_crq_rb(struct cfq_rq *crq) -{ - struct rb_node **p = &crq->cfq_queue->sort_list.rb_node; - struct rb_node *parent = NULL; - struct cfq_rq *__crq; - - while (*p) { - parent = *p; - __crq = rb_entry_crq(parent); - - if (crq->rb_key < __crq->rb_key) - p = &(*p)->rb_left; - else if (crq->rb_key > __crq->rb_key) - p = &(*p)->rb_right; - else - return __crq; - } - - rb_link_node(&crq->rb_node, parent, p); - return NULL; -} - -static void cfq_add_crq_rb(struct cfq_rq *crq) +static void cfq_add_rq_rb(struct request *rq) { - struct cfq_queue *cfqq = crq->cfq_queue; + struct cfq_queue *cfqq = RQ_CFQQ(rq); struct cfq_data *cfqd = cfqq->cfqd; - struct request *rq = crq->request; - struct cfq_rq *__alias; + struct request *__alias; - crq->rb_key = rq_rb_key(rq); - cfqq->queued[cfq_crq_is_sync(crq)]++; + cfqq->queued[rq_is_sync(rq)]++; /* * looks a little odd, but the first insert might return an alias. * if that happens, put the alias on the dispatch list */ - while ((__alias = __cfq_add_crq_rb(crq)) != NULL) + while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) cfq_dispatch_insert(cfqd->queue, __alias); - - rb_insert_color(&crq->rb_node, &cfqq->sort_list); - - if (!cfq_cfqq_on_rr(cfqq)) - cfq_add_cfqq_rr(cfqd, cfqq); - - /* - * check if this request is a better next-serve candidate - */ - cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq); } static inline void -cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) +cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) { - rb_erase(&crq->rb_node, &cfqq->sort_list); - cfqq->queued[cfq_crq_is_sync(crq)]--; - - cfq_add_crq_rb(crq); + elv_rb_del(&cfqq->sort_list, rq); + cfqq->queued[rq_is_sync(rq)]--; + cfq_add_rq_rb(rq); } static struct request * @@ -623,27 +460,14 @@ struct task_struct *tsk = current; pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio)); struct cfq_queue *cfqq; - struct rb_node *n; - sector_t sector; cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio); - if (!cfqq) - goto out; - - sector = bio->bi_sector + bio_sectors(bio); - n = cfqq->sort_list.rb_node; - while (n) { - struct cfq_rq *crq = rb_entry_crq(n); + if (cfqq) { + sector_t sector = bio->bi_sector + bio_sectors(bio); - if (sector < crq->rb_key) - n = n->rb_left; - else if (sector > crq->rb_key) - n = n->rb_right; - else - return crq->request; + return elv_rb_find(&cfqq->sort_list, sector); } -out: return NULL; } @@ -673,11 +497,18 @@ static void cfq_remove_request(struct request *rq) { - struct cfq_rq *crq = RQ_DATA(rq); + struct cfq_queue *cfqq = RQ_CFQQ(rq); + + if (cfqq->next_rq == rq) + cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); list_del_init(&rq->queuelist); - cfq_del_crq_rb(crq); - cfq_del_crq_hash(crq); + cfq_del_rq_rb(rq); + + if (rq_is_meta(rq)) { + WARN_ON(!cfqq->meta_pending); + cfqq->meta_pending--; + } } static int @@ -685,39 +516,23 @@ { struct cfq_data *cfqd = q->elevator->elevator_data; struct request *__rq; - int ret; - - __rq = cfq_find_rq_hash(cfqd, bio->bi_sector); - if (__rq && elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_BACK_MERGE; - goto out; - } __rq = cfq_find_rq_fmerge(cfqd, bio); if (__rq && elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_FRONT_MERGE; - goto out; + *req = __rq; + return ELEVATOR_FRONT_MERGE; } return ELEVATOR_NO_MERGE; -out: - *req = __rq; - return ret; } -static void cfq_merged_request(request_queue_t *q, struct request *req) +static void cfq_merged_request(request_queue_t *q, struct request *req, + int type) { - struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_rq *crq = RQ_DATA(req); - - cfq_del_crq_hash(crq); - cfq_add_crq_hash(cfqd, crq); - - if (rq_rb_key(req) != crq->rb_key) { - struct cfq_queue *cfqq = crq->cfq_queue; + if (type == ELEVATOR_FRONT_MERGE) { + struct cfq_queue *cfqq = RQ_CFQQ(req); - cfq_update_next_crq(crq); - cfq_reposition_crq_rb(cfqq, crq); + cfq_reposition_rq_rb(cfqq, req); } } @@ -725,8 +540,6 @@ cfq_merged_requests(request_queue_t *q, struct request *rq, struct request *next) { - cfq_merged_request(q, rq); - /* * reposition in fifo if next is older than rq */ @@ -768,13 +581,12 @@ if (cfq_cfqq_wait_request(cfqq)) del_timer(&cfqd->idle_slice_timer); - if (!preempted && !cfq_cfqq_dispatched(cfqq)) { - cfqq->service_last = now; + if (!preempted && !cfq_cfqq_dispatched(cfqq)) cfq_schedule_dispatch(cfqd); - } cfq_clear_cfqq_must_dispatch(cfqq); cfq_clear_cfqq_wait_request(cfqq); + cfq_clear_cfqq_queue_new(cfqq); /* * store what was left of this slice, if the queue idled out @@ -868,26 +680,25 @@ { struct cfq_queue *cfqq = NULL; - /* - * if current list is non-empty, grab first entry. if it is empty, - * get next prio level and grab first entry then if any are spliced - */ - if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) + if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) { + /* + * if current list is non-empty, grab first entry. if it is + * empty, get next prio level and grab first entry then if any + * are spliced + */ cfqq = list_entry_cfqq(cfqd->cur_rr.next); - - /* - * If no new queues are available, check if the busy list has some - * before falling back to idle io. - */ - if (!cfqq && !list_empty(&cfqd->busy_rr)) + } else if (!list_empty(&cfqd->busy_rr)) { + /* + * If no new queues are available, check if the busy list has + * some before falling back to idle io. + */ cfqq = list_entry_cfqq(cfqd->busy_rr.next); - - /* - * if we have idle queues and no rt or be queues had pending - * requests, either allow immediate service if the grace period - * has passed or arm the idle grace timer - */ - if (!cfqq && !list_empty(&cfqd->idle_rr)) { + } else if (!list_empty(&cfqd->idle_rr)) { + /* + * if we have idle queues and no rt or be queues had pending + * requests, either allow immediate service if the grace period + * has passed or arm the idle grace timer + */ unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE; if (time_after_eq(jiffies, end)) @@ -942,16 +753,14 @@ return 1; } -static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq) +static void cfq_dispatch_insert(request_queue_t *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_queue *cfqq = crq->cfq_queue; - struct request *rq; + struct cfq_queue *cfqq = RQ_CFQQ(rq); - cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq); - cfq_remove_request(crq->request); - cfqq->on_dispatch[cfq_crq_is_sync(crq)]++; - elv_dispatch_sort(q, crq->request); + cfq_remove_request(rq); + cfqq->on_dispatch[rq_is_sync(rq)]++; + elv_dispatch_sort(q, rq); rq = list_entry(q->queue_head.prev, struct request, queuelist); cfqd->last_sector = rq->sector + rq->nr_sectors; @@ -960,24 +769,23 @@ /* * return expired entry, or NULL to just start from scratch in rbtree */ -static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq) +static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq) { struct cfq_data *cfqd = cfqq->cfqd; struct request *rq; - struct cfq_rq *crq; + int fifo; if (cfq_cfqq_fifo_expire(cfqq)) return NULL; + if (list_empty(&cfqq->fifo)) + return NULL; - if (!list_empty(&cfqq->fifo)) { - int fifo = cfq_cfqq_class_sync(cfqq); + fifo = cfq_cfqq_class_sync(cfqq); + rq = rq_entry_fifo(cfqq->fifo.next); - crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next)); - rq = crq->request; - if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) { - cfq_mark_cfqq_fifo_expire(cfqq); - return crq; - } + if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) { + cfq_mark_cfqq_fifo_expire(cfqq); + return rq; } return NULL; @@ -1063,25 +871,25 @@ BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); do { - struct cfq_rq *crq; + struct request *rq; /* * follow expired path, else get first next available */ - if ((crq = cfq_check_fifo(cfqq)) == NULL) - crq = cfqq->next_crq; + if ((rq = cfq_check_fifo(cfqq)) == NULL) + rq = cfqq->next_rq; /* * finally, insert request into driver dispatch list */ - cfq_dispatch_insert(cfqd->queue, crq); + cfq_dispatch_insert(cfqd->queue, rq); cfqd->dispatch_slice++; dispatched++; if (!cfqd->active_cic) { - atomic_inc(&crq->io_context->ioc->refcount); - cfqd->active_cic = crq->io_context; + atomic_inc(&RQ_CIC(rq)->ioc->refcount); + cfqd->active_cic = RQ_CIC(rq); } if (RB_EMPTY_ROOT(&cfqq->sort_list)) @@ -1112,13 +920,12 @@ cfq_forced_dispatch_cfqqs(struct list_head *list) { struct cfq_queue *cfqq, *next; - struct cfq_rq *crq; int dispatched; dispatched = 0; list_for_each_entry_safe(cfqq, next, list, cfq_list) { - while ((crq = cfqq->next_crq)) { - cfq_dispatch_insert(cfqq->cfqd->queue, crq); + while (cfqq->next_rq) { + cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); dispatched++; } BUG_ON(!list_empty(&cfqq->fifo)); @@ -1194,8 +1001,8 @@ } /* - * task holds one reference to the queue, dropped when task exits. each crq - * in-flight on this queue also holds a reference, dropped when crq is freed. + * task holds one reference to the queue, dropped when task exits. each rq + * in-flight on this queue also holds a reference, dropped when rq is freed. * * queue lock must be held here. */ @@ -1223,7 +1030,7 @@ kmem_cache_free(cfq_pool, cfqq); } -static inline struct cfq_queue * +static struct cfq_queue * __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio, const int hashval) { @@ -1260,62 +1067,63 @@ freed++; } - if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone) + elv_ioc_count_mod(ioc_count, -freed); + + if (ioc_gone && !elv_ioc_count_read(ioc_count)) complete(ioc_gone); } -static void cfq_trim(struct io_context *ioc) +static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - ioc->set_ioprio = NULL; - cfq_free_io_context(ioc); + if (unlikely(cfqq == cfqd->active_queue)) + __cfq_slice_expired(cfqd, cfqq, 0); + + cfq_put_queue(cfqq); } -/* - * Called with interrupts disabled - */ -static void cfq_exit_single_io_context(struct cfq_io_context *cic) +static void __cfq_exit_single_io_context(struct cfq_data *cfqd, + struct cfq_io_context *cic) { - struct cfq_data *cfqd = cic->key; - request_queue_t *q; - - if (!cfqd) - return; - - q = cfqd->queue; - - WARN_ON(!irqs_disabled()); - - spin_lock(q->queue_lock); + list_del_init(&cic->queue_list); + smp_wmb(); + cic->key = NULL; if (cic->cfqq[ASYNC]) { - if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue)) - __cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0); - cfq_put_queue(cic->cfqq[ASYNC]); + cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]); cic->cfqq[ASYNC] = NULL; } if (cic->cfqq[SYNC]) { - if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue)) - __cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0); - cfq_put_queue(cic->cfqq[SYNC]); + cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]); cic->cfqq[SYNC] = NULL; } +} - cic->key = NULL; - list_del_init(&cic->queue_list); - spin_unlock(q->queue_lock); + +/* + * Called with interrupts disabled + */ +static void cfq_exit_single_io_context(struct cfq_io_context *cic) +{ + struct cfq_data *cfqd = cic->key; + + if (cfqd) { + request_queue_t *q = cfqd->queue; + + spin_lock_irq(q->queue_lock); + __cfq_exit_single_io_context(cfqd, cic); + spin_unlock_irq(q->queue_lock); + } } static void cfq_exit_io_context(struct io_context *ioc) { struct cfq_io_context *__cic; - unsigned long flags; struct rb_node *n; /* * put the reference this task is holding to the various queues */ - spin_lock_irqsave(&cfq_exit_lock, flags); n = rb_first(&ioc->cic_root); while (n != NULL) { @@ -1324,22 +1132,21 @@ cfq_exit_single_io_context(__cic); n = rb_next(n); } - - spin_unlock_irqrestore(&cfq_exit_lock, flags); } static struct cfq_io_context * cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) { - struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask); + struct cfq_io_context *cic; + cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask, cfqd->queue->node); if (cic) { memset(cic, 0, sizeof(*cic)); cic->last_end_request = jiffies; INIT_LIST_HEAD(&cic->queue_list); cic->dtor = cfq_free_io_context; cic->exit = cfq_exit_io_context; - atomic_inc(&ioc_count); + elv_ioc_count_inc(ioc_count); } return cic; @@ -1420,15 +1227,12 @@ spin_unlock(cfqd->queue->queue_lock); } -/* - * callback from sys_ioprio_set, irqs are disabled - */ -static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio) +static void cfq_ioc_set_ioprio(struct io_context *ioc) { struct cfq_io_context *cic; struct rb_node *n; - spin_lock(&cfq_exit_lock); + ioc->ioprio_changed = 0; n = rb_first(&ioc->cic_root); while (n != NULL) { @@ -1437,10 +1241,6 @@ changed_ioprio(cic); n = rb_next(n); } - - spin_unlock(&cfq_exit_lock); - - return 0; } static struct cfq_queue * @@ -1460,12 +1260,18 @@ cfqq = new_cfqq; new_cfqq = NULL; } else if (gfp_mask & __GFP_WAIT) { + /* + * Inform the allocator of the fact that we will + * just repeat this allocation if it fails, to allow + * the allocator to do whatever it needs to attempt to + * free memory. + */ spin_unlock_irq(cfqd->queue->queue_lock); - new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); + new_cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask|__GFP_NOFAIL, cfqd->queue->node); spin_lock_irq(cfqd->queue->queue_lock); goto retry; } else { - cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); + cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask, cfqd->queue->node); if (!cfqq) goto out; } @@ -1480,13 +1286,13 @@ hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); atomic_set(&cfqq->ref, 0); cfqq->cfqd = cfqd; - cfqq->service_last = 0; /* * set ->slice_left to allow preemption for a new process */ cfqq->slice_left = 2 * cfqd->cfq_slice_idle; cfq_mark_cfqq_idle_window(cfqq); cfq_mark_cfqq_prio_changed(cfqq); + cfq_mark_cfqq_queue_new(cfqq); cfq_init_prio_data(cfqq); } @@ -1502,12 +1308,10 @@ static void cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic) { - spin_lock(&cfq_exit_lock); + WARN_ON(!list_empty(&cic->queue_list)); rb_erase(&cic->rb_node, &ioc->cic_root); - list_del_init(&cic->queue_list); - spin_unlock(&cfq_exit_lock); kmem_cache_free(cfq_ioc_pool, cic); - atomic_dec(&ioc_count); + elv_ioc_count_dec(ioc_count); } static struct cfq_io_context * @@ -1551,7 +1355,6 @@ cic->ioc = ioc; cic->key = cfqd; - ioc->set_ioprio = cfq_ioc_set_ioprio; restart: parent = NULL; p = &ioc->cic_root.rb_node; @@ -1573,11 +1376,12 @@ BUG(); } - spin_lock(&cfq_exit_lock); rb_link_node(&cic->rb_node, parent, p); rb_insert_color(&cic->rb_node, &ioc->cic_root); + + spin_lock_irq(cfqd->queue->queue_lock); list_add(&cic->queue_list, &cfqd->cic_list); - spin_unlock(&cfq_exit_lock); + spin_unlock_irq(cfqd->queue->queue_lock); } /* @@ -1593,7 +1397,7 @@ might_sleep_if(gfp_mask & __GFP_WAIT); - ioc = get_io_context(gfp_mask); + ioc = get_io_context(gfp_mask, cfqd->queue->node); if (!ioc) return NULL; @@ -1607,6 +1411,10 @@ cfq_cic_link(cfqd, ioc, cic); out: + smp_read_barrier_depends(); + if (unlikely(ioc->ioprio_changed)) + cfq_ioc_set_ioprio(ioc); + return cic; err: put_io_context(ioc); @@ -1640,15 +1448,15 @@ static void cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, - struct cfq_rq *crq) + struct request *rq) { sector_t sdist; u64 total; - if (cic->last_request_pos < crq->request->sector) - sdist = crq->request->sector - cic->last_request_pos; + if (cic->last_request_pos < rq->sector) + sdist = rq->sector - cic->last_request_pos; else - sdist = cic->last_request_pos - crq->request->sector; + sdist = cic->last_request_pos - rq->sector; /* * Don't allow the seek distance to get too large from the @@ -1699,7 +1507,7 @@ */ static int cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, - struct cfq_rq *crq) + struct request *rq) { struct cfq_queue *cfqq = cfqd->active_queue; @@ -1718,7 +1526,17 @@ */ if (new_cfqq->slice_left < cfqd->cfq_slice_idle) return 0; - if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq)) + /* + * if the new request is sync, but the currently running queue is + * not, let the sync request have priority. + */ + if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) + return 1; + /* + * So both queues are sync. Let the new request get disk time if + * it's a metadata request and the current queue is doing regular IO. + */ + if (rq_is_meta(rq) && !cfqq->meta_pending) return 1; return 0; @@ -1730,47 +1548,45 @@ */ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - struct cfq_queue *__cfqq, *next; - - list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list) - cfq_resort_rr_list(__cfqq, 1); + cfq_slice_expired(cfqd, 1); if (!cfqq->slice_left) cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2; - cfqq->slice_end = cfqq->slice_left + jiffies; - cfq_slice_expired(cfqd, 1); - __cfq_set_active_queue(cfqd, cfqq); -} - -/* - * should really be a ll_rw_blk.c helper - */ -static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - request_queue_t *q = cfqd->queue; + /* + * Put the new queue at the front of the of the current list, + * so we know that it will be selected next. + */ + BUG_ON(!cfq_cfqq_on_rr(cfqq)); + list_move(&cfqq->cfq_list, &cfqd->cur_rr); - if (!blk_queue_plugged(q)) - q->request_fn(q); - else - __generic_unplug_device(q); + cfqq->slice_end = cfqq->slice_left + jiffies; } /* - * Called when a new fs request (crq) is added (to cfqq). Check if there's + * Called when a new fs request (rq) is added (to cfqq). Check if there's * something we should do about it */ static void -cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct cfq_rq *crq) +cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct request *rq) { - struct cfq_io_context *cic = crq->io_context; + struct cfq_io_context *cic = RQ_CIC(rq); + + if (rq_is_meta(rq)) + cfqq->meta_pending++; + + /* + * check if this request is a better next-serve candidate)) { + */ + cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); + BUG_ON(!cfqq->next_rq); /* * we never wait for an async request and we don't allow preemption * of an async request. so just return early */ - if (!cfq_crq_is_sync(crq)) { + if (!rq_is_sync(rq)) { /* * sync process issued an async request, if it's waiting * then expire it and kick rq handling. @@ -1778,17 +1594,17 @@ if (cic == cfqd->active_cic && del_timer(&cfqd->idle_slice_timer)) { cfq_slice_expired(cfqd, 0); - cfq_start_queueing(cfqd, cfqq); + blk_start_queueing(cfqd->queue); } return; } cfq_update_io_thinktime(cfqd, cic); - cfq_update_io_seektime(cfqd, cic, crq); + cfq_update_io_seektime(cfqd, cic, rq); cfq_update_idle_window(cfqd, cfqq, cic); cic->last_queue = jiffies; - cic->last_request_pos = crq->request->sector + crq->request->nr_sectors; + cic->last_request_pos = rq->sector + rq->nr_sectors; if (cfqq == cfqd->active_queue) { /* @@ -1799,9 +1615,9 @@ if (cfq_cfqq_wait_request(cfqq)) { cfq_mark_cfqq_must_dispatch(cfqq); del_timer(&cfqd->idle_slice_timer); - cfq_start_queueing(cfqd, cfqq); + blk_start_queueing(cfqd->queue); } - } else if (cfq_should_preempt(cfqd, cfqq, crq)) { + } else if (cfq_should_preempt(cfqd, cfqq, rq)) { /* * not the active queue - expire current slice if it is * idle and has expired it's mean thinktime or this new queue @@ -1809,34 +1625,32 @@ */ cfq_preempt_queue(cfqd, cfqq); cfq_mark_cfqq_must_dispatch(cfqq); - cfq_start_queueing(cfqd, cfqq); + blk_start_queueing(cfqd->queue); } } static void cfq_insert_request(request_queue_t *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_rq *crq = RQ_DATA(rq); - struct cfq_queue *cfqq = crq->cfq_queue; + struct cfq_queue *cfqq = RQ_CFQQ(rq); cfq_init_prio_data(cfqq); - cfq_add_crq_rb(crq); + cfq_add_rq_rb(rq); - list_add_tail(&rq->queuelist, &cfqq->fifo); + if (!cfq_cfqq_on_rr(cfqq)) + cfq_add_cfqq_rr(cfqd, cfqq); - if (rq_mergeable(rq)) - cfq_add_crq_hash(cfqd, crq); + list_add_tail(&rq->queuelist, &cfqq->fifo); - cfq_crq_enqueued(cfqd, cfqq, crq); + cfq_rq_enqueued(cfqd, cfqq, rq); } static void cfq_completed_request(request_queue_t *q, struct request *rq) { - struct cfq_rq *crq = RQ_DATA(rq); - struct cfq_queue *cfqq = crq->cfq_queue; + struct cfq_queue *cfqq = RQ_CFQQ(rq); struct cfq_data *cfqd = cfqq->cfqd; - const int sync = cfq_crq_is_sync(crq); + const int sync = rq_is_sync(rq); unsigned long now; now = jiffies; @@ -1849,15 +1663,11 @@ if (!cfq_class_idle(cfqq)) cfqd->last_end_request = now; - if (!cfq_cfqq_dispatched(cfqq)) { - if (cfq_cfqq_on_rr(cfqq)) { - cfqq->service_last = now; - cfq_resort_rr_list(cfqq, 0); - } - } + if (!cfq_cfqq_dispatched(cfqq) && cfq_cfqq_on_rr(cfqq)) + cfq_resort_rr_list(cfqq, 0); if (sync) - crq->io_context->last_end_request = now; + RQ_CIC(rq)->last_end_request = now; /* * If this is the active queue, check if it needs to be expired, @@ -1873,30 +1683,6 @@ } } -static struct request * -cfq_former_request(request_queue_t *q, struct request *rq) -{ - struct cfq_rq *crq = RQ_DATA(rq); - struct rb_node *rbprev = rb_prev(&crq->rb_node); - - if (rbprev) - return rb_entry_crq(rbprev)->request; - - return NULL; -} - -static struct request * -cfq_latter_request(request_queue_t *q, struct request *rq) -{ - struct cfq_rq *crq = RQ_DATA(rq); - struct rb_node *rbnext = rb_next(&crq->rb_node); - - if (rbnext) - return rb_entry_crq(rbnext)->request; - - return NULL; -} - /* * we temporarily boost lower priority queues if they are holding fs exclusive * resources. they are boosted to normal prio (CLASS_BE/4) @@ -1933,9 +1719,7 @@ cfq_resort_rr_list(cfqq, 0); } -static inline int -__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct task_struct *task, int rw) +static inline int __cfq_may_queue(struct cfq_queue *cfqq) { if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) && !cfq_cfqq_must_alloc_slice(cfqq)) { @@ -1946,7 +1730,7 @@ return ELV_MQUEUE_MAY; } -static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio) +static int cfq_may_queue(request_queue_t *q, int rw) { struct cfq_data *cfqd = q->elevator->elevator_data; struct task_struct *tsk = current; @@ -1963,48 +1747,30 @@ cfq_init_prio_data(cfqq); cfq_prio_boost(cfqq); - return __cfq_may_queue(cfqd, cfqq, tsk, rw); + return __cfq_may_queue(cfqq); } return ELV_MQUEUE_MAY; } -static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - - if (unlikely(cfqd->rq_starved)) { - struct request_list *rl = &q->rq; - - smp_mb(); - if (waitqueue_active(&rl->wait[READ])) - wake_up(&rl->wait[READ]); - if (waitqueue_active(&rl->wait[WRITE])) - wake_up(&rl->wait[WRITE]); - } -} - /* * queue lock held here */ static void cfq_put_request(request_queue_t *q, struct request *rq) { - struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_rq *crq = RQ_DATA(rq); + struct cfq_queue *cfqq = RQ_CFQQ(rq); - if (crq) { - struct cfq_queue *cfqq = crq->cfq_queue; + if (cfqq) { const int rw = rq_data_dir(rq); BUG_ON(!cfqq->allocated[rw]); cfqq->allocated[rw]--; - put_io_context(crq->io_context->ioc); + put_io_context(RQ_CIC(rq)->ioc); - mempool_free(crq, cfqd->crq_pool); rq->elevator_private = NULL; + rq->elevator_private2 = NULL; - cfq_check_waiters(q, cfqq); cfq_put_queue(cfqq); } } @@ -2013,8 +1779,7 @@ * Allocate cfq data structures associated with this request. */ static int -cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio, - gfp_t gfp_mask) +cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask) { struct cfq_data *cfqd = q->elevator->elevator_data; struct task_struct *tsk = current; @@ -2022,7 +1787,6 @@ const int rw = rq_data_dir(rq); pid_t key = cfq_queue_pid(tsk, rw); struct cfq_queue *cfqq; - struct cfq_rq *crq; unsigned long flags; int is_sync = key != CFQ_KEY_ASYNC; @@ -2046,42 +1810,18 @@ cfqq->allocated[rw]++; cfq_clear_cfqq_must_alloc(cfqq); - cfqd->rq_starved = 0; atomic_inc(&cfqq->ref); - spin_unlock_irqrestore(q->queue_lock, flags); - - crq = mempool_alloc(cfqd->crq_pool, gfp_mask); - if (crq) { - RB_CLEAR_NODE(&crq->rb_node); - crq->rb_key = 0; - crq->request = rq; - INIT_HLIST_NODE(&crq->hash); - crq->cfq_queue = cfqq; - crq->io_context = cic; - if (is_sync) - cfq_mark_crq_is_sync(crq); - else - cfq_clear_crq_is_sync(crq); + spin_unlock_irqrestore(q->queue_lock, flags); - rq->elevator_private = crq; - return 0; - } + rq->elevator_private = cic; + rq->elevator_private2 = cfqq; + return 0; - spin_lock_irqsave(q->queue_lock, flags); - cfqq->allocated[rw]--; - if (!(cfqq->allocated[0] + cfqq->allocated[1])) - cfq_mark_cfqq_must_alloc(cfqq); - cfq_put_queue(cfqq); queue_fail: if (cic) put_io_context(cic->ioc); - /* - * mark us rq allocation starved. we need to kickstart the process - * ourselves if there are no pending requests that can do it for us. - * that would be an extremely rare OOM situation - */ - cfqd->rq_starved = 1; + cfq_schedule_dispatch(cfqd); spin_unlock_irqrestore(q->queue_lock, flags); return 1; @@ -2090,27 +1830,10 @@ static void cfq_kick_queue(void *data) { request_queue_t *q = data; - struct cfq_data *cfqd = q->elevator->elevator_data; unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); - - if (cfqd->rq_starved) { - struct request_list *rl = &q->rq; - - /* - * we aren't guaranteed to get a request after this, but we - * have to be opportunistic - */ - smp_mb(); - if (waitqueue_active(&rl->wait[READ])) - wake_up(&rl->wait[READ]); - if (waitqueue_active(&rl->wait[WRITE])) - wake_up(&rl->wait[WRITE]); - } - - blk_remove_plug(q); - q->request_fn(q); + blk_start_queueing(q); spin_unlock_irqrestore(q->queue_lock, flags); } @@ -2193,7 +1916,6 @@ cfq_shutdown_timer_wq(cfqd); - spin_lock(&cfq_exit_lock); spin_lock_irq(q->queue_lock); if (cfqd->active_queue) @@ -2203,25 +1925,14 @@ struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, struct cfq_io_context, queue_list); - if (cic->cfqq[ASYNC]) { - cfq_put_queue(cic->cfqq[ASYNC]); - cic->cfqq[ASYNC] = NULL; - } - if (cic->cfqq[SYNC]) { - cfq_put_queue(cic->cfqq[SYNC]); - cic->cfqq[SYNC] = NULL; - } - cic->key = NULL; - list_del_init(&cic->queue_list); + + __cfq_exit_single_io_context(cfqd, cic); } spin_unlock_irq(q->queue_lock); - spin_unlock(&cfq_exit_lock); cfq_shutdown_timer_wq(cfqd); - mempool_destroy(cfqd->crq_pool); - kfree(cfqd->crq_hash); kfree(cfqd->cfq_hash); kfree(cfqd); } @@ -2231,7 +1942,7 @@ struct cfq_data *cfqd; int i; - cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL); + cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node); if (!cfqd) return NULL; @@ -2243,23 +1954,12 @@ INIT_LIST_HEAD(&cfqd->busy_rr); INIT_LIST_HEAD(&cfqd->cur_rr); INIT_LIST_HEAD(&cfqd->idle_rr); - INIT_LIST_HEAD(&cfqd->empty_list); INIT_LIST_HEAD(&cfqd->cic_list); - cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); - if (!cfqd->crq_hash) - goto out_crqhash; - - cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL); + cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node); if (!cfqd->cfq_hash) - goto out_cfqhash; - - cfqd->crq_pool = mempool_create_slab_pool(BLKDEV_MIN_RQ, crq_pool); - if (!cfqd->crq_pool) - goto out_crqpool; + goto out_free; - for (i = 0; i < CFQ_MHASH_ENTRIES; i++) - INIT_HLIST_HEAD(&cfqd->crq_hash[i]); for (i = 0; i < CFQ_QHASH_ENTRIES; i++) INIT_HLIST_HEAD(&cfqd->cfq_hash[i]); @@ -2275,7 +1975,6 @@ INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q); - cfqd->cfq_queued = cfq_queued; cfqd->cfq_quantum = cfq_quantum; cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; @@ -2287,19 +1986,13 @@ cfqd->cfq_slice_idle = cfq_slice_idle; return cfqd; -out_crqpool: - kfree(cfqd->cfq_hash); -out_cfqhash: - kfree(cfqd->crq_hash); -out_crqhash: +out_free: kfree(cfqd); return NULL; } static void cfq_slab_kill(void) { - if (crq_pool) - kmem_cache_destroy(crq_pool); if (cfq_pool) kmem_cache_destroy(cfq_pool); if (cfq_ioc_pool) @@ -2308,11 +2001,6 @@ static int __init cfq_slab_setup(void) { - crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0, - NULL, NULL); - if (!crq_pool) - goto fail; - cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0, NULL, NULL); if (!cfq_pool) @@ -2358,7 +2046,6 @@ return cfq_var_show(__data, (page)); \ } SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); -SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0); SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); @@ -2386,7 +2073,6 @@ return ret; \ } STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); -STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); @@ -2402,7 +2088,6 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(quantum), - CFQ_ATTR(queued), CFQ_ATTR(fifo_expire_sync), CFQ_ATTR(fifo_expire_async), CFQ_ATTR(back_seek_max), @@ -2425,14 +2110,14 @@ .elevator_deactivate_req_fn = cfq_deactivate_request, .elevator_queue_empty_fn = cfq_queue_empty, .elevator_completed_req_fn = cfq_completed_request, - .elevator_former_req_fn = cfq_former_request, - .elevator_latter_req_fn = cfq_latter_request, + .elevator_former_req_fn = elv_rb_former_request, + .elevator_latter_req_fn = elv_rb_latter_request, .elevator_set_req_fn = cfq_set_request, .elevator_put_req_fn = cfq_put_request, .elevator_may_queue_fn = cfq_may_queue, .elevator_init_fn = cfq_init_queue, .elevator_exit_fn = cfq_exit_queue, - .trim = cfq_trim, + .trim = cfq_free_io_context, }, .elevator_attrs = cfq_attrs, .elevator_name = "cfq", @@ -2468,7 +2153,7 @@ ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); - if (atomic_read(&ioc_count)) + if (elv_ioc_count_read(ioc_count)) wait_for_completion(ioc_gone); synchronize_rcu(); cfq_slab_kill(); diff -urN oldtree/block/deadline-iosched.c newtree/block/deadline-iosched.c --- oldtree/block/deadline-iosched.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/block/deadline-iosched.c 2006-08-03 13:27:10.000000000 -0700 @@ -12,7 +12,6 @@ #include #include #include -#include #include /* @@ -24,13 +23,6 @@ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ -static const int deadline_hash_shift = 5; -#define DL_HASH_BLOCK(sec) ((sec) >> 3) -#define DL_HASH_FN(sec) (hash_long(DL_HASH_BLOCK((sec)), deadline_hash_shift)) -#define DL_HASH_ENTRIES (1 << deadline_hash_shift) -#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) -#define ON_HASH(drq) (!hlist_unhashed(&(drq)->hash)) - struct deadline_data { /* * run time data @@ -45,8 +37,7 @@ /* * next in sort order. read, write or both are NULL */ - struct deadline_rq *next_drq[2]; - struct hlist_head *hash; /* request hash */ + struct request *next_rq[2]; unsigned int batching; /* number of sequential requests made */ sector_t last_sector; /* head position */ unsigned int starved; /* times reads have starved writes */ @@ -58,240 +49,69 @@ int fifo_batch; int writes_starved; int front_merges; - - mempool_t *drq_pool; -}; - -/* - * pre-request data. - */ -struct deadline_rq { - /* - * rbtree index, key is the starting offset - */ - struct rb_node rb_node; - sector_t rb_key; - - struct request *request; - - /* - * request hash, key is the ending offset (for back merge lookup) - */ - struct hlist_node hash; - - /* - * expire fifo - */ - struct list_head fifo; - unsigned long expires; }; -static void deadline_move_request(struct deadline_data *dd, struct deadline_rq *drq); - -static kmem_cache_t *drq_pool; - -#define RQ_DATA(rq) ((struct deadline_rq *) (rq)->elevator_private) - -/* - * the back merge hash support functions - */ -static inline void __deadline_del_drq_hash(struct deadline_rq *drq) -{ - hlist_del_init(&drq->hash); -} - -static inline void deadline_del_drq_hash(struct deadline_rq *drq) -{ - if (ON_HASH(drq)) - __deadline_del_drq_hash(drq); -} - -static inline void -deadline_add_drq_hash(struct deadline_data *dd, struct deadline_rq *drq) -{ - struct request *rq = drq->request; - - BUG_ON(ON_HASH(drq)); - - hlist_add_head(&drq->hash, &dd->hash[DL_HASH_FN(rq_hash_key(rq))]); -} - -/* - * move hot entry to front of chain - */ -static inline void -deadline_hot_drq_hash(struct deadline_data *dd, struct deadline_rq *drq) -{ - struct request *rq = drq->request; - struct hlist_head *head = &dd->hash[DL_HASH_FN(rq_hash_key(rq))]; - - if (ON_HASH(drq) && &drq->hash != head->first) { - hlist_del(&drq->hash); - hlist_add_head(&drq->hash, head); - } -} - -static struct request * -deadline_find_drq_hash(struct deadline_data *dd, sector_t offset) -{ - struct hlist_head *hash_list = &dd->hash[DL_HASH_FN(offset)]; - struct hlist_node *entry, *next; - struct deadline_rq *drq; - - hlist_for_each_entry_safe(drq, entry, next, hash_list, hash) { - struct request *__rq = drq->request; - - BUG_ON(!ON_HASH(drq)); - - if (!rq_mergeable(__rq)) { - __deadline_del_drq_hash(drq); - continue; - } - - if (rq_hash_key(__rq) == offset) - return __rq; - } - - return NULL; -} - -/* - * rb tree support functions - */ -#define rb_entry_drq(node) rb_entry((node), struct deadline_rq, rb_node) -#define DRQ_RB_ROOT(dd, drq) (&(dd)->sort_list[rq_data_dir((drq)->request)]) -#define rq_rb_key(rq) (rq)->sector - -static struct deadline_rq * -__deadline_add_drq_rb(struct deadline_data *dd, struct deadline_rq *drq) -{ - struct rb_node **p = &DRQ_RB_ROOT(dd, drq)->rb_node; - struct rb_node *parent = NULL; - struct deadline_rq *__drq; - - while (*p) { - parent = *p; - __drq = rb_entry_drq(parent); - - if (drq->rb_key < __drq->rb_key) - p = &(*p)->rb_left; - else if (drq->rb_key > __drq->rb_key) - p = &(*p)->rb_right; - else - return __drq; - } +static void deadline_move_request(struct deadline_data *, struct request *); - rb_link_node(&drq->rb_node, parent, p); - return NULL; -} +#define RQ_RB_ROOT(dd, rq) (&(dd)->sort_list[rq_data_dir((rq))]) static void -deadline_add_drq_rb(struct deadline_data *dd, struct deadline_rq *drq) +deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) { - struct deadline_rq *__alias; - - drq->rb_key = rq_rb_key(drq->request); + struct rb_root *root = RQ_RB_ROOT(dd, rq); + struct request *__alias; retry: - __alias = __deadline_add_drq_rb(dd, drq); - if (!__alias) { - rb_insert_color(&drq->rb_node, DRQ_RB_ROOT(dd, drq)); - return; + __alias = elv_rb_add(root, rq); + if (unlikely(__alias)) { + deadline_move_request(dd, __alias); + goto retry; } - - deadline_move_request(dd, __alias); - goto retry; } static inline void -deadline_del_drq_rb(struct deadline_data *dd, struct deadline_rq *drq) +deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) { - const int data_dir = rq_data_dir(drq->request); + const int data_dir = rq_data_dir(rq); - if (dd->next_drq[data_dir] == drq) { - struct rb_node *rbnext = rb_next(&drq->rb_node); + if (dd->next_rq[data_dir] == rq) { + struct rb_node *rbnext = rb_next(&rq->rb_node); - dd->next_drq[data_dir] = NULL; + dd->next_rq[data_dir] = NULL; if (rbnext) - dd->next_drq[data_dir] = rb_entry_drq(rbnext); + dd->next_rq[data_dir] = rb_entry_rq(rbnext); } - BUG_ON(!RB_EMPTY_NODE(&drq->rb_node)); - rb_erase(&drq->rb_node, DRQ_RB_ROOT(dd, drq)); - RB_CLEAR_NODE(&drq->rb_node); -} - -static struct request * -deadline_find_drq_rb(struct deadline_data *dd, sector_t sector, int data_dir) -{ - struct rb_node *n = dd->sort_list[data_dir].rb_node; - struct deadline_rq *drq; - - while (n) { - drq = rb_entry_drq(n); - - if (sector < drq->rb_key) - n = n->rb_left; - else if (sector > drq->rb_key) - n = n->rb_right; - else - return drq->request; - } - - return NULL; + elv_rb_del(RQ_RB_ROOT(dd, rq), rq); } /* - * deadline_find_first_drq finds the first (lowest sector numbered) request - * for the specified data_dir. Used to sweep back to the start of the disk - * (1-way elevator) after we process the last (highest sector) request. - */ -static struct deadline_rq * -deadline_find_first_drq(struct deadline_data *dd, int data_dir) -{ - struct rb_node *n = dd->sort_list[data_dir].rb_node; - - for (;;) { - if (n->rb_left == NULL) - return rb_entry_drq(n); - - n = n->rb_left; - } -} - -/* - * add drq to rbtree and fifo + * add rq to rbtree and fifo */ static void deadline_add_request(struct request_queue *q, struct request *rq) { struct deadline_data *dd = q->elevator->elevator_data; - struct deadline_rq *drq = RQ_DATA(rq); + const int data_dir = rq_data_dir(rq); - const int data_dir = rq_data_dir(drq->request); + deadline_add_rq_rb(dd, rq); - deadline_add_drq_rb(dd, drq); /* * set expire time (only used for reads) and add to fifo list */ - drq->expires = jiffies + dd->fifo_expire[data_dir]; - list_add_tail(&drq->fifo, &dd->fifo_list[data_dir]); - - if (rq_mergeable(rq)) - deadline_add_drq_hash(dd, drq); + rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]); + list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); } /* - * remove rq from rbtree, fifo, and hash + * remove rq from rbtree and fifo. */ static void deadline_remove_request(request_queue_t *q, struct request *rq) { - struct deadline_rq *drq = RQ_DATA(rq); struct deadline_data *dd = q->elevator->elevator_data; - list_del_init(&drq->fifo); - deadline_del_drq_rb(dd, drq); - deadline_del_drq_hash(drq); + rq_fifo_clear(rq); + deadline_del_rq_rb(dd, rq); } static int @@ -302,27 +122,14 @@ int ret; /* - * see if the merge hash can satisfy a back merge - */ - __rq = deadline_find_drq_hash(dd, bio->bi_sector); - if (__rq) { - BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); - - if (elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_BACK_MERGE; - goto out; - } - } - - /* * check for front merge */ if (dd->front_merges) { - sector_t rb_key = bio->bi_sector + bio_sectors(bio); + sector_t sector = bio->bi_sector + bio_sectors(bio); - __rq = deadline_find_drq_rb(dd, rb_key, bio_data_dir(bio)); + __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); if (__rq) { - BUG_ON(rb_key != rq_rb_key(__rq)); + BUG_ON(sector != __rq->sector); if (elv_rq_merge_ok(__rq, bio)) { ret = ELEVATOR_FRONT_MERGE; @@ -333,29 +140,21 @@ return ELEVATOR_NO_MERGE; out: - if (ret) - deadline_hot_drq_hash(dd, RQ_DATA(__rq)); *req = __rq; return ret; } -static void deadline_merged_request(request_queue_t *q, struct request *req) +static void deadline_merged_request(request_queue_t *q, struct request *req, + int type) { struct deadline_data *dd = q->elevator->elevator_data; - struct deadline_rq *drq = RQ_DATA(req); - - /* - * hash always needs to be repositioned, key is end sector - */ - deadline_del_drq_hash(drq); - deadline_add_drq_hash(dd, drq); /* * if the merge was a front merge, we need to reposition request */ - if (rq_rb_key(req) != drq->rb_key) { - deadline_del_drq_rb(dd, drq); - deadline_add_drq_rb(dd, drq); + if (type == ELEVATOR_FRONT_MERGE) { + elv_rb_del(RQ_RB_ROOT(dd, req), req); + deadline_add_rq_rb(dd, req); } } @@ -363,33 +162,14 @@ deadline_merged_requests(request_queue_t *q, struct request *req, struct request *next) { - struct deadline_data *dd = q->elevator->elevator_data; - struct deadline_rq *drq = RQ_DATA(req); - struct deadline_rq *dnext = RQ_DATA(next); - - BUG_ON(!drq); - BUG_ON(!dnext); - - /* - * reposition drq (this is the merged request) in hash, and in rbtree - * in case of a front merge - */ - deadline_del_drq_hash(drq); - deadline_add_drq_hash(dd, drq); - - if (rq_rb_key(req) != drq->rb_key) { - deadline_del_drq_rb(dd, drq); - deadline_add_drq_rb(dd, drq); - } - /* - * if dnext expires before drq, assign its expire time to drq - * and move into dnext position (dnext will be deleted) in fifo + * if next expires before rq, assign its expire time to rq + * and move into next position (next will be deleted) in fifo */ - if (!list_empty(&drq->fifo) && !list_empty(&dnext->fifo)) { - if (time_before(dnext->expires, drq->expires)) { - list_move(&drq->fifo, &dnext->fifo); - drq->expires = dnext->expires; + if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { + if (time_before(rq_fifo_time(next), rq_fifo_time(req))) { + list_move(&req->queuelist, &next->queuelist); + rq_set_fifo_time(req, rq_fifo_time(next)); } } @@ -403,52 +183,50 @@ * move request from sort list to dispatch queue. */ static inline void -deadline_move_to_dispatch(struct deadline_data *dd, struct deadline_rq *drq) +deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq) { - request_queue_t *q = drq->request->q; + request_queue_t *q = rq->q; - deadline_remove_request(q, drq->request); - elv_dispatch_add_tail(q, drq->request); + deadline_remove_request(q, rq); + elv_dispatch_add_tail(q, rq); } /* * move an entry to dispatch queue */ static void -deadline_move_request(struct deadline_data *dd, struct deadline_rq *drq) +deadline_move_request(struct deadline_data *dd, struct request *rq) { - const int data_dir = rq_data_dir(drq->request); - struct rb_node *rbnext = rb_next(&drq->rb_node); + const int data_dir = rq_data_dir(rq); + struct rb_node *rbnext = rb_next(&rq->rb_node); - dd->next_drq[READ] = NULL; - dd->next_drq[WRITE] = NULL; + dd->next_rq[READ] = NULL; + dd->next_rq[WRITE] = NULL; if (rbnext) - dd->next_drq[data_dir] = rb_entry_drq(rbnext); + dd->next_rq[data_dir] = rb_entry_rq(rbnext); - dd->last_sector = drq->request->sector + drq->request->nr_sectors; + dd->last_sector = rq->sector + rq->nr_sectors; /* * take it off the sort and fifo list, move * to dispatch queue */ - deadline_move_to_dispatch(dd, drq); + deadline_move_to_dispatch(dd, rq); } -#define list_entry_fifo(ptr) list_entry((ptr), struct deadline_rq, fifo) - /* * deadline_check_fifo returns 0 if there are no expired reads on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) */ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) { - struct deadline_rq *drq = list_entry_fifo(dd->fifo_list[ddir].next); + struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); /* - * drq is expired! + * rq is expired! */ - if (time_after(jiffies, drq->expires)) + if (time_after(jiffies, rq_fifo_time(rq))) return 1; return 0; @@ -463,21 +241,21 @@ struct deadline_data *dd = q->elevator->elevator_data; const int reads = !list_empty(&dd->fifo_list[READ]); const int writes = !list_empty(&dd->fifo_list[WRITE]); - struct deadline_rq *drq; + struct request *rq; int data_dir; /* * batches are currently reads XOR writes */ - if (dd->next_drq[WRITE]) - drq = dd->next_drq[WRITE]; + if (dd->next_rq[WRITE]) + rq = dd->next_rq[WRITE]; else - drq = dd->next_drq[READ]; + rq = dd->next_rq[READ]; - if (drq) { + if (rq) { /* we have a "next request" */ - if (dd->last_sector != drq->request->sector) + if (dd->last_sector != rq->sector) /* end the batch on a non sequential request */ dd->batching += dd->fifo_batch; @@ -526,30 +304,33 @@ if (deadline_check_fifo(dd, data_dir)) { /* An expired request exists - satisfy it */ dd->batching = 0; - drq = list_entry_fifo(dd->fifo_list[data_dir].next); + rq = rq_entry_fifo(dd->fifo_list[data_dir].next); - } else if (dd->next_drq[data_dir]) { + } else if (dd->next_rq[data_dir]) { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ - drq = dd->next_drq[data_dir]; + rq = dd->next_rq[data_dir]; } else { + struct rb_node *node; /* * The last req was the other direction or we have run out of * higher-sectored requests. Go back to the lowest sectored * request (1 way elevator) and start a new batch. */ dd->batching = 0; - drq = deadline_find_first_drq(dd, data_dir); + node = rb_first(&dd->sort_list[data_dir]); + if (node) + rq = rb_entry_rq(node); } dispatch_request: /* - * drq is the selected appropriate request. + * rq is the selected appropriate request. */ dd->batching++; - deadline_move_request(dd, drq); + deadline_move_request(dd, rq); return 1; } @@ -562,30 +343,6 @@ && list_empty(&dd->fifo_list[READ]); } -static struct request * -deadline_former_request(request_queue_t *q, struct request *rq) -{ - struct deadline_rq *drq = RQ_DATA(rq); - struct rb_node *rbprev = rb_prev(&drq->rb_node); - - if (rbprev) - return rb_entry_drq(rbprev)->request; - - return NULL; -} - -static struct request * -deadline_latter_request(request_queue_t *q, struct request *rq) -{ - struct deadline_rq *drq = RQ_DATA(rq); - struct rb_node *rbnext = rb_next(&drq->rb_node); - - if (rbnext) - return rb_entry_drq(rbnext)->request; - - return NULL; -} - static void deadline_exit_queue(elevator_t *e) { struct deadline_data *dd = e->elevator_data; @@ -593,46 +350,21 @@ BUG_ON(!list_empty(&dd->fifo_list[READ])); BUG_ON(!list_empty(&dd->fifo_list[WRITE])); - mempool_destroy(dd->drq_pool); - kfree(dd->hash); kfree(dd); } /* - * initialize elevator private data (deadline_data), and alloc a drq for - * each request on the free lists + * initialize elevator private data (deadline_data). */ static void *deadline_init_queue(request_queue_t *q, elevator_t *e) { struct deadline_data *dd; - int i; - - if (!drq_pool) - return NULL; dd = kmalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) return NULL; memset(dd, 0, sizeof(*dd)); - dd->hash = kmalloc_node(sizeof(struct hlist_head)*DL_HASH_ENTRIES, - GFP_KERNEL, q->node); - if (!dd->hash) { - kfree(dd); - return NULL; - } - - dd->drq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, drq_pool, q->node); - if (!dd->drq_pool) { - kfree(dd->hash); - kfree(dd); - return NULL; - } - - for (i = 0; i < DL_HASH_ENTRIES; i++) - INIT_HLIST_HEAD(&dd->hash[i]); - INIT_LIST_HEAD(&dd->fifo_list[READ]); INIT_LIST_HEAD(&dd->fifo_list[WRITE]); dd->sort_list[READ] = RB_ROOT; @@ -645,39 +377,6 @@ return dd; } -static void deadline_put_request(request_queue_t *q, struct request *rq) -{ - struct deadline_data *dd = q->elevator->elevator_data; - struct deadline_rq *drq = RQ_DATA(rq); - - mempool_free(drq, dd->drq_pool); - rq->elevator_private = NULL; -} - -static int -deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio, - gfp_t gfp_mask) -{ - struct deadline_data *dd = q->elevator->elevator_data; - struct deadline_rq *drq; - - drq = mempool_alloc(dd->drq_pool, gfp_mask); - if (drq) { - memset(drq, 0, sizeof(*drq)); - RB_CLEAR_NODE(&drq->rb_node); - drq->request = rq; - - INIT_HLIST_NODE(&drq->hash); - - INIT_LIST_HEAD(&drq->fifo); - - rq->elevator_private = drq; - return 0; - } - - return 1; -} - /* * sysfs parts below */ @@ -757,10 +456,8 @@ .elevator_dispatch_fn = deadline_dispatch_requests, .elevator_add_req_fn = deadline_add_request, .elevator_queue_empty_fn = deadline_queue_empty, - .elevator_former_req_fn = deadline_former_request, - .elevator_latter_req_fn = deadline_latter_request, - .elevator_set_req_fn = deadline_set_request, - .elevator_put_req_fn = deadline_put_request, + .elevator_former_req_fn = elv_rb_former_request, + .elevator_latter_req_fn = elv_rb_latter_request, .elevator_init_fn = deadline_init_queue, .elevator_exit_fn = deadline_exit_queue, }, @@ -772,24 +469,11 @@ static int __init deadline_init(void) { - int ret; - - drq_pool = kmem_cache_create("deadline_drq", sizeof(struct deadline_rq), - 0, 0, NULL, NULL); - - if (!drq_pool) - return -ENOMEM; - - ret = elv_register(&iosched_deadline); - if (ret) - kmem_cache_destroy(drq_pool); - - return ret; + return elv_register(&iosched_deadline); } static void __exit deadline_exit(void) { - kmem_cache_destroy(drq_pool); elv_unregister(&iosched_deadline); } diff -urN oldtree/block/elevator.c newtree/block/elevator.c --- oldtree/block/elevator.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/block/elevator.c 2006-08-03 13:27:10.000000000 -0700 @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -40,6 +41,16 @@ static LIST_HEAD(elv_list); /* + * Merge hash stuff. + */ +static const int elv_hash_shift = 6; +#define ELV_HASH_BLOCK(sec) ((sec) >> 3) +#define ELV_HASH_FN(sec) (hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift)) +#define ELV_HASH_ENTRIES (1 << elv_hash_shift) +#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) +#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) + +/* * can we safely merge with this request? */ inline int elv_rq_merge_ok(struct request *rq, struct bio *bio) @@ -56,8 +67,7 @@ /* * same device and no special stuff set, merge is ok */ - if (rq->rq_disk == bio->bi_bdev->bd_disk && - !rq->waiting && !rq->special) + if (rq->rq_disk == bio->bi_bdev->bd_disk && !rq->special) return 1; return 0; @@ -151,27 +161,44 @@ static struct kobj_type elv_ktype; -static elevator_t *elevator_alloc(struct elevator_type *e) +static elevator_t *elevator_alloc(request_queue_t *q, struct elevator_type *e) { - elevator_t *eq = kmalloc(sizeof(elevator_t), GFP_KERNEL); - if (eq) { - memset(eq, 0, sizeof(*eq)); - eq->ops = &e->ops; - eq->elevator_type = e; - kobject_init(&eq->kobj); - snprintf(eq->kobj.name, KOBJ_NAME_LEN, "%s", "iosched"); - eq->kobj.ktype = &elv_ktype; - mutex_init(&eq->sysfs_lock); - } else { - elevator_put(e); - } + elevator_t *eq; + int i; + + eq = kmalloc_node(sizeof(elevator_t), GFP_KERNEL, q->node); + if (unlikely(!eq)) + goto err; + + memset(eq, 0, sizeof(*eq)); + eq->ops = &e->ops; + eq->elevator_type = e; + kobject_init(&eq->kobj); + snprintf(eq->kobj.name, KOBJ_NAME_LEN, "%s", "iosched"); + eq->kobj.ktype = &elv_ktype; + mutex_init(&eq->sysfs_lock); + + eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES, + GFP_KERNEL, q->node); + if (!eq->hash) + goto err; + + for (i = 0; i < ELV_HASH_ENTRIES; i++) + INIT_HLIST_HEAD(&eq->hash[i]); + return eq; +err: + kfree(eq); + elevator_put(e); + return NULL; } static void elevator_release(struct kobject *kobj) { elevator_t *e = container_of(kobj, elevator_t, kobj); + elevator_put(e->elevator_type); + kfree(e->hash); kfree(e); } @@ -198,7 +225,7 @@ e = elevator_get("noop"); } - eq = elevator_alloc(e); + eq = elevator_alloc(q, e); if (!eq) return -ENOMEM; @@ -212,6 +239,8 @@ return ret; } +EXPORT_SYMBOL(elevator_init); + void elevator_exit(elevator_t *e) { mutex_lock(&e->sysfs_lock); @@ -223,10 +252,118 @@ kobject_put(&e->kobj); } +EXPORT_SYMBOL(elevator_exit); + +static inline void __elv_rqhash_del(struct request *rq) +{ + hlist_del_init(&rq->hash); +} + +static void elv_rqhash_del(request_queue_t *q, struct request *rq) +{ + if (ELV_ON_HASH(rq)) + __elv_rqhash_del(rq); +} + +static void elv_rqhash_add(request_queue_t *q, struct request *rq) +{ + elevator_t *e = q->elevator; + + BUG_ON(ELV_ON_HASH(rq)); + hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]); +} + +static void elv_rqhash_reposition(request_queue_t *q, struct request *rq) +{ + __elv_rqhash_del(rq); + elv_rqhash_add(q, rq); +} + +static struct request *elv_rqhash_find(request_queue_t *q, sector_t offset) +{ + elevator_t *e = q->elevator; + struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)]; + struct hlist_node *entry, *next; + struct request *rq; + + hlist_for_each_entry_safe(rq, entry, next, hash_list, hash) { + BUG_ON(!ELV_ON_HASH(rq)); + + if (unlikely(!rq_mergeable(rq))) { + __elv_rqhash_del(rq); + continue; + } + + if (rq_hash_key(rq) == offset) + return rq; + } + + return NULL; +} + +/* + * RB-tree support functions for inserting/lookup/removal of requests + * in a sorted RB tree. + */ +struct request *elv_rb_add(struct rb_root *root, struct request *rq) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct request *__rq; + + while (*p) { + parent = *p; + __rq = rb_entry(parent, struct request, rb_node); + + if (rq->sector < __rq->sector) + p = &(*p)->rb_left; + else if (rq->sector > __rq->sector) + p = &(*p)->rb_right; + else + return __rq; + } + + rb_link_node(&rq->rb_node, parent, p); + rb_insert_color(&rq->rb_node, root); + return NULL; +} + +EXPORT_SYMBOL(elv_rb_add); + +void elv_rb_del(struct rb_root *root, struct request *rq) +{ + BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); + rb_erase(&rq->rb_node, root); + RB_CLEAR_NODE(&rq->rb_node); +} + +EXPORT_SYMBOL(elv_rb_del); + +struct request *elv_rb_find(struct rb_root *root, sector_t sector) +{ + struct rb_node *n = root->rb_node; + struct request *rq; + + while (n) { + rq = rb_entry(n, struct request, rb_node); + + if (sector < rq->sector) + n = n->rb_left; + else if (sector > rq->sector) + n = n->rb_right; + else + return rq; + } + + return NULL; +} + +EXPORT_SYMBOL(elv_rb_find); + /* * Insert rq into dispatch queue of q. Queue lock must be held on - * entry. If sort != 0, rq is sort-inserted; otherwise, rq will be - * appended to the dispatch queue. To be used by specific elevators. + * entry. rq is sort insted into the dispatch queue. To be used by + * specific elevators. */ void elv_dispatch_sort(request_queue_t *q, struct request *rq) { @@ -235,6 +372,9 @@ if (q->last_merge == rq) q->last_merge = NULL; + + elv_rqhash_del(q, rq); + q->nr_sorted--; boundary = q->end_sector; @@ -258,11 +398,38 @@ list_add(&rq->queuelist, entry); } +EXPORT_SYMBOL(elv_dispatch_sort); + +/* + * Insert rq into dispatch queue of q. Queue lock must be held on + * entry. rq is added to the back of the dispatch queue. To be used by + * specific elevators. + */ +void elv_dispatch_add_tail(struct request_queue *q, struct request *rq) +{ + if (q->last_merge == rq) + q->last_merge = NULL; + + elv_rqhash_del(q, rq); + + q->nr_sorted--; + + q->end_sector = rq_end_sector(rq); + q->boundary_rq = rq; + list_add_tail(&rq->queuelist, &q->queue_head); +} + +EXPORT_SYMBOL(elv_dispatch_add_tail); + int elv_merge(request_queue_t *q, struct request **req, struct bio *bio) { elevator_t *e = q->elevator; + struct request *__rq; int ret; + /* + * First try one-hit cache. + */ if (q->last_merge) { ret = elv_try_merge(q->last_merge, bio); if (ret != ELEVATOR_NO_MERGE) { @@ -271,18 +438,30 @@ } } + /* + * See if our hash lookup can find a potential backmerge. + */ + __rq = elv_rqhash_find(q, bio->bi_sector); + if (__rq && elv_rq_merge_ok(__rq, bio)) { + *req = __rq; + return ELEVATOR_BACK_MERGE; + } + if (e->ops->elevator_merge_fn) return e->ops->elevator_merge_fn(q, req, bio); return ELEVATOR_NO_MERGE; } -void elv_merged_request(request_queue_t *q, struct request *rq) +void elv_merged_request(request_queue_t *q, struct request *rq, int type) { elevator_t *e = q->elevator; if (e->ops->elevator_merged_fn) - e->ops->elevator_merged_fn(q, rq); + e->ops->elevator_merged_fn(q, rq, type); + + if (type == ELEVATOR_BACK_MERGE) + elv_rqhash_reposition(q, rq); q->last_merge = rq; } @@ -294,8 +473,11 @@ if (e->ops->elevator_merge_req_fn) e->ops->elevator_merge_req_fn(q, rq, next); - q->nr_sorted--; + elv_rqhash_reposition(q, rq); + elv_rqhash_del(q, next); + + q->nr_sorted--; q->last_merge = rq; } @@ -371,8 +553,12 @@ BUG_ON(!blk_fs_request(rq)); rq->flags |= REQ_SORTED; q->nr_sorted++; - if (q->last_merge == NULL && rq_mergeable(rq)) - q->last_merge = rq; + if (rq_mergeable(rq)) { + elv_rqhash_add(q, rq); + if (!q->last_merge) + q->last_merge = rq; + } + /* * Some ioscheds (cfq) run q->request_fn directly, so * rq cannot be accessed after calling @@ -461,6 +647,8 @@ elv_insert(q, rq, where); } +EXPORT_SYMBOL(__elv_add_request); + void elv_add_request(request_queue_t *q, struct request *rq, int where, int plug) { @@ -471,6 +659,8 @@ spin_unlock_irqrestore(q->queue_lock, flags); } +EXPORT_SYMBOL(elv_add_request); + static inline struct request *__elv_next_request(request_queue_t *q) { struct request *rq; @@ -554,9 +744,12 @@ return rq; } +EXPORT_SYMBOL(elv_next_request); + void elv_dequeue_request(request_queue_t *q, struct request *rq) { BUG_ON(list_empty(&rq->queuelist)); + BUG_ON(ELV_ON_HASH(rq)); list_del_init(&rq->queuelist); @@ -569,6 +762,8 @@ q->in_flight++; } +EXPORT_SYMBOL(elv_dequeue_request); + int elv_queue_empty(request_queue_t *q) { elevator_t *e = q->elevator; @@ -582,6 +777,8 @@ return 1; } +EXPORT_SYMBOL(elv_queue_empty); + struct request *elv_latter_request(request_queue_t *q, struct request *rq) { elevator_t *e = q->elevator; @@ -600,13 +797,12 @@ return NULL; } -int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio, - gfp_t gfp_mask) +int elv_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask) { elevator_t *e = q->elevator; if (e->ops->elevator_set_req_fn) - return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask); + return e->ops->elevator_set_req_fn(q, rq, gfp_mask); rq->elevator_private = NULL; return 0; @@ -620,12 +816,12 @@ e->ops->elevator_put_req_fn(q, rq); } -int elv_may_queue(request_queue_t *q, int rw, struct bio *bio) +int elv_may_queue(request_queue_t *q, int rw) { elevator_t *e = q->elevator; if (e->ops->elevator_may_queue_fn) - return e->ops->elevator_may_queue_fn(q, rw, bio); + return e->ops->elevator_may_queue_fn(q, rw); return ELV_MQUEUE_MAY; } @@ -791,7 +987,7 @@ /* * Allocate new elevator */ - e = elevator_alloc(new_e); + e = elevator_alloc(q, new_e); if (!e) return 0; @@ -907,11 +1103,26 @@ return len; } -EXPORT_SYMBOL(elv_dispatch_sort); -EXPORT_SYMBOL(elv_add_request); -EXPORT_SYMBOL(__elv_add_request); -EXPORT_SYMBOL(elv_next_request); -EXPORT_SYMBOL(elv_dequeue_request); -EXPORT_SYMBOL(elv_queue_empty); -EXPORT_SYMBOL(elevator_exit); -EXPORT_SYMBOL(elevator_init); +struct request *elv_rb_former_request(request_queue_t *q, struct request *rq) +{ + struct rb_node *rbprev = rb_prev(&rq->rb_node); + + if (rbprev) + return rb_entry_rq(rbprev); + + return NULL; +} + +EXPORT_SYMBOL(elv_rb_former_request); + +struct request *elv_rb_latter_request(request_queue_t *q, struct request *rq) +{ + struct rb_node *rbnext = rb_next(&rq->rb_node); + + if (rbnext) + return rb_entry_rq(rbnext); + + return NULL; +} + +EXPORT_SYMBOL(elv_rb_latter_request); diff -urN oldtree/block/ll_rw_blk.c newtree/block/ll_rw_blk.c --- oldtree/block/ll_rw_blk.c 2006-08-02 07:14:14.000000000 -0700 +++ newtree/block/ll_rw_blk.c 2006-08-03 13:27:10.000000000 -0700 @@ -39,6 +39,7 @@ static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io); static void init_request_from_bio(struct request *req, struct bio *bio); static int __make_request(request_queue_t *q, struct bio *bio); +static struct io_context *current_io_context(gfp_t gfp_flags, int node); /* * For the allocated request tables @@ -274,19 +275,19 @@ EXPORT_SYMBOL(blk_queue_make_request); -static inline void rq_init(request_queue_t *q, struct request *rq) +static void rq_init(request_queue_t *q, struct request *rq) { INIT_LIST_HEAD(&rq->queuelist); INIT_LIST_HEAD(&rq->donelist); rq->errors = 0; - rq->rq_status = RQ_ACTIVE; rq->bio = rq->biotail = NULL; + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); rq->ioprio = 0; rq->buffer = NULL; rq->ref_count = 1; rq->q = q; - rq->waiting = NULL; rq->special = NULL; rq->data_len = 0; rq->data = NULL; @@ -446,8 +447,8 @@ rq_init(q, rq); rq->flags = REQ_HARDBARRIER; rq->elevator_private = NULL; + rq->elevator_private2 = NULL; rq->rq_disk = q->bar_rq.rq_disk; - rq->rl = NULL; rq->end_io = end_io; q->prepare_flush_fn(q, rq); @@ -472,7 +473,7 @@ rq->flags = bio_data_dir(q->orig_bar_rq->bio); rq->flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0; rq->elevator_private = NULL; - rq->rl = NULL; + rq->elevator_private2 = NULL; init_request_from_bio(rq, q->orig_bar_rq->bio); rq->end_io = bar_end_io; @@ -584,8 +585,8 @@ return 0; } -static inline int ordered_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, int error) +static int ordered_bio_endio(struct request *rq, struct bio *bio, + unsigned int nbytes, int error) { request_queue_t *q = rq->q; bio_end_io_t *endio; @@ -1967,9 +1968,8 @@ mempool_free(rq, q->rq.rq_pool); } -static inline struct request * -blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, - int priv, gfp_t gfp_mask) +static struct request * +blk_alloc_request(request_queue_t *q, int rw, int priv, gfp_t gfp_mask) { struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); @@ -1980,10 +1980,10 @@ * first three bits are identical in rq->flags and bio->bi_rw, * see bio.h and blkdev.h */ - rq->flags = rw; + rq->flags = rw | REQ_ALLOCED; if (priv) { - if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) { + if (unlikely(elv_set_request(q, rq, gfp_mask))) { mempool_free(rq, q->rq.rq_pool); return NULL; } @@ -2074,13 +2074,13 @@ struct io_context *ioc = NULL; int may_queue, priv; - may_queue = elv_may_queue(q, rw, bio); + may_queue = elv_may_queue(q, rw); if (may_queue == ELV_MQUEUE_NO) goto rq_starved; if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) { if (rl->count[rw]+1 >= q->nr_requests) { - ioc = current_io_context(GFP_ATOMIC); + ioc = current_io_context(GFP_ATOMIC, q->node); /* * The queue will fill after this allocation, so set * it as full, and mark this process as "batching". @@ -2122,7 +2122,7 @@ spin_unlock_irq(q->queue_lock); - rq = blk_alloc_request(q, rw, bio, priv, gfp_mask); + rq = blk_alloc_request(q, rw, priv, gfp_mask); if (unlikely(!rq)) { /* * Allocation failed presumably due to memory. Undo anything @@ -2158,7 +2158,6 @@ ioc->nr_batch_requests--; rq_init(q, rq); - rq->rl = rl; blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ); out: @@ -2201,7 +2200,7 @@ * up to a big batch of them for a small period time. * See ioc_batching, ioc_set_batching */ - ioc = current_io_context(GFP_NOIO); + ioc = current_io_context(GFP_NOIO, q->node); ioc_set_batching(q, ioc); spin_lock_irq(q->queue_lock); @@ -2233,6 +2232,25 @@ EXPORT_SYMBOL(blk_get_request); /** + * blk_start_queueing - initiate dispatch of requests to device + * @q: request queue to kick into gear + * + * This is basically a helper to remove the need to know whether a queue + * is plugged or not if someone just wants to initiate dispatch of requests + * for this queue. + * + * The queue lock must be held with interrupts disabled. + */ +void blk_start_queueing(request_queue_t *q) +{ + if (!blk_queue_plugged(q)) + q->request_fn(q); + else + __generic_unplug_device(q); +} +EXPORT_SYMBOL(blk_start_queueing); + +/** * blk_requeue_request - put a request back on queue * @q: request queue where request should be inserted * @rq: request to be inserted @@ -2298,11 +2316,7 @@ drive_stat_acct(rq, rq->nr_sectors, 1); __elv_add_request(q, rq, where, 0); - - if (blk_queue_plugged(q)) - __generic_unplug_device(q); - else - q->request_fn(q); + blk_start_queueing(q); spin_unlock_irqrestore(q->queue_lock, flags); } @@ -2538,10 +2552,9 @@ rq->sense_len = 0; } - rq->waiting = &wait; + rq->end_io_data = &wait; blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); wait_for_completion(&wait); - rq->waiting = NULL; if (rq->errors) err = -EIO; @@ -2650,8 +2663,6 @@ */ void __blk_put_request(request_queue_t *q, struct request *req) { - struct request_list *rl = req->rl; - if (unlikely(!q)) return; if (unlikely(--req->ref_count)) @@ -2659,18 +2670,16 @@ elv_completed_request(q, req); - req->rq_status = RQ_INACTIVE; - req->rl = NULL; - /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools */ - if (rl) { + if (req->flags & REQ_ALLOCED) { int rw = rq_data_dir(req); int priv = req->flags & REQ_ELVPRIV; BUG_ON(!list_empty(&req->queuelist)); + BUG_ON(!hlist_unhashed(&req->hash)); blk_free_request(q, req); freed_request(q, rw, priv); @@ -2704,9 +2713,9 @@ */ void blk_end_sync_rq(struct request *rq, int error) { - struct completion *waiting = rq->waiting; + struct completion *waiting = rq->end_io_data; - rq->waiting = NULL; + rq->end_io_data = NULL; __blk_put_request(rq->q, rq); /* @@ -2757,7 +2766,7 @@ if (rq_data_dir(req) != rq_data_dir(next) || req->rq_disk != next->rq_disk - || next->waiting || next->special) + || next->special) return 0; /* @@ -2834,10 +2843,11 @@ if (bio_sync(bio)) req->flags |= REQ_RW_SYNC; + if (bio_rw_meta(bio)) + req->flags |= REQ_RW_META; req->errors = 0; req->hard_sector = req->sector = bio->bi_sector; - req->waiting = NULL; req->ioprio = bio_prio(bio); req->rq_disk = bio->bi_bdev->bd_disk; req->start_time = jiffies; @@ -2848,17 +2858,11 @@ static int __make_request(request_queue_t *q, struct bio *bio) { struct request *req; - int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync; - unsigned short prio; - sector_t sector; + int el_ret, nr_sectors, barrier, err; + const unsigned short prio = bio_prio(bio); + const int sync = bio_sync(bio); - sector = bio->bi_sector; nr_sectors = bio_sectors(bio); - cur_nr_sectors = bio_cur_sectors(bio); - prio = bio_prio(bio); - - rw = bio_data_dir(bio); - sync = bio_sync(bio); /* * low level driver can indicate that it wants pages above a @@ -2867,8 +2871,6 @@ */ blk_queue_bounce(q, &bio); - spin_lock_prefetch(q->queue_lock); - barrier = bio_barrier(bio); if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) { err = -EOPNOTSUPP; @@ -2896,7 +2898,7 @@ req->ioprio = ioprio_best(req->ioprio, prio); drive_stat_acct(req, nr_sectors, 0); if (!attempt_back_merge(q, req)) - elv_merged_request(q, req); + elv_merged_request(q, req, el_ret); goto out; case ELEVATOR_FRONT_MERGE: @@ -2916,14 +2918,14 @@ * not touch req->buffer either... */ req->buffer = bio_data(bio); - req->current_nr_sectors = cur_nr_sectors; - req->hard_cur_sectors = cur_nr_sectors; - req->sector = req->hard_sector = sector; + req->current_nr_sectors = bio_cur_sectors(bio); + req->hard_cur_sectors = req->current_nr_sectors; + req->sector = req->hard_sector = bio->bi_sector; req->nr_sectors = req->hard_nr_sectors += nr_sectors; req->ioprio = ioprio_best(req->ioprio, prio); drive_stat_acct(req, nr_sectors, 0); if (!attempt_front_merge(q, req)) - elv_merged_request(q, req); + elv_merged_request(q, req, el_ret); goto out; /* ELV_NO_MERGE: elevator says don't/can't merge. */ @@ -2936,7 +2938,7 @@ * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ - req = get_request_wait(q, rw, bio); + req = get_request_wait(q, bio_data_dir(bio), bio); /* * After dropping the lock and possibly sleeping here, our request @@ -3660,7 +3662,7 @@ * but since the current task itself holds a reference, the context can be * used in general code, so long as it stays within `current` context. */ -struct io_context *current_io_context(gfp_t gfp_flags) +static struct io_context *current_io_context(gfp_t gfp_flags, int node) { struct task_struct *tsk = current; struct io_context *ret; @@ -3669,11 +3671,11 @@ if (likely(ret)) return ret; - ret = kmem_cache_alloc(iocontext_cachep, gfp_flags); + ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); if (ret) { atomic_set(&ret->refcount, 1); ret->task = current; - ret->set_ioprio = NULL; + ret->ioprio_changed = 0; ret->last_waited = jiffies; /* doesn't matter... */ ret->nr_batch_requests = 0; /* because this is 0 */ ret->aic = NULL; @@ -3691,10 +3693,10 @@ * * This is always called in the context of the task which submitted the I/O. */ -struct io_context *get_io_context(gfp_t gfp_flags) +struct io_context *get_io_context(gfp_t gfp_flags, int node) { struct io_context *ret; - ret = current_io_context(gfp_flags); + ret = current_io_context(gfp_flags, node); if (likely(ret)) atomic_inc(&ret->refcount); return ret; @@ -3806,6 +3808,7 @@ unsigned long ra_kb; ssize_t ret = queue_var_store(&ra_kb, page, count); + spin_lock_irq(q->queue_lock); q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); return ret; diff -urN oldtree/block/noop-iosched.c newtree/block/noop-iosched.c --- oldtree/block/noop-iosched.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/block/noop-iosched.c 2006-08-03 13:27:10.000000000 -0700 @@ -69,7 +69,7 @@ { struct noop_data *nd; - nd = kmalloc(sizeof(*nd), GFP_KERNEL); + nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); if (!nd) return NULL; INIT_LIST_HEAD(&nd->queue); diff -urN oldtree/drivers/block/DAC960.c newtree/drivers/block/DAC960.c --- oldtree/drivers/block/DAC960.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/drivers/block/DAC960.c 2006-08-03 13:27:10.000000000 -0700 @@ -3331,7 +3331,7 @@ Command->DmaDirection = PCI_DMA_TODEVICE; Command->CommandType = DAC960_WriteCommand; } - Command->Completion = Request->waiting; + Command->Completion = Request->end_io_data; Command->LogicalDriveNumber = (long)Request->rq_disk->private_data; Command->BlockNumber = Request->sector; Command->BlockCount = Request->nr_sectors; diff -urN oldtree/drivers/block/cciss.c newtree/drivers/block/cciss.c --- oldtree/drivers/block/cciss.c 2006-08-02 07:14:15.000000000 -0700 +++ newtree/drivers/block/cciss.c 2006-08-03 13:31:02.000000000 -0700 @@ -1227,7 +1227,6 @@ int nr_sectors = bio_sectors(bio); bio->bi_next = NULL; - blk_finished_io(len); bio_endio(bio, nr_sectors << 9, status ? 0 : -EIO); bio = xbh; } diff -urN oldtree/drivers/block/cpqarray.c newtree/drivers/block/cpqarray.c --- oldtree/drivers/block/cpqarray.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/drivers/block/cpqarray.c 2006-08-03 13:27:10.000000000 -0700 @@ -989,7 +989,6 @@ xbh = bio->bi_next; bio->bi_next = NULL; - blk_finished_io(nr_sectors); bio_endio(bio, nr_sectors << 9, ok ? 0 : -EIO); bio = xbh; diff -urN oldtree/drivers/block/paride/pd.c newtree/drivers/block/paride/pd.c --- oldtree/drivers/block/paride/pd.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/drivers/block/paride/pd.c 2006-08-03 13:27:10.000000000 -0700 @@ -719,14 +719,12 @@ memset(&rq, 0, sizeof(rq)); rq.errors = 0; - rq.rq_status = RQ_ACTIVE; rq.rq_disk = disk->gd; rq.ref_count = 1; - rq.waiting = &wait; + rq.end_io_data = &wait; rq.end_io = blk_end_sync_rq; blk_insert_request(disk->gd->queue, &rq, 0, func); wait_for_completion(&wait); - rq.waiting = NULL; if (rq.errors) err = -EIO; blk_put_request(&rq); diff -urN oldtree/drivers/block/pktcdvd.c newtree/drivers/block/pktcdvd.c --- oldtree/drivers/block/pktcdvd.c 2006-08-02 07:14:15.000000000 -0700 +++ newtree/drivers/block/pktcdvd.c 2006-08-03 13:27:10.000000000 -0700 @@ -375,7 +375,7 @@ rq->ref_count++; rq->flags |= REQ_NOMERGE; - rq->waiting = &wait; + rq->end_io_data = &wait; rq->end_io = blk_end_sync_rq; elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1); generic_unplug_device(q); diff -urN oldtree/drivers/block/swim3.c newtree/drivers/block/swim3.c --- oldtree/drivers/block/swim3.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/drivers/block/swim3.c 2006-08-03 13:27:10.000000000 -0700 @@ -319,8 +319,8 @@ printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%ld buf=%p\n", req->rq_disk->disk_name, req->cmd, (long)req->sector, req->nr_sectors, req->buffer); - printk(" rq_status=%d errors=%d current_nr_sectors=%ld\n", - req->rq_status, req->errors, req->current_nr_sectors); + printk(" errors=%d current_nr_sectors=%ld\n", + req->errors, req->current_nr_sectors); #endif if (req->sector < 0 || req->sector >= fs->total_secs) { diff -urN oldtree/drivers/block/swim_iop.c newtree/drivers/block/swim_iop.c --- oldtree/drivers/block/swim_iop.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/drivers/block/swim_iop.c 2006-08-03 13:27:10.000000000 -0700 @@ -529,8 +529,8 @@ printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%ld buf=%p\n", CURRENT->rq_disk->disk_name, CURRENT->cmd, CURRENT->sector, CURRENT->nr_sectors, CURRENT->buffer); - printk(" rq_status=%d errors=%d current_nr_sectors=%ld\n", - CURRENT->rq_status, CURRENT->errors, CURRENT->current_nr_sectors); + printk(" errors=%d current_nr_sectors=%ld\n", + CURRENT->errors, CURRENT->current_nr_sectors); #endif if (CURRENT->sector < 0 || CURRENT->sector >= fs->total_secs) { diff -urN oldtree/drivers/fc4/fc.c newtree/drivers/fc4/fc.c --- oldtree/drivers/fc4/fc.c 2006-08-02 07:14:15.000000000 -0700 +++ newtree/drivers/fc4/fc.c 2006-08-03 13:27:10.000000000 -0700 @@ -974,7 +974,6 @@ */ fc->rst_pkt->device->host->eh_action = &sem; - fc->rst_pkt->request->rq_status = RQ_SCSI_BUSY; fc->rst_pkt->done = fcp_scsi_reset_done; diff -urN oldtree/drivers/ide/ide-floppy.c newtree/drivers/ide/ide-floppy.c --- oldtree/drivers/ide/ide-floppy.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/drivers/ide/ide-floppy.c 2006-08-03 13:27:10.000000000 -0700 @@ -1281,8 +1281,7 @@ idefloppy_pc_t *pc; unsigned long block = (unsigned long)block_s; - debug_log(KERN_INFO "rq_status: %d, dev: %s, flags: %lx, errors: %d\n", - rq->rq_status, + debug_log(KERN_INFO "dev: %s, flags: %lx, errors: %d\n", rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->flags, rq->errors); debug_log(KERN_INFO "sector: %ld, nr_sectors: %ld, " diff -urN oldtree/drivers/ide/ide-io.c newtree/drivers/ide/ide-io.c --- oldtree/drivers/ide/ide-io.c 2006-08-02 07:14:15.000000000 -0700 +++ newtree/drivers/ide/ide-io.c 2006-08-03 13:27:11.000000000 -0700 @@ -1713,7 +1713,6 @@ int must_wait = (action == ide_wait || action == ide_head_wait); rq->errors = 0; - rq->rq_status = RQ_ACTIVE; /* * we need to hold an extra reference to request for safe inspection @@ -1721,7 +1720,7 @@ */ if (must_wait) { rq->ref_count++; - rq->waiting = &wait; + rq->end_io_data = &wait; rq->end_io = blk_end_sync_rq; } @@ -1739,7 +1738,6 @@ err = 0; if (must_wait) { wait_for_completion(&wait); - rq->waiting = NULL; if (rq->errors) err = -EIO; diff -urN oldtree/drivers/ide/ide-tape.c newtree/drivers/ide/ide-tape.c --- oldtree/drivers/ide/ide-tape.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/drivers/ide/ide-tape.c 2006-08-03 13:27:11.000000000 -0700 @@ -2423,8 +2423,8 @@ #if IDETAPE_DEBUG_LOG #if 0 if (tape->debug_level >= 5) - printk(KERN_INFO "ide-tape: rq_status: %d, " - "dev: %s, cmd: %ld, errors: %d\n", rq->rq_status, + printk(KERN_INFO "ide-tape: %d, " + "dev: %s, cmd: %ld, errors: %d\n", rq->rq_disk->disk_name, rq->cmd[0], rq->errors); #endif if (tape->debug_level >= 2) @@ -2773,7 +2773,7 @@ return; } #endif /* IDETAPE_DEBUG_BUGS */ - rq->waiting = &wait; + rq->end_io_data = &wait; rq->end_io = blk_end_sync_rq; spin_unlock_irq(&tape->spinlock); wait_for_completion(&wait); diff -urN oldtree/drivers/scsi/ide-scsi.c newtree/drivers/scsi/ide-scsi.c --- oldtree/drivers/scsi/ide-scsi.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/drivers/scsi/ide-scsi.c 2006-08-03 13:27:11.000000000 -0700 @@ -708,7 +708,7 @@ static ide_startstop_t idescsi_do_request (ide_drive_t *drive, struct request *rq, sector_t block) { #if IDESCSI_DEBUG_LOG - printk (KERN_INFO "rq_status: %d, dev: %s, cmd: %x, errors: %d\n",rq->rq_status, rq->rq_disk->disk_name,rq->cmd[0],rq->errors); + printk (KERN_INFO "dev: %s, cmd: %x, errors: %d\n", rq->rq_disk->disk_name,rq->cmd[0],rq->errors); printk (KERN_INFO "sector: %ld, nr_sectors: %ld, current_nr_sectors: %d\n",rq->sector,rq->nr_sectors,rq->current_nr_sectors); #endif /* IDESCSI_DEBUG_LOG */ diff -urN oldtree/drivers/scsi/scsi.c newtree/drivers/scsi/scsi.c --- oldtree/drivers/scsi/scsi.c 2006-08-02 07:14:20.000000000 -0700 +++ newtree/drivers/scsi/scsi.c 2006-08-03 13:27:11.000000000 -0700 @@ -1127,7 +1127,7 @@ spin_lock_irqsave(&sdev->list_lock, flags); list_for_each_entry(scmd, &sdev->cmd_list, list) { - if (scmd->request && scmd->request->rq_status != RQ_INACTIVE) { + if (scmd->request) { /* * If we are unable to remove the timer, it means * that the command has already timed out or diff -urN oldtree/fs/ext3/inode.c newtree/fs/ext3/inode.c --- oldtree/fs/ext3/inode.c 2006-08-02 07:14:21.000000000 -0700 +++ newtree/fs/ext3/inode.c 2006-08-03 13:27:11.000000000 -0700 @@ -36,6 +36,7 @@ #include #include #include +#include #include "xattr.h" #include "acl.h" @@ -1070,7 +1071,7 @@ return bh; if (buffer_uptodate(bh)) return bh; - ll_rw_block(READ, 1, &bh); + ll_rw_block(READ_META, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -2537,7 +2538,7 @@ */ get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); + submit_bh(READ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ext3_error(inode->i_sb, "ext3_get_inode_loc", diff -urN oldtree/fs/ext3/namei.c newtree/fs/ext3/namei.c --- oldtree/fs/ext3/namei.c 2006-08-02 07:14:21.000000000 -0700 +++ newtree/fs/ext3/namei.c 2006-08-03 13:27:11.000000000 -0700 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "namei.h" @@ -870,7 +871,7 @@ bh = ext3_getblk(NULL, dir, b++, 0, &err); bh_use[ra_max] = bh; if (bh) - ll_rw_block(READ, 1, &bh); + ll_rw_block(READ_META, 1, &bh); } } if ((bh = bh_use[ra_ptr++]) == NULL) diff -urN oldtree/fs/ioprio.c newtree/fs/ioprio.c --- oldtree/fs/ioprio.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/fs/ioprio.c 2006-08-03 13:27:11.000000000 -0700 @@ -44,8 +44,8 @@ task->ioprio = ioprio; ioc = task->io_context; - if (ioc && ioc->set_ioprio) - ioc->set_ioprio(ioc, ioprio); + if (ioc) + ioc->ioprio_changed = 1; task_unlock(task); return 0; diff -urN oldtree/include/linux/bio.h newtree/include/linux/bio.h --- oldtree/include/linux/bio.h 2006-07-15 14:53:08.000000000 -0700 +++ newtree/include/linux/bio.h 2006-08-03 13:27:11.000000000 -0700 @@ -148,6 +148,7 @@ #define BIO_RW_BARRIER 2 #define BIO_RW_FAILFAST 3 #define BIO_RW_SYNC 4 +#define BIO_RW_META 5 /* * upper 16 bits of bi_rw define the io priority of this bio @@ -178,6 +179,7 @@ #define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC)) #define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST)) #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) +#define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) /* * will die diff -urN oldtree/include/linux/blkdev.h newtree/include/linux/blkdev.h --- oldtree/include/linux/blkdev.h 2006-08-02 07:14:22.000000000 -0700 +++ newtree/include/linux/blkdev.h 2006-08-03 13:27:11.000000000 -0700 @@ -90,7 +90,7 @@ atomic_t refcount; struct task_struct *task; - int (*set_ioprio)(struct io_context *, unsigned int); + unsigned int ioprio_changed; /* * For request batching @@ -104,8 +104,7 @@ void put_io_context(struct io_context *ioc); void exit_io_context(void); -struct io_context *current_io_context(gfp_t gfp_flags); -struct io_context *get_io_context(gfp_t gfp_flags); +struct io_context *get_io_context(gfp_t gfp_flags, int node); void copy_io_context(struct io_context **pdst, struct io_context **psrc); void swap_io_context(struct io_context **ioc1, struct io_context **ioc2); @@ -129,30 +128,46 @@ struct list_head queuelist; struct list_head donelist; + request_queue_t *q; + unsigned long flags; /* see REQ_ bits below */ /* Maintain bio traversal state for part by part I/O submission. * hard_* are block layer internals, no driver should touch them! */ - sector_t sector; /* next sector to submit */ + sector_t hard_sector; /* next sector to complete */ unsigned long nr_sectors; /* no. of sectors left to submit */ + unsigned long hard_nr_sectors; /* no. of sectors left to complete */ + /* no. of sectors left to submit in the current segment */ unsigned int current_nr_sectors; - sector_t hard_sector; /* next sector to complete */ - unsigned long hard_nr_sectors; /* no. of sectors left to complete */ /* no. of sectors left to complete in the current segment */ unsigned int hard_cur_sectors; struct bio *bio; struct bio *biotail; + struct hlist_node hash; /* merge hash */ + + /* + * The rb_node is only used inside the io scheduler, requests + * are pruned when moved to the dispatch queue. So let the + * completion_data share space with the rb_node. + */ + union { + struct rb_node rb_node; /* sort/lookup */ + void *completion_data; + }; + + /* + * two pointers are available for the IO schedulers, if they need + * more they have to dynamically allocate it. + */ void *elevator_private; - void *completion_data; + void *elevator_private2; - int rq_status; /* should split this into a few status bits */ - int errors; struct gendisk *rq_disk; unsigned long start_time; @@ -170,15 +185,13 @@ unsigned short ioprio; + void *special; + char *buffer; + int tag; + int errors; int ref_count; - request_queue_t *q; - struct request_list *rl; - - struct completion *waiting; - void *special; - char *buffer; /* * when request is used as a packet command carrier @@ -195,7 +208,7 @@ int retries; /* - * completion callback. end_io_data should be folded in with waiting + * completion callback. */ rq_end_io_fn *end_io; void *end_io_data; @@ -236,6 +249,8 @@ __REQ_PM_SHUTDOWN, /* shutdown request */ __REQ_ORDERED_COLOR, /* is before or after barrier */ __REQ_RW_SYNC, /* request is sync (O_DIRECT) */ + __REQ_ALLOCED, /* request came from our alloc pool */ + __REQ_RW_META, /* metadata io request */ __REQ_NR_BITS, /* stops here */ }; @@ -266,6 +281,8 @@ #define REQ_PM_SHUTDOWN (1 << __REQ_PM_SHUTDOWN) #define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR) #define REQ_RW_SYNC (1 << __REQ_RW_SYNC) +#define REQ_ALLOCED (1 << __REQ_ALLOCED) +#define REQ_RW_META (1 << __REQ_RW_META) /* * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME @@ -432,9 +449,6 @@ struct mutex sysfs_lock; }; -#define RQ_INACTIVE (-1) -#define RQ_ACTIVE 1 - #define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */ #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ #define QUEUE_FLAG_STOPPED 2 /* queue is stopped */ @@ -510,6 +524,12 @@ #define rq_data_dir(rq) ((rq)->flags & 1) +/* + * We regard a request as sync, if it's a READ or a SYNC write. + */ +#define rq_is_sync(rq) (rq_data_dir((rq)) == READ || (rq)->flags & REQ_RW_SYNC) +#define rq_is_meta(rq) ((rq)->flags & REQ_RW_META) + static inline int blk_queue_full(struct request_queue *q, int rw) { if (rw == READ) @@ -544,12 +564,6 @@ (!((rq)->flags & RQ_NOMERGE_FLAGS) && blk_fs_request((rq))) /* - * noop, requests are automagically marked as active/inactive by I/O - * scheduler -- see elv_next_request - */ -#define blk_queue_headactive(q, head_active) - -/* * q->prep_rq_fn return values */ #define BLKPREP_OK 0 /* serve it */ @@ -586,11 +600,6 @@ if ((rq->bio)) \ for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) -struct sec_size { - unsigned block_size; - unsigned block_size_bits; -}; - extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); extern void register_disk(struct gendisk *dev); @@ -612,6 +621,7 @@ extern void blk_sync_queue(struct request_queue *q); extern void __blk_stop_queue(request_queue_t *q); extern void blk_run_queue(request_queue_t *); +extern void blk_start_queueing(request_queue_t *); extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *); extern int blk_rq_map_user(request_queue_t *, struct request *, void __user *, unsigned int); extern int blk_rq_unmap_user(struct bio *, unsigned int); @@ -656,16 +666,6 @@ extern void end_request(struct request *req, int uptodate); extern void blk_complete_request(struct request *); -static inline int rq_all_done(struct request *rq, unsigned int nr_bytes) -{ - if (blk_fs_request(rq)) - return (nr_bytes >= (rq->hard_nr_sectors << 9)); - else if (blk_pc_request(rq)) - return nr_bytes >= rq->data_len; - - return 0; -} - /* * end_that_request_first/chunk() takes an uptodate argument. we account * any value <= as an io error. 0 means -EIO for compatability reasons, @@ -680,21 +680,6 @@ } /* - * This should be in elevator.h, but that requires pulling in rq and q - */ -static inline void elv_dispatch_add_tail(struct request_queue *q, - struct request *rq) -{ - if (q->last_merge == rq) - q->last_merge = NULL; - q->nr_sorted--; - - q->end_sector = rq_end_sector(rq); - q->boundary_rq = rq; - list_add_tail(&rq->queuelist, &q->queue_head); -} - -/* * Access functions for manipulating queue properties */ extern request_queue_t *blk_init_queue_node(request_fn_proc *rfn, @@ -785,14 +770,6 @@ return retval; } -static inline int bdev_dma_aligment(struct block_device *bdev) -{ - return queue_dma_alignment(bdev_get_queue(bdev)); -} - -#define blk_finished_io(nsects) do { } while (0) -#define blk_started_io(nsects) do { } while (0) - /* assumes size > 256 */ static inline unsigned int blksize_bits(unsigned int size) { diff -urN oldtree/include/linux/blktrace_api.h newtree/include/linux/blktrace_api.h --- oldtree/include/linux/blktrace_api.h 2006-07-15 14:53:08.000000000 -0700 +++ newtree/include/linux/blktrace_api.h 2006-08-03 13:27:11.000000000 -0700 @@ -20,6 +20,7 @@ BLK_TC_PC = 1 << 9, /* pc requests */ BLK_TC_NOTIFY = 1 << 10, /* special message */ BLK_TC_AHEAD = 1 << 11, /* readahead */ + BLK_TC_META = 1 << 12, /* metadata */ BLK_TC_END = 1 << 15, /* only 16-bits, reminder */ }; diff -urN oldtree/include/linux/elevator.h newtree/include/linux/elevator.h --- oldtree/include/linux/elevator.h 2006-07-15 14:53:08.000000000 -0700 +++ newtree/include/linux/elevator.h 2006-08-03 13:27:11.000000000 -0700 @@ -1,12 +1,14 @@ #ifndef _LINUX_ELEVATOR_H #define _LINUX_ELEVATOR_H +#include + typedef int (elevator_merge_fn) (request_queue_t *, struct request **, struct bio *); typedef void (elevator_merge_req_fn) (request_queue_t *, struct request *, struct request *); -typedef void (elevator_merged_fn) (request_queue_t *, struct request *); +typedef void (elevator_merged_fn) (request_queue_t *, struct request *, int); typedef int (elevator_dispatch_fn) (request_queue_t *, int); @@ -14,9 +16,9 @@ typedef int (elevator_queue_empty_fn) (request_queue_t *); typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *); typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *); -typedef int (elevator_may_queue_fn) (request_queue_t *, int, struct bio *); +typedef int (elevator_may_queue_fn) (request_queue_t *, int); -typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, struct bio *, gfp_t); +typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, gfp_t); typedef void (elevator_put_req_fn) (request_queue_t *, struct request *); typedef void (elevator_activate_req_fn) (request_queue_t *, struct request *); typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *); @@ -82,19 +84,21 @@ struct kobject kobj; struct elevator_type *elevator_type; struct mutex sysfs_lock; + struct hlist_head *hash; }; /* * block elevator interface */ extern void elv_dispatch_sort(request_queue_t *, struct request *); +extern void elv_dispatch_add_tail(request_queue_t *, struct request *); extern void elv_add_request(request_queue_t *, struct request *, int, int); extern void __elv_add_request(request_queue_t *, struct request *, int, int); extern void elv_insert(request_queue_t *, struct request *, int); extern int elv_merge(request_queue_t *, struct request **, struct bio *); extern void elv_merge_requests(request_queue_t *, struct request *, struct request *); -extern void elv_merged_request(request_queue_t *, struct request *); +extern void elv_merged_request(request_queue_t *, struct request *, int); extern void elv_dequeue_request(request_queue_t *, struct request *); extern void elv_requeue_request(request_queue_t *, struct request *); extern int elv_queue_empty(request_queue_t *); @@ -103,9 +107,9 @@ extern struct request *elv_latter_request(request_queue_t *, struct request *); extern int elv_register_queue(request_queue_t *q); extern void elv_unregister_queue(request_queue_t *q); -extern int elv_may_queue(request_queue_t *, int, struct bio *); +extern int elv_may_queue(request_queue_t *, int); extern void elv_completed_request(request_queue_t *, struct request *); -extern int elv_set_request(request_queue_t *, struct request *, struct bio *, gfp_t); +extern int elv_set_request(request_queue_t *, struct request *, gfp_t); extern void elv_put_request(request_queue_t *, struct request *); /* @@ -125,6 +129,19 @@ extern int elv_rq_merge_ok(struct request *, struct bio *); /* + * Helper functions. + */ +extern struct request *elv_rb_former_request(request_queue_t *, struct request *); +extern struct request *elv_rb_latter_request(request_queue_t *, struct request *); + +/* + * rb support functions. + */ +extern struct request *elv_rb_add(struct rb_root *, struct request *); +extern void elv_rb_del(struct rb_root *, struct request *); +extern struct request *elv_rb_find(struct rb_root *, sector_t); + +/* * Return values from elevator merger */ #define ELEVATOR_NO_MERGE 0 @@ -149,5 +166,41 @@ }; #define rq_end_sector(rq) ((rq)->sector + (rq)->nr_sectors) +#define rb_entry_rq(node) rb_entry((node), struct request, rb_node) + +/* + * Hack to reuse the donelist list_head as the fifo time holder while + * the request is in the io scheduler. Saves an unsigned long in rq. + */ +#define rq_fifo_time(rq) ((unsigned long) (rq)->donelist.next) +#define rq_set_fifo_time(rq,exp) ((rq)->donelist.next = (void *) (exp)) +#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) +#define rq_fifo_clear(rq) do { \ + list_del_init(&(rq)->queuelist); \ + INIT_LIST_HEAD(&(rq)->donelist); \ + } while (0) + +/* + * io context count accounting + */ +#define elv_ioc_count_mod(name, __val) \ + do { \ + preempt_disable(); \ + __get_cpu_var(name) += (__val); \ + preempt_enable(); \ + } while (0) + +#define elv_ioc_count_inc(name) elv_ioc_count_mod(name, 1) +#define elv_ioc_count_dec(name) elv_ioc_count_mod(name, -1) + +#define elv_ioc_count_read(name) \ +({ \ + unsigned long __val = 0; \ + int __cpu; \ + smp_wmb(); \ + for_each_possible_cpu(__cpu) \ + __val += per_cpu(name, __cpu); \ + __val; \ +}) #endif diff -urN oldtree/include/linux/fs.h newtree/include/linux/fs.h --- oldtree/include/linux/fs.h 2006-08-02 07:14:22.000000000 -0700 +++ newtree/include/linux/fs.h 2006-08-03 13:27:11.000000000 -0700 @@ -79,8 +79,8 @@ #define WRITE 1 #define READA 2 /* read-ahead - don't block if no resources */ #define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */ -#define SPECIAL 4 /* For non-blockdevice requests in request queue */ #define READ_SYNC (READ | (1 << BIO_RW_SYNC)) +#define READ_META (READ | (1 << BIO_RW_META)) #define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) #define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER)) diff -urN oldtree/include/linux/rbtree.h newtree/include/linux/rbtree.h --- oldtree/include/linux/rbtree.h 2006-07-15 14:53:08.000000000 -0700 +++ newtree/include/linux/rbtree.h 2006-08-03 13:27:11.000000000 -0700 @@ -133,7 +133,7 @@ #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) -#define RB_EMPTY_NODE(node) (rb_parent(node) != node) +#define RB_EMPTY_NODE(node) (rb_parent(node) == node) #define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); diff -urN oldtree/lib/rbtree.c newtree/lib/rbtree.c --- oldtree/lib/rbtree.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree/lib/rbtree.c 2006-08-03 13:27:11.000000000 -0700 @@ -322,6 +322,9 @@ { struct rb_node *parent; + if (rb_parent(node) == node) + return NULL; + /* If we have a right-hand child, go down and then left as far as we can. */ if (node->rb_right) { @@ -348,6 +351,9 @@ { struct rb_node *parent; + if (rb_parent(node) == node) + return NULL; + /* If we have a left-hand child, go down and then right as far as we can. */ if (node->rb_left) {