diff options
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/aoe/aoechr.c | 10 | ||||
-rw-r--r-- | drivers/block/as-iosched.c | 27 | ||||
-rw-r--r-- | drivers/block/cciss.c | 79 | ||||
-rw-r--r-- | drivers/block/cciss.h | 4 | ||||
-rw-r--r-- | drivers/block/cfq-iosched.c | 2085 | ||||
-rw-r--r-- | drivers/block/deadline-iosched.c | 15 | ||||
-rw-r--r-- | drivers/block/elevator.c | 31 | ||||
-rw-r--r-- | drivers/block/genhd.c | 27 | ||||
-rw-r--r-- | drivers/block/ioctl.c | 74 | ||||
-rw-r--r-- | drivers/block/ll_rw_blk.c | 404 | ||||
-rw-r--r-- | drivers/block/loop.c | 81 | ||||
-rw-r--r-- | drivers/block/paride/pd.c | 2 | ||||
-rw-r--r-- | drivers/block/paride/pg.c | 14 | ||||
-rw-r--r-- | drivers/block/paride/pt.c | 20 | ||||
-rw-r--r-- | drivers/block/pktcdvd.c | 39 | ||||
-rw-r--r-- | drivers/block/swim3.c | 10 | ||||
-rw-r--r-- | drivers/block/sx8.c | 15 | ||||
-rw-r--r-- | drivers/block/ub.c | 217 |
18 files changed, 2041 insertions, 1113 deletions
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c index 14aeca3..45a2430 100644 --- a/drivers/block/aoe/aoechr.c +++ b/drivers/block/aoe/aoechr.c @@ -36,7 +36,7 @@ static int emsgs_head_idx, emsgs_tail_idx; static struct semaphore emsgs_sema; static spinlock_t emsgs_lock; static int nblocked_emsgs_readers; -static struct class_simple *aoe_class; +static struct class *aoe_class; static struct aoe_chardev chardevs[] = { { MINOR_ERR, "err" }, { MINOR_DISCOVER, "discover" }, @@ -218,13 +218,13 @@ aoechr_init(void) } sema_init(&emsgs_sema, 0); spin_lock_init(&emsgs_lock); - aoe_class = class_simple_create(THIS_MODULE, "aoe"); + aoe_class = class_create(THIS_MODULE, "aoe"); if (IS_ERR(aoe_class)) { unregister_chrdev(AOE_MAJOR, "aoechr"); return PTR_ERR(aoe_class); } for (i = 0; i < ARRAY_SIZE(chardevs); ++i) - class_simple_device_add(aoe_class, + class_device_create(aoe_class, MKDEV(AOE_MAJOR, chardevs[i].minor), NULL, chardevs[i].name); @@ -237,8 +237,8 @@ aoechr_exit(void) int i; for (i = 0; i < ARRAY_SIZE(chardevs); ++i) - class_simple_device_remove(MKDEV(AOE_MAJOR, chardevs[i].minor)); - class_simple_destroy(aoe_class); + class_device_destroy(aoe_class, MKDEV(AOE_MAJOR, chardevs[i].minor)); + class_destroy(aoe_class); unregister_chrdev(AOE_MAJOR, "aoechr"); } diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c index a9575bb..95c0a36 100644 --- a/drivers/block/as-iosched.c +++ b/drivers/block/as-iosched.c @@ -1806,7 +1806,8 @@ static void as_put_request(request_queue_t *q, struct request *rq) rq->elevator_private = NULL; } -static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask) +static int as_set_request(request_queue_t *q, struct request *rq, + struct bio *bio, int gfp_mask) { struct as_data *ad = q->elevator->elevator_data; struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask); @@ -1827,7 +1828,7 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask) return 1; } -static int as_may_queue(request_queue_t *q, int rw) +static int as_may_queue(request_queue_t *q, int rw, struct bio *bio) { int ret = ELV_MQUEUE_MAY; struct as_data *ad = q->elevator->elevator_data; @@ -1871,20 +1872,22 @@ static int as_init_queue(request_queue_t *q, elevator_t *e) if (!arq_pool) return -ENOMEM; - ad = kmalloc(sizeof(*ad), GFP_KERNEL); + ad = kmalloc_node(sizeof(*ad), GFP_KERNEL, q->node); if (!ad) return -ENOMEM; memset(ad, 0, sizeof(*ad)); ad->q = q; /* Identify what queue the data belongs to */ - ad->hash = kmalloc(sizeof(struct list_head)*AS_HASH_ENTRIES,GFP_KERNEL); + ad->hash = kmalloc_node(sizeof(struct list_head)*AS_HASH_ENTRIES, + GFP_KERNEL, q->node); if (!ad->hash) { kfree(ad); return -ENOMEM; } - ad->arq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, arq_pool); + ad->arq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, + mempool_free_slab, arq_pool, q->node); if (!ad->arq_pool) { kfree(ad->hash); kfree(ad); @@ -1932,23 +1935,15 @@ struct as_fs_entry { static ssize_t as_var_show(unsigned int var, char *page) { - var = (var * 1000) / HZ; return sprintf(page, "%d\n", var); } static ssize_t as_var_store(unsigned long *var, const char *page, size_t count) { - unsigned long tmp; char *p = (char *) page; - tmp = simple_strtoul(p, &p, 10); - if (tmp != 0) { - tmp = (tmp * HZ) / 1000; - if (tmp == 0) - tmp = 1; - } - *var = tmp; + *var = simple_strtoul(p, &p, 10); return count; } @@ -2044,7 +2039,7 @@ as_attr_show(struct kobject *kobj, struct attribute *attr, char *page) struct as_fs_entry *entry = to_as(attr); if (!entry->show) - return 0; + return -EIO; return entry->show(e->elevator_data, page); } @@ -2057,7 +2052,7 @@ as_attr_store(struct kobject *kobj, struct attribute *attr, struct as_fs_entry *entry = to_as(attr); if (!entry->store) - return -EINVAL; + return -EIO; return entry->store(e->elevator_data, page, length); } diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 8f7c1a1..418b146 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1,6 +1,6 @@ /* * Disk Array driver for HP SA 5xxx and 6xxx Controllers - * Copyright 2000, 2002 Hewlett-Packard Development Company, L.P. + * Copyright 2000, 2005 Hewlett-Packard Development Company, L.P. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -41,6 +41,7 @@ #include <asm/uaccess.h> #include <asm/io.h> +#include <linux/dma-mapping.h> #include <linux/blkdev.h> #include <linux/genhd.h> #include <linux/completion.h> @@ -53,7 +54,7 @@ MODULE_AUTHOR("Hewlett-Packard Company"); MODULE_DESCRIPTION("Driver for HP Controller SA5xxx SA6xxx version 2.6.6"); MODULE_SUPPORTED_DEVICE("HP SA5i SA5i+ SA532 SA5300 SA5312 SA641 SA642 SA6400" - " SA6i P600 P800 E400"); + " SA6i P600 P800 E400 E300"); MODULE_LICENSE("GPL"); #include "cciss_cmd.h" @@ -84,8 +85,10 @@ static const struct pci_device_id cciss_pci_device_id[] = { 0x103C, 0x3225, 0, 0, 0}, { PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSB, 0x103c, 0x3223, 0, 0, 0}, - { PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSB, + { PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103c, 0x3231, 0, 0, 0}, + { PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, + 0x103c, 0x3233, 0, 0, 0}, {0,} }; MODULE_DEVICE_TABLE(pci, cciss_pci_device_id); @@ -109,6 +112,7 @@ static struct board_type products[] = { { 0x3225103C, "Smart Array P600", &SA5_access}, { 0x3223103C, "Smart Array P800", &SA5_access}, { 0x3231103C, "Smart Array E400", &SA5_access}, + { 0x3233103C, "Smart Array E300", &SA5_access}, }; /* How long to wait (in millesconds) for board to go into simple mode */ @@ -126,8 +130,6 @@ static struct board_type products[] = { #define MAX_CTLR_ORIG 8 -#define CCISS_DMA_MASK 0xFFFFFFFF /* 32 bit DMA */ - static ctlr_info_t *hba[MAX_CTLR]; static void do_cciss_request(request_queue_t *q); @@ -636,6 +638,7 @@ static int cciss_ioctl(struct inode *inode, struct file *filep, cciss_pci_info_struct pciinfo; if (!arg) return -EINVAL; + pciinfo.domain = pci_domain_nr(host->pdev->bus); pciinfo.bus = host->pdev->bus->number; pciinfo.dev_fn = host->pdev->devfn; pciinfo.board_id = host->board_id; @@ -783,18 +786,10 @@ static int cciss_ioctl(struct inode *inode, struct file *filep, case CCISS_GETLUNINFO: { LogvolInfo_struct luninfo; - int i; luninfo.LunID = drv->LunID; luninfo.num_opens = drv->usage_count; luninfo.num_parts = 0; - /* count partitions 1 to 15 with sizes > 0 */ - for (i = 0; i < MAX_PART - 1; i++) { - if (!disk->part[i]) - continue; - if (disk->part[i]->nr_sects != 0) - luninfo.num_parts++; - } if (copy_to_user(argp, &luninfo, sizeof(LogvolInfo_struct))) return -EFAULT; @@ -1140,7 +1135,7 @@ static int revalidate_allvol(ctlr_info_t *host) /* this is for the online array utilities */ if (!drv->heads && i) continue; - blk_queue_hardsect_size(host->queue, drv->block_size); + blk_queue_hardsect_size(drv->queue, drv->block_size); set_capacity(disk, drv->nr_blocks); add_disk(disk); } @@ -1696,7 +1691,7 @@ static int cciss_revalidate(struct gendisk *disk) cciss_read_capacity(h->ctlr, logvol, size_buff, 1, &total_size, &block_size); cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size, inq_buff, drv); - blk_queue_hardsect_size(h->queue, drv->block_size); + blk_queue_hardsect_size(drv->queue, drv->block_size); set_capacity(disk, drv->nr_blocks); kfree(size_buff); @@ -2253,12 +2248,12 @@ static irqreturn_t do_cciss_intr(int irq, void *dev_id, struct pt_regs *regs) * them up. We will also keep track of the next queue to run so * that every queue gets a chance to be started first. */ - for (j=0; j < NWD; j++){ - int curr_queue = (start_queue + j) % NWD; + for (j=0; j < h->highest_lun + 1; j++){ + int curr_queue = (start_queue + j) % (h->highest_lun + 1); /* make sure the disk has been added and the drive is real * because this can be called from the middle of init_one. */ - if(!(h->gendisk[curr_queue]->queue) || + if(!(h->drv[curr_queue].queue) || !(h->drv[curr_queue].heads)) continue; blk_start_queue(h->gendisk[curr_queue]->queue); @@ -2269,14 +2264,14 @@ static irqreturn_t do_cciss_intr(int irq, void *dev_id, struct pt_regs *regs) if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS) { if (curr_queue == start_queue){ - h->next_to_run = (start_queue + 1) % NWD; + h->next_to_run = (start_queue + 1) % (h->highest_lun + 1); goto cleanup; } else { h->next_to_run = curr_queue; goto cleanup; } } else { - curr_queue = (curr_queue + 1) % NWD; + curr_queue = (curr_queue + 1) % (h->highest_lun + 1); } } @@ -2284,7 +2279,6 @@ cleanup: spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags); return IRQ_HANDLED; } - /* * We cannot read the structure directly, for portablity we must use * the io functions. @@ -2393,11 +2387,6 @@ static int cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev) printk(KERN_ERR "cciss: Unable to Enable PCI device\n"); return( -1); } - if (pci_set_dma_mask(pdev, CCISS_DMA_MASK ) != 0) - { - printk(KERN_ERR "cciss: Unable to set DMA mask\n"); - return(-1); - } subsystem_vendor_id = pdev->subsystem_vendor; subsystem_device_id = pdev->subsystem_device; @@ -2747,9 +2736,9 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, hba[i]->pdev = pdev; /* configure PCI DMA stuff */ - if (!pci_set_dma_mask(pdev, 0xffffffffffffffffULL)) + if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) printk("cciss: using DAC cycles\n"); - else if (!pci_set_dma_mask(pdev, 0xffffffff)) + else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) printk("cciss: not using DAC cycles\n"); else { printk("cciss: no suitable DMA available\n"); @@ -2799,13 +2788,6 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, } spin_lock_init(&hba[i]->lock); - q = blk_init_queue(do_cciss_request, &hba[i]->lock); - if (!q) - goto clean4; - - q->backing_dev_info.ra_pages = READ_AHEAD; - hba[i]->queue = q; - q->queuedata = hba[i]; /* Initialize the pdev driver private data. have it point to hba[i]. */ @@ -2827,6 +2809,20 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, cciss_procinit(i); + for(j=0; j < NWD; j++) { /* mfm */ + drive_info_struct *drv = &(hba[i]->drv[j]); + struct gendisk *disk = hba[i]->gendisk[j]; + + q = blk_init_queue(do_cciss_request, &hba[i]->lock); + if (!q) { + printk(KERN_ERR + "cciss: unable to allocate queue for disk %d\n", + j); + break; + } + drv->queue = q; + + q->backing_dev_info.ra_pages = READ_AHEAD; blk_queue_bounce_limit(q, hba[i]->pdev->dma_mask); /* This is a hardware imposed limit. */ @@ -2837,26 +2833,23 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, blk_queue_max_sectors(q, 512); - - for(j=0; j<NWD; j++) { - drive_info_struct *drv = &(hba[i]->drv[j]); - struct gendisk *disk = hba[i]->gendisk[j]; - + q->queuedata = hba[i]; sprintf(disk->disk_name, "cciss/c%dd%d", i, j); sprintf(disk->devfs_name, "cciss/host%d/target%d", i, j); disk->major = hba[i]->major; disk->first_minor = j << NWD_SHIFT; disk->fops = &cciss_fops; - disk->queue = hba[i]->queue; + disk->queue = q; disk->private_data = drv; /* we must register the controller even if no disks exist */ /* this is for the online array utilities */ if(!drv->heads && j) continue; - blk_queue_hardsect_size(hba[i]->queue, drv->block_size); + blk_queue_hardsect_size(q, drv->block_size); set_capacity(disk, drv->nr_blocks); add_disk(disk); } + return(1); clean4: @@ -2922,10 +2915,10 @@ static void __devexit cciss_remove_one (struct pci_dev *pdev) for (j = 0; j < NWD; j++) { struct gendisk *disk = hba[i]->gendisk[j]; if (disk->flags & GENHD_FL_UP) + blk_cleanup_queue(disk->queue); del_gendisk(disk); } - blk_cleanup_queue(hba[i]->queue); pci_free_consistent(hba[i]->pdev, NR_CMDS * sizeof(CommandList_struct), hba[i]->cmd_pool, hba[i]->cmd_pool_dhandle); pci_free_consistent(hba[i]->pdev, NR_CMDS * sizeof( ErrorInfo_struct), diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h index 8fb1920..566587d 100644 --- a/drivers/block/cciss.h +++ b/drivers/block/cciss.h @@ -29,6 +29,7 @@ typedef struct _drive_info_struct { __u32 LunID; int usage_count; + struct request_queue *queue; sector_t nr_blocks; int block_size; int heads; @@ -72,7 +73,6 @@ struct ctlr_info unsigned int maxQsinceinit; unsigned int maxSG; spinlock_t lock; - struct request_queue *queue; //* pointers to command and error info pool */ CommandList_struct *cmd_pool; @@ -260,7 +260,7 @@ struct board_type { struct access_method *access; }; -#define CCISS_LOCK(i) (hba[i]->queue->queue_lock) +#define CCISS_LOCK(i) (&hba[i]->lock) #endif /* CCISS_H */ diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c index 0ef7a00..2435a7c 100644 --- a/drivers/block/cfq-iosched.c +++ b/drivers/block/cfq-iosched.c @@ -21,22 +21,34 @@ #include <linux/hash.h> #include <linux/rbtree.h> #include <linux/mempool.h> - -static unsigned long max_elapsed_crq; -static unsigned long max_elapsed_dispatch; +#include <linux/ioprio.h> +#include <linux/writeback.h> /* * tunables */ static int cfq_quantum = 4; /* max queue in one round of service */ static int cfq_queued = 8; /* minimum rq allocate limit per-queue*/ -static int cfq_service = HZ; /* period over which service is avg */ -static int cfq_fifo_expire_r = HZ / 2; /* fifo timeout for sync requests */ -static int cfq_fifo_expire_w = 5 * HZ; /* fifo timeout for async requests */ -static int cfq_fifo_rate = HZ / 8; /* fifo expiry rate */ +static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; static int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */ static int cfq_back_penalty = 2; /* penalty of a backwards seek */ +static int cfq_slice_sync = HZ / 10; +static int cfq_slice_async = HZ / 25; +static int cfq_slice_async_rq = 2; +static int cfq_slice_idle = HZ / 100; + +#define CFQ_IDLE_GRACE (HZ / 10) +#define CFQ_SLICE_SCALE (5) + +#define CFQ_KEY_ASYNC (0) +#define CFQ_KEY_ANY (0xffff) + +/* + * disable queueing at the driver/hardware level + */ +static int cfq_max_depth = 1; + /* * for the hash of cfqq inside the cfqd */ @@ -55,6 +67,7 @@ static int cfq_back_penalty = 2; /* penalty of a backwards seek */ #define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash) #define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) +#define list_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define RQ_DATA(rq) (rq)->elevator_private @@ -75,78 +88,110 @@ static int cfq_back_penalty = 2; /* penalty of a backwards seek */ #define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node) #define rq_rb_key(rq) (rq)->sector -/* - * threshold for switching off non-tag accounting - */ -#define CFQ_MAX_TAG (4) - -/* - * sort key types and names - */ -enum { - CFQ_KEY_PGID, - CFQ_KEY_TGID, - CFQ_KEY_UID, - CFQ_KEY_GID, - CFQ_KEY_LAST, -}; - -static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL }; - static kmem_cache_t *crq_pool; static kmem_cache_t *cfq_pool; static kmem_cache_t *cfq_ioc_pool; +#define CFQ_PRIO_LISTS IOPRIO_BE_NR +#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) +#define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE) +#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) + +#define ASYNC (0) +#define SYNC (1) + +#define cfq_cfqq_dispatched(cfqq) \ + ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC]) + +#define cfq_cfqq_class_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC) + +#define cfq_cfqq_sync(cfqq) \ + (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC]) + +/* + * Per block device queue structure + */ struct cfq_data { - struct list_head rr_list; + atomic_t ref; + request_queue_t *queue; + + /* + * rr list of queues with requests and the count of them + */ + struct list_head rr_list[CFQ_PRIO_LISTS]; + struct list_head busy_rr; + struct list_head cur_rr; + struct list_head idle_rr; + unsigned int busy_queues; + + /* + * non-ordered list of empty cfqq's + */ struct list_head empty_list; + /* + * cfqq lookup hash + */ struct hlist_head *cfq_hash; - struct hlist_head *crq_hash; - /* queues on rr_list (ie they have pending requests */ - unsigned int busy_queues; + /* + * global crq hash for all queues + */ + struct hlist_head *crq_hash; unsigned int max_queued; - atomic_t ref; + mempool_t *crq_pool; - int key_type; + int rq_in_driver; - mempool_t *crq_pool; + /* + * schedule slice state info + */ + /* + * idle window management + */ + struct timer_list idle_slice_timer; + struct work_struct unplug_work; - request_queue_t *queue; + struct cfq_queue *active_queue; + struct cfq_io_context *active_cic; + int cur_prio, cur_end_prio; + unsigned int dispatch_slice; + + struct timer_list idle_class_timer; sector_t last_sector; + unsigned long last_end_request; - int rq_in_driver; + unsigned int rq_starved; /* * tunables, see top of file */ unsigned int cfq_quantum; unsigned int cfq_queued; - unsigned int cfq_fifo_expire_r; - unsigned int cfq_fifo_expire_w; - unsigned int cfq_fifo_batch_expire; + unsigned int cfq_fifo_expire[2]; unsigned int cfq_back_penalty; unsigned int cfq_back_max; - unsigned int find_best_crq; - - unsigned int cfq_tagged; + unsigned int cfq_slice[2]; + unsigned int cfq_slice_async_rq; + unsigned int cfq_slice_idle; + unsigned int cfq_max_depth; }; +/* + * Per process-grouping structure + */ struct cfq_queue { /* reference count */ atomic_t ref; /* parent cfq_data */ struct cfq_data *cfqd; - /* hash of mergeable requests */ + /* cfqq lookup hash */ struct hlist_node cfq_hash; /* hash key */ - unsigned long key; - /* whether queue is on rr (or empty) list */ - int on_rr; + unsigned int key; /* on either rr or empty list of cfqd */ struct list_head cfq_list; /* sorted list of pending requests */ @@ -158,21 +203,22 @@ struct cfq_queue { /* currently allocated requests */ int allocated[2]; /* fifo list of requests in sort_list */ - struct list_head fifo[2]; - /* last time fifo expired */ - unsigned long last_fifo_expire; + struct list_head fifo; - int key_type; + unsigned long slice_start; + unsigned long slice_end; + unsigned long slice_left; + unsigned long service_last; - unsigned long service_start; - unsigned long service_used; + /* number of requests that are on the dispatch list */ + int on_dispatch[2]; - unsigned int max_rate; + /* io prio of this group */ + unsigned short ioprio, org_ioprio; + unsigned short ioprio_class, org_ioprio_class; - /* number of requests that have been handed to the driver */ - int in_flight; - /* number of currently allocated requests */ - int alloc_limit[2]; + /* various state flags, see below */ + unsigned int flags; }; struct cfq_rq { @@ -184,42 +230,78 @@ struct cfq_rq { struct cfq_queue *cfq_queue; struct cfq_io_context *io_context; - unsigned long service_start; - unsigned long queue_start; + unsigned int crq_flags; +}; + +enum cfqq_state_flags { + CFQ_CFQQ_FLAG_on_rr = 0, + CFQ_CFQQ_FLAG_wait_request, + CFQ_CFQQ_FLAG_must_alloc, + CFQ_CFQQ_FLAG_must_alloc_slice, + CFQ_CFQQ_FLAG_must_dispatch, + CFQ_CFQQ_FLAG_fifo_expire, + CFQ_CFQQ_FLAG_idle_window, + CFQ_CFQQ_FLAG_prio_changed, + CFQ_CFQQ_FLAG_expired, +}; - unsigned int in_flight : 1; - unsigned int accounted : 1; - unsigned int is_sync : 1; - unsigned int is_write : 1; +#define CFQ_CFQQ_FNS(name) \ +static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ +{ \ + cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ +} \ +static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ +{ \ + cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ +} \ +static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ +{ \ + return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ +} + +CFQ_CFQQ_FNS(on_rr); +CFQ_CFQQ_FNS(wait_request); +CFQ_CFQQ_FNS(must_alloc); +CFQ_CFQQ_FNS(must_alloc_slice); +CFQ_CFQQ_FNS(must_dispatch); +CFQ_CFQQ_FNS(fifo_expire); +CFQ_CFQQ_FNS(idle_window); +CFQ_CFQQ_FNS(prio_changed); +CFQ_CFQQ_FNS(expired); +#undef CFQ_CFQQ_FNS + +enum cfq_rq_state_flags { + CFQ_CRQ_FLAG_in_flight = 0, + CFQ_CRQ_FLAG_in_driver, + CFQ_CRQ_FLAG_is_sync, + CFQ_CRQ_FLAG_requeued, }; -static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long); +#define CFQ_CRQ_FNS(name) \ +static inline void cfq_mark_crq_##name(struct cfq_rq *crq) \ +{ \ + crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name); \ +} \ +static inline void cfq_clear_crq_##name(struct cfq_rq *crq) \ +{ \ + crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name); \ +} \ +static inline int cfq_crq_##name(const struct cfq_rq *crq) \ +{ \ + return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0; \ +} + +CFQ_CRQ_FNS(in_flight); +CFQ_CRQ_FNS(in_driver); +CFQ_CRQ_FNS(is_sync); +CFQ_CRQ_FNS(requeued); +#undef CFQ_CRQ_FNS + +static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short); static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *); -static void cfq_update_next_crq(struct cfq_rq *); static void cfq_put_cfqd(struct cfq_data *cfqd); -/* - * what the fairness is based on (ie how processes are grouped and - * differentiated) - */ -static inline unsigned long -cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk) -{ - /* - * optimize this so that ->key_type is the offset into the struct - */ - switch (cfqd->key_type) { - case CFQ_KEY_PGID: - return process_group(tsk); - default: - case CFQ_KEY_TGID: - return tsk->tgid; - case CFQ_KEY_UID: - return tsk->uid; - case CFQ_KEY_GID: - return tsk->gid; - } -} +#define process_sync(tsk) ((tsk)->flags & PF_SYNCWRITE) /* * lots of deadline iosched dupes, can be abstracted later... @@ -235,16 +317,12 @@ static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) if (q->last_merge == crq->request) q->last_merge = NULL; - - cfq_update_next_crq(crq); } static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq) { const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request)); - BUG_ON(!hlist_unhashed(&crq->hash)); - hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]); } @@ -257,8 +335,6 @@ static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) struct cfq_rq *crq = list_entry_hash(entry); struct request *__rq = crq->request; - BUG_ON(hlist_unhashed(&crq->hash)); - if (!rq_mergeable(__rq)) { cfq_del_crq_hash(crq); continue; @@ -271,6 +347,28 @@ static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) return NULL; } +static inline int cfq_pending_requests(struct cfq_data *cfqd) +{ + return !list_empty(&cfqd->queue->queue_head) || cfqd->busy_queues; +} + +/* + * scheduler run of queue, if there are requests pending and no one in the + * driver that will restart queueing + */ +static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) +{ + if (!cfqd->rq_in_driver && cfq_pending_requests(cfqd)) + kblockd_schedule_work(&cfqd->unplug_work); +} + +static int cfq_queue_empty(request_queue_t *q) +{ + struct cfq_data *cfqd = q->elevator->elevator_data; + + return !cfq_pending_requests(cfqd); +} + /* * Lifted from AS - choose which of crq1 and crq2 that is best served now. * We choose the request that is closest to the head right now. Distance @@ -287,36 +385,16 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2) return crq2; if (crq2 == NULL) return crq1; + if (cfq_crq_requeued(crq1)) + return crq1; + if (cfq_crq_requeued(crq2)) + return crq2; s1 = crq1->request->sector; s2 = crq2->request->sector; last = cfqd->last_sector; -#if 0 - if (!list_empty(&cfqd->queue->queue_head)) { - struct list_head *entry = &cfqd->queue->queue_head; - unsigned long distance = ~0UL; - struct request *rq; - - while ((entry = entry->prev) != &cfqd->queue->queue_head) { - rq = list_entry_rq(entry); - - if (blk_barrier_rq(rq)) - break; - - if (distance < abs(s1 - rq->sector + rq->nr_sectors)) { - distance = abs(s1 - rq->sector +rq->nr_sectors); - last = rq->sector + rq->nr_sectors; - } - if (distance < abs(s2 - rq->sector + rq->nr_sectors)) { - distance = abs(s2 - rq->sector +rq->nr_sectors); - last = rq->sector + rq->nr_sectors; - } - } - } -#endif - /* * by definition, 1KiB is 2 sectors */ @@ -377,11 +455,14 @@ cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_rq *crq_next = NULL, *crq_prev = NULL; struct rb_node *rbnext, *rbprev; - if (!ON_RB(&last->rb_node)) - return NULL; - - if ((rbnext = rb_next(&last->rb_node)) == NULL) + rbnext = NULL; + if (ON_RB(&last->rb_node)) + rbnext = rb_next(&last->rb_node); + if (!rbnext) { rbnext = rb_first(&cfqq->sort_list); + if (rbnext == &last->rb_node) + rbnext = NULL; + } rbprev = rb_prev(&last->rb_node); @@ -401,67 +482,53 @@ static void cfq_update_next_crq(struct cfq_rq *crq) cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq); } -static int cfq_check_sort_rr_list(struct cfq_queue *cfqq) +static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) { - struct list_head *head = &cfqq->cfqd->rr_list; - struct list_head *next, *prev; - - /* - * list might still be ordered - */ - next = cfqq->cfq_list.next; - if (next != head) { - struct cfq_queue *cnext = list_entry_cfqq(next); + struct cfq_data *cfqd = cfqq->cfqd; + struct list_head *list, *entry; - if (cfqq->service_used > cnext->service_used) - return 1; - } + BUG_ON(!cfq_cfqq_on_rr(cfqq)); - prev = cfqq->cfq_list.prev; - if (prev != head) { - struct cfq_queue *cprev = list_entry_cfqq(prev); + list_del(&cfqq->cfq_list); - if (cfqq->service_used < cprev->service_used) - return 1; + if (cfq_class_rt(cfqq)) + list = &cfqd->cur_rr; + else if (cfq_class_idle(cfqq)) + list = &cfqd->idle_rr; + else { + /* + * if cfqq has requests in flight, don't allow it to be + * found in cfq_set_active_queue before it has finished them. + * this is done to increase fairness between a process that + * has lots of io pending vs one that only generates one + * sporadically or synchronously + */ + if (cfq_cfqq_dispatched(cfqq)) + list = &cfqd->busy_rr; + else + list = &cfqd->rr_list[cfqq->ioprio]; } - return 0; -} - -static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue) -{ - struct list_head *entry = &cfqq->cfqd->rr_list; - - if (!cfqq->on_rr) - return; - if (!new_queue && !cfq_check_sort_rr_list(cfqq)) + /* + * if queue was preempted, just add to front to be fair. busy_rr + * isn't sorted. + */ + if (preempted || list == &cfqd->busy_rr) { + list_add(&cfqq->cfq_list, list); return; - - list_del(&cfqq->cfq_list); + } /* - * sort by our mean service_used, sub-sort by in-flight requests + * sort by when queue was last serviced */ - while ((entry = entry->prev) != &cfqq->cfqd->rr_list) { + entry = list; + while ((entry = entry->prev) != list) { struct cfq_queue *__cfqq = list_entry_cfqq(entry); - if (cfqq->service_used > __cfqq->service_used) + if (!__cfqq->service_last) + break; + if (time_before(__cfqq->service_last, cfqq->service_last)) break; - else if (cfqq->service_used == __cfqq->service_used) { - struct list_head *prv; - - while ((prv = entry->prev) != &cfqq->cfqd->rr_list) { - __cfqq = list_entry_cfqq(prv); - - WARN_ON(__cfqq->service_used > cfqq->service_used); - if (cfqq->service_used != __cfqq->service_used) - break; - if (cfqq->in_flight > __cfqq->in_flight) - break; - - entry = prv; - } - } } list_add(&cfqq->cfq_list, entry); @@ -469,28 +536,24 @@ static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue) /* * add to busy list of queues for service, trying to be fair in ordering - * the pending list according to requests serviced + * the pending list according to last request service */ static inline void -cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) +cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq, int requeue) { - /* - * it's currently on the empty list - */ - cfqq->on_rr = 1; + BUG_ON(cfq_cfqq_on_rr(cfqq)); + cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; - if (time_after(jiffies, cfqq->service_start + cfq_service)) - cfqq->service_used >>= 3; - - cfq_sort_rr_list(cfqq, 1); + cfq_resort_rr_list(cfqq, requeue); } static inline void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + BUG_ON(!cfq_cfqq_on_rr(cfqq)); + cfq_clear_cfqq_on_rr(cfqq); list_move(&cfqq->cfq_list, &cfqd->empty_list); - cfqq->on_rr = 0; BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; @@ -505,16 +568,17 @@ static inline void cfq_del_crq_rb(struct cfq_rq *crq) if (ON_RB(&crq->rb_node)) { struct cfq_data *cfqd = cfqq->cfqd; + const int sync = cfq_crq_is_sync(crq); - BUG_ON(!cfqq->queued[crq->is_sync]); + BUG_ON(!cfqq->queued[sync]); + cfqq->queued[sync]--; cfq_update_next_crq(crq); - cfqq->queued[crq->is_sync]--; rb_erase(&crq->rb_node, &cfqq->sort_list); RB_CLEAR_COLOR(&crq->rb_node); - if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr) + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list)) cfq_del_cfqq_rr(cfqd, cfqq); } } @@ -550,7 +614,7 @@ static void cfq_add_crq_rb(struct cfq_rq *crq) struct cfq_rq *__alias; crq->rb_key = rq_rb_key(rq); - cfqq->queued[crq->is_sync]++; + cfqq->queued[cfq_crq_is_sync(crq)]++; /* * looks a little odd, but the first insert might return an alias. @@ -561,8 +625,8 @@ static void cfq_add_crq_rb(struct cfq_rq *crq) rb_insert_color(&crq->rb_node, &cfqq->sort_list); - if (!cfqq->on_rr) - cfq_add_cfqq_rr(cfqd, cfqq); + if (!cfq_cfqq_on_rr(cfqq)) + cfq_add_cfqq_rr(cfqd, cfqq, cfq_crq_requeued(crq)); /* * check if this request is a better next-serve candidate @@ -575,17 +639,16 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) { if (ON_RB(&crq->rb_node)) { rb_erase(&crq->rb_node, &cfqq->sort_list); - cfqq->queued[crq->is_sync]--; + cfqq->queued[cfq_crq_is_sync(crq)]--; } cfq_add_crq_rb(crq); } -static struct request * -cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) +static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) + { - const unsigned long key = cfq_hash_key(cfqd, current); - struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key); + struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY); struct rb_node *n; if (!cfqq) @@ -609,20 +672,25 @@ out: static void cfq_deactivate_request(request_queue_t *q, struct request *rq) { + struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_rq *crq = RQ_DATA(rq); if (crq) { struct cfq_queue *cfqq = crq->cfq_queue; - if (cfqq->cfqd->cfq_tagged) { - cfqq->service_used--; - cfq_sort_rr_list(cfqq, 0); + if (cfq_crq_in_driver(crq)) { + cfq_clear_crq_in_driver(crq); + WARN_ON(!cfqd->rq_in_driver); + cfqd->rq_in_driver--; } + if (cfq_crq_in_flight(crq)) { + const int sync = cfq_crq_is_sync(crq); - if (crq->accounted) { - crq->accounted = 0; - cfqq->cfqd->rq_in_driver--; + cfq_clear_crq_in_flight(crq); + WARN_ON(!cfqq->on_dispatch[sync]); + cfqq->on_dispatch[sync]--; } + cfq_mark_crq_requeued(crq); } } @@ -640,11 +708,10 @@ static void cfq_remove_request(request_queue_t *q, struct request *rq) struct cfq_rq *crq = RQ_DATA(rq); if (crq) { - cfq_remove_merge_hints(q, crq); list_del_init(&rq->queuelist); + cfq_del_crq_rb(crq); + cfq_remove_merge_hints(q, crq); - if (crq->cfq_queue) - cfq_del_crq_rb(crq); } } @@ -662,21 +729,15 @@ cfq_merge(request_queue_t *q, struct request **req, struct bio *bio) } __rq = cfq_find_rq_hash(cfqd, bio->bi_sector); - if (__rq) { - BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); - - if (elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_BACK_MERGE; - goto out; - } + if (__rq && elv_rq_merge_ok(__rq, bio)) { + ret = ELEVATOR_BACK_MERGE; + goto out; } __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio)); - if (__rq) { - if (elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_FRONT_MERGE; - goto out; - } + if (__rq && elv_rq_merge_ok(__rq, bio)) { + ret = ELEVATOR_FRONT_MERGE; + goto out; } return ELEVATOR_NO_MERGE; @@ -709,20 +770,220 @@ static void cfq_merged_requests(request_queue_t *q, struct request *rq, struct request *next) { - struct cfq_rq *crq = RQ_DATA(rq); - struct cfq_rq *cnext = RQ_DATA(next); - cfq_merged_request(q, rq); - if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) { - if (time_before(cnext->queue_start, crq->queue_start)) { - list_move(&rq->queuelist, &next->queuelist); - crq->queue_start = cnext->queue_start; + /* + * reposition in fifo if next is older than rq + */ + if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && + time_before(next->start_time, rq->start_time)) + list_move(&rq->queuelist, &next->queuelist); + + cfq_remove_request(q, next); +} + +static inline void +__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + if (cfqq) { + /* + * stop potential idle class queues waiting service + */ + del_timer(&cfqd->idle_class_timer); + + cfqq->slice_start = jiffies; + cfqq->slice_end = 0; + cfqq->slice_left = 0; + cfq_clear_cfqq_must_alloc_slice(cfqq); + cfq_clear_cfqq_fifo_expire(cfqq); + cfq_clear_cfqq_expired(cfqq); + } + + cfqd->active_queue = cfqq; +} + +/* + * 0 + * 0,1 + * 0,1,2 + * 0,1,2,3 + * 0,1,2,3,4 + * 0,1,2,3,4,5 + * 0,1,2,3,4,5,6 + * 0,1,2,3,4,5,6,7 + */ +static int cfq_get_next_prio_level(struct cfq_data *cfqd) +{ + int prio, wrap; + + prio = -1; + wrap = 0; + do { + int p; + + for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) { + if (!list_empty(&cfqd->rr_list[p])) { + prio = p; + break; + } + } + + if (prio != -1) + break; + cfqd->cur_prio = 0; + if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) { + cfqd->cur_end_prio = 0; + if (wrap) + break; + wrap = 1; } + } while (1); + + if (unlikely(prio == -1)) + return -1; + + BUG_ON(prio >= CFQ_PRIO_LISTS); + + list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr); + + cfqd->cur_prio = prio + 1; + if (cfqd->cur_prio > cfqd->cur_end_prio) { + cfqd->cur_end_prio = cfqd->cur_prio; + cfqd->cur_prio = 0; + } + if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) { + cfqd->cur_prio = 0; + cfqd->cur_end_prio = 0; } - cfq_update_next_crq(cnext); - cfq_remove_request(q, next); + return prio; +} + +static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) +{ + struct cfq_queue *cfqq; + + /* + * if current queue is expired but not done with its requests yet, + * wait for that to happen + */ + if ((cfqq = cfqd->active_queue) != NULL) { + if (cfq_cfqq_expired(cfqq) && cfq_cfqq_dispatched(cfqq)) + return NULL; + } + + /* + * if current list is non-empty, grab first entry. if it is empty, + * get next prio level and grab first entry then if any are spliced + */ + if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) + cfqq = list_entry_cfqq(cfqd->cur_rr.next); + + /* + * if we have idle queues and no rt or be queues had pending + * requests, either allow immediate service if the grace period + * has passed or arm the idle grace timer + */ + if (!cfqq && !list_empty(&cfqd->idle_rr)) { + unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE; + + if (time_after_eq(jiffies, end)) + cfqq = list_entry_cfqq(cfqd->idle_rr.next); + else + mod_timer(&cfqd->idle_class_timer, end); + } + + __cfq_set_active_queue(cfqd, cfqq); + return cfqq; +} + +/* + * current cfqq expired its slice (or was too idle), select new one + */ +static void +__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, + int preempted) +{ + unsigned long now = jiffies; + + if (cfq_cfqq_wait_request(cfqq)) + del_timer(&cfqd->idle_slice_timer); + + if (!preempted && !cfq_cfqq_dispatched(cfqq)) + cfqq->service_last = now; + + cfq_clear_cfqq_must_dispatch(cfqq); + cfq_clear_cfqq_wait_request(cfqq); + + /* + * store what was left of this slice, if the queue idled out + * or was preempted + */ + if (time_after(now, cfqq->slice_end)) + cfqq->slice_left = now - cfqq->slice_end; + else + cfqq->slice_left = 0; + + if (cfq_cfqq_on_rr(cfqq)) + cfq_resort_rr_list(cfqq, preempted); + + if (cfqq == cfqd->active_queue) + cfqd->active_queue = NULL; + + if (cfqd->active_cic) { + put_io_context(cfqd->active_cic->ioc); + cfqd->active_cic = NULL; + } + + cfqd->dispatch_slice = 0; +} + +static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted) +{ + struct cfq_queue *cfqq = cfqd->active_queue; + + if (cfqq) { + /* + * use deferred expiry, if there are requests in progress as + * not to disturb the slice of the next queue + */ + if (cfq_cfqq_dispatched(cfqq)) + cfq_mark_cfqq_expired(cfqq); + else + __cfq_slice_expired(cfqd, cfqq, preempted); + } +} + +static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) + +{ + WARN_ON(!RB_EMPTY(&cfqq->sort_list)); + WARN_ON(cfqq != cfqd->active_queue); + + /* + * idle is disabled, either manually or by past process history + */ + if (!cfqd->cfq_slice_idle) + return 0; + if (!cfq_cfqq_idle_window(cfqq)) + return 0; + /* + * task has exited, don't wait + */ + if (cfqd->active_cic && !cfqd->active_cic->ioc->task) + return 0; + + cfq_mark_cfqq_must_dispatch(cfqq); + cfq_mark_cfqq_wait_request(cfqq); + + if (!timer_pending(&cfqd->idle_slice_timer)) { + unsigned long slice_left = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle); + + cfqd->idle_slice_timer.expires = jiffies + slice_left; + add_timer(&cfqd->idle_slice_timer); + } + + return 1; } /* @@ -738,31 +999,40 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq) struct request *__rq; sector_t last; - cfq_del_crq_rb(crq); - cfq_remove_merge_hints(q, crq); list_del(&crq->request->queuelist); last = cfqd->last_sector; - while ((entry = entry->prev) != head) { - __rq = list_entry_rq(entry); + list_for_each_entry_reverse(__rq, head, queuelist) { + struct cfq_rq *__crq = RQ_DATA(__rq); - if (blk_barrier_rq(crq->request)) + if (blk_barrier_rq(__rq)) break; - if (!blk_fs_request(crq->request)) + if (!blk_fs_request(__rq)) + break; + if (cfq_crq_requeued(__crq)) break; - if (crq->request->sector > __rq->sector) + if (__rq->sector <= crq->request->sector) break; if (__rq->sector > last && crq->request->sector < last) { - last = crq->request->sector; + last = crq->request->sector + crq->request->nr_sectors; break; } + entry = &__rq->queuelist; } cfqd->last_sector = last; - crq->in_flight = 1; - cfqq->in_flight++; - list_add(&crq->request->queuelist, entry); + + cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq); + + cfq_del_crq_rb(crq); + cfq_remove_merge_hints(q, crq); + + cfq_mark_crq_in_flight(crq); + cfq_clear_crq_requeued(crq); + + cfqq->on_dispatch[cfq_crq_is_sync(crq)]++; + list_add_tail(&crq->request->queuelist, entry); } /* @@ -771,173 +1041,225 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq) static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq) { struct cfq_data *cfqd = cfqq->cfqd; - const int reads = !list_empty(&cfqq->fifo[0]); - const int writes = !list_empty(&cfqq->fifo[1]); - unsigned long now = jiffies; + struct request *rq; struct cfq_rq *crq; - if (time_before(now, cfqq->last_fifo_expire + cfqd->cfq_fifo_batch_expire)) + if (cfq_cfqq_fifo_expire(cfqq)) return NULL; - crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist)); - if (reads && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_r)) { - cfqq->last_fifo_expire = now; - return crq; - } + if (!list_empty(&cfqq->fifo)) { + int fifo = cfq_cfqq_class_sync(cfqq); - crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist)); - if (writes && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_w)) { - cfqq->last_fifo_expire = now; - return crq; + crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next)); + rq = crq->request; + if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) { + cfq_mark_cfqq_fifo_expire(cfqq); + return crq; + } } return NULL; } /* - * dispatch a single request from given queue + * Scale schedule slice based on io priority. Use the sync time slice only + * if a queue is marked sync and has sync io queued. A sync queue with async + * io only, should not get full sync slice length. */ +static inline int +cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)]; + + WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); + + return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio)); +} + static inline void -cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd, - struct cfq_queue *cfqq) +cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - struct cfq_rq *crq; + cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; +} + +static inline int +cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + const int base_rq = cfqd->cfq_slice_async_rq; + + WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); + + return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); +} + +/* + * get next queue for service + */ +static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd, int force) +{ + unsigned long now = jiffies; + struct cfq_queue *cfqq; + + cfqq = cfqd->active_queue; + if (!cfqq) + goto new_queue; + + if (cfq_cfqq_expired(cfqq)) + goto new_queue; /* - * follow expired path, else get first next available + * slice has expired */ - if ((crq = cfq_check_fifo(cfqq)) == NULL) { - if (cfqd->find_best_crq) - crq = cfqq->next_crq; - else - crq = rb_entry_crq(rb_first(&cfqq->sort_list)); - } - - cfqd->last_sector = crq->request->sector + crq->request->nr_sectors; + if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end)) + goto expire; /* - * finally, insert request into driver list + * if queue has requests, dispatch one. if not, check if + * enough slice is left to wait for one */ - cfq_dispatch_sort(q, crq); + if (!RB_EMPTY(&cfqq->sort_list)) + goto keep_queue; + else if (!force && cfq_cfqq_class_sync(cfqq) && + time_before(now, cfqq->slice_end)) { + if (cfq_arm_slice_timer(cfqd, cfqq)) + return NULL; + } + +expire: + cfq_slice_expired(cfqd, 0); +new_queue: + cfqq = cfq_set_active_queue(cfqd); +keep_queue: + return cfqq; } -static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch) +static int +__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, + int max_dispatch) { - struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_queue *cfqq; - struct list_head *entry, *tmp; - int queued, busy_queues, first_round; + int dispatched = 0; - if (list_empty(&cfqd->rr_list)) - return 0; + BUG_ON(RB_EMPTY(&cfqq->sort_list)); - queued = 0; - first_round = 1; -restart: - busy_queues = 0; - list_for_each_safe(entry, tmp, &cfqd->rr_list) { - cfqq = list_entry_cfqq(entry); + do { + struct cfq_rq *crq; - BUG_ON(RB_EMPTY(&cfqq->sort_list)); + /* + * follow expired path, else get first next available + */ + if ((crq = cfq_check_fifo(cfqq)) == NULL) + crq = cfqq->next_crq; /* - * first round of queueing, only select from queues that - * don't already have io in-flight + * finally, insert request into driver dispatch list */ - if (first_round && cfqq->in_flight) - continue; + cfq_dispatch_sort(cfqd->queue, crq); - cfq_dispatch_request(q, cfqd, cfqq); + cfqd->dispatch_slice++; + dispatched++; - if (!RB_EMPTY(&cfqq->sort_list)) - busy_queues++; + if (!cfqd->active_cic) { + atomic_inc(&crq->io_context->ioc->refcount); + cfqd->active_cic = crq->io_context; + } - queued++; - } + if (RB_EMPTY(&cfqq->sort_list)) + break; + + } while (dispatched < max_dispatch); + + /* + * if slice end isn't set yet, set it. if at least one request was + * sync, use the sync time slice value + */ + if (!cfqq->slice_end) + cfq_set_prio_slice(cfqd, cfqq); + + /* + * expire an async queue immediately if it has used up its slice. idle + * queue always expire after 1 dispatch round. + */ + if ((!cfq_cfqq_sync(cfqq) && + cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) || + cfq_class_idle(cfqq)) + cfq_slice_expired(cfqd, 0); + + return dispatched; +} + +static int +cfq_dispatch_requests(request_queue_t *q, int max_dispatch, int force) +{ + struct cfq_data *cfqd = q->elevator->elevator_data; + struct cfq_queue *cfqq; + + if (!cfqd->busy_queues) + return 0; + + cfqq = cfq_select_queue(cfqd, force); + if (cfqq) { + cfq_clear_cfqq_must_dispatch(cfqq); + cfq_clear_cfqq_wait_request(cfqq); + del_timer(&cfqd->idle_slice_timer); - if ((queued < max_dispatch) && (busy_queues || first_round)) { - first_round = 0; - goto restart; + if (cfq_class_idle(cfqq)) + max_dispatch = 1; + + return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch); } - return queued; + return 0; } static inline void cfq_account_dispatch(struct cfq_rq *crq) { struct cfq_queue *cfqq = crq->cfq_queue; struct cfq_data *cfqd = cfqq->cfqd; - unsigned long now, elapsed; - if (!blk_fs_request(crq->request)) + if (unlikely(!blk_fs_request(crq->request))) return; /* * accounted bit is necessary since some drivers will call * elv_next_request() many times for the same request (eg ide) */ - if (crq->accounted) + if (cfq_crq_in_driver(crq)) return; - now = jiffies; - if (cfqq->service_start == ~0UL) - cfqq->service_start = now; - - /* - * on drives with tagged command queueing, command turn-around time - * doesn't necessarily reflect the time spent processing this very - * command inside the drive. so do the accounting differently there, - * by just sorting on the number of requests - */ - if (cfqd->cfq_tagged) { - if (time_after(now, cfqq->service_start + cfq_service)) { - cfqq->service_start = now; - cfqq->service_used /= 10; - } - - cfqq->service_used++; - cfq_sort_rr_list(cfqq, 0); - } - - elapsed = now - crq->queue_start; - if (elapsed > max_elapsed_dispatch) - max_elapsed_dispatch = elapsed; - - crq->accounted = 1; - crq->service_start = now; - - if (++cfqd->rq_in_driver >= CFQ_MAX_TAG && !cfqd->cfq_tagged) { - cfqq->cfqd->cfq_tagged = 1; - printk("cfq: depth %d reached, tagging now on\n", CFQ_MAX_TAG); - } + cfq_mark_crq_in_driver(crq); + cfqd->rq_in_driver++; } static inline void cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq) { struct cfq_data *cfqd = cfqq->cfqd; + unsigned long now; - if (!crq->accounted) + if (!cfq_crq_in_driver(crq)) return; + now = jiffies; + WARN_ON(!cfqd->rq_in_driver); cfqd->rq_in_driver--; - if (!cfqd->cfq_tagged) { - unsigned long now = jiffies; - unsigned long duration = now - crq->service_start; + if (!cfq_class_idle(cfqq)) + cfqd->last_end_request = now; - if (time_after(now, cfqq->service_start + cfq_service)) { - cfqq->service_start = now; - cfqq->service_used >>= 3; + if (!cfq_cfqq_dispatched(cfqq)) { + if (cfq_cfqq_on_rr(cfqq)) { + cfqq->service_last = now; + cfq_resort_rr_list(cfqq, 0); + } + if (cfq_cfqq_expired(cfqq)) { + __cfq_slice_expired(cfqd, cfqq, 0); + cfq_schedule_dispatch(cfqd); } - - cfqq->service_used += duration; - cfq_sort_rr_list(cfqq, 0); - - if (duration > max_elapsed_crq) - max_elapsed_crq = duration; } + + if (cfq_crq_is_sync(crq)) + crq->io_context->last_end_request = now; } static struct request *cfq_next_request(request_queue_t *q) @@ -950,7 +1272,19 @@ static struct request *cfq_next_request(request_queue_t *q) dispatch: rq = list_entry_rq(q->queue_head.next); - if ((crq = RQ_DATA(rq)) != NULL) { + crq = RQ_DATA(rq); + if (crq) { + struct cfq_queue *cfqq = crq->cfq_queue; + + /* + * if idle window is disabled, allow queue buildup + */ + if (!cfq_crq_in_driver(crq) && + !cfq_cfqq_idle_window(cfqq) && + !blk_barrier_rq(rq) && + cfqd->rq_in_driver >= cfqd->cfq_max_depth) + return NULL; + cfq_remove_merge_hints(q, crq); cfq_account_dispatch(crq); } @@ -958,7 +1292,7 @@ dispatch: return rq; } - if (cfq_dispatch_requests(q, cfqd->cfq_quantum)) + if (cfq_dispatch_requests(q, cfqd->cfq_quantum, 0)) goto dispatch; return NULL; @@ -972,13 +1306,21 @@ dispatch: */ static void cfq_put_queue(struct cfq_queue *cfqq) { - BUG_ON(!atomic_read(&cfqq->ref)); + struct cfq_data *cfqd = cfqq->cfqd; + + BUG_ON(atomic_read(&cfqq->ref) <= 0); if (!atomic_dec_and_test(&cfqq->ref)) return; BUG_ON(rb_first(&cfqq->sort_list)); - BUG_ON(cfqq->on_rr); + BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); + BUG_ON(cfq_cfqq_on_rr(cfqq)); + + if (unlikely(cfqd->active_queue == cfqq)) { + __cfq_slice_expired(cfqd, cfqq, 0); + cfq_schedule_dispatch(cfqd); + } cfq_put_cfqd(cfqq->cfqd); @@ -991,15 +1333,17 @@ static void cfq_put_queue(struct cfq_queue *cfqq) } static inline struct cfq_queue * -__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval) +__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio, + const int hashval) { struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; struct hlist_node *entry, *next; hlist_for_each_safe(entry, next, hash_list) { struct cfq_queue *__cfqq = list_entry_qhash(entry); + const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->ioprio_class, __cfqq->ioprio); - if (__cfqq->key == key) + if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY)) return __cfqq; } @@ -1007,94 +1351,220 @@ __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval) } static struct cfq_queue * -cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key) +cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio) { - return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT)); + return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT)); } -static inline void -cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq, - struct cfq_io_context *cic) +static void cfq_free_io_context(struct cfq_io_context *cic) { - unsigned long hashkey = cfq_hash_key(cfqd, current); - unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT); - struct cfq_queue *__cfqq; - unsigned long flags; - - spin_lock_irqsave(cfqd->queue->queue_lock, flags); + struct cfq_io_context *__cic; + struct list_head *entry, *next; - hlist_del(&(*cfqq)->cfq_hash); - - __cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval); - if (!__cfqq || __cfqq == *cfqq) { - __cfqq = *cfqq; - hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); - __cfqq->key_type = cfqd->key_type; - } else { - atomic_inc(&__cfqq->ref); - cic->cfqq = __cfqq; - cfq_put_queue(*cfqq); - *cfqq = __cfqq; + list_for_each_safe(entry, next, &cic->list) { + __cic = list_entry(entry, struct cfq_io_context, list); + kmem_cache_free(cfq_ioc_pool, __cic); } - cic->cfqq = __cfqq; - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); + kmem_cache_free(cfq_ioc_pool, cic); } -static void cfq_free_io_context(struct cfq_io_context *cic) +/* + * Called with interrupts disabled + */ +static void cfq_exit_single_io_context(struct cfq_io_context *cic) { - kmem_cache_free(cfq_ioc_pool, cic); + struct cfq_data *cfqd = cic->cfqq->cfqd; + request_queue_t *q = cfqd->queue; + + WARN_ON(!irqs_disabled()); + + spin_lock(q->queue_lock); + + if (unlikely(cic->cfqq == cfqd->active_queue)) { + __cfq_slice_expired(cfqd, cic->cfqq, 0); + cfq_schedule_dispatch(cfqd); + } + + cfq_put_queue(cic->cfqq); + cic->cfqq = NULL; + spin_unlock(q->queue_lock); } /* - * locking hierarchy is: io_context lock -> queue locks + * Another task may update the task cic list, if it is doing a queue lookup + * on its behalf. cfq_cic_lock excludes such concurrent updates */ static void cfq_exit_io_context(struct cfq_io_context *cic) { - struct cfq_queue *cfqq = cic->cfqq; - struct list_head *entry = &cic->list; - request_queue_t *q; + struct cfq_io_context *__cic; + struct list_head *entry; unsigned long flags; + local_irq_save(flags); + /* * put the reference this task is holding to the various queues */ - spin_lock_irqsave(&cic->ioc->lock, flags); - while ((entry = cic->list.next) != &cic->list) { - struct cfq_io_context *__cic; - + list_for_each(entry, &cic->list) { __cic = list_entry(entry, struct cfq_io_context, list); - list_del(entry); - - q = __cic->cfqq->cfqd->queue; - spin_lock(q->queue_lock); - cfq_put_queue(__cic->cfqq); - spin_unlock(q->queue_lock); + cfq_exit_single_io_context(__cic); } - q = cfqq->cfqd->queue; - spin_lock(q->queue_lock); - cfq_put_queue(cfqq); - spin_unlock(q->queue_lock); - - cic->cfqq = NULL; - spin_unlock_irqrestore(&cic->ioc->lock, flags); + cfq_exit_single_io_context(cic); + local_irq_restore(flags); } -static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags) +static struct cfq_io_context * +cfq_alloc_io_context(struct cfq_data *cfqd, int gfp_mask) { - struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags); + struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask); if (cic) { - cic->dtor = cfq_free_io_context; - cic->exit = cfq_exit_io_context; INIT_LIST_HEAD(&cic->list); cic->cfqq = NULL; + cic->key = NULL; + cic->last_end_request = jiffies; + cic->ttime_total = 0; + cic->ttime_samples = 0; + cic->ttime_mean = 0; + cic->dtor = cfq_free_io_context; + cic->exit = cfq_exit_io_context; } return cic; } +static void cfq_init_prio_data(struct cfq_queue *cfqq) +{ + struct task_struct *tsk = current; + int ioprio_class; + + if (!cfq_cfqq_prio_changed(cfqq)) + return; + + ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio); + switch (ioprio_class) { + default: + printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* + * no prio set, place us in the middle of the BE classes + */ + cfqq->ioprio = task_nice_ioprio(tsk); + cfqq->ioprio_class = IOPRIO_CLASS_BE; + break; + case IOPRIO_CLASS_RT: + cfqq->ioprio = task_ioprio(tsk); + cfqq->ioprio_class = IOPRIO_CLASS_RT; + break; + case IOPRIO_CLASS_BE: + cfqq->ioprio = task_ioprio(tsk); + cfqq->ioprio_class = IOPRIO_CLASS_BE; + break; + case IOPRIO_CLASS_IDLE: + cfqq->ioprio_class = IOPRIO_CLASS_IDLE; + cfqq->ioprio = 7; + cfq_clear_cfqq_idle_window(cfqq); + break; + } + + /* + * keep track of original prio settings in case we have to temporarily + * elevate the priority of this queue + */ + cfqq->org_ioprio = cfqq->ioprio; + cfqq->org_ioprio_class = cfqq->ioprio_class; + + if (cfq_cfqq_on_rr(cfqq)) + cfq_resort_rr_list(cfqq, 0); + + cfq_clear_cfqq_prio_changed(cfqq); +} + +static inline void changed_ioprio(struct cfq_queue *cfqq) +{ + if (cfqq) { + struct cfq_data *cfqd = cfqq->cfqd; + + spin_lock(cfqd->queue->queue_lock); + cfq_mark_cfqq_prio_changed(cfqq); + cfq_init_prio_data(cfqq); + spin_unlock(cfqd->queue->queue_lock); + } +} + +/* + * callback from sys_ioprio_set, irqs are disabled + */ +static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio) +{ + struct cfq_io_context *cic = ioc->cic; + + changed_ioprio(cic->cfqq); + + list_for_each_entry(cic, &cic->list, list) + changed_ioprio(cic->cfqq); + + return 0; +} + +static struct cfq_queue * +cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio, + int gfp_mask) +{ + const int hashval = hash_long(key, CFQ_QHASH_SHIFT); + struct cfq_queue *cfqq, *new_cfqq = NULL; + +retry: + cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval); + + if (!cfqq) { + if (new_cfqq) { + cfqq = new_cfqq; + new_cfqq = NULL; + } else if (gfp_mask & __GFP_WAIT) { + spin_unlock_irq(cfqd->queue->queue_lock); + new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); + spin_lock_irq(cfqd->queue->queue_lock); + goto retry; + } else { + cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); + if (!cfqq) + goto out; + } + + memset(cfqq, 0, sizeof(*cfqq)); + + INIT_HLIST_NODE(&cfqq->cfq_hash); + INIT_LIST_HEAD(&cfqq->cfq_list); + RB_CLEAR_ROOT(&cfqq->sort_list); + INIT_LIST_HEAD(&cfqq->fifo); + + cfqq->key = key; + hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); + atomic_set(&cfqq->ref, 0); + cfqq->cfqd = cfqd; + atomic_inc(&cfqd->ref); + cfqq->service_last = 0; + /* + * set ->slice_left to allow preemption for a new process + */ + cfqq->slice_left = 2 * cfqd->cfq_slice_idle; + cfq_mark_cfqq_idle_window(cfqq); + cfq_mark_cfqq_prio_changed(cfqq); + cfq_init_prio_data(cfqq); + } + + if (new_cfqq) + kmem_cache_free(cfq_pool, new_cfqq); + + atomic_inc(&cfqq->ref); +out: + WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq); + return cfqq; +} + /* * Setup general io context and cfq io context. There can be several cfq * io contexts per general io context, if this process is doing io to more @@ -1102,39 +1572,39 @@ static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags) * cfqq, so we don't need to worry about it disappearing */ static struct cfq_io_context * -cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags) +cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, int gfp_mask) { - struct cfq_data *cfqd = (*cfqq)->cfqd; - struct cfq_queue *__cfqq = *cfqq; + struct io_context *ioc = NULL; struct cfq_io_context *cic; - struct io_context *ioc; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_WAIT); - ioc = get_io_context(gfp_flags); + ioc = get_io_context(gfp_mask); if (!ioc) return NULL; if ((cic = ioc->cic) == NULL) { - cic = cfq_alloc_io_context(gfp_flags); + cic = cfq_alloc_io_context(cfqd, gfp_mask); if (cic == NULL) goto err; + /* + * manually increment generic io_context usage count, it + * cannot go away since we are already holding one ref to it + */ ioc->cic = cic; + ioc->set_ioprio = cfq_ioc_set_ioprio; cic->ioc = ioc; - cic->cfqq = __cfqq; - atomic_inc(&__cfqq->ref); + cic->key = cfqd; + atomic_inc(&cfqd->ref); } else { struct cfq_io_context *__cic; - unsigned long flags; /* - * since the first cic on the list is actually the head - * itself, need to check this here or we'll duplicate an - * cic per ioc for no reason + * the first cic on the list is actually the head itself */ - if (cic->cfqq == __cfqq) + if (cic->key == cfqd) goto out; /* @@ -1142,149 +1612,250 @@ cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags) * should be ok here, the list will usually not be more than * 1 or a few entries long */ - spin_lock_irqsave(&ioc->lock, flags); list_for_each_entry(__cic, &cic->list, list) { /* * this process is already holding a reference to * this queue, so no need to get one more */ - if (__cic->cfqq == __cfqq) { + if (__cic->key == cfqd) { cic = __cic; - spin_unlock_irqrestore(&ioc->lock, flags); goto out; } } - spin_unlock_irqrestore(&ioc->lock, flags); /* * nope, process doesn't have a cic assoicated with this * cfqq yet. get a new one and add to list */ - __cic = cfq_alloc_io_context(gfp_flags); + __cic = cfq_alloc_io_context(cfqd, gfp_mask); if (__cic == NULL) goto err; __cic->ioc = ioc; - __cic->cfqq = __cfqq; - atomic_inc(&__cfqq->ref); - spin_lock_irqsave(&ioc->lock, flags); + __cic->key = cfqd; + atomic_inc(&cfqd->ref); list_add(&__cic->list, &cic->list); - spin_unlock_irqrestore(&ioc->lock, flags); - cic = __cic; - *cfqq = __cfqq; } out: - /* - * if key_type has been changed on the fly, we lazily rehash - * each queue at lookup time - */ - if ((*cfqq)->key_type != cfqd->key_type) - cfq_rehash_cfqq(cfqd, cfqq, cic); - return cic; err: put_io_context(ioc); return NULL; } -static struct cfq_queue * -__cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask) +static void +cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) { - const int hashval = hash_long(key, CFQ_QHASH_SHIFT); - struct cfq_queue *cfqq, *new_cfqq = NULL; + unsigned long elapsed, ttime; -retry: - cfqq = __cfq_find_cfq_hash(cfqd, key, hashval); + /* + * if this context already has stuff queued, thinktime is from + * last queue not last end + */ +#if 0 + if (time_after(cic->last_end_request, cic->last_queue)) + elapsed = jiffies - cic->last_end_request; + else + elapsed = jiffies - cic->last_queue; +#else + elapsed = jiffies - cic->last_end_request; +#endif - if (!cfqq) { - if (new_cfqq) { - cfqq = new_cfqq; - new_cfqq = NULL; - } else if (gfp_mask & __GFP_WAIT) { - spin_unlock_irq(cfqd->queue->queue_lock); - new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); - spin_lock_irq(cfqd->queue->queue_lock); - goto retry; - } else - goto out; + ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); - memset(cfqq, 0, sizeof(*cfqq)); + cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; + cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; + cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; +} - INIT_HLIST_NODE(&cfqq->cfq_hash); - INIT_LIST_HEAD(&cfqq->cfq_list); - RB_CLEAR_ROOT(&cfqq->sort_list); - INIT_LIST_HEAD(&cfqq->fifo[0]); - INIT_LIST_HEAD(&cfqq->fifo[1]); +#define sample_valid(samples) ((samples) > 80) - cfqq->key = key; - hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); - atomic_set(&cfqq->ref, 0); - cfqq->cfqd = cfqd; - atomic_inc(&cfqd->ref); - cfqq->key_type = cfqd->key_type; - cfqq->service_start = ~0UL; +/* + * Disable idle window if the process thinks too long or seeks so much that + * it doesn't matter + */ +static void +cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct cfq_io_context *cic) +{ + int enable_idle = cfq_cfqq_idle_window(cfqq); + + if (!cic->ioc->task || !cfqd->cfq_slice_idle) + enable_idle = 0; + else if (sample_valid(cic->ttime_samples)) { + if (cic->ttime_mean > cfqd->cfq_slice_idle) + enable_idle = 0; + else + enable_idle = 1; } - if (new_cfqq) - kmem_cache_free(cfq_pool, new_cfqq); + if (enable_idle) + cfq_mark_cfqq_idle_window(cfqq); + else + cfq_clear_cfqq_idle_window(cfqq); +} - atomic_inc(&cfqq->ref); -out: - WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq); - return cfqq; + +/* + * Check if new_cfqq should preempt the currently active queue. Return 0 for + * no or if we aren't sure, a 1 will cause a preempt. + */ +static int +cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, + struct cfq_rq *crq) +{ + struct cfq_queue *cfqq = cfqd->active_queue; + + if (cfq_class_idle(new_cfqq)) + return 0; + + if (!cfqq) + return 1; + + if (cfq_class_idle(cfqq)) + return 1; + if (!cfq_cfqq_wait_request(new_cfqq)) + return 0; + /* + * if it doesn't have slice left, forget it + */ + if (new_cfqq->slice_left < cfqd->cfq_slice_idle) + return 0; + if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq)) + return 1; + + return 0; +} + +/* + * cfqq preempts the active queue. if we allowed preempt with no slice left, + * let it have half of its nominal slice. + */ +static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + struct cfq_queue *__cfqq, *next; + + list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list) + cfq_resort_rr_list(__cfqq, 1); + + if (!cfqq->slice_left) + cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2; + + cfqq->slice_end = cfqq->slice_left + jiffies; + __cfq_slice_expired(cfqd, cfqq, 1); + __cfq_set_active_queue(cfqd, cfqq); +} + +/* + * should really be a ll_rw_blk.c helper + */ +static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + request_queue_t *q = cfqd->queue; + + if (!blk_queue_plugged(q)) + q->request_fn(q); + else + __generic_unplug_device(q); } -static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq) +/* + * Called when a new fs request (crq) is added (to cfqq). Check if there's + * something we should do about it + */ +static void +cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct cfq_rq *crq) { - crq->is_sync = 0; - if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE) - crq->is_sync = 1; + const int sync = cfq_crq_is_sync(crq); + + cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq); + + if (sync) { + struct cfq_io_context *cic = crq->io_context; + + cfq_update_io_thinktime(cfqd, cic); + cfq_update_idle_window(cfqd, cfqq, cic); + + cic->last_queue = jiffies; + } + + if (cfqq == cfqd->active_queue) { + /* + * if we are waiting for a request for this queue, let it rip + * immediately and flag that we must not expire this queue + * just now + */ + if (cfq_cfqq_wait_request(cfqq)) { + cfq_mark_cfqq_must_dispatch(cfqq); + del_timer(&cfqd->idle_slice_timer); + cfq_start_queueing(cfqd, cfqq); + } + } else if (cfq_should_preempt(cfqd, cfqq, crq)) { + /* + * not the active queue - expire current slice if it is + * idle and has expired it's mean thinktime or this new queue + * has some old slice time left and is of higher priority + */ + cfq_preempt_queue(cfqd, cfqq); + cfq_mark_cfqq_must_dispatch(cfqq); + cfq_start_queueing(cfqd, cfqq); + } +} + +static void cfq_enqueue(struct cfq_data *cfqd, struct request *rq) +{ + struct cfq_rq *crq = RQ_DATA(rq); + struct cfq_queue *cfqq = crq->cfq_queue; + + cfq_init_prio_data(cfqq); cfq_add_crq_rb(crq); - crq->queue_start = jiffies; - list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]); + list_add_tail(&rq->queuelist, &cfqq->fifo); + + if (rq_mergeable(rq)) { + cfq_add_crq_hash(cfqd, crq); + + if (!cfqd->queue->last_merge) + cfqd->queue->last_merge = rq; + } + + cfq_crq_enqueued(cfqd, cfqq, crq); } static void cfq_insert_request(request_queue_t *q, struct request *rq, int where) { struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_rq *crq = RQ_DATA(rq); switch (where) { case ELEVATOR_INSERT_BACK: - while (cfq_dispatch_requests(q, cfqd->cfq_quantum)) + while (cfq_dispatch_requests(q, INT_MAX, 1)) ; list_add_tail(&rq->queuelist, &q->queue_head); + /* + * If we were idling with pending requests on + * inactive cfqqs, force dispatching will + * remove the idle timer and the queue won't + * be kicked by __make_request() afterward. + * Kick it here. + */ + cfq_schedule_dispatch(cfqd); break; case ELEVATOR_INSERT_FRONT: list_add(&rq->queuelist, &q->queue_head); break; case ELEVATOR_INSERT_SORT: BUG_ON(!blk_fs_request(rq)); - cfq_enqueue(cfqd, crq); + cfq_enqueue(cfqd, rq); break; default: printk("%s: bad insert point %d\n", __FUNCTION__,where); return; } - - if (rq_mergeable(rq)) { - cfq_add_crq_hash(cfqd, crq); - - if (!q->last_merge) - q->last_merge = rq; - } -} - -static int cfq_queue_empty(request_queue_t *q) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - - return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list); } static void cfq_completed_request(request_queue_t *q, struct request *rq) @@ -1297,9 +1868,11 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq) cfqq = crq->cfq_queue; - if (crq->in_flight) { - WARN_ON(!cfqq->in_flight); - cfqq->in_flight--; + if (cfq_crq_in_flight(crq)) { + const int sync = cfq_crq_is_sync(crq); + + WARN_ON(!cfqq->on_dispatch[sync]); + cfqq->on_dispatch[sync]--; } cfq_account_completion(cfqq, crq); @@ -1329,51 +1902,136 @@ cfq_latter_request(request_queue_t *q, struct request *rq) return NULL; } -static int cfq_may_queue(request_queue_t *q, int rw) +/* + * we temporarily boost lower priority queues if they are holding fs exclusive + * resources. they are boosted to normal prio (CLASS_BE/4) + */ +static void cfq_prio_boost(struct cfq_queue *cfqq) { - struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_queue *cfqq; - int ret = ELV_MQUEUE_MAY; + const int ioprio_class = cfqq->ioprio_class; + const int ioprio = cfqq->ioprio; - if (current->flags & PF_MEMALLOC) - return ELV_MQUEUE_MAY; + if (has_fs_excl()) { + /* + * boost idle prio on transactions that would lock out other + * users of the filesystem + */ + if (cfq_class_idle(cfqq)) + cfqq->ioprio_class = IOPRIO_CLASS_BE; + if (cfqq->ioprio > IOPRIO_NORM) + cfqq->ioprio = IOPRIO_NORM; + } else { + /* + * check if we need to unboost the queue + */ + if (cfqq->ioprio_class != cfqq->org_ioprio_class) + cfqq->ioprio_class = cfqq->org_ioprio_class; + if (cfqq->ioprio != cfqq->org_ioprio) + cfqq->ioprio = cfqq->org_ioprio; + } - cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current)); - if (cfqq) { - int limit = cfqd->max_queued; + /* + * refile between round-robin lists if we moved the priority class + */ + if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) && + cfq_cfqq_on_rr(cfqq)) + cfq_resort_rr_list(cfqq, 0); +} - if (cfqq->allocated[rw] < cfqd->cfq_queued) - return ELV_MQUEUE_MUST; +static inline pid_t cfq_queue_pid(struct task_struct *task, int rw) +{ + if (rw == READ || process_sync(task)) + return task->pid; - if (cfqd->busy_queues) - limit = q->nr_requests / cfqd->busy_queues; + return CFQ_KEY_ASYNC; +} - if (limit < cfqd->cfq_queued) - limit = cfqd->cfq_queued; - else if (limit > cfqd->max_queued) - limit = cfqd->max_queued; +static inline int +__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct task_struct *task, int rw) +{ +#if 1 + if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) && + !cfq_cfqq_must_alloc_slice(cfqq)) { + cfq_mark_cfqq_must_alloc_slice(cfqq); + return ELV_MQUEUE_MUST; + } - if (cfqq->allocated[rw] >= limit) { - if (limit > cfqq->alloc_limit[rw]) - cfqq->alloc_limit[rw] = limit; + return ELV_MQUEUE_MAY; +#else + if (!cfqq || task->flags & PF_MEMALLOC) + return ELV_MQUEUE_MAY; + if (!cfqq->allocated[rw] || cfq_cfqq_must_alloc(cfqq)) { + if (cfq_cfqq_wait_request(cfqq)) + return ELV_MQUEUE_MUST; - ret = ELV_MQUEUE_NO; + /* + * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we + * can quickly flood the queue with writes from a single task + */ + if (rw == READ || !cfq_cfqq_must_alloc_slice(cfqq)) { + cfq_mark_cfqq_must_alloc_slice(cfqq); + return ELV_MQUEUE_MUST; } + + return ELV_MQUEUE_MAY; } + if (cfq_class_idle(cfqq)) + return ELV_MQUEUE_NO; + if (cfqq->allocated[rw] >= cfqd->max_queued) { + struct io_context *ioc = get_io_context(GFP_ATOMIC); + int ret = ELV_MQUEUE_NO; - return ret; + if (ioc && ioc->nr_batch_requests) + ret = ELV_MQUEUE_MAY; + + put_io_context(ioc); + return ret; + } + + return ELV_MQUEUE_MAY; +#endif +} + +static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio) +{ + struct cfq_data *cfqd = q->elevator->elevator_data; + struct task_struct *tsk = current; + struct cfq_queue *cfqq; + + /* + * don't force setup of a queue from here, as a call to may_queue + * does not necessarily imply that a request actually will be queued. + * so just lookup a possibly existing queue, or return 'may queue' + * if that fails + */ + cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio); + if (cfqq) { + cfq_init_prio_data(cfqq); + cfq_prio_boost(cfqq); + + return __cfq_may_queue(cfqd, cfqq, tsk, rw); + } + + return ELV_MQUEUE_MAY; } static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq) { + struct cfq_data *cfqd = q->elevator->elevator_data; struct request_list *rl = &q->rq; - const int write = waitqueue_active(&rl->wait[WRITE]); - const int read = waitqueue_active(&rl->wait[READ]); - if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ]) - wake_up(&rl->wait[READ]); - if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE]) - wake_up(&rl->wait[WRITE]); + if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) { + smp_mb(); + if (waitqueue_active(&rl->wait[READ])) + wake_up(&rl->wait[READ]); + } + + if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) { + smp_mb(); + if (waitqueue_active(&rl->wait[WRITE])) + wake_up(&rl->wait[WRITE]); + } } /* @@ -1386,69 +2044,61 @@ static void cfq_put_request(request_queue_t *q, struct request *rq) if (crq) { struct cfq_queue *cfqq = crq->cfq_queue; + const int rw = rq_data_dir(rq); - BUG_ON(q->last_merge == rq); - BUG_ON(!hlist_unhashed(&crq->hash)); - - if (crq->io_context) - put_io_context(crq->io_context->ioc); + BUG_ON(!cfqq->allocated[rw]); + cfqq->allocated[rw]--; - BUG_ON(!cfqq->allocated[crq->is_write]); - cfqq->allocated[crq->is_write]--; + put_io_context(crq->io_context->ioc); mempool_free(crq, cfqd->crq_pool); rq->elevator_private = NULL; - smp_mb(); cfq_check_waiters(q, cfqq); cfq_put_queue(cfqq); } } /* - * Allocate cfq data structures associated with this request. A queue and + * Allocate cfq data structures associated with this request. */ -static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask) +static int +cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio, + int gfp_mask) { struct cfq_data *cfqd = q->elevator->elevator_data; + struct task_struct *tsk = current; struct cfq_io_context *cic; const int rw = rq_data_dir(rq); - struct cfq_queue *cfqq, *saved_cfqq; + pid_t key = cfq_queue_pid(tsk, rw); + struct cfq_queue *cfqq; struct cfq_rq *crq; unsigned long flags; might_sleep_if(gfp_mask & __GFP_WAIT); + cic = cfq_get_io_context(cfqd, key, gfp_mask); + spin_lock_irqsave(q->queue_lock, flags); - cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask); - if (!cfqq) - goto out_lock; + if (!cic) + goto queue_fail; -repeat: - if (cfqq->allocated[rw] >= cfqd->max_queued) - goto out_lock; + if (!cic->cfqq) { + cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask); + if (!cfqq) + goto queue_fail; + + cic->cfqq = cfqq; + } else + cfqq = cic->cfqq; cfqq->allocated[rw]++; + cfq_clear_cfqq_must_alloc(cfqq); + cfqd->rq_starved = 0; + atomic_inc(&cfqq->ref); spin_unlock_irqrestore(q->queue_lock, flags); - /* - * if hashing type has changed, the cfq_queue might change here. - */ - saved_cfqq = cfqq; - cic = cfq_get_io_context(&cfqq, gfp_mask); - if (!cic) - goto err; - - /* - * repeat allocation checks on queue change - */ - if (unlikely(saved_cfqq != cfqq)) { - spin_lock_irqsave(q->queue_lock, flags); - saved_cfqq->allocated[rw]--; - goto repeat; - } - crq = mempool_alloc(cfqd->crq_pool, gfp_mask); if (crq) { RB_CLEAR(&crq->rb_node); @@ -1457,24 +2107,141 @@ repeat: INIT_HLIST_NODE(&crq->hash); crq->cfq_queue = cfqq; crq->io_context = cic; - crq->service_start = crq->queue_start = 0; - crq->in_flight = crq->accounted = crq->is_sync = 0; - crq->is_write = rw; + cfq_clear_crq_in_flight(crq); + cfq_clear_crq_in_driver(crq); + cfq_clear_crq_requeued(crq); + + if (rw == READ || process_sync(tsk)) + cfq_mark_crq_is_sync(crq); + else + cfq_clear_crq_is_sync(crq); + rq->elevator_private = crq; - cfqq->alloc_limit[rw] = 0; return 0; } - put_io_context(cic->ioc); -err: spin_lock_irqsave(q->queue_lock, flags); cfqq->allocated[rw]--; + if (!(cfqq->allocated[0] + cfqq->allocated[1])) + cfq_mark_cfqq_must_alloc(cfqq); cfq_put_queue(cfqq); -out_lock: +queue_fail: + if (cic) + put_io_context(cic->ioc); + /* + * mark us rq allocation starved. we need to kickstart the process + * ourselves if there are no pending requests that can do it for us. + * that would be an extremely rare OOM situation + */ + cfqd->rq_starved = 1; + cfq_schedule_dispatch(cfqd); spin_unlock_irqrestore(q->queue_lock, flags); return 1; } +static void cfq_kick_queue(void *data) +{ + request_queue_t *q = data; + struct cfq_data *cfqd = q->elevator->elevator_data; + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + + if (cfqd->rq_starved) { + struct request_list *rl = &q->rq; + + /* + * we aren't guaranteed to get a request after this, but we + * have to be opportunistic + */ + smp_mb(); + if (waitqueue_active(&rl->wait[READ])) + wake_up(&rl->wait[READ]); + if (waitqueue_active(&rl->wait[WRITE])) + wake_up(&rl->wait[WRITE]); + } + + blk_remove_plug(q); + q->request_fn(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +/* + * Timer running if the active_queue is currently idling inside its time slice + */ +static void cfq_idle_slice_timer(unsigned long data) +{ + struct cfq_data *cfqd = (struct cfq_data *) data; + struct cfq_queue *cfqq; + unsigned long flags; + + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + + if ((cfqq = cfqd->active_queue) != NULL) { + unsigned long now = jiffies; + + /* + * expired + */ + if (time_after(now, cfqq->slice_end)) + goto expire; + + /* + * only expire and reinvoke request handler, if there are + * other queues with pending requests + */ + if (!cfq_pending_requests(cfqd)) { + cfqd->idle_slice_timer.expires = min(now + cfqd->cfq_slice_idle, cfqq->slice_end); + add_timer(&cfqd->idle_slice_timer); + goto out_cont; + } + + /* + * not expired and it has a request pending, let it dispatch + */ + if (!RB_EMPTY(&cfqq->sort_list)) { + cfq_mark_cfqq_must_dispatch(cfqq); + goto out_kick; + } + } +expire: + cfq_slice_expired(cfqd, 0); +out_kick: + cfq_schedule_dispatch(cfqd); +out_cont: + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); +} + +/* + * Timer running if an idle class queue is waiting for service + */ +static void cfq_idle_class_timer(unsigned long data) +{ + struct cfq_data *cfqd = (struct cfq_data *) data; + unsigned long flags, end; + + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + + /* + * race with a non-idle queue, reset timer + */ + end = cfqd->last_end_request + CFQ_IDLE_GRACE; + if (!time_after_eq(jiffies, end)) { + cfqd->idle_class_timer.expires = end; + add_timer(&cfqd->idle_class_timer); + } else + cfq_schedule_dispatch(cfqd); + + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); +} + +static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) +{ + del_timer_sync(&cfqd->idle_slice_timer); + del_timer_sync(&cfqd->idle_class_timer); + blk_sync_queue(cfqd->queue); +} + static void cfq_put_cfqd(struct cfq_data *cfqd) { request_queue_t *q = cfqd->queue; @@ -1484,6 +2251,9 @@ static void cfq_put_cfqd(struct cfq_data *cfqd) blk_put_queue(q); + cfq_shutdown_timer_wq(cfqd); + q->elevator->elevator_data = NULL; + mempool_destroy(cfqd->crq_pool); kfree(cfqd->crq_hash); kfree(cfqd->cfq_hash); @@ -1492,7 +2262,10 @@ static void cfq_put_cfqd(struct cfq_data *cfqd) static void cfq_exit_queue(elevator_t *e) { - cfq_put_cfqd(e->elevator_data); + struct cfq_data *cfqd = e->elevator_data; + + cfq_shutdown_timer_wq(cfqd); + cfq_put_cfqd(cfqd); } static int cfq_init_queue(request_queue_t *q, elevator_t *e) @@ -1505,7 +2278,13 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e) return -ENOMEM; memset(cfqd, 0, sizeof(*cfqd)); - INIT_LIST_HEAD(&cfqd->rr_list); + + for (i = 0; i < CFQ_PRIO_LISTS; i++) + INIT_LIST_HEAD(&cfqd->rr_list[i]); + + INIT_LIST_HEAD(&cfqd->busy_rr); + INIT_LIST_HEAD(&cfqd->cur_rr); + INIT_LIST_HEAD(&cfqd->idle_rr); INIT_LIST_HEAD(&cfqd->empty_list); cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); @@ -1530,24 +2309,32 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e) cfqd->queue = q; atomic_inc(&q->refcnt); - /* - * just set it to some high value, we want anyone to be able to queue - * some requests. fairness is handled differently - */ - q->nr_requests = 1024; - cfqd->max_queued = q->nr_requests / 16; + cfqd->max_queued = q->nr_requests / 4; q->nr_batching = cfq_queued; - cfqd->key_type = CFQ_KEY_TGID; - cfqd->find_best_crq = 1; + + init_timer(&cfqd->idle_slice_timer); + cfqd->idle_slice_timer.function = cfq_idle_slice_timer; + cfqd->idle_slice_timer.data = (unsigned long) cfqd; + + init_timer(&cfqd->idle_class_timer); + cfqd->idle_class_timer.function = cfq_idle_class_timer; + cfqd->idle_class_timer.data = (unsigned long) cfqd; + + INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q); + atomic_set(&cfqd->ref, 1); cfqd->cfq_queued = cfq_queued; cfqd->cfq_quantum = cfq_quantum; - cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r; - cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w; - cfqd->cfq_fifo_batch_expire = cfq_fifo_rate; + cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; + cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; cfqd->cfq_back_max = cfq_back_max; cfqd->cfq_back_penalty = cfq_back_penalty; + cfqd->cfq_slice[0] = cfq_slice_async; + cfqd->cfq_slice[1] = cfq_slice_sync; + cfqd->cfq_slice_async_rq = cfq_slice_async_rq; + cfqd->cfq_slice_idle = cfq_slice_idle; + cfqd->cfq_max_depth = cfq_max_depth; return 0; out_crqpool: @@ -1592,7 +2379,6 @@ fail: return -ENOMEM; } - /* * sysfs parts below --> */ @@ -1617,45 +2403,6 @@ cfq_var_store(unsigned int *var, const char *page, size_t count) return count; } -static ssize_t -cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count) -{ - max_elapsed_dispatch = max_elapsed_crq = 0; - return count; -} - -static ssize_t -cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count) -{ - spin_lock_irq(cfqd->queue->queue_lock); - if (!strncmp(page, "pgid", 4)) - cfqd->key_type = CFQ_KEY_PGID; - else if (!strncmp(page, "tgid", 4)) - cfqd->key_type = CFQ_KEY_TGID; - else if (!strncmp(page, "uid", 3)) - cfqd->key_type = CFQ_KEY_UID; - else if (!strncmp(page, "gid", 3)) - cfqd->key_type = CFQ_KEY_GID; - spin_unlock_irq(cfqd->queue->queue_lock); - return count; -} - -static ssize_t -cfq_read_key_type(struct cfq_data *cfqd, char *page) -{ - ssize_t len = 0; - int i; - - for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) { - if (cfqd->key_type == i) - len += sprintf(page+len, "[%s] ", cfq_key_types[i]); - else - len += sprintf(page+len, "%s ", cfq_key_types[i]); - } - len += sprintf(page+len, "\n"); - return len; -} - #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \ { \ @@ -1666,12 +2413,15 @@ static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \ } SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0); -SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r, 1); -SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w, 1); -SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire, 1); -SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq, 0); +SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); +SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0); SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0); +SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); +SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); +SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); +SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); +SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -1691,12 +2441,15 @@ static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count) \ } STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0); -STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX, 1); -STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX, 1); -STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX, 1); -STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1, 0); +STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1); +STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); +STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); +STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); +STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); +STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); +STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0); #undef STORE_FUNCTION static struct cfq_fs_entry cfq_quantum_entry = { @@ -1709,25 +2462,15 @@ static struct cfq_fs_entry cfq_queued_entry = { .show = cfq_queued_show, .store = cfq_queued_store, }; -static struct cfq_fs_entry cfq_fifo_expire_r_entry = { +static struct cfq_fs_entry cfq_fifo_expire_sync_entry = { .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_fifo_expire_r_show, - .store = cfq_fifo_expire_r_store, + .show = cfq_fifo_expire_sync_show, + .store = cfq_fifo_expire_sync_store, }; -static struct cfq_fs_entry cfq_fifo_expire_w_entry = { +static struct cfq_fs_entry cfq_fifo_expire_async_entry = { .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_fifo_expire_w_show, - .store = cfq_fifo_expire_w_store, -}; -static struct cfq_fs_entry cfq_fifo_batch_expire_entry = { - .attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_fifo_batch_expire_show, - .store = cfq_fifo_batch_expire_store, -}; -static struct cfq_fs_entry cfq_find_best_entry = { - .attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_find_best_show, - .store = cfq_find_best_store, + .show = cfq_fifo_expire_async_show, + .store = cfq_fifo_expire_async_store, }; static struct cfq_fs_entry cfq_back_max_entry = { .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR }, @@ -1739,27 +2482,44 @@ static struct cfq_fs_entry cfq_back_penalty_entry = { .show = cfq_back_penalty_show, .store = cfq_back_penalty_store, }; -static struct cfq_fs_entry cfq_clear_elapsed_entry = { - .attr = {.name = "clear_elapsed", .mode = S_IWUSR }, - .store = cfq_clear_elapsed, +static struct cfq_fs_entry cfq_slice_sync_entry = { + .attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_slice_sync_show, + .store = cfq_slice_sync_store, +}; +static struct cfq_fs_entry cfq_slice_async_entry = { + .attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_slice_async_show, + .store = cfq_slice_async_store, +}; +static struct cfq_fs_entry cfq_slice_async_rq_entry = { + .attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_slice_async_rq_show, + .store = cfq_slice_async_rq_store, }; -static struct cfq_fs_entry cfq_key_type_entry = { - .attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_read_key_type, - .store = cfq_set_key_type, +static struct cfq_fs_entry cfq_slice_idle_entry = { + .attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_slice_idle_show, + .store = cfq_slice_idle_store, +}; +static struct cfq_fs_entry cfq_max_depth_entry = { + .attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_max_depth_show, + .store = cfq_max_depth_store, }; static struct attribute *default_attrs[] = { &cfq_quantum_entry.attr, &cfq_queued_entry.attr, - &cfq_fifo_expire_r_entry.attr, - &cfq_fifo_expire_w_entry.attr, - &cfq_fifo_batch_expire_entry.attr, - &cfq_key_type_entry.attr, - &cfq_find_best_entry.attr, + &cfq_fifo_expire_sync_entry.attr, + &cfq_fifo_expire_async_entry.attr, &cfq_back_max_entry.attr, &cfq_back_penalty_entry.attr, - &cfq_clear_elapsed_entry.attr, + &cfq_slice_sync_entry.attr, + &cfq_slice_async_entry.attr, + &cfq_slice_async_rq_entry.attr, + &cfq_slice_idle_entry.attr, + &cfq_max_depth_entry.attr, NULL, }; @@ -1772,7 +2532,7 @@ cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page) struct cfq_fs_entry *entry = to_cfq(attr); if (!entry->show) - return 0; + return -EIO; return entry->show(e->elevator_data, page); } @@ -1785,7 +2545,7 @@ cfq_attr_store(struct kobject *kobj, struct attribute *attr, struct cfq_fs_entry *entry = to_cfq(attr); if (!entry->store) - return -EINVAL; + return -EIO; return entry->store(e->elevator_data, page, length); } @@ -1829,21 +2589,46 @@ static int __init cfq_init(void) { int ret; + /* + * could be 0 on HZ < 1000 setups + */ + if (!cfq_slice_async) + cfq_slice_async = 1; + if (!cfq_slice_idle) + cfq_slice_idle = 1; + if (cfq_slab_setup()) return -ENOMEM; ret = elv_register(&iosched_cfq); - if (!ret) { - __module_get(THIS_MODULE); - return 0; - } + if (ret) + cfq_slab_kill(); - cfq_slab_kill(); return ret; } static void __exit cfq_exit(void) { + struct task_struct *g, *p; + unsigned long flags; + + read_lock_irqsave(&tasklist_lock, flags); + + /* + * iterate each process in the system, removing our io_context + */ + do_each_thread(g, p) { + struct io_context *ioc = p->io_context; + + if (ioc && ioc->cic) { + ioc->cic->exit(ioc->cic); + cfq_free_io_context(ioc->cic); + ioc->cic = NULL; + } + } while_each_thread(g, p); + + read_unlock_irqrestore(&tasklist_lock, flags); + cfq_slab_kill(); elv_unregister(&iosched_cfq); } diff --git a/drivers/block/deadline-iosched.c b/drivers/block/deadline-iosched.c index d63d34c..ff5201e 100644 --- a/drivers/block/deadline-iosched.c +++ b/drivers/block/deadline-iosched.c @@ -711,18 +711,20 @@ static int deadline_init_queue(request_queue_t *q, elevator_t *e) if (!drq_pool) return -ENOMEM; - dd = kmalloc(sizeof(*dd), GFP_KERNEL); + dd = kmalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) return -ENOMEM; memset(dd, 0, sizeof(*dd)); - dd->hash = kmalloc(sizeof(struct list_head)*DL_HASH_ENTRIES,GFP_KERNEL); + dd->hash = kmalloc_node(sizeof(struct list_head)*DL_HASH_ENTRIES, + GFP_KERNEL, q->node); if (!dd->hash) { kfree(dd); return -ENOMEM; } - dd->drq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, drq_pool); + dd->drq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, + mempool_free_slab, drq_pool, q->node); if (!dd->drq_pool) { kfree(dd->hash); kfree(dd); @@ -758,7 +760,8 @@ static void deadline_put_request(request_queue_t *q, struct request *rq) } static int -deadline_set_request(request_queue_t *q, struct request *rq, int gfp_mask) +deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio, + int gfp_mask) { struct deadline_data *dd = q->elevator->elevator_data; struct deadline_rq *drq; @@ -886,7 +889,7 @@ deadline_attr_show(struct kobject *kobj, struct attribute *attr, char *page) struct deadline_fs_entry *entry = to_deadline(attr); if (!entry->show) - return 0; + return -EIO; return entry->show(e->elevator_data, page); } @@ -899,7 +902,7 @@ deadline_attr_store(struct kobject *kobj, struct attribute *attr, struct deadline_fs_entry *entry = to_deadline(attr); if (!entry->store) - return -EINVAL; + return -EIO; return entry->store(e->elevator_data, page, length); } diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c index 6b79b43..98f0126 100644 --- a/drivers/block/elevator.c +++ b/drivers/block/elevator.c @@ -220,11 +220,6 @@ void elevator_exit(elevator_t *e) kfree(e); } -static int elevator_global_init(void) -{ - return 0; -} - int elv_merge(request_queue_t *q, struct request **req, struct bio *bio) { elevator_t *e = q->elevator; @@ -291,6 +286,13 @@ void elv_requeue_request(request_queue_t *q, struct request *rq) } /* + * the request is prepped and may have some resources allocated. + * allowing unprepped requests to pass this one may cause resource + * deadlock. turn on softbarrier. + */ + rq->flags |= REQ_SOFTBARRIER; + + /* * if iosched has an explicit requeue hook, then use that. otherwise * just put the request at the front of the queue */ @@ -322,7 +324,7 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where, int nrq = q->rq.count[READ] + q->rq.count[WRITE] - q->in_flight; - if (nrq == q->unplug_thresh) + if (nrq >= q->unplug_thresh) __generic_unplug_device(q); } } else @@ -386,6 +388,12 @@ struct request *elv_next_request(request_queue_t *q) if (ret == BLKPREP_OK) { break; } else if (ret == BLKPREP_DEFER) { + /* + * the request may have been (partially) prepped. + * we need to keep this request in the front to + * avoid resource deadlock. turn on softbarrier. + */ + rq->flags |= REQ_SOFTBARRIER; rq = NULL; break; } else if (ret == BLKPREP_KILL) { @@ -478,12 +486,13 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq) return NULL; } -int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask) +int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio, + int gfp_mask) { elevator_t *e = q->elevator; if (e->ops->elevator_set_req_fn) - return e->ops->elevator_set_req_fn(q, rq, gfp_mask); + return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask); rq->elevator_private = NULL; return 0; @@ -497,12 +506,12 @@ void elv_put_request(request_queue_t *q, struct request *rq) e->ops->elevator_put_req_fn(q, rq); } -int elv_may_queue(request_queue_t *q, int rw) +int elv_may_queue(request_queue_t *q, int rw, struct bio *bio) { elevator_t *e = q->elevator; if (e->ops->elevator_may_queue_fn) - return e->ops->elevator_may_queue_fn(q, rw); + return e->ops->elevator_may_queue_fn(q, rw, bio); return ELV_MQUEUE_MAY; } @@ -692,8 +701,6 @@ ssize_t elv_iosched_show(request_queue_t *q, char *name) return len; } -module_init(elevator_global_init); - EXPORT_SYMBOL(elv_add_request); EXPORT_SYMBOL(__elv_add_request); EXPORT_SYMBOL(elv_requeue_request); diff --git a/drivers/block/genhd.c b/drivers/block/genhd.c index 8bbe01d..47fd365 100644 --- a/drivers/block/genhd.c +++ b/drivers/block/genhd.c @@ -40,7 +40,7 @@ static inline int major_to_index(int major) #ifdef CONFIG_PROC_FS /* get block device names in somewhat random order */ -int get_blkdev_list(char *p) +int get_blkdev_list(char *p, int used) { struct blk_major_name *n; int i, len; @@ -49,10 +49,18 @@ int get_blkdev_list(char *p) down(&block_subsys_sem); for (i = 0; i < ARRAY_SIZE(major_names); i++) { - for (n = major_names[i]; n; n = n->next) + for (n = major_names[i]; n; n = n->next) { + /* + * If the curent string plus the 5 extra characters + * in the line would run us off the page, then we're done + */ + if ((len + used + strlen(n->name) + 5) >= PAGE_SIZE) + goto page_full; len += sprintf(p+len, "%3d %s\n", n->major, n->name); + } } +page_full: up(&block_subsys_sem); return len; @@ -322,7 +330,7 @@ static ssize_t disk_attr_show(struct kobject *kobj, struct attribute *attr, struct gendisk *disk = to_disk(kobj); struct disk_attribute *disk_attr = container_of(attr,struct disk_attribute,attr); - ssize_t ret = 0; + ssize_t ret = -EIO; if (disk_attr->show) ret = disk_attr->show(disk,page); @@ -582,10 +590,16 @@ struct seq_operations diskstats_op = { .show = diskstats_show }; - struct gendisk *alloc_disk(int minors) { - struct gendisk *disk = kmalloc(sizeof(struct gendisk), GFP_KERNEL); + return alloc_disk_node(minors, -1); +} + +struct gendisk *alloc_disk_node(int minors, int node_id) +{ + struct gendisk *disk; + + disk = kmalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (disk) { memset(disk, 0, sizeof(struct gendisk)); if (!init_disk_stats(disk)) { @@ -594,7 +608,7 @@ struct gendisk *alloc_disk(int minors) } if (minors > 1) { int size = (minors - 1) * sizeof(struct hd_struct *); - disk->part = kmalloc(size, GFP_KERNEL); + disk->part = kmalloc_node(size, GFP_KERNEL, node_id); if (!disk->part) { kfree(disk); return NULL; @@ -610,6 +624,7 @@ struct gendisk *alloc_disk(int minors) } EXPORT_SYMBOL(alloc_disk); +EXPORT_SYMBOL(alloc_disk_node); struct kobject *get_disk(struct gendisk *disk) { diff --git a/drivers/block/ioctl.c b/drivers/block/ioctl.c index 6d7bcc9..6e27847 100644 --- a/drivers/block/ioctl.c +++ b/drivers/block/ioctl.c @@ -133,11 +133,9 @@ static int put_u64(unsigned long arg, u64 val) return put_user(val, (u64 __user *)arg); } -int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd, - unsigned long arg) +static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev, + unsigned cmd, unsigned long arg) { - struct block_device *bdev = inode->i_bdev; - struct gendisk *disk = bdev->bd_disk; struct backing_dev_info *bdi; int ret, n; @@ -190,36 +188,72 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd, return put_ulong(arg, bdev->bd_inode->i_size >> 9); case BLKGETSIZE64: return put_u64(arg, bdev->bd_inode->i_size); + } + return -ENOIOCTLCMD; +} + +static int blkdev_driver_ioctl(struct inode *inode, struct file *file, + struct gendisk *disk, unsigned cmd, unsigned long arg) +{ + int ret; + if (disk->fops->unlocked_ioctl) + return disk->fops->unlocked_ioctl(file, cmd, arg); + + if (disk->fops->ioctl) { + lock_kernel(); + ret = disk->fops->ioctl(inode, file, cmd, arg); + unlock_kernel(); + return ret; + } + + return -ENOTTY; +} + +int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd, + unsigned long arg) +{ + struct block_device *bdev = inode->i_bdev; + struct gendisk *disk = bdev->bd_disk; + int ret, n; + + switch(cmd) { case BLKFLSBUF: if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (disk->fops->ioctl) { - ret = disk->fops->ioctl(inode, file, cmd, arg); - /* -EINVAL to handle old uncorrected drivers */ - if (ret != -EINVAL && ret != -ENOTTY) - return ret; - } + + ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg); + /* -EINVAL to handle old uncorrected drivers */ + if (ret != -EINVAL && ret != -ENOTTY) + return ret; + + lock_kernel(); fsync_bdev(bdev); invalidate_bdev(bdev, 0); + unlock_kernel(); return 0; + case BLKROSET: - if (disk->fops->ioctl) { - ret = disk->fops->ioctl(inode, file, cmd, arg); - /* -EINVAL to handle old uncorrected drivers */ - if (ret != -EINVAL && ret != -ENOTTY) - return ret; - } + ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg); + /* -EINVAL to handle old uncorrected drivers */ + if (ret != -EINVAL && ret != -ENOTTY) + return ret; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (get_user(n, (int __user *)(arg))) return -EFAULT; + lock_kernel(); set_device_ro(bdev, n); + unlock_kernel(); return 0; - default: - if (disk->fops->ioctl) - return disk->fops->ioctl(inode, file, cmd, arg); } - return -ENOTTY; + + lock_kernel(); + ret = blkdev_locked_ioctl(file, bdev, cmd, arg); + unlock_kernel(); + if (ret != -ENOIOCTLCMD) + return ret; + + return blkdev_driver_ioctl(inode, file, disk, cmd, arg); } /* Most of the generic ioctls are handled in the normal fallback path. diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 11ef9d9..3c81854 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/swap.h> #include <linux/writeback.h> +#include <linux/blkdev.h> /* * for max sense size @@ -36,6 +37,7 @@ static void blk_unplug_work(void *data); static void blk_unplug_timeout(unsigned long data); +static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io); /* * For the allocated request tables @@ -274,6 +276,7 @@ static inline void rq_init(request_queue_t *q, struct request *rq) rq->errors = 0; rq->rq_status = RQ_ACTIVE; rq->bio = rq->biotail = NULL; + rq->ioprio = 0; rq->buffer = NULL; rq->ref_count = 1; rq->q = q; @@ -774,9 +777,9 @@ EXPORT_SYMBOL(blk_queue_free_tags); static int init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth) { - int bits, i; struct request **tag_index; unsigned long *tag_map; + int nr_ulongs; if (depth > q->nr_requests * 2) { depth = q->nr_requests * 2; @@ -788,24 +791,18 @@ init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth) if (!tag_index) goto fail; - bits = (depth / BLK_TAGS_PER_LONG) + 1; - tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC); + nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; + tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); if (!tag_map) goto fail; memset(tag_index, 0, depth * sizeof(struct request *)); - memset(tag_map, 0, bits * sizeof(unsigned long)); + memset(tag_map, 0, nr_ulongs * sizeof(unsigned long)); + tags->real_max_depth = depth; tags->max_depth = depth; - tags->real_max_depth = bits * BITS_PER_LONG; tags->tag_index = tag_index; tags->tag_map = tag_map; - /* - * set the upper bits if the depth isn't a multiple of the word size - */ - for (i = depth; i < bits * BLK_TAGS_PER_LONG; i++) - __set_bit(i, tag_map); - return 0; fail: kfree(tag_index); @@ -870,13 +867,16 @@ int blk_queue_resize_tags(request_queue_t *q, int new_depth) struct blk_queue_tag *bqt = q->queue_tags; struct request **tag_index; unsigned long *tag_map; - int bits, max_depth; + int max_depth, nr_ulongs; if (!bqt) return -ENXIO; /* - * don't bother sizing down + * if we already have large enough real_max_depth. just + * adjust max_depth. *NOTE* as requests with tag value + * between new_depth and real_max_depth can be in-flight, tag + * map can not be shrunk blindly here. */ if (new_depth <= bqt->real_max_depth) { bqt->max_depth = new_depth; @@ -894,8 +894,8 @@ int blk_queue_resize_tags(request_queue_t *q, int new_depth) return -ENOMEM; memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); - bits = max_depth / BLK_TAGS_PER_LONG; - memcpy(bqt->tag_map, tag_map, bits * sizeof(unsigned long)); + nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; + memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); kfree(tag_index); kfree(tag_map); @@ -926,10 +926,15 @@ void blk_queue_end_tag(request_queue_t *q, struct request *rq) BUG_ON(tag == -1); if (unlikely(tag >= bqt->real_max_depth)) + /* + * This can happen after tag depth has been reduced. + * FIXME: how about a warning or info message here? + */ return; if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) { - printk("attempt to clear non-busy tag (%d)\n", tag); + printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", + __FUNCTION__, tag); return; } @@ -938,7 +943,8 @@ void blk_queue_end_tag(request_queue_t *q, struct request *rq) rq->tag = -1; if (unlikely(bqt->tag_index[tag] == NULL)) - printk("tag %d is missing\n", tag); + printk(KERN_ERR "%s: tag %d is missing\n", + __FUNCTION__, tag); bqt->tag_index[tag] = NULL; bqt->busy--; @@ -967,24 +973,20 @@ EXPORT_SYMBOL(blk_queue_end_tag); int blk_queue_start_tag(request_queue_t *q, struct request *rq) { struct blk_queue_tag *bqt = q->queue_tags; - unsigned long *map = bqt->tag_map; - int tag = 0; + int tag; if (unlikely((rq->flags & REQ_QUEUED))) { printk(KERN_ERR - "request %p for device [%s] already tagged %d", - rq, rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); + "%s: request %p for device [%s] already tagged %d", + __FUNCTION__, rq, + rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); BUG(); } - for (map = bqt->tag_map; *map == -1UL; map++) { - tag += BLK_TAGS_PER_LONG; - - if (tag >= bqt->max_depth) - return 1; - } + tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth); + if (tag >= bqt->max_depth) + return 1; - tag += ffz(*map); __set_bit(tag, bqt->tag_map); rq->flags |= REQ_QUEUED; @@ -1020,7 +1022,8 @@ void blk_queue_invalidate_tags(request_queue_t *q) rq = list_entry_rq(tmp); if (rq->tag == -1) { - printk("bad tag found on list\n"); + printk(KERN_ERR + "%s: bad tag found on list\n", __FUNCTION__); list_del_init(&rq->queuelist); rq->flags &= ~REQ_QUEUED; } else @@ -1148,7 +1151,7 @@ new_hw_segment: } -int blk_phys_contig_segment(request_queue_t *q, struct bio *bio, +static int blk_phys_contig_segment(request_queue_t *q, struct bio *bio, struct bio *nxt) { if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER))) @@ -1169,9 +1172,7 @@ int blk_phys_contig_segment(request_queue_t *q, struct bio *bio, return 0; } -EXPORT_SYMBOL(blk_phys_contig_segment); - -int blk_hw_contig_segment(request_queue_t *q, struct bio *bio, +static int blk_hw_contig_segment(request_queue_t *q, struct bio *bio, struct bio *nxt) { if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) @@ -1187,8 +1188,6 @@ int blk_hw_contig_segment(request_queue_t *q, struct bio *bio, return 1; } -EXPORT_SYMBOL(blk_hw_contig_segment); - /* * map a request to scatterlist, return number of sg entries setup. Caller * must make sure sg can hold rq->nr_phys_segments entries @@ -1358,8 +1357,8 @@ static int ll_front_merge_fn(request_queue_t *q, struct request *req, static int ll_merge_requests_fn(request_queue_t *q, struct request *req, struct request *next) { - int total_phys_segments = req->nr_phys_segments +next->nr_phys_segments; - int total_hw_segments = req->nr_hw_segments + next->nr_hw_segments; + int total_phys_segments; + int total_hw_segments; /* * First check if the either of the requests are re-queued @@ -1369,7 +1368,7 @@ static int ll_merge_requests_fn(request_queue_t *q, struct request *req, return 0; /* - * Will it become to large? + * Will it become too large? */ if ((req->nr_sectors + next->nr_sectors) > q->max_sectors) return 0; @@ -1450,17 +1449,13 @@ EXPORT_SYMBOL(blk_remove_plug); */ void __generic_unplug_device(request_queue_t *q) { - if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) + if (unlikely(test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))) return; if (!blk_remove_plug(q)) return; - /* - * was plugged, fire request_fn if queue has stuff to do - */ - if (elv_next_request(q)) - q->request_fn(q); + q->request_fn(q); } EXPORT_SYMBOL(__generic_unplug_device); @@ -1645,7 +1640,8 @@ static int blk_init_free_list(request_queue_t *q) init_waitqueue_head(&rl->wait[WRITE]); init_waitqueue_head(&rl->drain); - rl->rq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, request_cachep); + rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, + mempool_free_slab, request_cachep, q->node); if (!rl->rq_pool) return -ENOMEM; @@ -1657,8 +1653,15 @@ static int __make_request(request_queue_t *, struct bio *); request_queue_t *blk_alloc_queue(int gfp_mask) { - request_queue_t *q = kmem_cache_alloc(requestq_cachep, gfp_mask); + return blk_alloc_queue_node(gfp_mask, -1); +} +EXPORT_SYMBOL(blk_alloc_queue); +request_queue_t *blk_alloc_queue_node(int gfp_mask, int node_id) +{ + request_queue_t *q; + + q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id); if (!q) return NULL; @@ -1671,8 +1674,7 @@ request_queue_t *blk_alloc_queue(int gfp_mask) return q; } - -EXPORT_SYMBOL(blk_alloc_queue); +EXPORT_SYMBOL(blk_alloc_queue_node); /** * blk_init_queue - prepare a request queue for use with a block device @@ -1705,13 +1707,22 @@ EXPORT_SYMBOL(blk_alloc_queue); * blk_init_queue() must be paired with a blk_cleanup_queue() call * when the block device is deactivated (such as at module unload). **/ + request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) { - request_queue_t *q = blk_alloc_queue(GFP_KERNEL); + return blk_init_queue_node(rfn, lock, -1); +} +EXPORT_SYMBOL(blk_init_queue); + +request_queue_t * +blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) +{ + request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id); if (!q) return NULL; + q->node = node_id; if (blk_init_free_list(q)) goto out_init; @@ -1754,12 +1765,11 @@ out_init: kmem_cache_free(requestq_cachep, q); return NULL; } - -EXPORT_SYMBOL(blk_init_queue); +EXPORT_SYMBOL(blk_init_queue_node); int blk_get_queue(request_queue_t *q) { - if (!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { + if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { atomic_inc(&q->refcnt); return 0; } @@ -1775,8 +1785,8 @@ static inline void blk_free_request(request_queue_t *q, struct request *rq) mempool_free(rq, q->rq.rq_pool); } -static inline struct request *blk_alloc_request(request_queue_t *q, int rw, - int gfp_mask) +static inline struct request * +blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask) { struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); @@ -1789,7 +1799,7 @@ static inline struct request *blk_alloc_request(request_queue_t *q, int rw, */ rq->flags = rw; - if (!elv_set_request(q, rq, gfp_mask)) + if (!elv_set_request(q, rq, bio, gfp_mask)) return rq; mempool_free(rq, q->rq.rq_pool); @@ -1821,7 +1831,7 @@ static inline int ioc_batching(request_queue_t *q, struct io_context *ioc) * is the behaviour we want though - once it gets a wakeup it should be given * a nice run. */ -void ioc_set_batching(request_queue_t *q, struct io_context *ioc) +static void ioc_set_batching(request_queue_t *q, struct io_context *ioc) { if (!ioc || ioc_batching(q, ioc)) return; @@ -1838,7 +1848,6 @@ static void __freed_request(request_queue_t *q, int rw) clear_queue_congested(q, rw); if (rl->count[rw] + 1 <= q->nr_requests) { - smp_mb(); if (waitqueue_active(&rl->wait[rw])) wake_up(&rl->wait[rw]); @@ -1870,18 +1879,20 @@ static void freed_request(request_queue_t *q, int rw) #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) /* - * Get a free request, queue_lock must not be held + * Get a free request, queue_lock must be held. + * Returns NULL on failure, with queue_lock held. + * Returns !NULL on success, with queue_lock *not held*. */ -static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) +static struct request *get_request(request_queue_t *q, int rw, struct bio *bio, + int gfp_mask) { struct request *rq = NULL; struct request_list *rl = &q->rq; - struct io_context *ioc = get_io_context(gfp_mask); + struct io_context *ioc = current_io_context(GFP_ATOMIC); if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) goto out; - spin_lock_irq(q->queue_lock); if (rl->count[rw]+1 >= q->nr_requests) { /* * The queue will fill after this allocation, so set it as @@ -1895,7 +1906,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) } } - switch (elv_may_queue(q, rw)) { + switch (elv_may_queue(q, rw, bio)) { case ELV_MQUEUE_NO: goto rq_starved; case ELV_MQUEUE_MAY: @@ -1909,18 +1920,25 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) * The queue is full and the allocating process is not a * "batcher", and not exempted by the IO scheduler */ - spin_unlock_irq(q->queue_lock); goto out; } get_rq: + /* + * Only allow batching queuers to allocate up to 50% over the defined + * limit of requests, otherwise we could have thousands of requests + * allocated with any setting of ->nr_requests + */ + if (rl->count[rw] >= (3 * q->nr_requests / 2)) + goto out; + rl->count[rw]++; rl->starved[rw] = 0; if (rl->count[rw] >= queue_congestion_on_threshold(q)) set_queue_congested(q, rw); spin_unlock_irq(q->queue_lock); - rq = blk_alloc_request(q, rw, gfp_mask); + rq = blk_alloc_request(q, rw, bio, gfp_mask); if (!rq) { /* * Allocation failed presumably due to memory. Undo anything @@ -1943,7 +1961,6 @@ rq_starved: if (unlikely(rl->count[rw] == 0)) rl->starved[rw] = 1; - spin_unlock_irq(q->queue_lock); goto out; } @@ -1953,31 +1970,35 @@ rq_starved: rq_init(q, rq); rq->rl = rl; out: - put_io_context(ioc); return rq; } /* * No available requests for this queue, unplug the device and wait for some * requests to become available. + * + * Called with q->queue_lock held, and returns with it unlocked. */ -static struct request *get_request_wait(request_queue_t *q, int rw) +static struct request *get_request_wait(request_queue_t *q, int rw, + struct bio *bio) { - DEFINE_WAIT(wait); struct request *rq; - generic_unplug_device(q); - do { + rq = get_request(q, rw, bio, GFP_NOIO); + while (!rq) { + DEFINE_WAIT(wait); struct request_list *rl = &q->rq; prepare_to_wait_exclusive(&rl->wait[rw], &wait, TASK_UNINTERRUPTIBLE); - rq = get_request(q, rw, GFP_NOIO); + rq = get_request(q, rw, bio, GFP_NOIO); if (!rq) { struct io_context *ioc; + __generic_unplug_device(q); + spin_unlock_irq(q->queue_lock); io_schedule(); /* @@ -1986,12 +2007,13 @@ static struct request *get_request_wait(request_queue_t *q, int rw) * up to a big batch of them for a small period time. * See ioc_batching, ioc_set_batching */ - ioc = get_io_context(GFP_NOIO); + ioc = current_io_context(GFP_NOIO); ioc_set_batching(q, ioc); - put_io_context(ioc); + + spin_lock_irq(q->queue_lock); } finish_wait(&rl->wait[rw], &wait); - } while (!rq); + } return rq; } @@ -2002,14 +2024,18 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask) BUG_ON(rw != READ && rw != WRITE); - if (gfp_mask & __GFP_WAIT) - rq = get_request_wait(q, rw); - else - rq = get_request(q, rw, gfp_mask); + spin_lock_irq(q->queue_lock); + if (gfp_mask & __GFP_WAIT) { + rq = get_request_wait(q, rw, NULL); + } else { + rq = get_request(q, rw, NULL, gfp_mask); + if (!rq) + spin_unlock_irq(q->queue_lock); + } + /* q->queue_lock is unlocked at this point */ return rq; } - EXPORT_SYMBOL(blk_get_request); /** @@ -2038,7 +2064,6 @@ EXPORT_SYMBOL(blk_requeue_request); * @rq: request to be inserted * @at_head: insert request at head or tail of queue * @data: private data - * @reinsert: true if request it a reinsertion of previously processed one * * Description: * Many block devices need to execute commands asynchronously, so they don't @@ -2053,8 +2078,9 @@ EXPORT_SYMBOL(blk_requeue_request); * host that is unable to accept a particular command. */ void blk_insert_request(request_queue_t *q, struct request *rq, - int at_head, void *data, int reinsert) + int at_head, void *data) { + int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; unsigned long flags; /* @@ -2071,20 +2097,12 @@ void blk_insert_request(request_queue_t *q, struct request *rq, /* * If command is tagged, release the tag */ - if (reinsert) - blk_requeue_request(q, rq); - else { - int where = ELEVATOR_INSERT_BACK; + if (blk_rq_tagged(rq)) + blk_queue_end_tag(q, rq); - if (at_head) - where = ELEVATOR_INSERT_FRONT; + drive_stat_acct(rq, rq->nr_sectors, 1); + __elv_add_request(q, rq, where, 0); - if (blk_rq_tagged(rq)) - blk_queue_end_tag(q, rq); - - drive_stat_acct(rq, rq->nr_sectors, 1); - __elv_add_request(q, rq, where, 0); - } if (blk_queue_plugged(q)) __generic_unplug_device(q); else @@ -2259,45 +2277,7 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) EXPORT_SYMBOL(blkdev_issue_flush); -/** - * blkdev_scsi_issue_flush_fn - issue flush for SCSI devices - * @q: device queue - * @disk: gendisk - * @error_sector: error offset - * - * Description: - * Devices understanding the SCSI command set, can use this function as - * a helper for issuing a cache flush. Note: driver is required to store - * the error offset (in case of error flushing) in ->sector of struct - * request. - */ -int blkdev_scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk, - sector_t *error_sector) -{ - struct request *rq = blk_get_request(q, WRITE, __GFP_WAIT); - int ret; - - rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER; - rq->sector = 0; - memset(rq->cmd, 0, sizeof(rq->cmd)); - rq->cmd[0] = 0x35; - rq->cmd_len = 12; - rq->data = NULL; - rq->data_len = 0; - rq->timeout = 60 * HZ; - - ret = blk_execute_rq(q, disk, rq); - - if (ret && error_sector) - *error_sector = rq->sector; - - blk_put_request(rq); - return ret; -} - -EXPORT_SYMBOL(blkdev_scsi_issue_flush_fn); - -void drive_stat_acct(struct request *rq, int nr_sectors, int new_io) +static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io) { int rw = rq_data_dir(rq); @@ -2379,7 +2359,6 @@ static void __blk_put_request(request_queue_t *q, struct request *req) return; req->rq_status = RQ_INACTIVE; - req->q = NULL; req->rl = NULL; /* @@ -2508,6 +2487,8 @@ static int attempt_merge(request_queue_t *q, struct request *req, req->rq_disk->in_flight--; } + req->ioprio = ioprio_best(req->ioprio, next->ioprio); + __blk_put_request(q, next); return 1; } @@ -2556,25 +2537,17 @@ void blk_attempt_remerge(request_queue_t *q, struct request *rq) EXPORT_SYMBOL(blk_attempt_remerge); -/* - * Non-locking blk_attempt_remerge variant. - */ -void __blk_attempt_remerge(request_queue_t *q, struct request *rq) -{ - attempt_back_merge(q, rq); -} - -EXPORT_SYMBOL(__blk_attempt_remerge); - static int __make_request(request_queue_t *q, struct bio *bio) { - struct request *req, *freereq = NULL; + struct request *req; int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync; + unsigned short prio; sector_t sector; sector = bio->bi_sector; nr_sectors = bio_sectors(bio); cur_nr_sectors = bio_cur_sectors(bio); + prio = bio_prio(bio); rw = bio_data_dir(bio); sync = bio_sync(bio); @@ -2589,19 +2562,14 @@ static int __make_request(request_queue_t *q, struct bio *bio) spin_lock_prefetch(q->queue_lock); barrier = bio_barrier(bio); - if (barrier && (q->ordered == QUEUE_ORDERED_NONE)) { + if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) { err = -EOPNOTSUPP; goto end_io; } -again: spin_lock_irq(q->queue_lock); - if (elv_queue_empty(q)) { - blk_plug_device(q); - goto get_rq; - } - if (barrier) + if (unlikely(barrier) || elv_queue_empty(q)) goto get_rq; el_ret = elv_merge(q, &req, bio); @@ -2615,6 +2583,7 @@ again: req->biotail->bi_next = bio; req->biotail = bio; req->nr_sectors = req->hard_nr_sectors += nr_sectors; + req->ioprio = ioprio_best(req->ioprio, prio); drive_stat_acct(req, nr_sectors, 0); if (!attempt_back_merge(q, req)) elv_merged_request(q, req); @@ -2639,45 +2608,30 @@ again: req->hard_cur_sectors = cur_nr_sectors; req->sector = req->hard_sector = sector; req->nr_sectors = req->hard_nr_sectors += nr_sectors; + req->ioprio = ioprio_best(req->ioprio, prio); drive_stat_acct(req, nr_sectors, 0); if (!attempt_front_merge(q, req)) elv_merged_request(q, req); goto out; - /* - * elevator says don't/can't merge. get new request - */ - case ELEVATOR_NO_MERGE: - break; - + /* ELV_NO_MERGE: elevator says don't/can't merge. */ default: - printk("elevator returned crap (%d)\n", el_ret); - BUG(); + ; } +get_rq: /* - * Grab a free request from the freelist - if that is empty, check - * if we are doing read ahead and abort instead of blocking for - * a free slot. + * Grab a free request. This is might sleep but can not fail. + * Returns with the queue unlocked. + */ + req = get_request_wait(q, rw, bio); + + /* + * After dropping the lock and possibly sleeping here, our request + * may now be mergeable after it had proven unmergeable (above). + * We don't worry about that case for efficiency. It won't happen + * often, and the elevators are able to handle it. */ -get_rq: - if (freereq) { - req = freereq; - freereq = NULL; - } else { - spin_unlock_irq(q->queue_lock); - if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) { - /* - * READA bit set - */ - err = -EWOULDBLOCK; - if (bio_rw_ahead(bio)) - goto end_io; - - freereq = get_request_wait(q, rw); - } - goto again; - } req->flags |= REQ_CMD; @@ -2690,7 +2644,7 @@ get_rq: /* * REQ_BARRIER implies no merging, but lets make it explicit */ - if (barrier) + if (unlikely(barrier)) req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE); req->errors = 0; @@ -2702,13 +2656,15 @@ get_rq: req->buffer = bio_data(bio); /* see ->buffer comment above */ req->waiting = NULL; req->bio = req->biotail = bio; + req->ioprio = prio; req->rq_disk = bio->bi_bdev->bd_disk; req->start_time = jiffies; + spin_lock_irq(q->queue_lock); + if (elv_queue_empty(q)) + blk_plug_device(q); add_request(q, req); out: - if (freereq) - __blk_put_request(q, freereq); if (sync) __generic_unplug_device(q); @@ -2730,7 +2686,7 @@ static inline void blk_partition_remap(struct bio *bio) if (bdev != bdev->bd_contains) { struct hd_struct *p = bdev->bd_part; - switch (bio->bi_rw) { + switch (bio_data_dir(bio)) { case READ: p->read_sectors += bio_sectors(bio); p->reads++; @@ -2749,6 +2705,7 @@ void blk_finish_queue_drain(request_queue_t *q) { struct request_list *rl = &q->rq; struct request *rq; + int requeued = 0; spin_lock_irq(q->queue_lock); clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags); @@ -2757,9 +2714,13 @@ void blk_finish_queue_drain(request_queue_t *q) rq = list_entry_rq(q->drain_list.next); list_del_init(&rq->queuelist); - __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1); + elv_requeue_request(q, rq); + requeued++; } + if (requeued) + q->request_fn(q); + spin_unlock_irq(q->queue_lock); wake_up(&rl->wait[0]); @@ -2814,7 +2775,7 @@ static inline void block_wait_queue_running(request_queue_t *q) { DEFINE_WAIT(wait); - while (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)) { + while (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) { struct request_list *rl = &q->rq; prepare_to_wait_exclusive(&rl->drain, &wait, @@ -2923,7 +2884,7 @@ end_io: goto end_io; } - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) goto end_io; block_wait_queue_running(q); @@ -2956,7 +2917,7 @@ void submit_bio(int rw, struct bio *bio) BIO_BUG_ON(!bio->bi_size); BIO_BUG_ON(!bio->bi_io_vec); - bio->bi_rw = rw; + bio->bi_rw |= rw; if (rw & WRITE) mod_page_state(pgpgout, count); else @@ -2976,7 +2937,7 @@ void submit_bio(int rw, struct bio *bio) EXPORT_SYMBOL(submit_bio); -void blk_recalc_rq_segments(struct request *rq) +static void blk_recalc_rq_segments(struct request *rq) { struct bio *bio, *prevbio = NULL; int nr_phys_segs, nr_hw_segs; @@ -3018,7 +2979,7 @@ void blk_recalc_rq_segments(struct request *rq) rq->nr_hw_segments = nr_hw_segs; } -void blk_recalc_rq_sectors(struct request *rq, int nsect) +static void blk_recalc_rq_sectors(struct request *rq, int nsect) { if (blk_fs_request(rq)) { rq->hard_sector += nsect; @@ -3313,8 +3274,11 @@ void exit_io_context(void) struct io_context *ioc; local_irq_save(flags); + task_lock(current); ioc = current->io_context; current->io_context = NULL; + ioc->task = NULL; + task_unlock(current); local_irq_restore(flags); if (ioc->aic && ioc->aic->exit) @@ -3327,53 +3291,49 @@ void exit_io_context(void) /* * If the current task has no IO context then create one and initialise it. - * If it does have a context, take a ref on it. + * Otherwise, return its existing IO context. * - * This is always called in the context of the task which submitted the I/O. - * But weird things happen, so we disable local interrupts to ensure exclusive - * access to *current. + * This returned IO context doesn't have a specifically elevated refcount, + * but since the current task itself holds a reference, the context can be + * used in general code, so long as it stays within `current` context. */ -struct io_context *get_io_context(int gfp_flags) +struct io_context *current_io_context(int gfp_flags) { struct task_struct *tsk = current; - unsigned long flags; struct io_context *ret; - local_irq_save(flags); ret = tsk->io_context; - if (ret) - goto out; - - local_irq_restore(flags); + if (likely(ret)) + return ret; ret = kmem_cache_alloc(iocontext_cachep, gfp_flags); if (ret) { atomic_set(&ret->refcount, 1); - ret->pid = tsk->pid; + ret->task = current; + ret->set_ioprio = NULL; ret->last_waited = jiffies; /* doesn't matter... */ ret->nr_batch_requests = 0; /* because this is 0 */ ret->aic = NULL; ret->cic = NULL; - spin_lock_init(&ret->lock); - - local_irq_save(flags); + tsk->io_context = ret; + } - /* - * very unlikely, someone raced with us in setting up the task - * io context. free new context and just grab a reference. - */ - if (!tsk->io_context) - tsk->io_context = ret; - else { - kmem_cache_free(iocontext_cachep, ret); - ret = tsk->io_context; - } + return ret; +} +EXPORT_SYMBOL(current_io_context); -out: +/* + * If the current task has no IO context then create one and initialise it. + * If it does have a context, take a ref on it. + * + * This is always called in the context of the task which submitted the I/O. + */ +struct io_context *get_io_context(int gfp_flags) +{ + struct io_context *ret; + ret = current_io_context(gfp_flags); + if (likely(ret)) atomic_inc(&ret->refcount); - local_irq_restore(flags); - } - return ret; } EXPORT_SYMBOL(get_io_context); @@ -3582,7 +3542,7 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) q = container_of(kobj, struct request_queue, kobj); if (!entry->show) - return 0; + return -EIO; return entry->show(q, page); } @@ -3596,7 +3556,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, q = container_of(kobj, struct request_queue, kobj); if (!entry->store) - return -EINVAL; + return -EIO; return entry->store(q, page, length); } @@ -3606,7 +3566,7 @@ static struct sysfs_ops queue_sysfs_ops = { .store = queue_attr_store, }; -struct kobj_type queue_ktype = { +static struct kobj_type queue_ktype = { .sysfs_ops = &queue_sysfs_ops, .default_attrs = default_attrs, }; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6f011d0..b35e088 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -472,17 +472,11 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) */ static void loop_add_bio(struct loop_device *lo, struct bio *bio) { - unsigned long flags; - - spin_lock_irqsave(&lo->lo_lock, flags); if (lo->lo_biotail) { lo->lo_biotail->bi_next = bio; lo->lo_biotail = bio; } else lo->lo_bio = lo->lo_biotail = bio; - spin_unlock_irqrestore(&lo->lo_lock, flags); - - up(&lo->lo_bh_mutex); } /* @@ -492,14 +486,12 @@ static struct bio *loop_get_bio(struct loop_device *lo) { struct bio *bio; - spin_lock_irq(&lo->lo_lock); if ((bio = lo->lo_bio)) { if (bio == lo->lo_biotail) lo->lo_biotail = NULL; lo->lo_bio = bio->bi_next; bio->bi_next = NULL; } - spin_unlock_irq(&lo->lo_lock); return bio; } @@ -509,35 +501,28 @@ static int loop_make_request(request_queue_t *q, struct bio *old_bio) struct loop_device *lo = q->queuedata; int rw = bio_rw(old_bio); - if (!lo) - goto out; + if (rw == READA) + rw = READ; + + BUG_ON(!lo || (rw != READ && rw != WRITE)); spin_lock_irq(&lo->lo_lock); if (lo->lo_state != Lo_bound) - goto inactive; - atomic_inc(&lo->lo_pending); - spin_unlock_irq(&lo->lo_lock); - - if (rw == WRITE) { - if (lo->lo_flags & LO_FLAGS_READ_ONLY) - goto err; - } else if (rw == READA) { - rw = READ; - } else if (rw != READ) { - printk(KERN_ERR "loop: unknown command (%x)\n", rw); - goto err; - } + goto out; + if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) + goto out; + lo->lo_pending++; loop_add_bio(lo, old_bio); + spin_unlock_irq(&lo->lo_lock); + up(&lo->lo_bh_mutex); return 0; -err: - if (atomic_dec_and_test(&lo->lo_pending)) - up(&lo->lo_bh_mutex); + out: + if (lo->lo_pending == 0) + up(&lo->lo_bh_mutex); + spin_unlock_irq(&lo->lo_lock); bio_io_error(old_bio, old_bio->bi_size); return 0; -inactive: - spin_unlock_irq(&lo->lo_lock); - goto out; } /* @@ -560,13 +545,11 @@ static void do_loop_switch(struct loop_device *, struct switch_request *); static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio) { - int ret; - if (unlikely(!bio->bi_bdev)) { do_loop_switch(lo, bio->bi_private); bio_put(bio); } else { - ret = do_bio_filebacked(lo, bio); + int ret = do_bio_filebacked(lo, bio); bio_endio(bio, bio->bi_size, ret); } } @@ -594,7 +577,7 @@ static int loop_thread(void *data) set_user_nice(current, -20); lo->lo_state = Lo_bound; - atomic_inc(&lo->lo_pending); + lo->lo_pending = 1; /* * up sem, we are running @@ -602,26 +585,37 @@ static int loop_thread(void *data) up(&lo->lo_sem); for (;;) { - down_interruptible(&lo->lo_bh_mutex); + int pending; + /* - * could be upped because of tear-down, not because of - * pending work + * interruptible just to not contribute to load avg */ - if (!atomic_read(&lo->lo_pending)) + if (down_interruptible(&lo->lo_bh_mutex)) + continue; + + spin_lock_irq(&lo->lo_lock); + + /* + * could be upped because of tear-down, not pending work + */ + if (unlikely(!lo->lo_pending)) { + spin_unlock_irq(&lo->lo_lock); break; + } bio = loop_get_bio(lo); - if (!bio) { - printk("loop: missing bio\n"); - continue; - } + lo->lo_pending--; + pending = lo->lo_pending; + spin_unlock_irq(&lo->lo_lock); + + BUG_ON(!bio); loop_handle_bio(lo, bio); /* * upped both for pending work and tear-down, lo_pending * will hit zero then */ - if (atomic_dec_and_test(&lo->lo_pending)) + if (unlikely(!pending)) break; } @@ -900,7 +894,8 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_rundown; - if (atomic_dec_and_test(&lo->lo_pending)) + lo->lo_pending--; + if (!lo->lo_pending) up(&lo->lo_bh_mutex); spin_unlock_irq(&lo->lo_lock); diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 202a5a7..fa49d62 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -723,7 +723,7 @@ static int pd_special_command(struct pd_unit *disk, rq.ref_count = 1; rq.waiting = &wait; rq.end_io = blk_end_sync_rq; - blk_insert_request(disk->gd->queue, &rq, 0, func, 0); + blk_insert_request(disk->gd->queue, &rq, 0, func); wait_for_completion(&wait); rq.waiting = NULL; if (rq.errors) diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c index dbeb107..84d8e29 100644 --- a/drivers/block/paride/pg.c +++ b/drivers/block/paride/pg.c @@ -222,7 +222,7 @@ static int pg_identify(struct pg *dev, int log); static char pg_scratch[512]; /* scratch block buffer */ -static struct class_simple *pg_class; +static struct class *pg_class; /* kernel glue structures */ @@ -666,7 +666,7 @@ static int __init pg_init(void) err = -1; goto out; } - pg_class = class_simple_create(THIS_MODULE, "pg"); + pg_class = class_create(THIS_MODULE, "pg"); if (IS_ERR(pg_class)) { err = PTR_ERR(pg_class); goto out_chrdev; @@ -675,7 +675,7 @@ static int __init pg_init(void) for (unit = 0; unit < PG_UNITS; unit++) { struct pg *dev = &devices[unit]; if (dev->present) { - class_simple_device_add(pg_class, MKDEV(major, unit), + class_device_create(pg_class, MKDEV(major, unit), NULL, "pg%u", unit); err = devfs_mk_cdev(MKDEV(major, unit), S_IFCHR | S_IRUSR | S_IWUSR, "pg/%u", @@ -688,8 +688,8 @@ static int __init pg_init(void) goto out; out_class: - class_simple_device_remove(MKDEV(major, unit)); - class_simple_destroy(pg_class); + class_device_destroy(pg_class, MKDEV(major, unit)); + class_destroy(pg_class); out_chrdev: unregister_chrdev(major, "pg"); out: @@ -703,11 +703,11 @@ static void __exit pg_exit(void) for (unit = 0; unit < PG_UNITS; unit++) { struct pg *dev = &devices[unit]; if (dev->present) { - class_simple_device_remove(MKDEV(major, unit)); + class_device_destroy(pg_class, MKDEV(major, unit)); devfs_remove("pg/%u", unit); } } - class_simple_destroy(pg_class); + class_destroy(pg_class); devfs_remove("pg"); unregister_chrdev(major, name); diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c index 8fbd692..5fe8ee8 100644 --- a/drivers/block/paride/pt.c +++ b/drivers/block/paride/pt.c @@ -242,7 +242,7 @@ static struct file_operations pt_fops = { }; /* sysfs class support */ -static struct class_simple *pt_class; +static struct class *pt_class; static inline int status_reg(struct pi_adapter *pi) { @@ -963,7 +963,7 @@ static int __init pt_init(void) err = -1; goto out; } - pt_class = class_simple_create(THIS_MODULE, "pt"); + pt_class = class_create(THIS_MODULE, "pt"); if (IS_ERR(pt_class)) { err = PTR_ERR(pt_class); goto out_chrdev; @@ -972,29 +972,29 @@ static int __init pt_init(void) devfs_mk_dir("pt"); for (unit = 0; unit < PT_UNITS; unit++) if (pt[unit].present) { - class_simple_device_add(pt_class, MKDEV(major, unit), + class_device_create(pt_class, MKDEV(major, unit), NULL, "pt%d", unit); err = devfs_mk_cdev(MKDEV(major, unit), S_IFCHR | S_IRUSR | S_IWUSR, "pt/%d", unit); if (err) { - class_simple_device_remove(MKDEV(major, unit)); + class_device_destroy(pt_class, MKDEV(major, unit)); goto out_class; } - class_simple_device_add(pt_class, MKDEV(major, unit + 128), + class_device_create(pt_class, MKDEV(major, unit + 128), NULL, "pt%dn", unit); err = devfs_mk_cdev(MKDEV(major, unit + 128), S_IFCHR | S_IRUSR | S_IWUSR, "pt/%dn", unit); if (err) { - class_simple_device_remove(MKDEV(major, unit + 128)); + class_device_destroy(pt_class, MKDEV(major, unit + 128)); goto out_class; } } goto out; out_class: - class_simple_destroy(pt_class); + class_destroy(pt_class); out_chrdev: unregister_chrdev(major, "pt"); out: @@ -1006,12 +1006,12 @@ static void __exit pt_exit(void) int unit; for (unit = 0; unit < PT_UNITS; unit++) if (pt[unit].present) { - class_simple_device_remove(MKDEV(major, unit)); + class_device_destroy(pt_class, MKDEV(major, unit)); devfs_remove("pt/%d", unit); - class_simple_device_remove(MKDEV(major, unit + 128)); + class_device_destroy(pt_class, MKDEV(major, unit + 128)); devfs_remove("pt/%dn", unit); } - class_simple_destroy(pt_class); + class_destroy(pt_class); devfs_remove("pt"); unregister_chrdev(major, name); for (unit = 0; unit < PT_UNITS; unit++) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index bc56770..7b83834 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -467,14 +467,12 @@ static int pkt_set_speed(struct pktcdvd_device *pd, unsigned write_speed, unsign * Queue a bio for processing by the low-level CD device. Must be called * from process context. */ -static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio, int high_prio_read) +static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) { spin_lock(&pd->iosched.lock); if (bio_data_dir(bio) == READ) { pkt_add_list_last(bio, &pd->iosched.read_queue, &pd->iosched.read_queue_tail); - if (high_prio_read) - pd->iosched.high_prio_read = 1; } else { pkt_add_list_last(bio, &pd->iosched.write_queue, &pd->iosched.write_queue_tail); @@ -490,15 +488,16 @@ static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio, int high_p * requirements for CDRW drives: * - A cache flush command must be inserted before a read request if the * previous request was a write. - * - Switching between reading and writing is slow, so don't it more often + * - Switching between reading and writing is slow, so don't do it more often * than necessary. + * - Optimize for throughput at the expense of latency. This means that streaming + * writes will never be interrupted by a read, but if the drive has to seek + * before the next write, switch to reading instead if there are any pending + * read requests. * - Set the read speed according to current usage pattern. When only reading * from the device, it's best to use the highest possible read speed, but * when switching often between reading and writing, it's better to have the * same read and write speeds. - * - Reads originating from user space should have higher priority than reads - * originating from pkt_gather_data, because some process is usually waiting - * on reads of the first kind. */ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) { @@ -512,21 +511,24 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) for (;;) { struct bio *bio; - int reads_queued, writes_queued, high_prio_read; + int reads_queued, writes_queued; spin_lock(&pd->iosched.lock); reads_queued = (pd->iosched.read_queue != NULL); writes_queued = (pd->iosched.write_queue != NULL); - if (!reads_queued) - pd->iosched.high_prio_read = 0; - high_prio_read = pd->iosched.high_prio_read; spin_unlock(&pd->iosched.lock); if (!reads_queued && !writes_queued) break; if (pd->iosched.writing) { - if (high_prio_read || (!writes_queued && reads_queued)) { + int need_write_seek = 1; + spin_lock(&pd->iosched.lock); + bio = pd->iosched.write_queue; + spin_unlock(&pd->iosched.lock); + if (bio && (bio->bi_sector == pd->iosched.last_write)) + need_write_seek = 0; + if (need_write_seek && reads_queued) { if (atomic_read(&pd->cdrw.pending_bios) > 0) { VPRINTK("pktcdvd: write, waiting\n"); break; @@ -559,8 +561,10 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) if (bio_data_dir(bio) == READ) pd->iosched.successive_reads += bio->bi_size >> 10; - else + else { pd->iosched.successive_reads = 0; + pd->iosched.last_write = bio->bi_sector + bio_sectors(bio); + } if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) { if (pd->read_speed == pd->write_speed) { pd->read_speed = MAX_SPEED; @@ -765,7 +769,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) atomic_inc(&pkt->io_wait); bio->bi_rw = READ; - pkt_queue_bio(pd, bio, 0); + pkt_queue_bio(pd, bio); frames_read++; } @@ -1062,7 +1066,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) atomic_set(&pkt->io_wait, 1); pkt->w_bio->bi_rw = WRITE; - pkt_queue_bio(pd, pkt->w_bio, 0); + pkt_queue_bio(pd, pkt->w_bio); } static void pkt_finish_packet(struct packet_data *pkt, int uptodate) @@ -1247,8 +1251,7 @@ static int kcdrwd(void *foobar) VPRINTK("kcdrwd: wake up\n"); /* make swsusp happy with our thread */ - if (current->flags & PF_FREEZE) - refrigerator(PF_FREEZE); + try_to_freeze(); list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { if (!pkt->sleep_time) @@ -2120,7 +2123,7 @@ static int pkt_make_request(request_queue_t *q, struct bio *bio) cloned_bio->bi_private = psd; cloned_bio->bi_end_io = pkt_end_io_read_cloned; pd->stats.secs_r += bio->bi_size >> 9; - pkt_queue_bio(pd, cloned_bio, 1); + pkt_queue_bio(pd, cloned_bio); return 0; } diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 5b09cf1..e5f7494 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -253,7 +253,7 @@ static int floppy_revalidate(struct gendisk *disk); static int swim3_add_device(struct device_node *swims); int swim3_init(void); -#ifndef CONFIG_PMAC_PBOOK +#ifndef CONFIG_PMAC_MEDIABAY #define check_media_bay(which, what) 1 #endif @@ -297,9 +297,11 @@ static void do_fd_request(request_queue_t * q) int i; for(i=0;i<floppy_count;i++) { +#ifdef CONFIG_PMAC_MEDIABAY if (floppy_states[i].media_bay && check_media_bay(floppy_states[i].media_bay, MB_FD)) continue; +#endif /* CONFIG_PMAC_MEDIABAY */ start_request(&floppy_states[i]); } sti(); @@ -856,8 +858,10 @@ static int floppy_ioctl(struct inode *inode, struct file *filp, if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN)) return -EPERM; +#ifdef CONFIG_PMAC_MEDIABAY if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD)) return -ENXIO; +#endif switch (cmd) { case FDEJECT: @@ -881,8 +885,10 @@ static int floppy_open(struct inode *inode, struct file *filp) int n, err = 0; if (fs->ref_count == 0) { +#ifdef CONFIG_PMAC_MEDIABAY if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD)) return -ENXIO; +#endif out_8(&sw->setup, S_IBM_DRIVE | S_FCLK_DIV2); out_8(&sw->control_bic, 0xff); out_8(&sw->mode, 0x95); @@ -967,8 +973,10 @@ static int floppy_revalidate(struct gendisk *disk) struct swim3 __iomem *sw; int ret, n; +#ifdef CONFIG_PMAC_MEDIABAY if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD)) return -ENXIO; +#endif sw = fs->swim3; grab_drive(fs, revalidating, 0); diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 797f598..d57007b9 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -26,6 +26,7 @@ #include <linux/delay.h> #include <linux/time.h> #include <linux/hdreg.h> +#include <linux/dma-mapping.h> #include <asm/io.h> #include <asm/semaphore.h> #include <asm/uaccess.h> @@ -614,7 +615,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx) spin_unlock_irq(&host->lock); DPRINTK("blk_insert_request, tag == %u\n", idx); - blk_insert_request(host->oob_q, crq->rq, 1, crq, 0); + blk_insert_request(host->oob_q, crq->rq, 1, crq); return 0; @@ -653,7 +654,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) crq->msg_bucket = (u32) rc; DPRINTK("blk_insert_request, tag == %u\n", idx); - blk_insert_request(host->oob_q, crq->rq, 1, crq, 0); + blk_insert_request(host->oob_q, crq->rq, 1, crq); return 0; } @@ -1581,10 +1582,10 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) if (rc) goto err_out; -#if IF_64BIT_DMA_IS_POSSIBLE /* grrrr... */ - rc = pci_set_dma_mask(pdev, 0xffffffffffffffffULL); +#ifdef IF_64BIT_DMA_IS_POSSIBLE /* grrrr... */ + rc = pci_set_dma_mask(pdev, DMA_64BIT_MASK); if (!rc) { - rc = pci_set_consistent_dma_mask(pdev, 0xffffffffffffffffULL); + rc = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK); if (rc) { printk(KERN_ERR DRV_NAME "(%s): consistent DMA mask failure\n", pci_name(pdev)); @@ -1593,14 +1594,14 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) pci_dac = 1; } else { #endif - rc = pci_set_dma_mask(pdev, 0xffffffffULL); + rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK); if (rc) { printk(KERN_ERR DRV_NAME "(%s): DMA mask failure\n", pci_name(pdev)); goto err_out_regions; } pci_dac = 0; -#if IF_64BIT_DMA_IS_POSSIBLE /* grrrr... */ +#ifdef IF_64BIT_DMA_IS_POSSIBLE /* grrrr... */ } #endif diff --git a/drivers/block/ub.c b/drivers/block/ub.c index adc4dcc..a026567 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -23,6 +23,7 @@ * -- Exterminate P3 printks * -- Resove XXX's * -- Redo "benh's retries", perhaps have spin-up code to handle them. V:D=? + * -- CLEAR, CLR2STS, CLRRS seem to be ripe for refactoring. */ #include <linux/kernel.h> #include <linux/module.h> @@ -38,6 +39,73 @@ #define UB_MAJOR 180 /* + * The command state machine is the key model for understanding of this driver. + * + * The general rule is that all transitions are done towards the bottom + * of the diagram, thus preventing any loops. + * + * An exception to that is how the STAT state is handled. A counter allows it + * to be re-entered along the path marked with [C]. + * + * +--------+ + * ! INIT ! + * +--------+ + * ! + * ub_scsi_cmd_start fails ->--------------------------------------\ + * ! ! + * V ! + * +--------+ ! + * ! CMD ! ! + * +--------+ ! + * ! +--------+ ! + * was -EPIPE -->-------------------------------->! CLEAR ! ! + * ! +--------+ ! + * ! ! ! + * was error -->------------------------------------- ! --------->\ + * ! ! ! + * /--<-- cmd->dir == NONE ? ! ! + * ! ! ! ! + * ! V ! ! + * ! +--------+ ! ! + * ! ! DATA ! ! ! + * ! +--------+ ! ! + * ! ! +---------+ ! ! + * ! was -EPIPE -->--------------->! CLR2STS ! ! ! + * ! ! +---------+ ! ! + * ! ! ! ! ! + * ! ! was error -->---- ! --------->\ + * ! was error -->--------------------- ! ------------- ! --------->\ + * ! ! ! ! ! + * ! V ! ! ! + * \--->+--------+ ! ! ! + * ! STAT !<--------------------------/ ! ! + * /--->+--------+ ! ! + * ! ! ! ! + * [C] was -EPIPE -->-----------\ ! ! + * ! ! ! ! ! + * +<---- len == 0 ! ! ! + * ! ! ! ! ! + * ! was error -->--------------------------------------!---------->\ + * ! ! ! ! ! + * +<---- bad CSW ! ! ! + * +<---- bad tag ! ! ! + * ! ! V ! ! + * ! ! +--------+ ! ! + * ! ! ! CLRRS ! ! ! + * ! ! +--------+ ! ! + * ! ! ! ! ! + * \------- ! --------------------[C]--------\ ! ! + * ! ! ! ! + * cmd->error---\ +--------+ ! ! + * ! +--------------->! SENSE !<----------/ ! + * STAT_FAIL----/ +--------+ ! + * ! ! V + * ! V +--------+ + * \--------------------------------\--------------------->! DONE ! + * +--------+ + */ + +/* * Definitions which have to be scattered once we understand the layout better. */ @@ -51,7 +119,7 @@ * This many LUNs per USB device. * Every one of them takes a host, see UB_MAX_HOSTS. */ -#define UB_MAX_LUNS 4 +#define UB_MAX_LUNS 9 /* */ @@ -91,8 +159,6 @@ struct bulk_cs_wrap { #define US_BULK_CS_WRAP_LEN 13 #define US_BULK_CS_SIGN 0x53425355 /* spells out 'USBS' */ -/* This is for Olympus Camedia digital cameras */ -#define US_BULK_CS_OLYMPUS_SIGN 0x55425355 /* spells out 'USBU' */ #define US_BULK_STAT_OK 0 #define US_BULK_STAT_FAIL 1 #define US_BULK_STAT_PHASE 2 @@ -135,6 +201,7 @@ enum ub_scsi_cmd_state { UB_CMDST_CLR2STS, /* Clearing before requesting status */ UB_CMDST_STAT, /* Status phase */ UB_CMDST_CLEAR, /* Clearing a stall (halt, actually) */ + UB_CMDST_CLRRS, /* Clearing before retrying status */ UB_CMDST_SENSE, /* Sending Request Sense */ UB_CMDST_DONE /* Final state */ }; @@ -146,6 +213,7 @@ static char *ub_scsi_cmd_stname[] = { "c2s", "sts", "clr", + "crs", "Sen", "fin" }; @@ -316,6 +384,7 @@ struct ub_dev { struct urb work_urb; struct timer_list work_timer; int last_pipe; /* What might need clearing */ + __le32 signature; /* Learned signature */ struct bulk_cb_wrap work_bcb; struct bulk_cs_wrap work_bcs; struct usb_ctrlrequest work_cr; @@ -339,8 +408,9 @@ static void ub_scsi_action(unsigned long _dev); static void ub_scsi_dispatch(struct ub_dev *sc); static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd); static void ub_state_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd, int rc); -static void __ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd); +static int __ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd); static void ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd); +static void ub_state_stat_counted(struct ub_dev *sc, struct ub_scsi_cmd *cmd); static void ub_state_sense(struct ub_dev *sc, struct ub_scsi_cmd *cmd); static int ub_submit_clear_stall(struct ub_dev *sc, struct ub_scsi_cmd *cmd, int stalled_pipe); @@ -430,7 +500,7 @@ static void ub_cmdtr_sense(struct ub_dev *sc, struct ub_scsi_cmd *cmd, } } -static ssize_t ub_diag_show(struct device *dev, char *page) +static ssize_t ub_diag_show(struct device *dev, struct device_attribute *attr, char *page) { struct usb_interface *intf; struct ub_dev *sc; @@ -1085,6 +1155,28 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd) ub_state_stat(sc, cmd); + } else if (cmd->state == UB_CMDST_CLRRS) { + if (urb->status == -EPIPE) { + /* + * STALL while clearning STALL. + * The control pipe clears itself - nothing to do. + * XXX Might try to reset the device here and retry. + */ + printk(KERN_NOTICE "%s: stall on control pipe\n", + sc->name); + goto Bad_End; + } + + /* + * We ignore the result for the halt clear. + */ + + /* reset the endpoint toggle */ + usb_settoggle(sc->dev, usb_pipeendpoint(sc->last_pipe), + usb_pipeout(sc->last_pipe), 0); + + ub_state_stat_counted(sc, cmd); + } else if (cmd->state == UB_CMDST_CMD) { if (urb->status == -EPIPE) { rc = ub_submit_clear_stall(sc, cmd, sc->last_pipe); @@ -1190,52 +1282,57 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd) */ goto Bad_End; } - cmd->state = UB_CMDST_CLEAR; + + /* + * Having a stall when getting CSW is an error, so + * make sure uppper levels are not oblivious to it. + */ + cmd->error = -EIO; /* A cheap trick... */ + + cmd->state = UB_CMDST_CLRRS; ub_cmdtr_state(sc, cmd); return; } + if (urb->status == -EOVERFLOW) { + /* + * XXX We are screwed here. Retrying is pointless, + * because the pipelined data will not get in until + * we read with a big enough buffer. We must reset XXX. + */ + goto Bad_End; + } if (urb->status != 0) goto Bad_End; if (urb->actual_length == 0) { - /* - * Some broken devices add unnecessary zero-length - * packets to the end of their data transfers. - * Such packets show up as 0-length CSWs. If we - * encounter such a thing, try to read the CSW again. - */ - if (++cmd->stat_count >= 4) { - printk(KERN_NOTICE "%s: unable to get CSW\n", - sc->name); - goto Bad_End; - } - __ub_state_stat(sc, cmd); + ub_state_stat_counted(sc, cmd); return; } /* * Check the returned Bulk protocol status. + * The status block has to be validated first. */ bcs = &sc->work_bcs; - rc = le32_to_cpu(bcs->Residue); - if (rc != cmd->len - cmd->act_len) { + + if (sc->signature == cpu_to_le32(0)) { /* - * It is all right to transfer less, the caller has - * to check. But it's not all right if the device - * counts disagree with our counts. + * This is the first reply, so do not perform the check. + * Instead, remember the signature the device uses + * for future checks. But do not allow a nul. */ - /* P3 */ printk("%s: resid %d len %d act %d\n", - sc->name, rc, cmd->len, cmd->act_len); - goto Bad_End; - } - -#if 0 - if (bcs->Signature != cpu_to_le32(US_BULK_CS_SIGN) && - bcs->Signature != cpu_to_le32(US_BULK_CS_OLYMPUS_SIGN)) { - /* Windows ignores signatures, so do we. */ + sc->signature = bcs->Signature; + if (sc->signature == cpu_to_le32(0)) { + ub_state_stat_counted(sc, cmd); + return; + } + } else { + if (bcs->Signature != sc->signature) { + ub_state_stat_counted(sc, cmd); + return; + } } -#endif if (bcs->Tag != cmd->tag) { /* @@ -1245,16 +1342,22 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd) * commands and reply at commands we timed out before. * Without flushing these replies we loop forever. */ - if (++cmd->stat_count >= 4) { - printk(KERN_NOTICE "%s: " - "tag mismatch orig 0x%x reply 0x%x\n", - sc->name, cmd->tag, bcs->Tag); - goto Bad_End; - } - __ub_state_stat(sc, cmd); + ub_state_stat_counted(sc, cmd); return; } + rc = le32_to_cpu(bcs->Residue); + if (rc != cmd->len - cmd->act_len) { + /* + * It is all right to transfer less, the caller has + * to check. But it's not all right if the device + * counts disagree with our counts. + */ + /* P3 */ printk("%s: resid %d len %d act %d\n", + sc->name, rc, cmd->len, cmd->act_len); + goto Bad_End; + } + switch (bcs->Status) { case US_BULK_STAT_OK: break; @@ -1272,6 +1375,10 @@ static void ub_scsi_urb_compl(struct ub_dev *sc, struct ub_scsi_cmd *cmd) } /* Not zeroing error to preserve a babble indicator */ + if (cmd->error != 0) { + ub_state_sense(sc, cmd); + return; + } cmd->state = UB_CMDST_DONE; ub_cmdtr_state(sc, cmd); ub_cmdq_pop(sc); @@ -1310,7 +1417,7 @@ static void ub_state_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd, int rc) * Factorization helper for the command state machine: * Submit a CSW read. */ -static void __ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd) +static int __ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd) { int rc; @@ -1328,11 +1435,12 @@ static void __ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd) /* XXX Clear stalls */ ub_complete(&sc->work_done); ub_state_done(sc, cmd, rc); - return; + return -1; } sc->work_timer.expires = jiffies + UB_STAT_TIMEOUT; add_timer(&sc->work_timer); + return 0; } /* @@ -1341,7 +1449,9 @@ static void __ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd) */ static void ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd) { - __ub_state_stat(sc, cmd); + + if (__ub_state_stat(sc, cmd) != 0) + return; cmd->stat_count = 0; cmd->state = UB_CMDST_STAT; @@ -1350,6 +1460,25 @@ static void ub_state_stat(struct ub_dev *sc, struct ub_scsi_cmd *cmd) /* * Factorization helper for the command state machine: + * Submit a CSW read and go to STAT state with counter (along [C] path). + */ +static void ub_state_stat_counted(struct ub_dev *sc, struct ub_scsi_cmd *cmd) +{ + + if (++cmd->stat_count >= 4) { + ub_state_sense(sc, cmd); + return; + } + + if (__ub_state_stat(sc, cmd) != 0) + return; + + cmd->state = UB_CMDST_STAT; + ub_cmdtr_state(sc, cmd); +} + +/* + * Factorization helper for the command state machine: * Submit a REQUEST SENSE and go to SENSE state. */ static void ub_state_sense(struct ub_dev *sc, struct ub_scsi_cmd *cmd) @@ -2100,7 +2229,7 @@ static int ub_probe(struct usb_interface *intf, nluns = rc; break; } - mdelay(100); + msleep(100); } for (i = 0; i < nluns; i++) { |