diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-10-13 11:05:51 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-13 11:05:51 +0200 |
commit | accba5f3965d6a9d1bf7c1e1a7995d17e9d521b6 (patch) | |
tree | 8fb40782e79472ed882ff2098d4dd295557278ee /drivers/md | |
parent | 6852fd9b86d05063c6ef49d2e12e061cc7f6a105 (diff) | |
parent | 4480f15b3306f43bbb0310d461142b4e897ca45b (diff) | |
download | op-kernel-dev-accba5f3965d6a9d1bf7c1e1a7995d17e9d521b6.zip op-kernel-dev-accba5f3965d6a9d1bf7c1e1a7995d17e9d521b6.tar.gz |
Merge branch 'linus' into oprofile-v2
Conflicts:
arch/x86/kernel/apic_32.c
arch/x86/oprofile/nmi_int.c
include/linux/pci_ids.h
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bitmap.c | 47 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 109 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.c | 29 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 10 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 79 | ||||
-rw-r--r-- | drivers/md/dm-mpath.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 126 | ||||
-rw-r--r-- | drivers/md/dm.c | 52 | ||||
-rw-r--r-- | drivers/md/dm.h | 10 | ||||
-rw-r--r-- | drivers/md/linear.c | 10 | ||||
-rw-r--r-- | drivers/md/md.c | 66 | ||||
-rw-r--r-- | drivers/md/multipath.c | 8 | ||||
-rw-r--r-- | drivers/md/raid0.c | 10 | ||||
-rw-r--r-- | drivers/md/raid1.c | 13 | ||||
-rw-r--r-- | drivers/md/raid10.c | 26 | ||||
-rw-r--r-- | drivers/md/raid5.c | 136 |
18 files changed, 485 insertions, 256 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 621a272..ac89a5d 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -238,15 +238,47 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde } +static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) +{ + /* Iterate the disks of an mddev, using rcu to protect access to the + * linked list, and raising the refcount of devices we return to ensure + * they don't disappear while in use. + * As devices are only added or removed when raid_disk is < 0 and + * nr_pending is 0 and In_sync is clear, the entries we return will + * still be in the same position on the list when we re-enter + * list_for_each_continue_rcu. + */ + struct list_head *pos; + rcu_read_lock(); + if (rdev == NULL) + /* start at the beginning */ + pos = &mddev->disks; + else { + /* release the previous rdev and start from there. */ + rdev_dec_pending(rdev, mddev); + pos = &rdev->same_set; + } + list_for_each_continue_rcu(pos, &mddev->disks) { + rdev = list_entry(pos, mdk_rdev_t, same_set); + if (rdev->raid_disk >= 0 && + test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) { + /* this is a usable devices */ + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + return rdev; + } + } + rcu_read_unlock(); + return NULL; +} + static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) { - mdk_rdev_t *rdev; + mdk_rdev_t *rdev = NULL; mddev_t *mddev = bitmap->mddev; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (test_bit(In_sync, &rdev->flags) - && !test_bit(Faulty, &rdev->flags)) { + while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { int size = PAGE_SIZE; if (page->index == bitmap->file_pages-1) size = roundup(bitmap->last_page_size, @@ -281,8 +313,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) + page->index * (PAGE_SIZE/512), size, page); - } - rcu_read_unlock(); + } if (wait) md_super_wait(mddev); @@ -1234,7 +1265,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect case 0: bitmap_file_set_bit(bitmap, offset); bitmap_count_page(bitmap,offset, 1); - blk_plug_device(bitmap->mddev->queue); + blk_plug_device_unlocked(bitmap->mddev->queue); /* fall through */ case 1: *bmc = 2; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1395643..682ef9e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -333,7 +333,6 @@ static void crypt_convert_init(struct crypt_config *cc, ctx->idx_out = bio_out ? bio_out->bi_idx : 0; ctx->sector = sector + cc->iv_offset; init_completion(&ctx->restart); - atomic_set(&ctx->pending, 1); } static int crypt_convert_block(struct crypt_config *cc, @@ -408,6 +407,8 @@ static int crypt_convert(struct crypt_config *cc, { int r; + atomic_set(&ctx->pending, 1); + while(ctx->idx_in < ctx->bio_in->bi_vcnt && ctx->idx_out < ctx->bio_out->bi_vcnt) { @@ -456,9 +457,11 @@ static void dm_crypt_bio_destructor(struct bio *bio) /* * Generate a new unfragmented bio with the given size * This should never violate the device limitations - * May return a smaller bio when running out of pages + * May return a smaller bio when running out of pages, indicated by + * *out_of_pages set to 1. */ -static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) +static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, + unsigned *out_of_pages) { struct crypt_config *cc = io->target->private; struct bio *clone; @@ -472,11 +475,14 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) return NULL; clone_init(io, clone); + *out_of_pages = 0; for (i = 0; i < nr_iovecs; i++) { page = mempool_alloc(cc->page_pool, gfp_mask); - if (!page) + if (!page) { + *out_of_pages = 1; break; + } /* * if additional pages cannot be allocated without waiting, @@ -517,6 +523,27 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) } } +static struct dm_crypt_io *crypt_io_alloc(struct dm_target *ti, + struct bio *bio, sector_t sector) +{ + struct crypt_config *cc = ti->private; + struct dm_crypt_io *io; + + io = mempool_alloc(cc->io_pool, GFP_NOIO); + io->target = ti; + io->base_bio = bio; + io->sector = sector; + io->error = 0; + atomic_set(&io->pending, 0); + + return io; +} + +static void crypt_inc_pending(struct dm_crypt_io *io) +{ + atomic_inc(&io->pending); +} + /* * One of the bios was finished. Check for completion of * the whole request and correctly clean up the buffer. @@ -591,7 +618,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io) struct bio *base_bio = io->base_bio; struct bio *clone; - atomic_inc(&io->pending); + crypt_inc_pending(io); /* * The block layer might modify the bvec array, so always @@ -653,6 +680,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, crypt_free_buffer_pages(cc, clone); bio_put(clone); io->error = -EIO; + crypt_dec_pending(io); return; } @@ -664,28 +692,34 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, if (async) kcryptd_queue_io(io); - else { - atomic_inc(&io->pending); + else generic_make_request(clone); - } } -static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io) +static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) { struct crypt_config *cc = io->target->private; struct bio *clone; + int crypt_finished; + unsigned out_of_pages = 0; unsigned remaining = io->base_bio->bi_size; int r; /* + * Prevent io from disappearing until this function completes. + */ + crypt_inc_pending(io); + crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector); + + /* * The allocated buffers can be smaller than the whole bio, * so repeat the whole process until all the data can be handled. */ while (remaining) { - clone = crypt_alloc_buffer(io, remaining); + clone = crypt_alloc_buffer(io, remaining, &out_of_pages); if (unlikely(!clone)) { io->error = -ENOMEM; - return; + break; } io->ctx.bio_out = clone; @@ -693,37 +727,32 @@ static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io) remaining -= clone->bi_size; + crypt_inc_pending(io); r = crypt_convert(cc, &io->ctx); + crypt_finished = atomic_dec_and_test(&io->ctx.pending); - if (atomic_dec_and_test(&io->ctx.pending)) { - /* processed, no running async crypto */ + /* Encryption was already finished, submit io now */ + if (crypt_finished) { kcryptd_crypt_write_io_submit(io, r, 0); - if (unlikely(r < 0)) - return; - } else - atomic_inc(&io->pending); - /* out of memory -> run queues */ - if (unlikely(remaining)) { - /* wait for async crypto then reinitialize pending */ - wait_event(cc->writeq, !atomic_read(&io->ctx.pending)); - atomic_set(&io->ctx.pending, 1); - congestion_wait(WRITE, HZ/100); + /* + * If there was an error, do not try next fragments. + * For async, error is processed in async handler. + */ + if (unlikely(r < 0)) + break; } - } -} -static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) -{ - struct crypt_config *cc = io->target->private; - - /* - * Prevent io from disappearing until this function completes. - */ - atomic_inc(&io->pending); + /* + * Out of memory -> run queues + * But don't wait if split was due to the io size restriction + */ + if (unlikely(out_of_pages)) + congestion_wait(WRITE, HZ/100); - crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector); - kcryptd_crypt_write_convert_loop(io); + if (unlikely(remaining)) + wait_event(cc->writeq, !atomic_read(&io->ctx.pending)); + } crypt_dec_pending(io); } @@ -741,7 +770,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) struct crypt_config *cc = io->target->private; int r = 0; - atomic_inc(&io->pending); + crypt_inc_pending(io); crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, io->sector); @@ -1108,15 +1137,9 @@ static void crypt_dtr(struct dm_target *ti) static int crypt_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { - struct crypt_config *cc = ti->private; struct dm_crypt_io *io; - io = mempool_alloc(cc->io_pool, GFP_NOIO); - io->target = ti; - io->base_bio = bio; - io->sector = bio->bi_sector - ti->begin; - io->error = 0; - atomic_set(&io->pending, 0); + io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); if (bio_data_dir(io->base_bio) == READ) kcryptd_queue_io(io); diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 41f4080..769ab67 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -108,12 +108,12 @@ struct pstore { * Used to keep track of which metadata area the data in * 'chunk' refers to. */ - uint32_t current_area; + chunk_t current_area; /* * The next free chunk for an exception. */ - uint32_t next_free; + chunk_t next_free; /* * The index of next free exception in the current @@ -175,7 +175,7 @@ static void do_metadata(struct work_struct *work) /* * Read or write a chunk aligned and sized block of data from a device. */ -static int chunk_io(struct pstore *ps, uint32_t chunk, int rw, int metadata) +static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata) { struct dm_io_region where = { .bdev = ps->snap->cow->bdev, @@ -209,16 +209,23 @@ static int chunk_io(struct pstore *ps, uint32_t chunk, int rw, int metadata) } /* + * Convert a metadata area index to a chunk index. + */ +static chunk_t area_location(struct pstore *ps, chunk_t area) +{ + return 1 + ((ps->exceptions_per_area + 1) * area); +} + +/* * Read or write a metadata area. Remembering to skip the first * chunk which holds the header. */ -static int area_io(struct pstore *ps, uint32_t area, int rw) +static int area_io(struct pstore *ps, chunk_t area, int rw) { int r; - uint32_t chunk; + chunk_t chunk; - /* convert a metadata area index to a chunk index */ - chunk = 1 + ((ps->exceptions_per_area + 1) * area); + chunk = area_location(ps, area); r = chunk_io(ps, chunk, rw, 0); if (r) @@ -228,7 +235,7 @@ static int area_io(struct pstore *ps, uint32_t area, int rw) return 0; } -static int zero_area(struct pstore *ps, uint32_t area) +static int zero_area(struct pstore *ps, chunk_t area) { memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); return area_io(ps, area, WRITE); @@ -404,7 +411,7 @@ static int insert_exceptions(struct pstore *ps, int *full) static int read_exceptions(struct pstore *ps) { - uint32_t area; + chunk_t area; int r, full = 1; /* @@ -517,6 +524,7 @@ static int persistent_prepare(struct exception_store *store, { struct pstore *ps = get_info(store); uint32_t stride; + chunk_t next_free; sector_t size = get_dev_size(store->snap->cow->bdev); /* Is there enough room ? */ @@ -530,7 +538,8 @@ static int persistent_prepare(struct exception_store *store, * into account the location of the metadata chunks. */ stride = (ps->exceptions_per_area + 1); - if ((++ps->next_free % stride) == 1) + next_free = ++ps->next_free; + if (sector_div(next_free, stride) == 1) ps->next_free++; atomic_inc(&ps->pending_count); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index b262c00..dca401d 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -426,7 +426,7 @@ static int list_devices(struct dm_ioctl *param, size_t param_size) old_nl->next = (uint32_t) ((void *) nl - (void *) old_nl); disk = dm_disk(hc->md); - nl->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor)); + nl->dev = huge_encode_dev(disk_devt(disk)); nl->next = 0; strcpy(nl->name, hc->name); @@ -539,7 +539,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) if (dm_suspended(md)) param->flags |= DM_SUSPEND_FLAG; - param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor)); + param->dev = huge_encode_dev(disk_devt(disk)); /* * Yes, this will be out of date by the time it gets back @@ -548,7 +548,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) */ param->open_count = dm_open_count(md); - if (disk->policy) + if (get_disk_ro(disk)) param->flags |= DM_READONLY_FLAG; param->event_nr = dm_get_event_nr(md); @@ -1131,7 +1131,7 @@ static void retrieve_deps(struct dm_table *table, unsigned int count = 0; struct list_head *tmp; size_t len, needed; - struct dm_dev *dd; + struct dm_dev_internal *dd; struct dm_target_deps *deps; deps = get_result_buffer(param, param_size, &len); @@ -1157,7 +1157,7 @@ static void retrieve_deps(struct dm_table *table, deps->count = count; count = 0; list_for_each_entry (dd, dm_table_get_devices(table), list) - deps->dev[count++] = huge_encode_dev(dd->bdev->bd_dev); + deps->dev[count++] = huge_encode_dev(dd->dm_dev.bdev->bd_dev); param->data_size = param->data_start + needed; } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index fea966d..103304c 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -30,9 +30,11 @@ struct pgpath { struct list_head list; struct priority_group *pg; /* Owning PG */ + unsigned is_active; /* Path status */ unsigned fail_count; /* Cumulative failure count */ struct dm_path path; + struct work_struct deactivate_path; }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -63,6 +65,7 @@ struct multipath { const char *hw_handler_name; struct work_struct activate_path; + struct pgpath *pgpath_to_activate; unsigned nr_priority_groups; struct list_head priority_groups; unsigned pg_init_required; /* pg_init needs calling? */ @@ -111,6 +114,7 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void process_queued_ios(struct work_struct *work); static void trigger_event(struct work_struct *work); static void activate_path(struct work_struct *work); +static void deactivate_path(struct work_struct *work); /*----------------------------------------------- @@ -121,8 +125,10 @@ static struct pgpath *alloc_pgpath(void) { struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); - if (pgpath) - pgpath->path.is_active = 1; + if (pgpath) { + pgpath->is_active = 1; + INIT_WORK(&pgpath->deactivate_path, deactivate_path); + } return pgpath; } @@ -132,6 +138,14 @@ static void free_pgpath(struct pgpath *pgpath) kfree(pgpath); } +static void deactivate_path(struct work_struct *work) +{ + struct pgpath *pgpath = + container_of(work, struct pgpath, deactivate_path); + + blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue); +} + static struct priority_group *alloc_priority_group(void) { struct priority_group *pg; @@ -146,11 +160,19 @@ static struct priority_group *alloc_priority_group(void) static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) { + unsigned long flags; struct pgpath *pgpath, *tmp; + struct multipath *m = ti->private; list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { list_del(&pgpath->list); + if (m->hw_handler_name) + scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); dm_put_device(ti, pgpath->path.dev); + spin_lock_irqsave(&m->lock, flags); + if (m->pgpath_to_activate == pgpath) + m->pgpath_to_activate = NULL; + spin_unlock_irqrestore(&m->lock, flags); free_pgpath(pgpath); } } @@ -418,6 +440,7 @@ static void process_queued_ios(struct work_struct *work) __choose_pgpath(m); pgpath = m->current_pgpath; + m->pgpath_to_activate = m->current_pgpath; if ((pgpath && !m->queue_io) || (!pgpath && !m->queue_if_no_path)) @@ -548,16 +571,17 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, { int r; struct pgpath *p; + struct multipath *m = ti->private; /* we need at least a path arg */ if (as->argc < 1) { ti->error = "no device given"; - return NULL; + return ERR_PTR(-EINVAL); } p = alloc_pgpath(); if (!p) - return NULL; + return ERR_PTR(-ENOMEM); r = dm_get_device(ti, shift(as), ti->begin, ti->len, dm_table_get_mode(ti->table), &p->path.dev); @@ -566,6 +590,15 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, goto bad; } + if (m->hw_handler_name) { + r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev), + m->hw_handler_name); + if (r < 0) { + dm_put_device(ti, p->path.dev); + goto bad; + } + } + r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); if (r) { dm_put_device(ti, p->path.dev); @@ -576,7 +609,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, bad: free_pgpath(p); - return NULL; + return ERR_PTR(r); } static struct priority_group *parse_priority_group(struct arg_set *as, @@ -594,14 +627,14 @@ static struct priority_group *parse_priority_group(struct arg_set *as, if (as->argc < 2) { as->argc = 0; - ti->error = "not enough priority group aruments"; - return NULL; + ti->error = "not enough priority group arguments"; + return ERR_PTR(-EINVAL); } pg = alloc_priority_group(); if (!pg) { ti->error = "couldn't allocate priority group"; - return NULL; + return ERR_PTR(-ENOMEM); } pg->m = m; @@ -634,8 +667,10 @@ static struct priority_group *parse_priority_group(struct arg_set *as, path_args.argv = as->argv; pgpath = parse_path(&path_args, &pg->ps, ti); - if (!pgpath) + if (IS_ERR(pgpath)) { + r = PTR_ERR(pgpath); goto bad; + } pgpath->pg = pg; list_add_tail(&pgpath->list, &pg->pgpaths); @@ -646,7 +681,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as, bad: free_priority_group(pg, ti); - return NULL; + return ERR_PTR(r); } static int parse_hw_handler(struct arg_set *as, struct multipath *m) @@ -765,8 +800,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, struct priority_group *pg; pg = parse_priority_group(&as, m); - if (!pg) { - r = -EINVAL; + if (IS_ERR(pg)) { + r = PTR_ERR(pg); goto bad; } @@ -832,13 +867,13 @@ static int fail_path(struct pgpath *pgpath) spin_lock_irqsave(&m->lock, flags); - if (!pgpath->path.is_active) + if (!pgpath->is_active) goto out; DMWARN("Failing path %s.", pgpath->path.dev->name); pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); - pgpath->path.is_active = 0; + pgpath->is_active = 0; pgpath->fail_count++; m->nr_valid_paths--; @@ -850,6 +885,7 @@ static int fail_path(struct pgpath *pgpath) pgpath->path.dev->name, m->nr_valid_paths); queue_work(kmultipathd, &m->trigger_event); + queue_work(kmultipathd, &pgpath->deactivate_path); out: spin_unlock_irqrestore(&m->lock, flags); @@ -868,7 +904,7 @@ static int reinstate_path(struct pgpath *pgpath) spin_lock_irqsave(&m->lock, flags); - if (pgpath->path.is_active) + if (pgpath->is_active) goto out; if (!pgpath->pg->ps.type->reinstate_path) { @@ -882,7 +918,7 @@ static int reinstate_path(struct pgpath *pgpath) if (r) goto out; - pgpath->path.is_active = 1; + pgpath->is_active = 1; m->current_pgpath = NULL; if (!m->nr_valid_paths++ && m->queue_size) @@ -1080,8 +1116,15 @@ static void activate_path(struct work_struct *work) int ret; struct multipath *m = container_of(work, struct multipath, activate_path); - struct dm_path *path = &m->current_pgpath->path; + struct dm_path *path; + unsigned long flags; + spin_lock_irqsave(&m->lock, flags); + path = &m->pgpath_to_activate->path; + m->pgpath_to_activate = NULL; + spin_unlock_irqrestore(&m->lock, flags); + if (!path) + return; ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev)); pg_init_done(path, ret); } @@ -1263,7 +1306,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type, list_for_each_entry(p, &pg->pgpaths, list) { DMEMIT("%s %s %u ", p->path.dev->name, - p->path.is_active ? "A" : "F", + p->is_active ? "A" : "F", p->fail_count); if (pg->ps.type->status) sz += pg->ps.type->status(&pg->ps, diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h index c198b85..e230f71 100644 --- a/drivers/md/dm-mpath.h +++ b/drivers/md/dm-mpath.h @@ -13,8 +13,6 @@ struct dm_dev; struct dm_path { struct dm_dev *dev; /* Read-only */ - unsigned is_active; /* Read-only */ - void *pscontext; /* For path-selector use */ }; diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index ff05fe8..29913e4 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -842,7 +842,9 @@ static int recover(struct mirror_set *ms, struct region *reg) } /* hand to kcopyd */ - set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); + if (!errors_handled(ms)) + set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); + r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, recovery_complete, reg); diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 4de90ab..b745d8a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -284,8 +284,8 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, memset(major_minor, 0, sizeof(major_minor)); sprintf(major_minor, "%d:%d", - bio->bi_bdev->bd_disk->major, - bio->bi_bdev->bd_disk->first_minor); + MAJOR(disk_devt(bio->bi_bdev->bd_disk)), + MINOR(disk_devt(bio->bi_bdev->bd_disk))); /* * Test to see which stripe drive triggered the event diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 798e468..a740a69 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -250,7 +250,8 @@ static void free_devices(struct list_head *devices) struct list_head *tmp, *next; list_for_each_safe(tmp, next, devices) { - struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + struct dm_dev_internal *dd = + list_entry(tmp, struct dm_dev_internal, list); kfree(dd); } } @@ -316,40 +317,23 @@ static inline int check_space(struct dm_table *t) */ static int lookup_device(const char *path, dev_t *dev) { - int r; - struct nameidata nd; - struct inode *inode; - - if ((r = path_lookup(path, LOOKUP_FOLLOW, &nd))) - return r; - - inode = nd.path.dentry->d_inode; - if (!inode) { - r = -ENOENT; - goto out; - } - - if (!S_ISBLK(inode->i_mode)) { - r = -ENOTBLK; - goto out; - } - - *dev = inode->i_rdev; - - out: - path_put(&nd.path); - return r; + struct block_device *bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + *dev = bdev->bd_dev; + bdput(bdev); + return 0; } /* * See if we've already got a device in the list. */ -static struct dm_dev *find_device(struct list_head *l, dev_t dev) +static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) { - struct dm_dev *dd; + struct dm_dev_internal *dd; list_for_each_entry (dd, l, list) - if (dd->bdev->bd_dev == dev) + if (dd->dm_dev.bdev->bd_dev == dev) return dd; return NULL; @@ -358,45 +342,47 @@ static struct dm_dev *find_device(struct list_head *l, dev_t dev) /* * Open a device so we can use it as a map destination. */ -static int open_dev(struct dm_dev *d, dev_t dev, struct mapped_device *md) +static int open_dev(struct dm_dev_internal *d, dev_t dev, + struct mapped_device *md) { static char *_claim_ptr = "I belong to device-mapper"; struct block_device *bdev; int r; - BUG_ON(d->bdev); + BUG_ON(d->dm_dev.bdev); - bdev = open_by_devnum(dev, d->mode); + bdev = open_by_devnum(dev, d->dm_dev.mode); if (IS_ERR(bdev)) return PTR_ERR(bdev); r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md)); if (r) blkdev_put(bdev); else - d->bdev = bdev; + d->dm_dev.bdev = bdev; return r; } /* * Close a device that we've been using. */ -static void close_dev(struct dm_dev *d, struct mapped_device *md) +static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) { - if (!d->bdev) + if (!d->dm_dev.bdev) return; - bd_release_from_disk(d->bdev, dm_disk(md)); - blkdev_put(d->bdev); - d->bdev = NULL; + bd_release_from_disk(d->dm_dev.bdev, dm_disk(md)); + blkdev_put(d->dm_dev.bdev); + d->dm_dev.bdev = NULL; } /* * If possible, this checks an area of a destination device is valid. */ -static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len) +static int check_device_area(struct dm_dev_internal *dd, sector_t start, + sector_t len) { - sector_t dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT; + sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT; if (!dev_size) return 1; @@ -409,16 +395,17 @@ static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len) * careful to leave things as they were if we fail to reopen the * device. */ -static int upgrade_mode(struct dm_dev *dd, int new_mode, struct mapped_device *md) +static int upgrade_mode(struct dm_dev_internal *dd, int new_mode, + struct mapped_device *md) { int r; - struct dm_dev dd_copy; - dev_t dev = dd->bdev->bd_dev; + struct dm_dev_internal dd_copy; + dev_t dev = dd->dm_dev.bdev->bd_dev; dd_copy = *dd; - dd->mode |= new_mode; - dd->bdev = NULL; + dd->dm_dev.mode |= new_mode; + dd->dm_dev.bdev = NULL; r = open_dev(dd, dev, md); if (!r) close_dev(&dd_copy, md); @@ -438,7 +425,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, { int r; dev_t uninitialized_var(dev); - struct dm_dev *dd; + struct dm_dev_internal *dd; unsigned int major, minor; BUG_ON(!t); @@ -460,20 +447,20 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, if (!dd) return -ENOMEM; - dd->mode = mode; - dd->bdev = NULL; + dd->dm_dev.mode = mode; + dd->dm_dev.bdev = NULL; if ((r = open_dev(dd, dev, t->md))) { kfree(dd); return r; } - format_dev_t(dd->name, dev); + format_dev_t(dd->dm_dev.name, dev); atomic_set(&dd->count, 0); list_add(&dd->list, &t->devices); - } else if (dd->mode != (mode | dd->mode)) { + } else if (dd->dm_dev.mode != (mode | dd->dm_dev.mode)) { r = upgrade_mode(dd, mode, t->md); if (r) return r; @@ -482,11 +469,11 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, if (!check_device_area(dd, start, len)) { DMWARN("device %s too small for target", path); - dm_put_device(ti, dd); + dm_put_device(ti, &dd->dm_dev); return -EINVAL; } - *result = dd; + *result = &dd->dm_dev; return 0; } @@ -495,6 +482,13 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); struct io_restrictions *rs = &ti->limits; + char b[BDEVNAME_SIZE]; + + if (unlikely(!q)) { + DMWARN("%s: Cannot set limits for nonexistent device %s", + dm_device_name(ti->table->md), bdevname(bdev, b)); + return; + } /* * Combine the device limits low. @@ -557,8 +551,11 @@ int dm_get_device(struct dm_target *ti, const char *path, sector_t start, /* * Decrement a devices use count and remove it if necessary. */ -void dm_put_device(struct dm_target *ti, struct dm_dev *dd) +void dm_put_device(struct dm_target *ti, struct dm_dev *d) { + struct dm_dev_internal *dd = container_of(d, struct dm_dev_internal, + dm_dev); + if (atomic_dec_and_test(&dd->count)) { close_dev(dd, ti->table->md); list_del(&dd->list); @@ -954,13 +951,20 @@ int dm_table_resume_targets(struct dm_table *t) int dm_table_any_congested(struct dm_table *t, int bdi_bits) { - struct dm_dev *dd; + struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); int r = 0; list_for_each_entry(dd, devices, list) { - struct request_queue *q = bdev_get_queue(dd->bdev); - r |= bdi_congested(&q->backing_dev_info, bdi_bits); + struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); + char b[BDEVNAME_SIZE]; + + if (likely(q)) + r |= bdi_congested(&q->backing_dev_info, bdi_bits); + else + DMWARN_LIMIT("%s: any_congested: nonexistent device %s", + dm_device_name(t->md), + bdevname(dd->dm_dev.bdev, b)); } return r; @@ -968,13 +972,19 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) void dm_table_unplug_all(struct dm_table *t) { - struct dm_dev *dd; + struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); list_for_each_entry(dd, devices, list) { - struct request_queue *q = bdev_get_queue(dd->bdev); - - blk_unplug(q); + struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); + char b[BDEVNAME_SIZE]; + + if (likely(q)) + blk_unplug(q); + else + DMWARN_LIMIT("%s: Cannot unplug nonexistent device %s", + dm_device_name(t->md), + bdevname(dd->dm_dev.bdev, b)); } } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index bca448e..327de03 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -377,13 +377,14 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio) static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; + int cpu; io->start_time = jiffies; - preempt_disable(); - disk_round_stats(dm_disk(md)); - preempt_enable(); - dm_disk(md)->in_flight = atomic_inc_return(&md->pending); + cpu = part_stat_lock(); + part_round_stats(cpu, &dm_disk(md)->part0); + part_stat_unlock(); + dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending); } static int end_io_acct(struct dm_io *io) @@ -391,15 +392,16 @@ static int end_io_acct(struct dm_io *io) struct mapped_device *md = io->md; struct bio *bio = io->bio; unsigned long duration = jiffies - io->start_time; - int pending; + int pending, cpu; int rw = bio_data_dir(bio); - preempt_disable(); - disk_round_stats(dm_disk(md)); - preempt_enable(); - dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending); + cpu = part_stat_lock(); + part_round_stats(cpu, &dm_disk(md)->part0); + part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); + part_stat_unlock(); - disk_stat_add(dm_disk(md), ticks[rw], duration); + dm_disk(md)->part0.in_flight = pending = + atomic_dec_return(&md->pending); return !pending; } @@ -837,12 +839,14 @@ static int dm_merge_bvec(struct request_queue *q, struct dm_table *map = dm_get_table(md); struct dm_target *ti; sector_t max_sectors; - int max_size; + int max_size = 0; if (unlikely(!map)) - return 0; + goto out; ti = dm_table_find_target(map, bvm->bi_sector); + if (!dm_target_is_valid(ti)) + goto out_table; /* * Find maximum amount of I/O that won't need splitting @@ -861,14 +865,16 @@ static int dm_merge_bvec(struct request_queue *q, if (max_size && ti->type->merge) max_size = ti->type->merge(ti, bvm, biovec, max_size); +out_table: + dm_table_put(map); + +out: /* * Always allow an entire first page */ if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) max_size = biovec->bv_len; - dm_table_put(map); - return max_size; } @@ -881,6 +887,7 @@ static int dm_request(struct request_queue *q, struct bio *bio) int r = -EIO; int rw = bio_data_dir(bio); struct mapped_device *md = q->queuedata; + int cpu; /* * There is no use in forwarding any barrier request since we can't @@ -893,8 +900,10 @@ static int dm_request(struct request_queue *q, struct bio *bio) down_read(&md->io_lock); - disk_stat_inc(dm_disk(md), ios[rw]); - disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio)); + cpu = part_stat_lock(); + part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); + part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); + part_stat_unlock(); /* * If we're suspended we have to queue @@ -1142,7 +1151,7 @@ static void unlock_fs(struct mapped_device *md); static void free_dev(struct mapped_device *md) { - int minor = md->disk->first_minor; + int minor = MINOR(disk_devt(md->disk)); if (md->suspended_bdev) { unlock_fs(md); @@ -1178,7 +1187,7 @@ static void event_callback(void *context) list_splice_init(&md->uevent_list, &uevents); spin_unlock_irqrestore(&md->uevent_lock, flags); - dm_send_uevents(&uevents, &md->disk->dev.kobj); + dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); atomic_inc(&md->event_nr); wake_up(&md->eventq); @@ -1263,7 +1272,7 @@ static struct mapped_device *dm_find_md(dev_t dev) md = idr_find(&_minor_idr, minor); if (md && (md == MINOR_ALLOCED || - (dm_disk(md)->first_minor != minor) || + (MINOR(disk_devt(dm_disk(md))) != minor) || test_bit(DMF_FREEING, &md->flags))) { md = NULL; goto out; @@ -1314,7 +1323,8 @@ void dm_put(struct mapped_device *md) if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { map = dm_get_table(md); - idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor); + idr_replace(&_minor_idr, MINOR_ALLOCED, + MINOR(disk_devt(dm_disk(md)))); set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); if (!dm_suspended(md)) { @@ -1634,7 +1644,7 @@ out: *---------------------------------------------------------------*/ void dm_kobject_uevent(struct mapped_device *md) { - kobject_uevent(&md->disk->dev.kobj, KOBJ_CHANGE); + kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE); } uint32_t dm_next_uevent_seq(struct mapped_device *md) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 1e59a0b..cd189da 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -25,13 +25,10 @@ /* * List of devices that a metadevice uses and should open/close. */ -struct dm_dev { +struct dm_dev_internal { struct list_head list; - atomic_t count; - int mode; - struct block_device *bdev; - char name[16]; + struct dm_dev dm_dev; }; struct dm_table; @@ -49,7 +46,6 @@ void dm_table_presuspend_targets(struct dm_table *t); void dm_table_postsuspend_targets(struct dm_table *t); int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); -void dm_table_unplug_all(struct dm_table *t); /* * To check the return value from dm_table_find_target(). @@ -93,8 +89,6 @@ void dm_linear_exit(void); int dm_stripe_init(void); void dm_stripe_exit(void); -void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); -union map_info *dm_get_mapinfo(struct bio *bio); int dm_open_count(struct mapped_device *md); int dm_lock_for_deletion(struct mapped_device *md); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index b1eebf8..b9cbee68 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -318,14 +318,18 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) mddev_t *mddev = q->queuedata; dev_info_t *tmp_dev; sector_t block; + int cpu; if (unlikely(bio_barrier(bio))) { bio_endio(bio, -EOPNOTSUPP); return 0; } - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); tmp_dev = which_dev(mddev, bio->bi_sector); block = bio->bi_sector >> 1; @@ -349,7 +353,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) * split it. */ struct bio_pair *bp; - bp = bio_split(bio, bio_split_pool, + bp = bio_split(bio, ((tmp_dev->offset + tmp_dev->size)<<1) - bio->bi_sector); if (linear_make_request(q, &bp->bio1)) generic_make_request(&bp->bio1); diff --git a/drivers/md/md.c b/drivers/md/md.c index c2ff77c..0a3a4bd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1464,10 +1464,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; - if (rdev->bdev->bd_part) - ko = &rdev->bdev->bd_part->dev.kobj; - else - ko = &rdev->bdev->bd_disk->dev.kobj; + ko = &part_to_dev(rdev->bdev->bd_part)->kobj; if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { kobject_del(&rdev->kobj); goto fail; @@ -2393,6 +2390,8 @@ static void analyze_sbs(mddev_t * mddev) } +static void md_safemode_timeout(unsigned long data); + static ssize_t safe_delay_show(mddev_t *mddev, char *page) { @@ -2432,9 +2431,12 @@ safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) if (msec == 0) mddev->safemode_delay = 0; else { + unsigned long old_delay = mddev->safemode_delay; mddev->safemode_delay = (msec*HZ)/1000; if (mddev->safemode_delay == 0) mddev->safemode_delay = 1; + if (mddev->safemode_delay < old_delay) + md_safemode_timeout((unsigned long)mddev); } return len; } @@ -3465,8 +3467,8 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) disk->queue = mddev->queue; add_disk(disk); mddev->gendisk = disk; - error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, - "%s", "md"); + error = kobject_init_and_add(&mddev->kobj, &md_ktype, + &disk_to_dev(disk)->kobj, "%s", "md"); mutex_unlock(&disks_mutex); if (error) printk(KERN_WARNING "md: cannot register %s/md - name in use\n", @@ -3483,7 +3485,7 @@ static void md_safemode_timeout(unsigned long data) if (!atomic_read(&mddev->writes_pending)) { mddev->safemode = 1; if (mddev->external) - sysfs_notify(&mddev->kobj, NULL, "array_state"); + set_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags); } md_wakeup_thread(mddev->thread); } @@ -3756,7 +3758,7 @@ static int do_md_run(mddev_t * mddev) sysfs_notify(&mddev->kobj, NULL, "array_state"); sysfs_notify(&mddev->kobj, NULL, "sync_action"); sysfs_notify(&mddev->kobj, NULL, "degraded"); - kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); + kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); return 0; } @@ -3836,8 +3838,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) del_timer_sync(&mddev->safemode_timer); - invalidate_partition(disk, 0); - switch(mode) { case 1: /* readonly */ err = -ENXIO; @@ -4634,6 +4634,11 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) */ if (mddev->sync_thread) return -EBUSY; + if (mddev->bitmap) + /* Sorry, cannot grow a bitmap yet, just remove it, + * grow, and re-add. + */ + return -EBUSY; rdev_for_each(rdev, tmp, mddev) { sector_t avail; avail = rdev->size * 2; @@ -5541,8 +5546,8 @@ static int is_mddev_idle(mddev_t *mddev) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = disk_stat_read(disk, sectors[0]) + - disk_stat_read(disk, sectors[1]) - + curr_events = part_stat_read(&disk->part0, sectors[0]) + + part_stat_read(&disk->part0, sectors[1]) - atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and @@ -5753,7 +5758,11 @@ void md_do_sync(mddev_t *mddev) * time 'round when curr_resync == 2 */ continue; - prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); + /* We need to wait 'interruptible' so as not to + * contribute to the load average, and not to + * be caught by 'softlockup' + */ + prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); if (!kthread_should_stop() && mddev2->curr_resync >= mddev->curr_resync) { printk(KERN_INFO "md: delaying %s of %s" @@ -5761,6 +5770,8 @@ void md_do_sync(mddev_t *mddev) " share one or more physical units)\n", desc, mdname(mddev), mdname(mddev2)); mddev_put(mddev2); + if (signal_pending(current)) + flush_signals(current); schedule(); finish_wait(&resync_wait, &wq); goto try_again; @@ -5993,10 +6004,11 @@ static int remove_and_add_spares(mddev_t *mddev) } } - if (mddev->degraded) { + if (mddev->degraded && ! mddev->ro) { rdev_for_each(rdev, rtmp, mddev) { if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags)) + !test_bit(In_sync, &rdev->flags) && + !test_bit(Blocked, &rdev->flags)) spares++; if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { @@ -6051,6 +6063,9 @@ void md_check_recovery(mddev_t *mddev) if (mddev->bitmap) bitmap_daemon_work(mddev->bitmap); + if (test_and_clear_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags)) + sysfs_notify(&mddev->kobj, NULL, "array_state"); + if (mddev->ro) return; @@ -6063,6 +6078,8 @@ void md_check_recovery(mddev_t *mddev) flush_signals(current); } + if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + return; if ( ! ( (mddev->flags && !mddev->external) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || @@ -6076,6 +6093,15 @@ void md_check_recovery(mddev_t *mddev) if (mddev_trylock(mddev)) { int spares = 0; + if (mddev->ro) { + /* Only thing we do on a ro array is remove + * failed devices. + */ + remove_and_add_spares(mddev); + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + goto unlock; + } + if (!mddev->external) { int did_change = 0; spin_lock_irq(&mddev->write_lock); @@ -6113,7 +6139,8 @@ void md_check_recovery(mddev_t *mddev) /* resync has finished, collect result */ md_unregister_thread(mddev->sync_thread); mddev->sync_thread = NULL; - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* success...*/ /* activate any spares */ if (mddev->pers->spare_active(mddev)) @@ -6165,6 +6192,7 @@ void md_check_recovery(mddev_t *mddev) } else if ((spares = remove_and_add_spares(mddev))) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); } else if (mddev->recovery_cp < MaxSector) { set_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -6228,7 +6256,11 @@ static int md_notify_reboot(struct notifier_block *this, for_each_mddev(mddev, tmp) if (mddev_trylock(mddev)) { - do_md_stop (mddev, 1, 0); + /* Force a switch to readonly even array + * appears to still be in use. Hence + * the '100'. + */ + do_md_stop (mddev, 1, 100); mddev_unlock(mddev); } /* diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index c4779cc..8bb8794 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -147,6 +147,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) struct multipath_bh * mp_bh; struct multipath_info *multipath; const int rw = bio_data_dir(bio); + int cpu; if (unlikely(bio_barrier(bio))) { bio_endio(bio, -EOPNOTSUPP); @@ -158,8 +159,11 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) mp_bh->master_bio = bio; mp_bh->mddev = mddev; - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); mp_bh->path = multipath_map(conf); if (mp_bh->path < 0) { diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 1836106..53508a8 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -399,14 +399,18 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) sector_t chunk; sector_t block, rsect; const int rw = bio_data_dir(bio); + int cpu; if (unlikely(bio_barrier(bio))) { bio_endio(bio, -EOPNOTSUPP); return 0; } - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); chunk_size = mddev->chunk_size >> 10; chunk_sects = mddev->chunk_size >> 9; @@ -423,7 +427,7 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio) /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. */ - bp = bio_split(bio, bio_split_pool, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); + bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1))); if (raid0_make_request(q, &bp->bio1)) generic_make_request(&bp->bio1); if (raid0_make_request(q, &bp->bio2)) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 03a5ab7..b976442 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -779,7 +779,7 @@ static int make_request(struct request_queue *q, struct bio * bio) struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); const int do_sync = bio_sync(bio); - int do_barriers; + int cpu, do_barriers; mdk_rdev_t *blocked_rdev; /* @@ -804,8 +804,11 @@ static int make_request(struct request_queue *q, struct bio * bio) bitmap = mddev->bitmap; - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); /* * make_request() can abort the operation when READA is being @@ -1302,9 +1305,6 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) sbio->bi_size = r1_bio->sectors << 9; sbio->bi_idx = 0; sbio->bi_phys_segments = 0; - sbio->bi_hw_segments = 0; - sbio->bi_hw_front_size = 0; - sbio->bi_hw_back_size = 0; sbio->bi_flags &= ~(BIO_POOL_MASK - 1); sbio->bi_flags |= 1 << BIO_UPTODATE; sbio->bi_next = NULL; @@ -1790,7 +1790,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i bio->bi_vcnt = 0; bio->bi_idx = 0; bio->bi_phys_segments = 0; - bio->bi_hw_segments = 0; bio->bi_size = 0; bio->bi_end_io = NULL; bio->bi_private = NULL; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 159535d..8bdc9bf 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -76,11 +76,13 @@ static void r10bio_pool_free(void *r10_bio, void *data) kfree(r10_bio); } +/* Maximum size of each resync request */ #define RESYNC_BLOCK_SIZE (64*1024) -//#define RESYNC_BLOCK_SIZE PAGE_SIZE -#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -#define RESYNC_WINDOW (2048*1024) +/* amount of memory to reserve for resync requests */ +#define RESYNC_WINDOW (1024*1024) +/* maximum number of concurrent requests, memory permitting */ +#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) /* * When performing a resync, we need to read and compare, so @@ -215,6 +217,9 @@ static void reschedule_retry(r10bio_t *r10_bio) conf->nr_queued ++; spin_unlock_irqrestore(&conf->device_lock, flags); + /* wake up frozen array... */ + wake_up(&conf->wait_barrier); + md_wakeup_thread(mddev->thread); } @@ -687,7 +692,6 @@ static int flush_pending_writes(conf_t *conf) * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ -#define RESYNC_DEPTH 32 static void raise_barrier(conf_t *conf, int force) { @@ -785,6 +789,7 @@ static int make_request(struct request_queue *q, struct bio * bio) mirror_info_t *mirror; r10bio_t *r10_bio; struct bio *read_bio; + int cpu; int i; int chunk_sects = conf->chunk_mask + 1; const int rw = bio_data_dir(bio); @@ -812,7 +817,7 @@ static int make_request(struct request_queue *q, struct bio * bio) /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. */ - bp = bio_split(bio, bio_split_pool, + bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); if (make_request(q, &bp->bio1)) generic_make_request(&bp->bio1); @@ -839,8 +844,11 @@ static int make_request(struct request_queue *q, struct bio * bio) */ wait_barrier(conf); - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); @@ -1341,9 +1349,6 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) tbio->bi_size = r10_bio->sectors << 9; tbio->bi_idx = 0; tbio->bi_phys_segments = 0; - tbio->bi_hw_segments = 0; - tbio->bi_hw_front_size = 0; - tbio->bi_hw_back_size = 0; tbio->bi_flags &= ~(BIO_POOL_MASK - 1); tbio->bi_flags |= 1 << BIO_UPTODATE; tbio->bi_next = NULL; @@ -1943,7 +1948,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i bio->bi_vcnt = 0; bio->bi_idx = 0; bio->bi_phys_segments = 0; - bio->bi_hw_segments = 0; bio->bi_size = 0; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 55e7c56..ae16794 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -101,6 +101,40 @@ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); #endif +/* + * We maintain a biased count of active stripes in the bottom 16 bits of + * bi_phys_segments, and a count of processed stripes in the upper 16 bits + */ +static inline int raid5_bi_phys_segments(struct bio *bio) +{ + return bio->bi_phys_segments & 0xffff; +} + +static inline int raid5_bi_hw_segments(struct bio *bio) +{ + return (bio->bi_phys_segments >> 16) & 0xffff; +} + +static inline int raid5_dec_bi_phys_segments(struct bio *bio) +{ + --bio->bi_phys_segments; + return raid5_bi_phys_segments(bio); +} + +static inline int raid5_dec_bi_hw_segments(struct bio *bio) +{ + unsigned short val = raid5_bi_hw_segments(bio); + + --val; + bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); + return val; +} + +static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) +{ + bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); +} + static inline int raid6_next_disk(int disk, int raid_disks) { disk++; @@ -507,7 +541,7 @@ static void ops_complete_biofill(void *stripe_head_ref) while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - if (--rbi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(rbi)) { rbi->bi_next = return_bi; return_bi = rbi; } @@ -1725,7 +1759,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (*bip) bi->bi_next = *bip; *bip = bi; - bi->bi_phys_segments ++; + bi->bi_phys_segments++; spin_unlock_irq(&conf->device_lock); spin_unlock(&sh->lock); @@ -1819,7 +1853,7 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(bi)) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; @@ -1834,7 +1868,7 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(bi)) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; @@ -1858,7 +1892,7 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(bi)) { bi->bi_next = *return_bi; *return_bi = bi; } @@ -2033,7 +2067,7 @@ static void handle_stripe_clean_event(raid5_conf_t *conf, while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); - if (--wbi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(wbi)) { md_write_end(conf->mddev); wbi->bi_next = *return_bi; *return_bi = wbi; @@ -2507,7 +2541,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, * */ -static void handle_stripe5(struct stripe_head *sh) +static bool handle_stripe5(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; @@ -2568,10 +2602,10 @@ static void handle_stripe5(struct stripe_head *sh) if (dev->written) s.written++; rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + if (blocked_rdev == NULL && + rdev && unlikely(test_bit(Blocked, &rdev->flags))) { blocked_rdev = rdev; atomic_inc(&rdev->nr_pending); - break; } if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag will just be confusing now */ @@ -2588,8 +2622,14 @@ static void handle_stripe5(struct stripe_head *sh) rcu_read_unlock(); if (unlikely(blocked_rdev)) { - set_bit(STRIPE_HANDLE, &sh->state); - goto unlock; + if (s.syncing || s.expanding || s.expanded || + s.to_write || s.written) { + set_bit(STRIPE_HANDLE, &sh->state); + goto unlock; + } + /* There is nothing for the blocked_rdev to block */ + rdev_dec_pending(blocked_rdev, conf->mddev); + blocked_rdev = NULL; } if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { @@ -2717,10 +2757,11 @@ static void handle_stripe5(struct stripe_head *sh) if (sh->reconstruct_state == reconstruct_state_result) { sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); - for (i = conf->raid_disks; i--; ) + for (i = conf->raid_disks; i--; ) { set_bit(R5_Wantwrite, &sh->dev[i].flags); - set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_LOCKED, &sh->dev[i].flags); s.locked++; + } } if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && @@ -2754,9 +2795,11 @@ static void handle_stripe5(struct stripe_head *sh) ops_run_io(sh, &s); return_io(return_bi); + + return blocked_rdev == NULL; } -static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) +static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) { raid6_conf_t *conf = sh->raid_conf; int disks = sh->disks; @@ -2805,7 +2848,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) copy_data(0, rbi, dev->page, dev->sector); rbi2 = r5_next_bio(rbi, dev->sector); spin_lock_irq(&conf->device_lock); - if (--rbi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(rbi)) { rbi->bi_next = return_bi; return_bi = rbi; } @@ -2829,10 +2872,10 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (dev->written) s.written++; rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + if (blocked_rdev == NULL && + rdev && unlikely(test_bit(Blocked, &rdev->flags))) { blocked_rdev = rdev; atomic_inc(&rdev->nr_pending); - break; } if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag will just be confusing now */ @@ -2850,9 +2893,16 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) rcu_read_unlock(); if (unlikely(blocked_rdev)) { - set_bit(STRIPE_HANDLE, &sh->state); - goto unlock; + if (s.syncing || s.expanding || s.expanded || + s.to_write || s.written) { + set_bit(STRIPE_HANDLE, &sh->state); + goto unlock; + } + /* There is nothing for the blocked_rdev to block */ + rdev_dec_pending(blocked_rdev, conf->mddev); + blocked_rdev = NULL; } + pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, @@ -2967,14 +3017,17 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) ops_run_io(sh, &s); return_io(return_bi); + + return blocked_rdev == NULL; } -static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) +/* returns true if the stripe was handled */ +static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) { if (sh->raid_conf->level == 6) - handle_stripe6(sh, tmp_page); + return handle_stripe6(sh, tmp_page); else - handle_stripe5(sh); + return handle_stripe5(sh); } @@ -3136,8 +3189,11 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf) if(bi) { conf->retry_read_aligned_list = bi->bi_next; bi->bi_next = NULL; + /* + * this sets the active strip count to 1 and the processed + * strip count to zero (upper 8 bits) + */ bi->bi_phys_segments = 1; /* biased count of active stripes */ - bi->bi_hw_segments = 0; /* count of processed stripes */ } return bi; @@ -3187,8 +3243,7 @@ static int bio_fits_rdev(struct bio *bi) if ((bi->bi_size>>9) > q->max_sectors) return 0; blk_recount_segments(q, bi); - if (bi->bi_phys_segments > q->max_phys_segments || - bi->bi_hw_segments > q->max_hw_segments) + if (bi->bi_phys_segments > q->max_phys_segments) return 0; if (q->merge_bvec_fn) @@ -3332,7 +3387,7 @@ static int make_request(struct request_queue *q, struct bio * bi) sector_t logical_sector, last_sector; struct stripe_head *sh; const int rw = bio_data_dir(bi); - int remaining; + int cpu, remaining; if (unlikely(bio_barrier(bi))) { bio_endio(bi, -EOPNOTSUPP); @@ -3341,8 +3396,11 @@ static int make_request(struct request_queue *q, struct bio * bi) md_write_start(mddev, bi); - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi)); + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bi)); + part_stat_unlock(); if (rw == READ && mddev->reshape_position == MaxSector && @@ -3449,7 +3507,7 @@ static int make_request(struct request_queue *q, struct bio * bi) } spin_lock_irq(&conf->device_lock); - remaining = --bi->bi_phys_segments; + remaining = raid5_dec_bi_phys_segments(bi); spin_unlock_irq(&conf->device_lock); if (remaining == 0) { @@ -3692,7 +3750,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski clear_bit(STRIPE_INSYNC, &sh->state); spin_unlock(&sh->lock); - handle_stripe(sh, NULL); + /* wait for any blocked device to be handled */ + while(unlikely(!handle_stripe(sh, NULL))) + ; release_stripe(sh); return STRIPE_SECTORS; @@ -3731,7 +3791,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) sector += STRIPE_SECTORS, scnt++) { - if (scnt < raid_bio->bi_hw_segments) + if (scnt < raid5_bi_hw_segments(raid_bio)) /* already done this stripe */ continue; @@ -3739,7 +3799,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) if (!sh) { /* failed to get a stripe - must wait */ - raid_bio->bi_hw_segments = scnt; + raid5_set_bi_hw_segments(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; } @@ -3747,7 +3807,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) set_bit(R5_ReadError, &sh->dev[dd_idx].flags); if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { release_stripe(sh); - raid_bio->bi_hw_segments = scnt; + raid5_set_bi_hw_segments(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; } @@ -3757,7 +3817,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) handled++; } spin_lock_irq(&conf->device_lock); - remaining = --raid_bio->bi_phys_segments; + remaining = raid5_dec_bi_phys_segments(raid_bio); spin_unlock_irq(&conf->device_lock); if (remaining == 0) bio_endio(raid_bio, 0); @@ -3811,10 +3871,8 @@ static void raid5d(mddev_t *mddev) sh = __get_priority_stripe(conf); - if (!sh) { - async_tx_issue_pending_all(); + if (!sh) break; - } spin_unlock_irq(&conf->device_lock); handled++; @@ -3827,6 +3885,7 @@ static void raid5d(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); + async_tx_issue_pending_all(); unplug_slaves(mddev); pr_debug("--- raid5d inactive\n"); @@ -4439,6 +4498,9 @@ static int raid5_check_reshape(mddev_t *mddev) return -EINVAL; /* Cannot shrink array or change level yet */ if (mddev->delta_disks == 0) return 0; /* nothing to do */ + if (mddev->bitmap) + /* Cannot grow a bitmap yet */ + return -EBUSY; /* Can only proceed if there are plenty of stripe_heads. * We need a minimum of one full stripe,, and for sensible progress |