diff options
-rw-r--r-- | arch/x86/Makefile | 5 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_32.h | 6 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_64.h | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_avx.h | 214 | ||||
-rw-r--r-- | crypto/xor.c | 13 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 1100 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 60 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 22 | ||||
-rw-r--r-- | drivers/md/md.c | 370 | ||||
-rw-r--r-- | drivers/md/md.h | 12 | ||||
-rw-r--r-- | drivers/md/raid1.c | 22 | ||||
-rw-r--r-- | drivers/md/raid10.c | 1281 | ||||
-rw-r--r-- | drivers/md/raid10.h | 34 | ||||
-rw-r--r-- | drivers/md/raid5.c | 252 | ||||
-rw-r--r-- | drivers/md/raid5.h | 7 | ||||
-rw-r--r-- | include/linux/raid/md_p.h | 15 | ||||
-rw-r--r-- | include/linux/raid/pq.h | 18 | ||||
-rw-r--r-- | lib/raid6/Makefile | 2 | ||||
-rw-r--r-- | lib/raid6/algos.c | 127 | ||||
-rw-r--r-- | lib/raid6/mktables.c | 25 | ||||
-rw-r--r-- | lib/raid6/recov.c | 15 | ||||
-rw-r--r-- | lib/raid6/recov_ssse3.c | 335 | ||||
-rw-r--r-- | lib/raid6/test/Makefile | 2 | ||||
-rw-r--r-- | lib/raid6/test/test.c | 32 | ||||
-rw-r--r-- | lib/raid6/x86.h | 15 |
25 files changed, 3124 insertions, 868 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index dc611a4..1f25214 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) +avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 133b40a..4545708 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = { .do_5 = xor_sse_5, }; +/* Also try the AVX routines */ +#include "xor_avx.h" + /* Also try the generic routines. */ #include <asm-generic/xor.h> @@ -871,6 +874,7 @@ do { \ xor_speed(&xor_block_8regs_p); \ xor_speed(&xor_block_32regs); \ xor_speed(&xor_block_32regs_p); \ + AVX_XOR_SPEED; \ if (cpu_has_xmm) \ xor_speed(&xor_block_pIII_sse); \ if (cpu_has_mmx) { \ @@ -883,6 +887,6 @@ do { \ We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ #define XOR_SELECT_TEMPLATE(FASTEST) \ - (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) + AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) #endif /* _ASM_X86_XOR_32_H */ diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 1549b5e..b9b2323 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = { .do_5 = xor_sse_5, }; + +/* Also try the AVX routines */ +#include "xor_avx.h" + #undef XOR_TRY_TEMPLATES #define XOR_TRY_TEMPLATES \ do { \ + AVX_XOR_SPEED; \ xor_speed(&xor_block_sse); \ } while (0) /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) +#define XOR_SELECT_TEMPLATE(FASTEST) \ + AVX_SELECT(&xor_block_sse) #endif /* _ASM_X86_XOR_64_H */ diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h new file mode 100644 index 0000000..2510d35 --- /dev/null +++ b/arch/x86/include/asm/xor_avx.h @@ -0,0 +1,214 @@ +#ifndef _ASM_X86_XOR_AVX_H +#define _ASM_X86_XOR_AVX_H + +/* + * Optimized RAID-5 checksumming functions for AVX + * + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#ifdef CONFIG_AS_AVX + +#include <linux/compiler.h> +#include <asm/i387.h> + +#define ALIGN32 __aligned(32) + +#define YMM_SAVED_REGS 4 + +#define YMMS_SAVE \ +do { \ + preempt_disable(); \ + cr0 = read_cr0(); \ + clts(); \ + asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ + asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ + asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ + asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ +} while (0); + +#define YMMS_RESTORE \ +do { \ + asm volatile("sfence" : : : "memory"); \ + asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ + asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ + asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ + asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ + write_cr0(cr0); \ + preempt_enable(); \ +} while (0); + +#define BLOCK4(i) \ + BLOCK(32 * i, 0) \ + BLOCK(32 * (i + 1), 1) \ + BLOCK(32 * (i + 2), 2) \ + BLOCK(32 * (i + 3), 3) + +#define BLOCK16() \ + BLOCK4(0) \ + BLOCK4(4) \ + BLOCK4(8) \ + BLOCK4(12) + +static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2, unsigned long *p3) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16(); + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + p3 = (unsigned long *)((uintptr_t)p3 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2, unsigned long *p3, unsigned long *p4) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p3[i / sizeof(*p3)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + p3 = (unsigned long *)((uintptr_t)p3 + 512); + p4 = (unsigned long *)((uintptr_t)p4 + 512); + } + + YMMS_RESTORE +} + +static struct xor_block_template xor_block_avx = { + .name = "avx", + .do_2 = xor_avx_2, + .do_3 = xor_avx_3, + .do_4 = xor_avx_4, + .do_5 = xor_avx_5, +}; + +#define AVX_XOR_SPEED \ +do { \ + if (cpu_has_avx) \ + xor_speed(&xor_block_avx); \ +} while (0) + +#define AVX_SELECT(FASTEST) \ + (cpu_has_avx ? &xor_block_avx : FASTEST) + +#else + +#define AVX_XOR_SPEED {} + +#define AVX_SELECT(FASTEST) (FASTEST) + +#endif +#endif diff --git a/crypto/xor.c b/crypto/xor.c index 664b6df..65c7b41 100644 --- a/crypto/xor.c +++ b/crypto/xor.c @@ -21,6 +21,7 @@ #include <linux/gfp.h> #include <linux/raid/xor.h> #include <linux/jiffies.h> +#include <linux/preempt.h> #include <asm/xor.h> /* The xor routines to use. */ @@ -63,12 +64,14 @@ static void do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) { int speed; - unsigned long now; + unsigned long now, j; int i, count, max; tmpl->next = template_list; template_list = tmpl; + preempt_disable(); + /* * Count the number of XORs done during a whole jiffy, and use * this to calculate the speed of checksumming. We use a 2-page @@ -76,9 +79,11 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) */ max = 0; for (i = 0; i < 5; i++) { - now = jiffies; + j = jiffies; count = 0; - while (jiffies == now) { + while ((now = jiffies) == j) + cpu_relax(); + while (time_before(jiffies, now + 1)) { mb(); /* prevent loop optimzation */ tmpl->do_2(BENCH_SIZE, b1, b2); mb(); @@ -89,6 +94,8 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) max = count; } + preempt_enable(); + speed = max * (HZ * BENCH_SIZE / 1024); tmpl->speed = speed; diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 17e2b47..15dbe03 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -45,7 +45,7 @@ static inline char *bmname(struct bitmap *bitmap) * if we find our page, we increment the page's refcount so that it stays * allocated while we're using it */ -static int bitmap_checkpage(struct bitmap *bitmap, +static int bitmap_checkpage(struct bitmap_counts *bitmap, unsigned long page, int create) __releases(bitmap->lock) __acquires(bitmap->lock) @@ -76,8 +76,7 @@ __acquires(bitmap->lock) spin_lock_irq(&bitmap->lock); if (mappage == NULL) { - pr_debug("%s: bitmap map page allocation failed, hijacking\n", - bmname(bitmap)); + pr_debug("md/bitmap: map page allocation failed, hijacking\n"); /* failed - set the hijacked flag so that we can use the * pointer as a counter */ if (!bitmap->bp[page].map) @@ -100,7 +99,7 @@ __acquires(bitmap->lock) /* if page is completely empty, put it back on the free list, or dealloc it */ /* if page was hijacked, unmark the flag so it might get alloced next time */ /* Note: lock should be held when calling this */ -static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) +static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page) { char *ptr; @@ -130,22 +129,14 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) */ /* IO operations when bitmap is stored near all superblocks */ -static struct page *read_sb_page(struct mddev *mddev, loff_t offset, - struct page *page, - unsigned long index, int size) +static int read_sb_page(struct mddev *mddev, loff_t offset, + struct page *page, + unsigned long index, int size) { /* choose a good rdev and read the page from there */ struct md_rdev *rdev; sector_t target; - int did_alloc = 0; - - if (!page) { - page = alloc_page(GFP_KERNEL); - if (!page) - return ERR_PTR(-ENOMEM); - did_alloc = 1; - } rdev_for_each(rdev, mddev) { if (! test_bit(In_sync, &rdev->flags) @@ -158,15 +149,10 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset, roundup(size, bdev_logical_block_size(rdev->bdev)), page, READ, true)) { page->index = index; - attach_page_buffers(page, NULL); /* so that free_buffer will - * quietly no-op */ - return page; + return 0; } } - if (did_alloc) - put_page(page); - return ERR_PTR(-EIO); - + return -EIO; } static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) @@ -208,6 +194,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) struct md_rdev *rdev = NULL; struct block_device *bdev; struct mddev *mddev = bitmap->mddev; + struct bitmap_storage *store = &bitmap->storage; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { int size = PAGE_SIZE; @@ -215,9 +202,13 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; - if (page->index == bitmap->file_pages-1) - size = roundup(bitmap->last_page_size, + if (page->index == store->file_pages-1) { + int last_page_size = store->bytes & (PAGE_SIZE-1); + if (last_page_size == 0) + last_page_size = PAGE_SIZE; + size = roundup(last_page_size, bdev_logical_block_size(bdev)); + } /* Just make sure we aren't corrupting data or * metadata */ @@ -276,10 +267,10 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) { struct buffer_head *bh; - if (bitmap->file == NULL) { + if (bitmap->storage.file == NULL) { switch (write_sb_page(bitmap, page, wait)) { case -EINVAL: - bitmap->flags |= BITMAP_WRITE_ERROR; + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); } } else { @@ -297,20 +288,16 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes)==0); } - if (bitmap->flags & BITMAP_WRITE_ERROR) + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) bitmap_file_kick(bitmap); } static void end_bitmap_write(struct buffer_head *bh, int uptodate) { struct bitmap *bitmap = bh->b_private; - unsigned long flags; - if (!uptodate) { - spin_lock_irqsave(&bitmap->lock, flags); - bitmap->flags |= BITMAP_WRITE_ERROR; - spin_unlock_irqrestore(&bitmap->lock, flags); - } + if (!uptodate) + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); if (atomic_dec_and_test(&bitmap->pending_writes)) wake_up(&bitmap->write_wait); } @@ -325,8 +312,12 @@ __clear_page_buffers(struct page *page) } static void free_buffers(struct page *page) { - struct buffer_head *bh = page_buffers(page); + struct buffer_head *bh; + if (!PagePrivate(page)) + return; + + bh = page_buffers(page); while (bh) { struct buffer_head *next = bh->b_this_page; free_buffer_head(bh); @@ -343,11 +334,12 @@ static void free_buffers(struct page *page) * This usage is similar to how swap files are handled, and allows us * to write to a file with no concerns of memory allocation failing. */ -static struct page *read_page(struct file *file, unsigned long index, - struct bitmap *bitmap, - unsigned long count) +static int read_page(struct file *file, unsigned long index, + struct bitmap *bitmap, + unsigned long count, + struct page *page) { - struct page *page = NULL; + int ret = 0; struct inode *inode = file->f_path.dentry->d_inode; struct buffer_head *bh; sector_t block; @@ -355,16 +347,9 @@ static struct page *read_page(struct file *file, unsigned long index, pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT); - page = alloc_page(GFP_KERNEL); - if (!page) - page = ERR_PTR(-ENOMEM); - if (IS_ERR(page)) - goto out; - bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); if (!bh) { - put_page(page); - page = ERR_PTR(-ENOMEM); + ret = -ENOMEM; goto out; } attach_page_buffers(page, bh); @@ -376,8 +361,7 @@ static struct page *read_page(struct file *file, unsigned long index, bh->b_blocknr = bmap(inode, block); if (bh->b_blocknr == 0) { /* Cannot use this file! */ - free_buffers(page); - page = ERR_PTR(-EINVAL); + ret = -EINVAL; goto out; } bh->b_bdev = inode->i_sb->s_bdev; @@ -400,17 +384,15 @@ static struct page *read_page(struct file *file, unsigned long index, wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes)==0); - if (bitmap->flags & BITMAP_WRITE_ERROR) { - free_buffers(page); - page = ERR_PTR(-EIO); - } + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) + ret = -EIO; out: - if (IS_ERR(page)) - printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n", + if (ret) + printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT, - PTR_ERR(page)); - return page; + ret); + return ret; } /* @@ -426,9 +408,9 @@ void bitmap_update_sb(struct bitmap *bitmap) return; if (bitmap->mddev->bitmap_info.external) return; - if (!bitmap->sb_page) /* no superblock */ + if (!bitmap->storage.sb_page) /* no superblock */ return; - sb = kmap_atomic(bitmap->sb_page); + sb = kmap_atomic(bitmap->storage.sb_page); sb->events = cpu_to_le64(bitmap->mddev->events); if (bitmap->mddev->events < bitmap->events_cleared) /* rocking back to read-only */ @@ -438,8 +420,13 @@ void bitmap_update_sb(struct bitmap *bitmap) /* Just in case these have been changed via sysfs: */ sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); + /* This might have been changed by a reshape */ + sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); + sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); + sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> + bitmap_info.space); kunmap_atomic(sb); - write_page(bitmap, bitmap->sb_page, 1); + write_page(bitmap, bitmap->storage.sb_page, 1); } /* print out the bitmap file superblock */ @@ -447,9 +434,9 @@ void bitmap_print_sb(struct bitmap *bitmap) { bitmap_super_t *sb; - if (!bitmap || !bitmap->sb_page) + if (!bitmap || !bitmap->storage.sb_page) return; - sb = kmap_atomic(bitmap->sb_page); + sb = kmap_atomic(bitmap->storage.sb_page); printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); @@ -488,15 +475,15 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) unsigned long chunksize, daemon_sleep, write_behind; int err = -EINVAL; - bitmap->sb_page = alloc_page(GFP_KERNEL); - if (IS_ERR(bitmap->sb_page)) { - err = PTR_ERR(bitmap->sb_page); - bitmap->sb_page = NULL; + bitmap->storage.sb_page = alloc_page(GFP_KERNEL); + if (IS_ERR(bitmap->storage.sb_page)) { + err = PTR_ERR(bitmap->storage.sb_page); + bitmap->storage.sb_page = NULL; return err; } - bitmap->sb_page->index = 0; + bitmap->storage.sb_page->index = 0; - sb = kmap_atomic(bitmap->sb_page); + sb = kmap_atomic(bitmap->storage.sb_page); sb->magic = cpu_to_le32(BITMAP_MAGIC); sb->version = cpu_to_le32(BITMAP_MAJOR_HI); @@ -534,8 +521,8 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) memcpy(sb->uuid, bitmap->mddev->uuid, 16); - bitmap->flags |= BITMAP_STALE; - sb->state |= cpu_to_le32(BITMAP_STALE); + set_bit(BITMAP_STALE, &bitmap->flags); + sb->state = cpu_to_le32(bitmap->flags); bitmap->events_cleared = bitmap->mddev->events; sb->events_cleared = cpu_to_le64(bitmap->mddev->events); @@ -551,31 +538,45 @@ static int bitmap_read_sb(struct bitmap *bitmap) bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; unsigned long long events; + unsigned long sectors_reserved = 0; int err = -EINVAL; + struct page *sb_page; + if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { + chunksize = 128 * 1024 * 1024; + daemon_sleep = 5 * HZ; + write_behind = 0; + set_bit(BITMAP_STALE, &bitmap->flags); + err = 0; + goto out_no_sb; + } /* page 0 is the superblock, read it... */ - if (bitmap->file) { - loff_t isize = i_size_read(bitmap->file->f_mapping->host); + sb_page = alloc_page(GFP_KERNEL); + if (!sb_page) + return -ENOMEM; + bitmap->storage.sb_page = sb_page; + + if (bitmap->storage.file) { + loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; - bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); + err = read_page(bitmap->storage.file, 0, + bitmap, bytes, sb_page); } else { - bitmap->sb_page = read_sb_page(bitmap->mddev, - bitmap->mddev->bitmap_info.offset, - NULL, - 0, sizeof(bitmap_super_t)); + err = read_sb_page(bitmap->mddev, + bitmap->mddev->bitmap_info.offset, + sb_page, + 0, sizeof(bitmap_super_t)); } - if (IS_ERR(bitmap->sb_page)) { - err = PTR_ERR(bitmap->sb_page); - bitmap->sb_page = NULL; + if (err) return err; - } - sb = kmap_atomic(bitmap->sb_page); + sb = kmap_atomic(sb_page); chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); + sectors_reserved = le32_to_cpu(sb->sectors_reserved); /* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) @@ -618,60 +619,32 @@ static int bitmap_read_sb(struct bitmap *bitmap) "-- forcing full recovery\n", bmname(bitmap), events, (unsigned long long) bitmap->mddev->events); - sb->state |= cpu_to_le32(BITMAP_STALE); + set_bit(BITMAP_STALE, &bitmap->flags); } } /* assign fields using values from superblock */ - bitmap->mddev->bitmap_info.chunksize = chunksize; - bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; - bitmap->mddev->bitmap_info.max_write_behind = write_behind; bitmap->flags |= le32_to_cpu(sb->state); if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) - bitmap->flags |= BITMAP_HOSTENDIAN; + set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); bitmap->events_cleared = le64_to_cpu(sb->events_cleared); - if (bitmap->flags & BITMAP_STALE) - bitmap->events_cleared = bitmap->mddev->events; err = 0; out: kunmap_atomic(sb); +out_no_sb: + if (test_bit(BITMAP_STALE, &bitmap->flags)) + bitmap->events_cleared = bitmap->mddev->events; + bitmap->mddev->bitmap_info.chunksize = chunksize; + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; + bitmap->mddev->bitmap_info.max_write_behind = write_behind; + if (bitmap->mddev->bitmap_info.space == 0 || + bitmap->mddev->bitmap_info.space > sectors_reserved) + bitmap->mddev->bitmap_info.space = sectors_reserved; if (err) bitmap_print_sb(bitmap); return err; } -enum bitmap_mask_op { - MASK_SET, - MASK_UNSET -}; - -/* record the state of the bitmap in the superblock. Return the old value */ -static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, - enum bitmap_mask_op op) -{ - bitmap_super_t *sb; - int old; - - if (!bitmap->sb_page) /* can't set the state */ - return 0; - sb = kmap_atomic(bitmap->sb_page); - old = le32_to_cpu(sb->state) & bits; - switch (op) { - case MASK_SET: - sb->state |= cpu_to_le32(bits); - bitmap->flags |= bits; - break; - case MASK_UNSET: - sb->state &= cpu_to_le32(~bits); - bitmap->flags &= ~bits; - break; - default: - BUG(); - } - kunmap_atomic(sb); - return old; -} - /* * general bitmap file operations */ @@ -683,17 +656,19 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, * file a page at a time. There's a superblock at the start of the file. */ /* calculate the index of the page that contains this bit */ -static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk) +static inline unsigned long file_page_index(struct bitmap_storage *store, + unsigned long chunk) { - if (!bitmap->mddev->bitmap_info.external) + if (store->sb_page) chunk += sizeof(bitmap_super_t) << 3; return chunk >> PAGE_BIT_SHIFT; } /* calculate the (bit) offset of this bit within a page */ -static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk) +static inline unsigned long file_page_offset(struct bitmap_storage *store, + unsigned long chunk) { - if (!bitmap->mddev->bitmap_info.external) + if (store->sb_page) chunk += sizeof(bitmap_super_t) << 3; return chunk & (PAGE_BITS - 1); } @@ -705,57 +680,86 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page * 0 or page 1 */ -static inline struct page *filemap_get_page(struct bitmap *bitmap, +static inline struct page *filemap_get_page(struct bitmap_storage *store, unsigned long chunk) { - if (file_page_index(bitmap, chunk) >= bitmap->file_pages) + if (file_page_index(store, chunk) >= store->file_pages) return NULL; - return bitmap->filemap[file_page_index(bitmap, chunk) - - file_page_index(bitmap, 0)]; + return store->filemap[file_page_index(store, chunk) + - file_page_index(store, 0)]; } -static void bitmap_file_unmap(struct bitmap *bitmap) +static int bitmap_storage_alloc(struct bitmap_storage *store, + unsigned long chunks, int with_super) +{ + int pnum; + unsigned long num_pages; + unsigned long bytes; + + bytes = DIV_ROUND_UP(chunks, 8); + if (with_super) + bytes += sizeof(bitmap_super_t); + + num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); + + store->filemap = kmalloc(sizeof(struct page *) + * num_pages, GFP_KERNEL); + if (!store->filemap) + return -ENOMEM; + + if (with_super && !store->sb_page) { + store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (store->sb_page == NULL) + return -ENOMEM; + store->sb_page->index = 0; + } + pnum = 0; + if (store->sb_page) { + store->filemap[0] = store->sb_page; + pnum = 1; + } + for ( ; pnum < num_pages; pnum++) { + store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!store->filemap[pnum]) { + store->file_pages = pnum; + return -ENOMEM; + } + store->filemap[pnum]->index = pnum; + } + store->file_pages = pnum; + + /* We need 4 bits per page, rounded up to a multiple + * of sizeof(unsigned long) */ + store->filemap_attr = kzalloc( + roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), + GFP_KERNEL); + if (!store->filemap_attr) + return -ENOMEM; + + store->bytes = bytes; + + return 0; +} + +static void bitmap_file_unmap(struct bitmap_storage *store) { struct page **map, *sb_page; - unsigned long *attr; int pages; - unsigned long flags; + struct file *file; - spin_lock_irqsave(&bitmap->lock, flags); - map = bitmap->filemap; - bitmap->filemap = NULL; - attr = bitmap->filemap_attr; - bitmap->filemap_attr = NULL; - pages = bitmap->file_pages; - bitmap->file_pages = 0; - sb_page = bitmap->sb_page; - bitmap->sb_page = NULL; - spin_unlock_irqrestore(&bitmap->lock, flags); + file = store->file; + map = store->filemap; + pages = store->file_pages; + sb_page = store->sb_page; while (pages--) if (map[pages] != sb_page) /* 0 is sb_page, release it below */ free_buffers(map[pages]); kfree(map); - kfree(attr); + kfree(store->filemap_attr); if (sb_page) free_buffers(sb_page); -} - -static void bitmap_file_put(struct bitmap *bitmap) -{ - struct file *file; - unsigned long flags; - - spin_lock_irqsave(&bitmap->lock, flags); - file = bitmap->file; - bitmap->file = NULL; - spin_unlock_irqrestore(&bitmap->lock, flags); - - if (file) - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); - bitmap_file_unmap(bitmap); if (file) { struct inode *inode = file->f_path.dentry->d_inode; @@ -773,14 +777,14 @@ static void bitmap_file_kick(struct bitmap *bitmap) { char *path, *ptr = NULL; - if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) { + if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { bitmap_update_sb(bitmap); - if (bitmap->file) { + if (bitmap->storage.file) { path = kmalloc(PAGE_SIZE, GFP_KERNEL); if (path) - ptr = d_path(&bitmap->file->f_path, path, - PAGE_SIZE); + ptr = d_path(&bitmap->storage.file->f_path, + path, PAGE_SIZE); printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n", @@ -792,10 +796,6 @@ static void bitmap_file_kick(struct bitmap *bitmap) "%s: disabling internal bitmap due to errors\n", bmname(bitmap)); } - - bitmap_file_put(bitmap); - - return; } enum bitmap_page_attr { @@ -805,24 +805,30 @@ enum bitmap_page_attr { BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ }; -static inline void set_page_attr(struct bitmap *bitmap, struct page *page, - enum bitmap_page_attr attr) +static inline void set_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) { - __set_bit((page->index<<2) + attr, bitmap->filemap_attr); + set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); } -static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, - enum bitmap_page_attr attr) +static inline void clear_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) { - __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); + clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); } -static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, - enum bitmap_page_attr attr) +static inline int test_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) { - return test_bit((page->index<<2) + attr, bitmap->filemap_attr); + return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); } +static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) +{ + return test_and_clear_bit((pnum<<2) + attr, + bitmap->storage.filemap_attr); +} /* * bitmap_file_set_bit -- called before performing a write to the md device * to set (and eventually sync) a particular bit in the bitmap file @@ -835,26 +841,46 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) unsigned long bit; struct page *page; void *kaddr; - unsigned long chunk = block >> bitmap->chunkshift; + unsigned long chunk = block >> bitmap->counts.chunkshift; - if (!bitmap->filemap) - return; - - page = filemap_get_page(bitmap, chunk); + page = filemap_get_page(&bitmap->storage, chunk); if (!page) return; - bit = file_page_offset(bitmap, chunk); + bit = file_page_offset(&bitmap->storage, chunk); /* set the bit */ kaddr = kmap_atomic(page); - if (bitmap->flags & BITMAP_HOSTENDIAN) + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) set_bit(bit, kaddr); else - __set_bit_le(bit, kaddr); + test_and_set_bit_le(bit, kaddr); kunmap_atomic(kaddr); pr_debug("set file bit %lu page %lu\n", bit, page->index); /* record page number so it gets flushed to disk when unplug occurs */ - set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); + set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY); +} + +static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) +{ + unsigned long bit; + struct page *page; + void *paddr; + unsigned long chunk = block >> bitmap->counts.chunkshift; + + page = filemap_get_page(&bitmap->storage, chunk); + if (!page) + return; + bit = file_page_offset(&bitmap->storage, chunk); + paddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) + clear_bit(bit, paddr); + else + test_and_clear_bit_le(bit, paddr); + kunmap_atomic(paddr); + if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) { + set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING); + bitmap->allclean = 0; + } } /* this gets called when the md device is ready to unplug its underlying @@ -862,42 +888,37 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) * sync the dirty pages of the bitmap file to disk */ void bitmap_unplug(struct bitmap *bitmap) { - unsigned long i, flags; + unsigned long i; int dirty, need_write; - struct page *page; int wait = 0; - if (!bitmap) + if (!bitmap || !bitmap->storage.filemap || + test_bit(BITMAP_STALE, &bitmap->flags)) return; /* look at each page to see if there are any set bits that need to be * flushed out to disk */ - for (i = 0; i < bitmap->file_pages; i++) { - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->filemap) { - spin_unlock_irqrestore(&bitmap->lock, flags); + for (i = 0; i < bitmap->storage.file_pages; i++) { + if (!bitmap->storage.filemap) return; + dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); + need_write = test_and_clear_page_attr(bitmap, i, + BITMAP_PAGE_NEEDWRITE); + if (dirty || need_write) { + clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); + write_page(bitmap, bitmap->storage.filemap[i], 0); } - page = bitmap->filemap[i]; - dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); - need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); - clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); - clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); if (dirty) wait = 1; - spin_unlock_irqrestore(&bitmap->lock, flags); - - if (dirty || need_write) - write_page(bitmap, page, 0); } if (wait) { /* if any writes were performed, we need to wait on them */ - if (bitmap->file) + if (bitmap->storage.file) wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes)==0); else md_super_wait(bitmap->mddev); } - if (bitmap->flags & BITMAP_WRITE_ERROR) + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) bitmap_file_kick(bitmap); } EXPORT_SYMBOL(bitmap_unplug); @@ -917,98 +938,77 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) { unsigned long i, chunks, index, oldindex, bit; - struct page *page = NULL, *oldpage = NULL; - unsigned long num_pages, bit_cnt = 0; + struct page *page = NULL; + unsigned long bit_cnt = 0; struct file *file; - unsigned long bytes, offset; + unsigned long offset; int outofdate; int ret = -ENOSPC; void *paddr; + struct bitmap_storage *store = &bitmap->storage; - chunks = bitmap->chunks; - file = bitmap->file; + chunks = bitmap->counts.chunks; + file = store->file; - BUG_ON(!file && !bitmap->mddev->bitmap_info.offset); + if (!file && !bitmap->mddev->bitmap_info.offset) { + /* No permanent bitmap - fill with '1s'. */ + store->filemap = NULL; + store->file_pages = 0; + for (i = 0; i < chunks ; i++) { + /* if the disk bit is set, set the memory bit */ + int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift) + >= start); + bitmap_set_memory_bits(bitmap, + (sector_t)i << bitmap->counts.chunkshift, + needed); + } + return 0; + } - outofdate = bitmap->flags & BITMAP_STALE; + outofdate = test_bit(BITMAP_STALE, &bitmap->flags); if (outofdate) printk(KERN_INFO "%s: bitmap file is out of date, doing full " "recovery\n", bmname(bitmap)); - bytes = DIV_ROUND_UP(bitmap->chunks, 8); - if (!bitmap->mddev->bitmap_info.external) - bytes += sizeof(bitmap_super_t); - - num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); - - if (file && i_size_read(file->f_mapping->host) < bytes) { + if (file && i_size_read(file->f_mapping->host) < store->bytes) { printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", - bmname(bitmap), - (unsigned long) i_size_read(file->f_mapping->host), - bytes); + bmname(bitmap), + (unsigned long) i_size_read(file->f_mapping->host), + store->bytes); goto err; } - ret = -ENOMEM; - - bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); - if (!bitmap->filemap) - goto err; - - /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */ - bitmap->filemap_attr = kzalloc( - roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), - GFP_KERNEL); - if (!bitmap->filemap_attr) - goto err; - oldindex = ~0L; + offset = 0; + if (!bitmap->mddev->bitmap_info.external) + offset = sizeof(bitmap_super_t); for (i = 0; i < chunks; i++) { int b; - index = file_page_index(bitmap, i); - bit = file_page_offset(bitmap, i); + index = file_page_index(&bitmap->storage, i); + bit = file_page_offset(&bitmap->storage, i); if (index != oldindex) { /* this is a new page, read it in */ int count; /* unmap the old page, we're done with it */ - if (index == num_pages-1) - count = bytes - index * PAGE_SIZE; + if (index == store->file_pages-1) + count = store->bytes - index * PAGE_SIZE; else count = PAGE_SIZE; - if (index == 0 && bitmap->sb_page) { - /* - * if we're here then the superblock page - * contains some bits (PAGE_SIZE != sizeof sb) - * we've already read it in, so just use it - */ - page = bitmap->sb_page; - offset = sizeof(bitmap_super_t); - if (!file) - page = read_sb_page( - bitmap->mddev, - bitmap->mddev->bitmap_info.offset, - page, - index, count); - } else if (file) { - page = read_page(file, index, bitmap, count); - offset = 0; - } else { - page = read_sb_page(bitmap->mddev, - bitmap->mddev->bitmap_info.offset, - NULL, - index, count); - offset = 0; - } - if (IS_ERR(page)) { /* read error */ - ret = PTR_ERR(page); + page = store->filemap[index]; + if (file) + ret = read_page(file, index, bitmap, + count, page); + else + ret = read_sb_page( + bitmap->mddev, + bitmap->mddev->bitmap_info.offset, + page, + index, count); + + if (ret) goto err; - } oldindex = index; - oldpage = page; - - bitmap->filemap[bitmap->file_pages++] = page; - bitmap->last_page_size = count; if (outofdate) { /* @@ -1022,39 +1022,33 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) write_page(bitmap, page, 1); ret = -EIO; - if (bitmap->flags & BITMAP_WRITE_ERROR) + if (test_bit(BITMAP_WRITE_ERROR, + &bitmap->flags)) goto err; } } paddr = kmap_atomic(page); - if (bitmap->flags & BITMAP_HOSTENDIAN) + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) b = test_bit(bit, paddr); else b = test_bit_le(bit, paddr); kunmap_atomic(paddr); if (b) { /* if the disk bit is set, set the memory bit */ - int needed = ((sector_t)(i+1) << bitmap->chunkshift + int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift >= start); bitmap_set_memory_bits(bitmap, - (sector_t)i << bitmap->chunkshift, + (sector_t)i << bitmap->counts.chunkshift, needed); bit_cnt++; } - } - - /* everything went OK */ - ret = 0; - bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); - - if (bit_cnt) { /* Kick recovery if any bits were set */ - set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); - md_wakeup_thread(bitmap->mddev->thread); + offset = 0; } printk(KERN_INFO "%s: bitmap initialized from disk: " - "read %lu/%lu pages, set %lu of %lu bits\n", - bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks); + "read %lu pages, set %lu of %lu bits\n", + bmname(bitmap), store->file_pages, + bit_cnt, chunks); return 0; @@ -1071,22 +1065,38 @@ void bitmap_write_all(struct bitmap *bitmap) */ int i; - spin_lock_irq(&bitmap->lock); - for (i = 0; i < bitmap->file_pages; i++) - set_page_attr(bitmap, bitmap->filemap[i], + if (!bitmap || !bitmap->storage.filemap) + return; + if (bitmap->storage.file) + /* Only one copy, so nothing needed */ + return; + + for (i = 0; i < bitmap->storage.file_pages; i++) + set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); bitmap->allclean = 0; - spin_unlock_irq(&bitmap->lock); } -static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) +static void bitmap_count_page(struct bitmap_counts *bitmap, + sector_t offset, int inc) { sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; bitmap->bp[page].count += inc; bitmap_checkfree(bitmap, page); } -static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, + +static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) +{ + sector_t chunk = offset >> bitmap->chunkshift; + unsigned long page = chunk >> PAGE_COUNTER_SHIFT; + struct bitmap_page *bp = &bitmap->bp[page]; + + if (!bp->pending) + bp->pending = 1; +} + +static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, sector_t offset, sector_t *blocks, int create); @@ -1099,10 +1109,9 @@ void bitmap_daemon_work(struct mddev *mddev) { struct bitmap *bitmap; unsigned long j; - unsigned long flags; - struct page *page = NULL, *lastpage = NULL; + unsigned long nextpage; sector_t blocks; - void *paddr; + struct bitmap_counts *counts; /* Use a mutex to guard daemon_work against * bitmap_destroy. @@ -1124,112 +1133,90 @@ void bitmap_daemon_work(struct mddev *mddev) } bitmap->allclean = 1; - spin_lock_irqsave(&bitmap->lock, flags); - for (j = 0; j < bitmap->chunks; j++) { + /* Any file-page which is PENDING now needs to be written. + * So set NEEDWRITE now, then after we make any last-minute changes + * we will write it. + */ + for (j = 0; j < bitmap->storage.file_pages; j++) + if (test_and_clear_page_attr(bitmap, j, + BITMAP_PAGE_PENDING)) + set_page_attr(bitmap, j, + BITMAP_PAGE_NEEDWRITE); + + if (bitmap->need_sync && + mddev->bitmap_info.external == 0) { + /* Arrange for superblock update as well as + * other changes */ + bitmap_super_t *sb; + bitmap->need_sync = 0; + if (bitmap->storage.filemap) { + sb = kmap_atomic(bitmap->storage.sb_page); + sb->events_cleared = + cpu_to_le64(bitmap->events_cleared); + kunmap_atomic(sb); + set_page_attr(bitmap, 0, + BITMAP_PAGE_NEEDWRITE); + } + } + /* Now look at the bitmap counters and if any are '2' or '1', + * decrement and handle accordingly. + */ + counts = &bitmap->counts; + spin_lock_irq(&counts->lock); + nextpage = 0; + for (j = 0; j < counts->chunks; j++) { bitmap_counter_t *bmc; - if (!bitmap->filemap) - /* error or shutdown */ - break; + sector_t block = (sector_t)j << counts->chunkshift; - page = filemap_get_page(bitmap, j); - - if (page != lastpage) { - /* skip this page unless it's marked as needing cleaning */ - if (!test_page_attr(bitmap, page, BITMAP_PAGE_PENDING)) { - int need_write = test_page_attr(bitmap, page, - BITMAP_PAGE_NEEDWRITE); - if (need_write) - clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); - - spin_unlock_irqrestore(&bitmap->lock, flags); - if (need_write) - write_page(bitmap, page, 0); - spin_lock_irqsave(&bitmap->lock, flags); - j |= (PAGE_BITS - 1); + if (j == nextpage) { + nextpage += PAGE_COUNTER_RATIO; + if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) { + j |= PAGE_COUNTER_MASK; continue; } - - /* grab the new page, sync and release the old */ - if (lastpage != NULL) { - if (test_page_attr(bitmap, lastpage, - BITMAP_PAGE_NEEDWRITE)) { - clear_page_attr(bitmap, lastpage, - BITMAP_PAGE_NEEDWRITE); - spin_unlock_irqrestore(&bitmap->lock, flags); - write_page(bitmap, lastpage, 0); - } else { - set_page_attr(bitmap, lastpage, - BITMAP_PAGE_NEEDWRITE); - bitmap->allclean = 0; - spin_unlock_irqrestore(&bitmap->lock, flags); - } - } else - spin_unlock_irqrestore(&bitmap->lock, flags); - lastpage = page; - - /* We are possibly going to clear some bits, so make - * sure that events_cleared is up-to-date. - */ - if (bitmap->need_sync && - mddev->bitmap_info.external == 0) { - bitmap_super_t *sb; - bitmap->need_sync = 0; - sb = kmap_atomic(bitmap->sb_page); - sb->events_cleared = - cpu_to_le64(bitmap->events_cleared); - kunmap_atomic(sb); - write_page(bitmap, bitmap->sb_page, 1); - } - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->need_sync) - clear_page_attr(bitmap, page, BITMAP_PAGE_PENDING); - else - bitmap->allclean = 0; + counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0; } - bmc = bitmap_get_counter(bitmap, - (sector_t)j << bitmap->chunkshift, + bmc = bitmap_get_counter(counts, + block, &blocks, 0); - if (!bmc) + + if (!bmc) { j |= PAGE_COUNTER_MASK; - else if (*bmc) { - if (*bmc == 1 && !bitmap->need_sync) { - /* we can clear the bit */ - *bmc = 0; - bitmap_count_page(bitmap, - (sector_t)j << bitmap->chunkshift, - -1); - - /* clear the bit */ - paddr = kmap_atomic(page); - if (bitmap->flags & BITMAP_HOSTENDIAN) - clear_bit(file_page_offset(bitmap, j), - paddr); - else - __clear_bit_le( - file_page_offset(bitmap, - j), - paddr); - kunmap_atomic(paddr); - } else if (*bmc <= 2) { - *bmc = 1; /* maybe clear the bit next time */ - set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); - bitmap->allclean = 0; - } + continue; } - } - spin_unlock_irqrestore(&bitmap->lock, flags); - - /* now sync the final page */ - if (lastpage != NULL) { - spin_lock_irqsave(&bitmap->lock, flags); - if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { - clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); - spin_unlock_irqrestore(&bitmap->lock, flags); - write_page(bitmap, lastpage, 0); - } else { - set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); + if (*bmc == 1 && !bitmap->need_sync) { + /* We can clear the bit */ + *bmc = 0; + bitmap_count_page(counts, block, -1); + bitmap_file_clear_bit(bitmap, block); + } else if (*bmc && *bmc <= 2) { + *bmc = 1; + bitmap_set_pending(counts, block); bitmap->allclean = 0; - spin_unlock_irqrestore(&bitmap->lock, flags); + } + } + spin_unlock_irq(&counts->lock); + + /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. + * DIRTY pages need to be written by bitmap_unplug so it can wait + * for them. + * If we find any DIRTY page we stop there and let bitmap_unplug + * handle all the rest. This is important in the case where + * the first blocking holds the superblock and it has been updated. + * We mustn't write any other blocks before the superblock. + */ + for (j = 0; + j < bitmap->storage.file_pages + && !test_bit(BITMAP_STALE, &bitmap->flags); + j++) { + + if (test_page_attr(bitmap, j, + BITMAP_PAGE_DIRTY)) + /* bitmap_unplug will handle the rest */ + break; + if (test_and_clear_page_attr(bitmap, j, + BITMAP_PAGE_NEEDWRITE)) { + write_page(bitmap, bitmap->storage.filemap[j], 0); } } @@ -1240,7 +1227,7 @@ void bitmap_daemon_work(struct mddev *mddev) mutex_unlock(&mddev->bitmap_info.mutex); } -static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, +static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, sector_t offset, sector_t *blocks, int create) __releases(bitmap->lock) @@ -1302,10 +1289,10 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect sector_t blocks; bitmap_counter_t *bmc; - spin_lock_irq(&bitmap->lock); - bmc = bitmap_get_counter(bitmap, offset, &blocks, 1); + spin_lock_irq(&bitmap->counts.lock); + bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); if (!bmc) { - spin_unlock_irq(&bitmap->lock); + spin_unlock_irq(&bitmap->counts.lock); return 0; } @@ -1317,7 +1304,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect */ prepare_to_wait(&bitmap->overflow_wait, &__wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&bitmap->lock); + spin_unlock_irq(&bitmap->counts.lock); io_schedule(); finish_wait(&bitmap->overflow_wait, &__wait); continue; @@ -1326,7 +1313,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect switch (*bmc) { case 0: bitmap_file_set_bit(bitmap, offset); - bitmap_count_page(bitmap, offset, 1); + bitmap_count_page(&bitmap->counts, offset, 1); /* fall through */ case 1: *bmc = 2; @@ -1334,7 +1321,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect (*bmc)++; - spin_unlock_irq(&bitmap->lock); + spin_unlock_irq(&bitmap->counts.lock); offset += blocks; if (sectors > blocks) @@ -1364,10 +1351,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto unsigned long flags; bitmap_counter_t *bmc; - spin_lock_irqsave(&bitmap->lock, flags); - bmc = bitmap_get_counter(bitmap, offset, &blocks, 0); + spin_lock_irqsave(&bitmap->counts.lock, flags); + bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0); if (!bmc) { - spin_unlock_irqrestore(&bitmap->lock, flags); + spin_unlock_irqrestore(&bitmap->counts.lock, flags); return; } @@ -1386,14 +1373,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto (*bmc)--; if (*bmc <= 2) { - set_page_attr(bitmap, - filemap_get_page( - bitmap, - offset >> bitmap->chunkshift), - BITMAP_PAGE_PENDING); + bitmap_set_pending(&bitmap->counts, offset); bitmap->allclean = 0; } - spin_unlock_irqrestore(&bitmap->lock, flags); + spin_unlock_irqrestore(&bitmap->counts.lock, flags); offset += blocks; if (sectors > blocks) sectors -= blocks; @@ -1412,8 +1395,8 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks = 1024; return 1; /* always resync if no bitmap */ } - spin_lock_irq(&bitmap->lock); - bmc = bitmap_get_counter(bitmap, offset, blocks, 0); + spin_lock_irq(&bitmap->counts.lock); + bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); rv = 0; if (bmc) { /* locked */ @@ -1427,7 +1410,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t } } } - spin_unlock_irq(&bitmap->lock); + spin_unlock_irq(&bitmap->counts.lock); return rv; } @@ -1464,8 +1447,8 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i *blocks = 1024; return; } - spin_lock_irqsave(&bitmap->lock, flags); - bmc = bitmap_get_counter(bitmap, offset, blocks, 0); + spin_lock_irqsave(&bitmap->counts.lock, flags); + bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc == NULL) goto unlock; /* locked */ @@ -1476,15 +1459,13 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i *bmc |= NEEDED_MASK; else { if (*bmc <= 2) { - set_page_attr(bitmap, - filemap_get_page(bitmap, offset >> bitmap->chunkshift), - BITMAP_PAGE_PENDING); + bitmap_set_pending(&bitmap->counts, offset); bitmap->allclean = 0; } } } unlock: - spin_unlock_irqrestore(&bitmap->lock, flags); + spin_unlock_irqrestore(&bitmap->counts.lock, flags); } EXPORT_SYMBOL(bitmap_end_sync); @@ -1524,7 +1505,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) bitmap->mddev->curr_resync_completed = sector; set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); - sector &= ~((1ULL << bitmap->chunkshift) - 1); + sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { bitmap_end_sync(bitmap, s, &blocks, 0); @@ -1538,27 +1519,25 @@ EXPORT_SYMBOL(bitmap_cond_end_sync); static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) { /* For each chunk covered by any of these sectors, set the - * counter to 1 and set resync_needed. They should all + * counter to 2 and possibly set resync_needed. They should all * be 0 at this point */ sector_t secs; bitmap_counter_t *bmc; - spin_lock_irq(&bitmap->lock); - bmc = bitmap_get_counter(bitmap, offset, &secs, 1); + spin_lock_irq(&bitmap->counts.lock); + bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1); if (!bmc) { - spin_unlock_irq(&bitmap->lock); + spin_unlock_irq(&bitmap->counts.lock); return; } if (!*bmc) { - struct page *page; *bmc = 2 | (needed ? NEEDED_MASK : 0); - bitmap_count_page(bitmap, offset, 1); - page = filemap_get_page(bitmap, offset >> bitmap->chunkshift); - set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); + bitmap_count_page(&bitmap->counts, offset, 1); + bitmap_set_pending(&bitmap->counts, offset); bitmap->allclean = 0; } - spin_unlock_irq(&bitmap->lock); + spin_unlock_irq(&bitmap->counts.lock); } /* dirty the memory and file bits for bitmap chunks "s" to "e" */ @@ -1567,11 +1546,9 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) unsigned long chunk; for (chunk = s; chunk <= e; chunk++) { - sector_t sec = (sector_t)chunk << bitmap->chunkshift; + sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; bitmap_set_memory_bits(bitmap, sec, 1); - spin_lock_irq(&bitmap->lock); bitmap_file_set_bit(bitmap, sec); - spin_unlock_irq(&bitmap->lock); if (sec < bitmap->mddev->recovery_cp) /* We are asserting that the array is dirty, * so move the recovery_cp address back so @@ -1616,11 +1593,15 @@ static void bitmap_free(struct bitmap *bitmap) if (!bitmap) /* there was no bitmap */ return; - /* release the bitmap file and kill the daemon */ - bitmap_file_put(bitmap); + /* Shouldn't be needed - but just in case.... */ + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes) == 0); + + /* release the bitmap file */ + bitmap_file_unmap(&bitmap->storage); - bp = bitmap->bp; - pages = bitmap->pages; + bp = bitmap->counts.bp; + pages = bitmap->counts.pages; /* free all allocated memory */ @@ -1659,25 +1640,19 @@ int bitmap_create(struct mddev *mddev) { struct bitmap *bitmap; sector_t blocks = mddev->resync_max_sectors; - unsigned long chunks; - unsigned long pages; struct file *file = mddev->bitmap_info.file; int err; struct sysfs_dirent *bm = NULL; BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); - if (!file - && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ - return 0; - BUG_ON(file && mddev->bitmap_info.offset); bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) return -ENOMEM; - spin_lock_init(&bitmap->lock); + spin_lock_init(&bitmap->counts.lock); atomic_set(&bitmap->pending_writes, 0); init_waitqueue_head(&bitmap->write_wait); init_waitqueue_head(&bitmap->overflow_wait); @@ -1693,7 +1668,7 @@ int bitmap_create(struct mddev *mddev) } else bitmap->sysfs_can_clear = NULL; - bitmap->file = file; + bitmap->storage.file = file; if (file) { get_file(file); /* As future accesses to this file will use bmap, @@ -1724,32 +1699,15 @@ int bitmap_create(struct mddev *mddev) goto error; bitmap->daemon_lastrun = jiffies; - bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize) - - BITMAP_BLOCK_SHIFT); - - chunks = (blocks + (1 << bitmap->chunkshift) - 1) >> - bitmap->chunkshift; - pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; - - BUG_ON(!pages); - - bitmap->chunks = chunks; - bitmap->pages = pages; - bitmap->missing_pages = pages; - - bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); - - err = -ENOMEM; - if (!bitmap->bp) + err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); + if (err) goto error; printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", - pages, bmname(bitmap)); + bitmap->counts.pages, bmname(bitmap)); mddev->bitmap = bitmap; - - - return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0; + return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; error: bitmap_free(bitmap); @@ -1790,13 +1748,17 @@ int bitmap_load(struct mddev *mddev) if (err) goto out; + clear_bit(BITMAP_STALE, &bitmap->flags); + + /* Kick recovery in case any bits were set */ + set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; md_wakeup_thread(mddev->thread); bitmap_update_sb(bitmap); - if (bitmap->flags & BITMAP_WRITE_ERROR) + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) err = -EIO; out: return err; @@ -1806,30 +1768,194 @@ EXPORT_SYMBOL_GPL(bitmap_load); void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) { unsigned long chunk_kb; - unsigned long flags; + struct bitmap_counts *counts; if (!bitmap) return; - spin_lock_irqsave(&bitmap->lock, flags); + counts = &bitmap->counts; + chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " "%lu%s chunk", - bitmap->pages - bitmap->missing_pages, - bitmap->pages, - (bitmap->pages - bitmap->missing_pages) + counts->pages - counts->missing_pages, + counts->pages, + (counts->pages - counts->missing_pages) << (PAGE_SHIFT - 10), chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, chunk_kb ? "KB" : "B"); - if (bitmap->file) { + if (bitmap->storage.file) { seq_printf(seq, ", file: "); - seq_path(seq, &bitmap->file->f_path, " \t\n"); + seq_path(seq, &bitmap->storage.file->f_path, " \t\n"); } seq_printf(seq, "\n"); - spin_unlock_irqrestore(&bitmap->lock, flags); } +int bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, int init) +{ + /* If chunk_size is 0, choose an appropriate chunk size. + * Then possibly allocate new storage space. + * Then quiesce, copy bits, replace bitmap, and re-start + * + * This function is called both to set up the initial bitmap + * and to resize the bitmap while the array is active. + * If this happens as a result of the array being resized, + * chunksize will be zero, and we need to choose a suitable + * chunksize, otherwise we use what we are given. + */ + struct bitmap_storage store; + struct bitmap_counts old_counts; + unsigned long chunks; + sector_t block; + sector_t old_blocks, new_blocks; + int chunkshift; + int ret = 0; + long pages; + struct bitmap_page *new_bp; + + if (chunksize == 0) { + /* If there is enough space, leave the chunk size unchanged, + * else increase by factor of two until there is enough space. + */ + long bytes; + long space = bitmap->mddev->bitmap_info.space; + + if (space == 0) { + /* We don't know how much space there is, so limit + * to current size - in sectors. + */ + bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8); + if (!bitmap->mddev->bitmap_info.external) + bytes += sizeof(bitmap_super_t); + space = DIV_ROUND_UP(bytes, 512); + bitmap->mddev->bitmap_info.space = space; + } + chunkshift = bitmap->counts.chunkshift; + chunkshift--; + do { + /* 'chunkshift' is shift from block size to chunk size */ + chunkshift++; + chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); + bytes = DIV_ROUND_UP(chunks, 8); + if (!bitmap->mddev->bitmap_info.external) + bytes += sizeof(bitmap_super_t); + } while (bytes > (space << 9)); + } else + chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT; + + chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); + memset(&store, 0, sizeof(store)); + if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) + ret = bitmap_storage_alloc(&store, chunks, + !bitmap->mddev->bitmap_info.external); + if (ret) + goto err; + + pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); + + new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL); + ret = -ENOMEM; + if (!new_bp) { + bitmap_file_unmap(&store); + goto err; + } + + if (!init) + bitmap->mddev->pers->quiesce(bitmap->mddev, 1); + + store.file = bitmap->storage.file; + bitmap->storage.file = NULL; + + if (store.sb_page && bitmap->storage.sb_page) + memcpy(page_address(store.sb_page), + page_address(bitmap->storage.sb_page), + sizeof(bitmap_super_t)); + bitmap_file_unmap(&bitmap->storage); + bitmap->storage = store; + + old_counts = bitmap->counts; + bitmap->counts.bp = new_bp; + bitmap->counts.pages = pages; + bitmap->counts.missing_pages = pages; + bitmap->counts.chunkshift = chunkshift; + bitmap->counts.chunks = chunks; + bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift + + BITMAP_BLOCK_SHIFT); + + blocks = min(old_counts.chunks << old_counts.chunkshift, + chunks << chunkshift); + + spin_lock_irq(&bitmap->counts.lock); + for (block = 0; block < blocks; ) { + bitmap_counter_t *bmc_old, *bmc_new; + int set; + + bmc_old = bitmap_get_counter(&old_counts, block, + &old_blocks, 0); + set = bmc_old && NEEDED(*bmc_old); + + if (set) { + bmc_new = bitmap_get_counter(&bitmap->counts, block, + &new_blocks, 1); + if (*bmc_new == 0) { + /* need to set on-disk bits too. */ + sector_t end = block + new_blocks; + sector_t start = block >> chunkshift; + start <<= chunkshift; + while (start < end) { + bitmap_file_set_bit(bitmap, block); + start += 1 << chunkshift; + } + *bmc_new = 2; + bitmap_count_page(&bitmap->counts, + block, 1); + bitmap_set_pending(&bitmap->counts, + block); + } + *bmc_new |= NEEDED_MASK; + if (new_blocks < old_blocks) + old_blocks = new_blocks; + } + block += old_blocks; + } + + if (!init) { + int i; + while (block < (chunks << chunkshift)) { + bitmap_counter_t *bmc; + bmc = bitmap_get_counter(&bitmap->counts, block, + &new_blocks, 1); + if (bmc) { + /* new space. It needs to be resynced, so + * we set NEEDED_MASK. + */ + if (*bmc == 0) { + *bmc = NEEDED_MASK | 2; + bitmap_count_page(&bitmap->counts, + block, 1); + bitmap_set_pending(&bitmap->counts, + block); + } + } + block += new_blocks; + } + for (i = 0; i < bitmap->storage.file_pages; i++) + set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); + } + spin_unlock_irq(&bitmap->counts.lock); + + if (!init) { + bitmap_unplug(bitmap); + bitmap->mddev->pers->quiesce(bitmap->mddev, 0); + } + ret = 0; +err: + return ret; +} +EXPORT_SYMBOL_GPL(bitmap_resize); + static ssize_t location_show(struct mddev *mddev, char *page) { @@ -1923,6 +2049,43 @@ location_store(struct mddev *mddev, const char *buf, size_t len) static struct md_sysfs_entry bitmap_location = __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); +/* 'bitmap/space' is the space available at 'location' for the + * bitmap. This allows the kernel to know when it is safe to + * resize the bitmap to match a resized array. + */ +static ssize_t +space_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.space); +} + +static ssize_t +space_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long sectors; + int rv; + + rv = kstrtoul(buf, 10, §ors); + if (rv) + return rv; + + if (sectors == 0) + return -EINVAL; + + if (mddev->bitmap && + sectors < (mddev->bitmap->storage.bytes + 511) >> 9) + return -EFBIG; /* Bitmap is too big for this small space */ + + /* could make sure it isn't too big, but that isn't really + * needed - user-space should be careful. + */ + mddev->bitmap_info.space = sectors; + return len; +} + +static struct md_sysfs_entry bitmap_space = +__ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store); + static ssize_t timeout_show(struct mddev *mddev, char *page) { @@ -2098,6 +2261,7 @@ __ATTR(max_backlog_used, S_IRUGO | S_IWUSR, static struct attribute *md_bitmap_attrs[] = { &bitmap_location.attr, + &bitmap_space.attr, &bitmap_timeout.attr, &bitmap_backlog.attr, &bitmap_chunksize.attr, diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index b44b0aba..df4aeb6 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -111,9 +111,9 @@ typedef __u16 bitmap_counter_t; /* use these for bitmap->flags and bitmap->sb->state bit-fields */ enum bitmap_state { - BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ - BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ - BITMAP_HOSTENDIAN = 0x8000, + BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ + BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ + BITMAP_HOSTENDIAN =15, }; /* the superblock at the front of the bitmap file -- little endian */ @@ -128,8 +128,10 @@ typedef struct bitmap_super_s { __le32 chunksize; /* 52 the bitmap chunk size in bytes */ __le32 daemon_sleep; /* 56 seconds between disk flushes */ __le32 write_behind; /* 60 number of outstanding write-behind writes */ + __le32 sectors_reserved; /* 64 number of 512-byte sectors that are + * reserved for the bitmap. */ - __u8 pad[256 - 64]; /* set to zero */ + __u8 pad[256 - 68]; /* set to zero */ } bitmap_super_t; /* notes: @@ -160,35 +162,48 @@ struct bitmap_page { */ unsigned int hijacked:1; /* + * If any counter in this page is '1' or '2' - and so could be + * cleared then that page is marked as 'pending' + */ + unsigned int pending:1; + /* * count of dirty bits on the page */ - unsigned int count:31; + unsigned int count:30; }; /* the main bitmap structure - one per mddev */ struct bitmap { - struct bitmap_page *bp; - unsigned long pages; /* total number of pages in the bitmap */ - unsigned long missing_pages; /* number of pages not yet allocated */ - struct mddev *mddev; /* the md device that the bitmap is for */ + struct bitmap_counts { + spinlock_t lock; + struct bitmap_page *bp; + unsigned long pages; /* total number of pages + * in the bitmap */ + unsigned long missing_pages; /* number of pages + * not yet allocated */ + unsigned long chunkshift; /* chunksize = 2^chunkshift + * (for bitops) */ + unsigned long chunks; /* Total number of data + * chunks for the array */ + } counts; - /* bitmap chunksize -- how much data does each bit represent? */ - unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */ - unsigned long chunks; /* total number of data chunks for the array */ + struct mddev *mddev; /* the md device that the bitmap is for */ __u64 events_cleared; int need_sync; - /* bitmap spinlock */ - spinlock_t lock; - - struct file *file; /* backing disk file */ - struct page *sb_page; /* cached copy of the bitmap file superblock */ - struct page **filemap; /* list of cache pages for the file */ - unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ - unsigned long file_pages; /* number of pages in the file */ - int last_page_size; /* bytes in the last page */ + struct bitmap_storage { + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap + * file superblock */ + struct page **filemap; /* list of cache pages for + * the file */ + unsigned long *filemap_attr; /* attributes associated + * w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file*/ + unsigned long bytes; /* total bytes in the bitmap */ + } storage; unsigned long flags; @@ -242,6 +257,9 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); void bitmap_unplug(struct bitmap *bitmap); void bitmap_daemon_work(struct mddev *mddev); + +int bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, int init); #endif #endif diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 68965e6..017c34d 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -155,10 +155,7 @@ static void context_free(struct raid_set *rs) for (i = 0; i < rs->md.raid_disks; i++) { if (rs->dev[i].meta_dev) dm_put_device(rs->ti, rs->dev[i].meta_dev); - if (rs->dev[i].rdev.sb_page) - put_page(rs->dev[i].rdev.sb_page); - rs->dev[i].rdev.sb_page = NULL; - rs->dev[i].rdev.sb_loaded = 0; + md_rdev_clear(&rs->dev[i].rdev); if (rs->dev[i].data_dev) dm_put_device(rs->ti, rs->dev[i].data_dev); } @@ -606,7 +603,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size) if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { DMERR("Failed to read superblock of device at position %d", rdev->raid_disk); - set_bit(Faulty, &rdev->flags); + md_error(rdev->mddev, rdev); return -EINVAL; } @@ -617,16 +614,18 @@ static int read_disk_sb(struct md_rdev *rdev, int size) static void super_sync(struct mddev *mddev, struct md_rdev *rdev) { - struct md_rdev *r; + int i; uint64_t failed_devices; struct dm_raid_superblock *sb; + struct raid_set *rs = container_of(mddev, struct raid_set, md); sb = page_address(rdev->sb_page); failed_devices = le64_to_cpu(sb->failed_devices); - rdev_for_each(r, mddev) - if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) - failed_devices |= (1ULL << r->raid_disk); + for (i = 0; i < mddev->raid_disks; i++) + if (!rs->dev[i].data_dev || + test_bit(Faulty, &(rs->dev[i].rdev.flags))) + failed_devices |= (1ULL << i); memset(sb, 0, sizeof(*sb)); @@ -1252,12 +1251,13 @@ static void raid_resume(struct dm_target *ti) { struct raid_set *rs = ti->private; + set_bit(MD_CHANGE_DEVS, &rs->md.flags); if (!rs->bitmap_loaded) { bitmap_load(&rs->md); rs->bitmap_loaded = 1; - } else - md_wakeup_thread(rs->md.thread); + } + clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); mddev_resume(&rs->md); } diff --git a/drivers/md/md.c b/drivers/md/md.c index 01233d8..1c2f904 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -402,6 +402,7 @@ void mddev_resume(struct mddev *mddev) wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 0); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ } @@ -452,7 +453,7 @@ static void submit_flushes(struct work_struct *ws) atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev); + bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); bi->bi_end_io = md_end_flush; bi->bi_private = rdev; bi->bi_bdev = rdev->bdev; @@ -607,6 +608,7 @@ void mddev_init(struct mddev *mddev) init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; + mddev->reshape_backwards = 0; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; @@ -802,7 +804,7 @@ static int alloc_disk_sb(struct md_rdev * rdev) return 0; } -static void free_disk_sb(struct md_rdev * rdev) +void md_rdev_clear(struct md_rdev *rdev) { if (rdev->sb_page) { put_page(rdev->sb_page); @@ -815,8 +817,10 @@ static void free_disk_sb(struct md_rdev * rdev) put_page(rdev->bb_page); rdev->bb_page = NULL; } + kfree(rdev->badblocks.page); + rdev->badblocks.page = NULL; } - +EXPORT_SYMBOL_GPL(md_rdev_clear); static void super_written(struct bio *bio, int error) { @@ -887,6 +891,10 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, rdev->meta_bdev : rdev->bdev; if (metadata_op) bio->bi_sector = sector + rdev->sb_start; + else if (rdev->mddev->reshape_position != MaxSector && + (rdev->mddev->reshape_backwards == + (sector >= rdev->mddev->reshape_position))) + bio->bi_sector = sector + rdev->new_data_offset; else bio->bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); @@ -1034,12 +1042,17 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) struct super_type { char *name; struct module *owner; - int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev, + int (*load_super)(struct md_rdev *rdev, + struct md_rdev *refdev, int minor_version); - int (*validate_super)(struct mddev *mddev, struct md_rdev *rdev); - void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); + int (*validate_super)(struct mddev *mddev, + struct md_rdev *rdev); + void (*sync_super)(struct mddev *mddev, + struct md_rdev *rdev); unsigned long long (*rdev_size_change)(struct md_rdev *rdev, sector_t num_sectors); + int (*allow_new_offset)(struct md_rdev *rdev, + unsigned long long new_offset); }; /* @@ -1111,6 +1124,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; + rdev->new_data_offset = 0; rdev->sb_size = MD_SB_BYTES; rdev->badblocks.shift = -1; @@ -1184,7 +1198,11 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->dev_sectors = ((sector_t)sb->size) * 2; mddev->events = ev1; mddev->bitmap_info.offset = 0; + mddev->bitmap_info.space = 0; + /* bitmap can use 60 K after the 4K superblocks */ mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); + mddev->reshape_backwards = 0; if (mddev->minor_version >= 91) { mddev->reshape_position = sb->reshape_position; @@ -1192,6 +1210,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_level = sb->new_level; mddev->new_layout = sb->new_layout; mddev->new_chunk_sectors = sb->new_chunk >> 9; + if (mddev->delta_disks < 0) + mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; @@ -1218,9 +1238,12 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->max_disks = MD_SB_DISKS; if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && - mddev->bitmap_info.file == NULL) + mddev->bitmap_info.file == NULL) { mddev->bitmap_info.offset = mddev->bitmap_info.default_offset; + mddev->bitmap_info.space = + mddev->bitmap_info.space; + } } else if (mddev->pers == NULL) { /* Insist on good event counter while assembling, except @@ -1434,6 +1457,12 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) return num_sectors; } +static int +super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) +{ + /* non-zero offset changes not possible with v0.90 */ + return new_offset == 0; +} /* * version 1 superblock @@ -1469,6 +1498,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ struct mdp_superblock_1 *sb; int ret; sector_t sb_start; + sector_t sectors; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; @@ -1523,9 +1553,18 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ bdevname(rdev->bdev,b)); return -EINVAL; } + if (sb->pad0 || + sb->pad3[0] || + memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) + /* Some padding is non-zero, might be a new feature */ + return -EINVAL; rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); + rdev->new_data_offset = rdev->data_offset; + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && + (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) + rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; @@ -1536,6 +1575,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ if (minor_version && rdev->data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; + if (minor_version + && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) + return -EINVAL; if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) rdev->desc_nr = -1; @@ -1607,16 +1649,14 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ else ret = 0; } - if (minor_version) - rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - - le64_to_cpu(sb->data_offset); - else - rdev->sectors = rdev->sb_start; - if (rdev->sectors < le64_to_cpu(sb->data_size)) + if (minor_version) { + sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); + sectors -= rdev->data_offset; + } else + sectors = rdev->sb_start; + if (sectors < le64_to_cpu(sb->data_size)) return -EINVAL; rdev->sectors = le64_to_cpu(sb->data_size); - if (le64_to_cpu(sb->size) > rdev->sectors) - return -EINVAL; return ret; } @@ -1644,17 +1684,37 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->dev_sectors = le64_to_cpu(sb->size); mddev->events = ev1; mddev->bitmap_info.offset = 0; + mddev->bitmap_info.space = 0; + /* Default location for bitmap is 1K after superblock + * using 3K - total of 4K + */ mddev->bitmap_info.default_offset = 1024 >> 9; - + mddev->bitmap_info.default_space = (4096-1024) >> 9; + mddev->reshape_backwards = 0; + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && - mddev->bitmap_info.file == NULL ) + mddev->bitmap_info.file == NULL) { mddev->bitmap_info.offset = (__s32)le32_to_cpu(sb->bitmap_offset); + /* Metadata doesn't record how much space is available. + * For 1.0, we assume we can use up to the superblock + * if before, else to 4K beyond superblock. + * For others, assume no change is possible. + */ + if (mddev->minor_version > 0) + mddev->bitmap_info.space = 0; + else if (mddev->bitmap_info.offset > 0) + mddev->bitmap_info.space = + 8 - mddev->bitmap_info.offset; + else + mddev->bitmap_info.space = + -mddev->bitmap_info.offset; + } if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { mddev->reshape_position = le64_to_cpu(sb->reshape_position); @@ -1662,6 +1722,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_level = le32_to_cpu(sb->new_level); mddev->new_layout = le32_to_cpu(sb->new_layout); mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); + if (mddev->delta_disks < 0 || + (mddev->delta_disks == 0 && + (le32_to_cpu(sb->feature_map) + & MD_FEATURE_RESHAPE_BACKWARDS))) + mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; @@ -1735,7 +1800,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map = 0; sb->pad0 = 0; sb->recovery_offset = cpu_to_le64(0); - memset(sb->pad1, 0, sizeof(sb->pad1)); memset(sb->pad3, 0, sizeof(sb->pad3)); sb->utime = cpu_to_le64((__u64)mddev->utime); @@ -1757,6 +1821,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->devflags |= WriteMostly1; else sb->devflags &= ~WriteMostly1; + sb->data_offset = cpu_to_le64(rdev->data_offset); + sb->data_size = cpu_to_le64(rdev->sectors); if (mddev->bitmap && mddev->bitmap_info.file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); @@ -1781,6 +1847,16 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->delta_disks = cpu_to_le32(mddev->delta_disks); sb->new_level = cpu_to_le32(mddev->new_level); sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); + if (mddev->delta_disks == 0 && + mddev->reshape_backwards) + sb->feature_map + |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); + if (rdev->new_data_offset != rdev->data_offset) { + sb->feature_map + |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); + sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset + - rdev->data_offset)); + } } if (rdev->badblocks.count == 0) @@ -1857,6 +1933,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) sector_t max_sectors; if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ + if (rdev->data_offset != rdev->new_data_offset) + return 0; /* too confusing */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; @@ -1884,6 +1962,40 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) rdev->sb_page); md_super_wait(rdev->mddev); return num_sectors; + +} + +static int +super_1_allow_new_offset(struct md_rdev *rdev, + unsigned long long new_offset) +{ + /* All necessary checks on new >= old have been done */ + struct bitmap *bitmap; + if (new_offset >= rdev->data_offset) + return 1; + + /* with 1.0 metadata, there is no metadata to tread on + * so we can always move back */ + if (rdev->mddev->minor_version == 0) + return 1; + + /* otherwise we must be sure not to step on + * any metadata, so stay: + * 36K beyond start of superblock + * beyond end of badblocks + * beyond write-intent bitmap + */ + if (rdev->sb_start + (32+4)*2 > new_offset) + return 0; + bitmap = rdev->mddev->bitmap; + if (bitmap && !rdev->mddev->bitmap_info.file && + rdev->sb_start + rdev->mddev->bitmap_info.offset + + bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) + return 0; + if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) + return 0; + + return 1; } static struct super_type super_types[] = { @@ -1894,6 +2006,7 @@ static struct super_type super_types[] = { .validate_super = super_90_validate, .sync_super = super_90_sync, .rdev_size_change = super_90_rdev_size_change, + .allow_new_offset = super_90_allow_new_offset, }, [1] = { .name = "md-1", @@ -1902,6 +2015,7 @@ static struct super_type super_types[] = { .validate_super = super_1_validate, .sync_super = super_1_sync, .rdev_size_change = super_1_rdev_size_change, + .allow_new_offset = super_1_allow_new_offset, }, }; @@ -2105,9 +2219,7 @@ static void unbind_rdev_from_array(struct md_rdev * rdev) sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); rdev->sysfs_state = NULL; - kfree(rdev->badblocks.page); rdev->badblocks.count = 0; - rdev->badblocks.page = NULL; /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state". We also need * to delay it due to rcu usage. @@ -2158,7 +2270,7 @@ static void export_rdev(struct md_rdev * rdev) bdevname(rdev->bdev,b)); if (rdev->mddev) MD_BUG(); - free_disk_sb(rdev); + md_rdev_clear(rdev); #ifndef MODULE if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); @@ -2809,9 +2921,8 @@ offset_show(struct md_rdev *rdev, char *page) static ssize_t offset_store(struct md_rdev *rdev, const char *buf, size_t len) { - char *e; - unsigned long long offset = simple_strtoull(buf, &e, 10); - if (e==buf || (*e && *e != '\n')) + unsigned long long offset; + if (strict_strtoull(buf, 10, &offset) < 0) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; @@ -2826,6 +2937,63 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len) static struct rdev_sysfs_entry rdev_offset = __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); +static ssize_t new_offset_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)rdev->new_data_offset); +} + +static ssize_t new_offset_store(struct md_rdev *rdev, + const char *buf, size_t len) +{ + unsigned long long new_offset; + struct mddev *mddev = rdev->mddev; + + if (strict_strtoull(buf, 10, &new_offset) < 0) + return -EINVAL; + + if (mddev->sync_thread) + return -EBUSY; + if (new_offset == rdev->data_offset) + /* reset is always permitted */ + ; + else if (new_offset > rdev->data_offset) { + /* must not push array size beyond rdev_sectors */ + if (new_offset - rdev->data_offset + + mddev->dev_sectors > rdev->sectors) + return -E2BIG; + } + /* Metadata worries about other space details. */ + + /* decreasing the offset is inconsistent with a backwards + * reshape. + */ + if (new_offset < rdev->data_offset && + mddev->reshape_backwards) + return -EINVAL; + /* Increasing offset is inconsistent with forwards + * reshape. reshape_direction should be set to + * 'backwards' first. + */ + if (new_offset > rdev->data_offset && + !mddev->reshape_backwards) + return -EINVAL; + + if (mddev->pers && mddev->persistent && + !super_types[mddev->major_version] + .allow_new_offset(rdev, new_offset)) + return -E2BIG; + rdev->new_data_offset = new_offset; + if (new_offset > rdev->data_offset) + mddev->reshape_backwards = 1; + else if (new_offset < rdev->data_offset) + mddev->reshape_backwards = 0; + + return len; +} +static struct rdev_sysfs_entry rdev_new_offset = +__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); + static ssize_t rdev_size_show(struct md_rdev *rdev, char *page) { @@ -2870,6 +3038,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; + if (rdev->data_offset != rdev->new_data_offset) + return -EINVAL; /* too confusing */ if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { sectors = super_types[my_mddev->major_version]. @@ -3006,6 +3176,7 @@ static struct attribute *rdev_default_attrs[] = { &rdev_errors.attr, &rdev_slot.attr, &rdev_offset.attr, + &rdev_new_offset.attr, &rdev_size.attr, &rdev_recovery_start.attr, &rdev_bad_blocks.attr, @@ -3080,6 +3251,7 @@ int md_rdev_init(struct md_rdev *rdev) rdev->raid_disk = -1; rdev->flags = 0; rdev->data_offset = 0; + rdev->new_data_offset = 0; rdev->sb_events = 0; rdev->last_read_error.tv_sec = 0; rdev->last_read_error.tv_nsec = 0; @@ -3178,8 +3350,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe abort_free: if (rdev->bdev) unlock_rdev(rdev); - free_disk_sb(rdev); - kfree(rdev->badblocks.page); + md_rdev_clear(rdev); kfree(rdev); return ERR_PTR(err); } @@ -3419,6 +3590,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; module_put(pers->owner); printk(KERN_WARNING "md: %s: %s would not accept array\n", mdname(mddev), clevel); @@ -3492,6 +3664,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; mddev->degraded = 0; if (mddev->pers->sync_request == NULL) { /* this is now an array without redundancy, so @@ -3501,10 +3674,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) del_timer_sync(&mddev->safemode_timer); } pers->run(mddev); - mddev_resume(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); + mddev_resume(mddev); sysfs_notify(&mddev->kobj, NULL, "level"); md_new_event(mddev); return rv; @@ -3582,9 +3753,20 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers) rv = update_raid_disks(mddev, n); else if (mddev->reshape_position != MaxSector) { + struct md_rdev *rdev; int olddisks = mddev->raid_disks - mddev->delta_disks; + + rdev_for_each(rdev, mddev) { + if (olddisks < n && + rdev->data_offset < rdev->new_data_offset) + return -EINVAL; + if (olddisks > n && + rdev->data_offset > rdev->new_data_offset) + return -EINVAL; + } mddev->delta_disks = n - olddisks; mddev->raid_disks = n; + mddev->reshape_backwards = (mddev->delta_disks < 0); } else mddev->raid_disks = n; return rv ? rv : len; @@ -4266,7 +4448,8 @@ sync_completed_show(struct mddev *mddev, char *page) if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; @@ -4428,6 +4611,7 @@ reshape_position_show(struct mddev *mddev, char *page) static ssize_t reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { + struct md_rdev *rdev; char *e; unsigned long long new = simple_strtoull(buf, &e, 10); if (mddev->pers) @@ -4436,9 +4620,12 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) return -EINVAL; mddev->reshape_position = new; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; + rdev_for_each(rdev, mddev) + rdev->new_data_offset = rdev->data_offset; return len; } @@ -4447,6 +4634,42 @@ __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, reshape_position_store); static ssize_t +reshape_direction_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%s\n", + mddev->reshape_backwards ? "backwards" : "forwards"); +} + +static ssize_t +reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) +{ + int backwards = 0; + if (cmd_match(buf, "forwards")) + backwards = 0; + else if (cmd_match(buf, "backwards")) + backwards = 1; + else + return -EINVAL; + if (mddev->reshape_backwards == backwards) + return len; + + /* check if we are allowed to change */ + if (mddev->delta_disks) + return -EBUSY; + + if (mddev->persistent && + mddev->major_version == 0) + return -EINVAL; + + mddev->reshape_backwards = backwards; + return len; +} + +static struct md_sysfs_entry md_reshape_direction = +__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, + reshape_direction_store); + +static ssize_t array_size_show(struct mddev *mddev, char *page) { if (mddev->external_size) @@ -4501,6 +4724,7 @@ static struct attribute *md_default_attrs[] = { &md_safe_delay.attr, &md_array_state.attr, &md_reshape_position.attr, + &md_reshape_direction.attr, &md_array_size.attr, &max_corr_read_errors.attr, NULL, @@ -4914,7 +5138,8 @@ int md_run(struct mddev *mddev) err = -EINVAL; mddev->pers->stop(mddev); } - if (err == 0 && mddev->pers->sync_request) { + if (err == 0 && mddev->pers->sync_request && + (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { err = bitmap_create(mddev); if (err) { printk(KERN_ERR "%s: failed to create bitmap (%d)\n", @@ -5064,6 +5289,7 @@ static void md_clean(struct mddev *mddev) mddev->events = 0; mddev->can_decrease_events = 0; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; mddev->new_level = LEVEL_NONE; mddev->new_layout = 0; mddev->new_chunk_sectors = 0; @@ -5079,6 +5305,7 @@ static void md_clean(struct mddev *mddev) mddev->merge_check_needed = 0; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; + mddev->bitmap_info.default_space = 0; mddev->bitmap_info.chunksize = 0; mddev->bitmap_info.daemon_sleep = 0; mddev->bitmap_info.max_write_behind = 0; @@ -5421,7 +5648,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg) goto out; /* bitmap disabled, zero the first byte and copy out */ - if (!mddev->bitmap || !mddev->bitmap->file) { + if (!mddev->bitmap || !mddev->bitmap->storage.file) { file->pathname[0] = '\0'; goto copy_out; } @@ -5430,7 +5657,8 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg) if (!buf) goto out; - ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); + ptr = d_path(&mddev->bitmap->storage.file->f_path, + buf, sizeof(file->pathname)); if (IS_ERR(ptr)) goto out; @@ -5875,6 +6103,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); mddev->bitmap_info.offset = 0; mddev->reshape_position = MaxSector; @@ -5888,6 +6117,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_layout = mddev->layout; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; return 0; } @@ -5922,11 +6152,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) */ if (mddev->sync_thread) return -EBUSY; - if (mddev->bitmap) - /* Sorry, cannot grow a bitmap yet, just remove it, - * grow, and re-add. - */ - return -EBUSY; + rdev_for_each(rdev, mddev) { sector_t avail = rdev->sectors; @@ -5944,6 +6170,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) static int update_raid_disks(struct mddev *mddev, int raid_disks) { int rv; + struct md_rdev *rdev; /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; @@ -5952,11 +6179,27 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return -EINVAL; if (mddev->sync_thread || mddev->reshape_position != MaxSector) return -EBUSY; + + rdev_for_each(rdev, mddev) { + if (mddev->raid_disks < raid_disks && + rdev->data_offset < rdev->new_data_offset) + return -EINVAL; + if (mddev->raid_disks > raid_disks && + rdev->data_offset > rdev->new_data_offset) + return -EINVAL; + } + mddev->delta_disks = raid_disks - mddev->raid_disks; + if (mddev->delta_disks < 0) + mddev->reshape_backwards = 1; + else if (mddev->delta_disks > 0) + mddev->reshape_backwards = 0; rv = mddev->pers->check_reshape(mddev); - if (rv < 0) + if (rv < 0) { mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + } return rv; } @@ -6039,6 +6282,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) return -EINVAL; mddev->bitmap_info.offset = mddev->bitmap_info.default_offset; + mddev->bitmap_info.space = + mddev->bitmap_info.default_space; mddev->pers->quiesce(mddev, 1); rv = bitmap_create(mddev); if (!rv) @@ -6050,7 +6295,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) /* remove the bitmap */ if (!mddev->bitmap) return -ENOENT; - if (mddev->bitmap->file) + if (mddev->bitmap->storage.file) return -EINVAL; mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); @@ -6373,6 +6618,9 @@ static int md_open(struct block_device *bdev, fmode_t mode) struct mddev *mddev = mddev_find(bdev->bd_dev); int err; + if (!mddev) + return -ENODEV; + if (mddev->gendisk != bdev->bd_disk) { /* we are racing with mddev_put which is discarding this * bd_disk. @@ -6584,7 +6832,8 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev) resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; @@ -7147,7 +7396,7 @@ void md_do_sync(struct mddev *mddev) j = mddev->recovery_cp; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sectors = mddev->dev_sectors; + max_sectors = mddev->resync_max_sectors; else { /* recovery follows the physical size of devices */ max_sectors = mddev->dev_sectors; @@ -7598,7 +7847,7 @@ void md_check_recovery(struct mddev *mddev) goto unlock; if (mddev->pers->sync_request) { - if (spares && mddev->bitmap && ! mddev->bitmap->file) { + if (spares) { /* We are adding a device or devices to an array * which has the bitmap stored on all devices. * So make sure all bitmap pages get written @@ -7646,6 +7895,20 @@ void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) } EXPORT_SYMBOL(md_wait_for_blocked_rdev); +void md_finish_reshape(struct mddev *mddev) +{ + /* called be personality module when reshape completes. */ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) { + if (rdev->data_offset > rdev->new_data_offset) + rdev->sectors += rdev->data_offset - rdev->new_data_offset; + else + rdev->sectors -= rdev->new_data_offset - rdev->data_offset; + rdev->data_offset = rdev->new_data_offset; + } +} +EXPORT_SYMBOL(md_finish_reshape); /* Bad block management. * We can record which blocks on each device are 'bad' and so just @@ -7894,10 +8157,15 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, } int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int acknowledged) + int is_new) { - int rv = md_set_badblocks(&rdev->badblocks, - s + rdev->data_offset, sectors, acknowledged); + int rv; + if (is_new) + s += rdev->new_data_offset; + else + s += rdev->data_offset; + rv = md_set_badblocks(&rdev->badblocks, + s, sectors, 0); if (rv) { /* Make sure they get written out promptly */ sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -8003,11 +8271,15 @@ out: return rv; } -int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors) +int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { + if (is_new) + s += rdev->new_data_offset; + else + s += rdev->data_offset; return md_clear_badblocks(&rdev->badblocks, - s + rdev->data_offset, - sectors); + s, sectors); } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); diff --git a/drivers/md/md.h b/drivers/md/md.h index 1c2063c..7b4a3c3 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -55,6 +55,7 @@ struct md_rdev { int sb_loaded; __u64 sb_events; sector_t data_offset; /* start of data in array */ + sector_t new_data_offset;/* only relevant while reshaping */ sector_t sb_start; /* offset of the super block (in 512byte sectors) */ int sb_size; /* bytes in the superblock */ int preferred_minor; /* autorun support */ @@ -193,8 +194,9 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, return 0; } extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int acknowledged); -extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors); + int is_new); +extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new); extern void md_ack_all_badblocks(struct badblocks *bb); struct mddev { @@ -262,6 +264,7 @@ struct mddev { sector_t reshape_position; int delta_disks, new_level, new_layout; int new_chunk_sectors; + int reshape_backwards; atomic_t plug_cnt; /* If device is expecting * more bios soon. @@ -390,10 +393,13 @@ struct mddev { * For external metadata, offset * from start of device. */ + unsigned long space; /* space available at this offset */ loff_t default_offset; /* this is the offset to use when * hot-adding a bitmap. It should * eventually be settable by sysfs. */ + unsigned long default_space; /* space available at + * default offset */ struct mutex mutex; unsigned long chunksize; unsigned long daemon_sleep; /* how many jiffies between updates? */ @@ -591,6 +597,7 @@ extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); +extern void md_finish_reshape(struct mddev *mddev); extern int mddev_congested(struct mddev *mddev, int bits); extern void md_flush_request(struct mddev *mddev, struct bio *bio); @@ -615,6 +622,7 @@ extern int md_run(struct mddev *mddev); extern void md_stop(struct mddev *mddev); extern void md_stop_writes(struct mddev *mddev); extern int md_rdev_init(struct md_rdev *rdev); +extern void md_rdev_clear(struct md_rdev *rdev); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 15dd59b..835de71 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1859,7 +1859,9 @@ static void fix_read_error(struct r1conf *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && - test_bit(In_sync, &rdev->flags) && + (test_bit(In_sync, &rdev->flags) || + (!test_bit(Faulty, &rdev->flags) && + rdev->recovery_offset >= sect + s)) && is_badblock(rdev, sect, s, &first_bad, &bad_sectors) == 0 && sync_page_io(rdev, sect, s<<9, @@ -2024,7 +2026,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio continue; if (test_bit(BIO_UPTODATE, &bio->bi_flags) && test_bit(R1BIO_MadeGood, &r1_bio->state)) { - rdev_clear_badblocks(rdev, r1_bio->sector, s); + rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); } if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && test_bit(R1BIO_WriteError, &r1_bio->state)) { @@ -2044,7 +2046,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) struct md_rdev *rdev = conf->mirrors[m].rdev; rdev_clear_badblocks(rdev, r1_bio->sector, - r1_bio->sectors); + r1_bio->sectors, 0); rdev_dec_pending(rdev, conf->mddev); } else if (r1_bio->bios[m] != NULL) { /* This drive got a write error. We need to @@ -2598,7 +2600,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!disk->rdev || !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; - if (disk->rdev) + if (disk->rdev && + (disk->rdev->saved_raid_disk < 0)) conf->fullsync = 1; } else if (conf->last_used < 0) /* @@ -2750,9 +2753,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); - if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) + sector_t newsize = raid1_size(mddev, sectors, 0); + if (mddev->external_size && + mddev->array_sectors > newsize) return -EINVAL; + if (mddev->bitmap) { + int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0); + if (ret) + return ret; + } + md_set_array_sectors(mddev, newsize); set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3f91c2e..987db37 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -24,6 +24,7 @@ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/ratelimit.h> +#include <linux/kthread.h> #include "md.h" #include "raid10.h" #include "raid0.h" @@ -68,6 +69,11 @@ static int max_queued_requests = 1024; static void allow_barrier(struct r10conf *conf); static void lower_barrier(struct r10conf *conf); static int enough(struct r10conf *conf, int ignore); +static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, + int *skipped); +static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); +static void end_reshape_write(struct bio *bio, int error); +static void end_reshape(struct r10conf *conf); static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { @@ -112,7 +118,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) if (!r10_bio) return NULL; - if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) + if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) nalloc = conf->copies; /* resync */ else nalloc = 2; /* recovery */ @@ -140,9 +147,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) struct bio *rbio = r10_bio->devs[j].repl_bio; bio = r10_bio->devs[j].bio; for (i = 0; i < RESYNC_PAGES; i++) { - if (j == 1 && !test_bit(MD_RECOVERY_SYNC, - &conf->mddev->recovery)) { - /* we can share bv_page's during recovery */ + if (j > 0 && !test_bit(MD_RECOVERY_SYNC, + &conf->mddev->recovery)) { + /* we can share bv_page's during recovery + * and reshape */ struct bio *rbio = r10_bio->devs[0].bio; page = rbio->bi_io_vec[i].bv_page; get_page(page); @@ -165,10 +173,11 @@ out_free_pages: while (j--) for (i = 0; i < RESYNC_PAGES ; i++) safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); - j = -1; + j = 0; out_free_bio: - while (++j < nalloc) { - bio_put(r10_bio->devs[j].bio); + for ( ; j < nalloc; j++) { + if (r10_bio->devs[j].bio) + bio_put(r10_bio->devs[j].bio); if (r10_bio->devs[j].repl_bio) bio_put(r10_bio->devs[j].repl_bio); } @@ -504,79 +513,96 @@ static void raid10_end_write_request(struct bio *bio, int error) * sector offset to a virtual address */ -static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) +static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) { int n,f; sector_t sector; sector_t chunk; sector_t stripe; int dev; - int slot = 0; /* now calculate first sector/dev */ - chunk = r10bio->sector >> conf->chunk_shift; - sector = r10bio->sector & conf->chunk_mask; + chunk = r10bio->sector >> geo->chunk_shift; + sector = r10bio->sector & geo->chunk_mask; - chunk *= conf->near_copies; + chunk *= geo->near_copies; stripe = chunk; - dev = sector_div(stripe, conf->raid_disks); - if (conf->far_offset) - stripe *= conf->far_copies; + dev = sector_div(stripe, geo->raid_disks); + if (geo->far_offset) + stripe *= geo->far_copies; - sector += stripe << conf->chunk_shift; + sector += stripe << geo->chunk_shift; /* and calculate all the others */ - for (n=0; n < conf->near_copies; n++) { + for (n = 0; n < geo->near_copies; n++) { int d = dev; sector_t s = sector; r10bio->devs[slot].addr = sector; r10bio->devs[slot].devnum = d; slot++; - for (f = 1; f < conf->far_copies; f++) { - d += conf->near_copies; - if (d >= conf->raid_disks) - d -= conf->raid_disks; - s += conf->stride; + for (f = 1; f < geo->far_copies; f++) { + d += geo->near_copies; + if (d >= geo->raid_disks) + d -= geo->raid_disks; + s += geo->stride; r10bio->devs[slot].devnum = d; r10bio->devs[slot].addr = s; slot++; } dev++; - if (dev >= conf->raid_disks) { + if (dev >= geo->raid_disks) { dev = 0; - sector += (conf->chunk_mask + 1); + sector += (geo->chunk_mask + 1); } } - BUG_ON(slot != conf->copies); +} + +static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) +{ + struct geom *geo = &conf->geo; + + if (conf->reshape_progress != MaxSector && + ((r10bio->sector >= conf->reshape_progress) != + conf->mddev->reshape_backwards)) { + set_bit(R10BIO_Previous, &r10bio->state); + geo = &conf->prev; + } else + clear_bit(R10BIO_Previous, &r10bio->state); + + __raid10_find_phys(geo, r10bio); } static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) { sector_t offset, chunk, vchunk; + /* Never use conf->prev as this is only called during resync + * or recovery, so reshape isn't happening + */ + struct geom *geo = &conf->geo; - offset = sector & conf->chunk_mask; - if (conf->far_offset) { + offset = sector & geo->chunk_mask; + if (geo->far_offset) { int fc; - chunk = sector >> conf->chunk_shift; - fc = sector_div(chunk, conf->far_copies); - dev -= fc * conf->near_copies; + chunk = sector >> geo->chunk_shift; + fc = sector_div(chunk, geo->far_copies); + dev -= fc * geo->near_copies; if (dev < 0) - dev += conf->raid_disks; + dev += geo->raid_disks; } else { - while (sector >= conf->stride) { - sector -= conf->stride; - if (dev < conf->near_copies) - dev += conf->raid_disks - conf->near_copies; + while (sector >= geo->stride) { + sector -= geo->stride; + if (dev < geo->near_copies) + dev += geo->raid_disks - geo->near_copies; else - dev -= conf->near_copies; + dev -= geo->near_copies; } - chunk = sector >> conf->chunk_shift; + chunk = sector >> geo->chunk_shift; } - vchunk = chunk * conf->raid_disks + dev; - sector_div(vchunk, conf->near_copies); - return (vchunk << conf->chunk_shift) + offset; + vchunk = chunk * geo->raid_disks + dev; + sector_div(vchunk, geo->near_copies); + return (vchunk << geo->chunk_shift) + offset; } /** @@ -597,10 +623,17 @@ static int raid10_mergeable_bvec(struct request_queue *q, struct r10conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; - unsigned int chunk_sectors = mddev->chunk_sectors; + unsigned int chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; + struct geom *geo = &conf->geo; + + chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1; + if (conf->reshape_progress != MaxSector && + ((sector >= conf->reshape_progress) != + conf->mddev->reshape_backwards)) + geo = &conf->prev; - if (conf->near_copies < conf->raid_disks) { + if (geo->near_copies < geo->raid_disks) { max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) @@ -614,6 +647,12 @@ static int raid10_mergeable_bvec(struct request_queue *q, if (mddev->merge_check_needed) { struct r10bio r10_bio; int s; + if (conf->reshape_progress != MaxSector) { + /* Cannot give any guidance during reshape */ + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; + return 0; + } r10_bio.sector = sector; raid10_find_phys(conf, &r10_bio); rcu_read_lock(); @@ -681,6 +720,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, struct md_rdev *rdev, *best_rdev; int do_balance; int best_slot; + struct geom *geo = &conf->geo; raid10_find_phys(conf, r10_bio); rcu_read_lock(); @@ -761,11 +801,11 @@ retry: * sequential read speed for 'far copies' arrays. So only * keep it for 'near' arrays, and review those later. */ - if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) + if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) break; /* for far > 1 always use the lowest address */ - if (conf->far_copies > 1) + if (geo->far_copies > 1) new_distance = r10_bio->devs[slot].addr; else new_distance = abs(r10_bio->devs[slot].addr - @@ -812,7 +852,10 @@ static int raid10_congested(void *data, int bits) if (mddev_congested(mddev, bits)) return 1; rcu_read_lock(); - for (i = 0; i < conf->raid_disks && ret == 0; i++) { + for (i = 0; + (i < conf->geo.raid_disks || i < conf->prev.raid_disks) + && ret == 0; + i++) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -973,13 +1016,24 @@ static void unfreeze_array(struct r10conf *conf) spin_unlock_irq(&conf->resync_lock); } +static sector_t choose_data_offset(struct r10bio *r10_bio, + struct md_rdev *rdev) +{ + if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) || + test_bit(R10BIO_Previous, &r10_bio->state)) + return rdev->data_offset; + else + return rdev->new_data_offset; +} + static void make_request(struct mddev *mddev, struct bio * bio) { struct r10conf *conf = mddev->private; struct r10bio *r10_bio; struct bio *read_bio; int i; - int chunk_sects = conf->chunk_mask + 1; + sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); + int chunk_sects = chunk_mask + 1; const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_fua = (bio->bi_rw & REQ_FUA); @@ -988,6 +1042,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) int plugged; int sectors_handled; int max_sectors; + int sectors; if (unlikely(bio->bi_rw & REQ_FLUSH)) { md_flush_request(mddev, bio); @@ -997,9 +1052,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) /* If this request crosses a chunk boundary, we need to * split it. This will only happen for 1 PAGE (or less) requests. */ - if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) - > chunk_sects && - conf->near_copies < conf->raid_disks)) { + if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9) + > chunk_sects + && (conf->geo.near_copies < conf->geo.raid_disks + || conf->prev.near_copies < conf->prev.raid_disks))) { struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ if (bio->bi_vcnt != 1 || @@ -1051,10 +1107,41 @@ static void make_request(struct mddev *mddev, struct bio * bio) */ wait_barrier(conf); + sectors = bio->bi_size >> 9; + while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + bio->bi_sector < conf->reshape_progress && + bio->bi_sector + sectors > conf->reshape_progress) { + /* IO spans the reshape position. Need to wait for + * reshape to pass + */ + allow_barrier(conf); + wait_event(conf->wait_barrier, + conf->reshape_progress <= bio->bi_sector || + conf->reshape_progress >= bio->bi_sector + sectors); + wait_barrier(conf); + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + bio_data_dir(bio) == WRITE && + (mddev->reshape_backwards + ? (bio->bi_sector < conf->reshape_safe && + bio->bi_sector + sectors > conf->reshape_progress) + : (bio->bi_sector + sectors > conf->reshape_safe && + bio->bi_sector < conf->reshape_progress))) { + /* Need to update reshape_position in metadata */ + mddev->reshape_position = conf->reshape_progress; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); + + conf->reshape_safe = mddev->reshape_position; + } + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; - r10_bio->sectors = bio->bi_size >> 9; + r10_bio->sectors = sectors; r10_bio->mddev = mddev; r10_bio->sector = bio->bi_sector; @@ -1093,7 +1180,7 @@ read_again: r10_bio->devs[slot].rdev = rdev; read_bio->bi_sector = r10_bio->devs[slot].addr + - rdev->data_offset; + choose_data_offset(r10_bio, rdev); read_bio->bi_bdev = rdev->bdev; read_bio->bi_end_io = raid10_end_read_request; read_bio->bi_rw = READ | do_sync; @@ -1297,7 +1384,8 @@ retry_write: r10_bio->devs[i].bio = mbio; mbio->bi_sector = (r10_bio->devs[i].addr+ - conf->mirrors[d].rdev->data_offset); + choose_data_offset(r10_bio, + conf->mirrors[d].rdev)); mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; mbio->bi_rw = WRITE | do_sync | do_fua; @@ -1321,8 +1409,10 @@ retry_write: * so it cannot disappear, so the replacement cannot * become NULL here */ - mbio->bi_sector = (r10_bio->devs[i].addr+ - conf->mirrors[d].replacement->data_offset); + mbio->bi_sector = (r10_bio->devs[i].addr + + choose_data_offset( + r10_bio, + conf->mirrors[d].replacement)); mbio->bi_bdev = conf->mirrors[d].replacement->bdev; mbio->bi_end_io = raid10_end_write_request; mbio->bi_rw = WRITE | do_sync | do_fua; @@ -1368,19 +1458,19 @@ static void status(struct seq_file *seq, struct mddev *mddev) struct r10conf *conf = mddev->private; int i; - if (conf->near_copies < conf->raid_disks) + if (conf->geo.near_copies < conf->geo.raid_disks) seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); - if (conf->near_copies > 1) - seq_printf(seq, " %d near-copies", conf->near_copies); - if (conf->far_copies > 1) { - if (conf->far_offset) - seq_printf(seq, " %d offset-copies", conf->far_copies); + if (conf->geo.near_copies > 1) + seq_printf(seq, " %d near-copies", conf->geo.near_copies); + if (conf->geo.far_copies > 1) { + if (conf->geo.far_offset) + seq_printf(seq, " %d offset-copies", conf->geo.far_copies); else - seq_printf(seq, " %d far-copies", conf->far_copies); + seq_printf(seq, " %d far-copies", conf->geo.far_copies); } - seq_printf(seq, " [%d/%d] [", conf->raid_disks, - conf->raid_disks - mddev->degraded); - for (i = 0; i < conf->raid_disks; i++) + seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, + conf->geo.raid_disks - mddev->degraded); + for (i = 0; i < conf->geo.raid_disks; i++) seq_printf(seq, "%s", conf->mirrors[i].rdev && test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); @@ -1392,7 +1482,7 @@ static void status(struct seq_file *seq, struct mddev *mddev) * Don't consider the device numbered 'ignore' * as we might be about to remove it. */ -static int enough(struct r10conf *conf, int ignore) +static int _enough(struct r10conf *conf, struct geom *geo, int ignore) { int first = 0; @@ -1403,7 +1493,7 @@ static int enough(struct r10conf *conf, int ignore) if (conf->mirrors[first].rdev && first != ignore) cnt++; - first = (first+1) % conf->raid_disks; + first = (first+1) % geo->raid_disks; } if (cnt == 0) return 0; @@ -1411,6 +1501,12 @@ static int enough(struct r10conf *conf, int ignore) return 1; } +static int enough(struct r10conf *conf, int ignore) +{ + return _enough(conf, &conf->geo, ignore) && + _enough(conf, &conf->prev, ignore); +} + static void error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; @@ -1445,7 +1541,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) "md/raid10:%s: Disk failure on %s, disabling device.\n" "md/raid10:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), - mdname(mddev), conf->raid_disks - mddev->degraded); + mdname(mddev), conf->geo.raid_disks - mddev->degraded); } static void print_conf(struct r10conf *conf) @@ -1458,10 +1554,10 @@ static void print_conf(struct r10conf *conf) printk(KERN_DEBUG "(!conf)\n"); return; } - printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, - conf->raid_disks); + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, + conf->geo.raid_disks); - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->geo.raid_disks; i++) { char b[BDEVNAME_SIZE]; tmp = conf->mirrors + i; if (tmp->rdev) @@ -1493,7 +1589,7 @@ static int raid10_spare_active(struct mddev *mddev) * Find all non-in_sync disks within the RAID10 configuration * and mark them in_sync */ - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->geo.raid_disks; i++) { tmp = conf->mirrors + i; if (tmp->replacement && tmp->replacement->recovery_offset == MaxSector @@ -1535,7 +1631,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) int err = -EEXIST; int mirror; int first = 0; - int last = conf->raid_disks - 1; + int last = conf->geo.raid_disks - 1; struct request_queue *q = bdev_get_queue(rdev->bdev); if (mddev->recovery_cp < MaxSector) @@ -1543,7 +1639,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) * very different from resync */ return -EBUSY; - if (rdev->saved_raid_disk < 0 && !enough(conf, -1)) + if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1)) return -EINVAL; if (rdev->raid_disk >= 0) @@ -1635,6 +1731,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (!test_bit(Faulty, &rdev->flags) && mddev->recovery_disabled != p->recovery_disabled && (!p->replacement || p->replacement == rdev) && + number < conf->geo.raid_disks && enough(conf, -1)) { err = -EBUSY; goto abort; @@ -1676,7 +1773,11 @@ static void end_sync_read(struct bio *bio, int error) struct r10conf *conf = r10_bio->mddev->private; int d; - d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); + if (bio == r10_bio->master_bio) { + /* this is a reshape read */ + d = r10_bio->read_slot; /* really the read dev */ + } else + d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); if (test_bit(BIO_UPTODATE, &bio->bi_flags)) set_bit(R10BIO_Uptodate, &r10_bio->state); @@ -2218,7 +2319,9 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 " (%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)( - sect + rdev->data_offset), + sect + + choose_data_offset(r10_bio, + rdev)), bdevname(rdev->bdev, b)); printk(KERN_NOTICE "md/raid10:%s: %s: failing " "drive\n", @@ -2256,7 +2359,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 " (%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)( - sect + rdev->data_offset), + sect + + choose_data_offset(r10_bio, rdev)), bdevname(rdev->bdev, b)); printk(KERN_NOTICE "md/raid10:%s: %s: failing " "drive\n", @@ -2269,7 +2373,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 " (%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)( - sect + rdev->data_offset), + sect + + choose_data_offset(r10_bio, rdev)), bdevname(rdev->bdev, b)); atomic_add(s, &rdev->corrected_errors); } @@ -2343,7 +2448,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); md_trim_bio(wbio, sector - bio->bi_sector, sectors); wbio->bi_sector = (r10_bio->devs[i].addr+ - rdev->data_offset+ + choose_data_offset(r10_bio, rdev) + (sector - r10_bio->sector)); wbio->bi_bdev = rdev->bdev; if (submit_bio_wait(WRITE, wbio) == 0) @@ -2420,7 +2525,7 @@ read_more: r10_bio->devs[slot].bio = bio; r10_bio->devs[slot].rdev = rdev; bio->bi_sector = r10_bio->devs[slot].addr - + rdev->data_offset; + + choose_data_offset(r10_bio, rdev); bio->bi_bdev = rdev->bdev; bio->bi_rw = READ | do_sync; bio->bi_private = r10_bio; @@ -2480,7 +2585,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, - r10_bio->sectors); + r10_bio->sectors, 0); } else { if (!rdev_set_badblocks( rdev, @@ -2496,7 +2601,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, - r10_bio->sectors); + r10_bio->sectors, 0); } else { if (!rdev_set_badblocks( rdev, @@ -2515,7 +2620,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, - r10_bio->sectors); + r10_bio->sectors, 0); rdev_dec_pending(rdev, conf->mddev); } else if (bio != NULL && !test_bit(BIO_UPTODATE, &bio->bi_flags)) { @@ -2532,7 +2637,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, - r10_bio->sectors); + r10_bio->sectors, 0); rdev_dec_pending(rdev, conf->mddev); } } @@ -2573,6 +2678,8 @@ static void raid10d(struct mddev *mddev) if (test_bit(R10BIO_MadeGood, &r10_bio->state) || test_bit(R10BIO_WriteError, &r10_bio->state)) handle_write_completed(conf, r10_bio); + else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) + reshape_request_write(mddev, r10_bio); else if (test_bit(R10BIO_IsSync, &r10_bio->state)) sync_request_write(mddev, r10_bio); else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) @@ -2603,7 +2710,7 @@ static int init_resync(struct r10conf *conf) buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; BUG_ON(conf->r10buf_pool); conf->have_replacement = 0; - for (i = 0; i < conf->raid_disks; i++) + for (i = 0; i < conf->geo.raid_disks; i++) if (conf->mirrors[i].replacement) conf->have_replacement = 1; conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); @@ -2657,6 +2764,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, sector_t sync_blocks; sector_t sectors_skipped = 0; int chunks_skipped = 0; + sector_t chunk_mask = conf->geo.chunk_mask; if (!conf->r10buf_pool) if (init_resync(conf)) @@ -2664,7 +2772,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, skipped: max_sector = mddev->dev_sectors; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { /* If we aborted, we need to abort the @@ -2676,11 +2785,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, * we need to convert that to several * virtual addresses. */ + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { + end_reshape(conf); + return 0; + } + if (mddev->curr_resync < max_sector) { /* aborted */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) bitmap_end_sync(mddev->bitmap, mddev->curr_resync, &sync_blocks, 1); - else for (i=0; i<conf->raid_disks; i++) { + else for (i = 0; i < conf->geo.raid_disks; i++) { sector_t sect = raid10_find_virt(conf, mddev->curr_resync, i); bitmap_end_sync(mddev->bitmap, sect, @@ -2694,7 +2808,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, /* Completed a full sync so the replacements * are now fully recovered. */ - for (i = 0; i < conf->raid_disks; i++) + for (i = 0; i < conf->geo.raid_disks; i++) if (conf->mirrors[i].replacement) conf->mirrors[i].replacement ->recovery_offset @@ -2707,7 +2821,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, *skipped = 1; return sectors_skipped; } - if (chunks_skipped >= conf->raid_disks) { + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return reshape_request(mddev, sector_nr, skipped); + + if (chunks_skipped >= conf->geo.raid_disks) { /* if there has been nothing to do on any drive, * then there is nothing to do at all.. */ @@ -2721,9 +2839,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, /* make sure whole request will fit in a chunk - if chunks * are meaningful */ - if (conf->near_copies < conf->raid_disks && - max_sector > (sector_nr | conf->chunk_mask)) - max_sector = (sector_nr | conf->chunk_mask) + 1; + if (conf->geo.near_copies < conf->geo.raid_disks && + max_sector > (sector_nr | chunk_mask)) + max_sector = (sector_nr | chunk_mask) + 1; /* * If there is non-resync activity waiting for us then * put in a delay to throttle resync. @@ -2752,7 +2870,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int j; r10_bio = NULL; - for (i=0 ; i<conf->raid_disks; i++) { + for (i = 0 ; i < conf->geo.raid_disks; i++) { int still_degraded; struct r10bio *rb2; sector_t sect; @@ -2806,7 +2924,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, /* Need to check if the array will still be * degraded */ - for (j=0; j<conf->raid_disks; j++) + for (j = 0; j < conf->geo.raid_disks; j++) if (conf->mirrors[j].rdev == NULL || test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { still_degraded = 1; @@ -2984,9 +3102,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->sector = sector_nr; set_bit(R10BIO_IsSync, &r10_bio->state); raid10_find_phys(conf, r10_bio); - r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; + r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; - for (i=0; i<conf->copies; i++) { + for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; sector_t first_bad, sector; int bad_sectors; @@ -3152,16 +3270,17 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) struct r10conf *conf = mddev->private; if (!raid_disks) - raid_disks = conf->raid_disks; + raid_disks = min(conf->geo.raid_disks, + conf->prev.raid_disks); if (!sectors) sectors = conf->dev_sectors; - size = sectors >> conf->chunk_shift; - sector_div(size, conf->far_copies); + size = sectors >> conf->geo.chunk_shift; + sector_div(size, conf->geo.far_copies); size = size * raid_disks; - sector_div(size, conf->near_copies); + sector_div(size, conf->geo.near_copies); - return size << conf->chunk_shift; + return size << conf->geo.chunk_shift; } static void calc_sectors(struct r10conf *conf, sector_t size) @@ -3171,10 +3290,10 @@ static void calc_sectors(struct r10conf *conf, sector_t size) * conf->stride */ - size = size >> conf->chunk_shift; - sector_div(size, conf->far_copies); - size = size * conf->raid_disks; - sector_div(size, conf->near_copies); + size = size >> conf->geo.chunk_shift; + sector_div(size, conf->geo.far_copies); + size = size * conf->geo.raid_disks; + sector_div(size, conf->geo.near_copies); /* 'size' is now the number of chunks in the array */ /* calculate "used chunks per device" */ size = size * conf->copies; @@ -3182,38 +3301,76 @@ static void calc_sectors(struct r10conf *conf, sector_t size) /* We need to round up when dividing by raid_disks to * get the stride size. */ - size = DIV_ROUND_UP_SECTOR_T(size, conf->raid_disks); + size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks); - conf->dev_sectors = size << conf->chunk_shift; + conf->dev_sectors = size << conf->geo.chunk_shift; - if (conf->far_offset) - conf->stride = 1 << conf->chunk_shift; + if (conf->geo.far_offset) + conf->geo.stride = 1 << conf->geo.chunk_shift; else { - sector_div(size, conf->far_copies); - conf->stride = size << conf->chunk_shift; + sector_div(size, conf->geo.far_copies); + conf->geo.stride = size << conf->geo.chunk_shift; } } +enum geo_type {geo_new, geo_old, geo_start}; +static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) +{ + int nc, fc, fo; + int layout, chunk, disks; + switch (new) { + case geo_old: + layout = mddev->layout; + chunk = mddev->chunk_sectors; + disks = mddev->raid_disks - mddev->delta_disks; + break; + case geo_new: + layout = mddev->new_layout; + chunk = mddev->new_chunk_sectors; + disks = mddev->raid_disks; + break; + default: /* avoid 'may be unused' warnings */ + case geo_start: /* new when starting reshape - raid_disks not + * updated yet. */ + layout = mddev->new_layout; + chunk = mddev->new_chunk_sectors; + disks = mddev->raid_disks + mddev->delta_disks; + break; + } + if (layout >> 17) + return -1; + if (chunk < (PAGE_SIZE >> 9) || + !is_power_of_2(chunk)) + return -2; + nc = layout & 255; + fc = (layout >> 8) & 255; + fo = layout & (1<<16); + geo->raid_disks = disks; + geo->near_copies = nc; + geo->far_copies = fc; + geo->far_offset = fo; + geo->chunk_mask = chunk - 1; + geo->chunk_shift = ffz(~chunk); + return nc*fc; +} + static struct r10conf *setup_conf(struct mddev *mddev) { struct r10conf *conf = NULL; - int nc, fc, fo; int err = -EINVAL; + struct geom geo; + int copies; + + copies = setup_geo(&geo, mddev, geo_new); - if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || - !is_power_of_2(mddev->new_chunk_sectors)) { + if (copies == -2) { printk(KERN_ERR "md/raid10:%s: chunk size must be " "at least PAGE_SIZE(%ld) and be a power of 2.\n", mdname(mddev), PAGE_SIZE); goto out; } - nc = mddev->new_layout & 255; - fc = (mddev->new_layout >> 8) & 255; - fo = mddev->new_layout & (1<<16); - - if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || - (mddev->new_layout >> 17)) { + if (copies < 2 || copies > mddev->raid_disks) { printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", mdname(mddev), mddev->new_layout); goto out; @@ -3224,7 +3381,9 @@ static struct r10conf *setup_conf(struct mddev *mddev) if (!conf) goto out; - conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, + /* FIXME calc properly */ + conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + + max(0,mddev->delta_disks)), GFP_KERNEL); if (!conf->mirrors) goto out; @@ -3233,22 +3392,29 @@ static struct r10conf *setup_conf(struct mddev *mddev) if (!conf->tmppage) goto out; - - conf->raid_disks = mddev->raid_disks; - conf->near_copies = nc; - conf->far_copies = fc; - conf->copies = nc*fc; - conf->far_offset = fo; - conf->chunk_mask = mddev->new_chunk_sectors - 1; - conf->chunk_shift = ffz(~mddev->new_chunk_sectors); - + conf->geo = geo; + conf->copies = copies; conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, r10bio_pool_free, conf); if (!conf->r10bio_pool) goto out; calc_sectors(conf, mddev->dev_sectors); - + if (mddev->reshape_position == MaxSector) { + conf->prev = conf->geo; + conf->reshape_progress = MaxSector; + } else { + if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { + err = -EINVAL; + goto out; + } + conf->reshape_progress = mddev->reshape_position; + if (conf->prev.far_offset) + conf->prev.stride = 1 << conf->prev.chunk_shift; + else + /* far_copies must be 1 */ + conf->prev.stride = conf->dev_sectors; + } spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); @@ -3263,8 +3429,9 @@ static struct r10conf *setup_conf(struct mddev *mddev) return conf; out: - printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", - mdname(mddev)); + if (err == -ENOMEM) + printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", + mdname(mddev)); if (conf) { if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); @@ -3282,12 +3449,8 @@ static int run(struct mddev *mddev) struct mirror_info *disk; struct md_rdev *rdev; sector_t size; - - /* - * copy the already verified devices into our private RAID10 - * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] - */ + sector_t min_offset_diff = 0; + int first = 1; if (mddev->private == NULL) { conf = setup_conf(mddev); @@ -3304,17 +3467,20 @@ static int run(struct mddev *mddev) chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); - if (conf->raid_disks % conf->near_copies) - blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); + if (conf->geo.raid_disks % conf->geo.near_copies) + blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); else blk_queue_io_opt(mddev->queue, chunk_size * - (conf->raid_disks / conf->near_copies)); + (conf->geo.raid_disks / conf->geo.near_copies)); rdev_for_each(rdev, mddev) { + long long diff; disk_idx = rdev->raid_disk; - if (disk_idx >= conf->raid_disks - || disk_idx < 0) + if (disk_idx < 0) + continue; + if (disk_idx >= conf->geo.raid_disks && + disk_idx >= conf->prev.raid_disks) continue; disk = conf->mirrors + disk_idx; @@ -3327,12 +3493,20 @@ static int run(struct mddev *mddev) goto out_free_conf; disk->rdev = rdev; } + diff = (rdev->new_data_offset - rdev->data_offset); + if (!mddev->reshape_backwards) + diff = -diff; + if (diff < 0) + diff = 0; + if (first || diff < min_offset_diff) + min_offset_diff = diff; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); disk->head_position = 0; } + /* need to check that every block has at least one working mirror */ if (!enough(conf, -1)) { printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", @@ -3340,8 +3514,21 @@ static int run(struct mddev *mddev) goto out_free_conf; } + if (conf->reshape_progress != MaxSector) { + /* must ensure that shape change is supported */ + if (conf->geo.far_copies != 1 && + conf->geo.far_offset == 0) + goto out_free_conf; + if (conf->prev.far_copies != 1 && + conf->geo.far_offset == 0) + goto out_free_conf; + } + mddev->degraded = 0; - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; + i < conf->geo.raid_disks + || i < conf->prev.raid_disks; + i++) { disk = conf->mirrors + i; @@ -3368,8 +3555,8 @@ static int run(struct mddev *mddev) mdname(mddev)); printk(KERN_INFO "md/raid10:%s: active with %d out of %d devices\n", - mdname(mddev), conf->raid_disks - mddev->degraded, - conf->raid_disks); + mdname(mddev), conf->geo.raid_disks - mddev->degraded, + conf->geo.raid_disks); /* * Ok, everything is just fine now */ @@ -3386,11 +3573,11 @@ static int run(struct mddev *mddev) * maybe... */ { - int stripe = conf->raid_disks * + int stripe = conf->geo.raid_disks * ((mddev->chunk_sectors << 9) / PAGE_SIZE); - stripe /= conf->near_copies; - if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) - mddev->queue->backing_dev_info.ra_pages = 2* stripe; + stripe /= conf->geo.near_copies; + if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) + mddev->queue->backing_dev_info.ra_pages = 2 * stripe; } blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); @@ -3398,6 +3585,30 @@ static int run(struct mddev *mddev) if (md_integrity_register(mddev)) goto out_free_conf; + if (conf->reshape_progress != MaxSector) { + unsigned long before_length, after_length; + + before_length = ((1 << conf->prev.chunk_shift) * + conf->prev.far_copies); + after_length = ((1 << conf->geo.chunk_shift) * + conf->geo.far_copies); + + if (max(before_length, after_length) > min_offset_diff) { + /* This cannot work */ + printk("md/raid10: offset difference not enough to continue reshape\n"); + goto out_free_conf; + } + conf->offset_diff = min_offset_diff; + + conf->reshape_safe = conf->reshape_progress; + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "reshape"); + } + return 0; out_free_conf: @@ -3460,14 +3671,23 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) struct r10conf *conf = mddev->private; sector_t oldsize, size; - if (conf->far_copies > 1 && !conf->far_offset) + if (mddev->reshape_position != MaxSector) + return -EBUSY; + + if (conf->geo.far_copies > 1 && !conf->geo.far_offset) return -EINVAL; oldsize = raid10_size(mddev, 0, 0); size = raid10_size(mddev, sectors, 0); - md_set_array_sectors(mddev, size); - if (mddev->array_sectors > size) + if (mddev->external_size && + mddev->array_sectors > size) return -EINVAL; + if (mddev->bitmap) { + int ret = bitmap_resize(mddev->bitmap, size, 0, 0); + if (ret) + return ret; + } + md_set_array_sectors(mddev, size); set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && @@ -3534,6 +3754,758 @@ static void *raid10_takeover(struct mddev *mddev) return ERR_PTR(-EINVAL); } +static int raid10_check_reshape(struct mddev *mddev) +{ + /* Called when there is a request to change + * - layout (to ->new_layout) + * - chunk size (to ->new_chunk_sectors) + * - raid_disks (by delta_disks) + * or when trying to restart a reshape that was ongoing. + * + * We need to validate the request and possibly allocate + * space if that might be an issue later. + * + * Currently we reject any reshape of a 'far' mode array, + * allow chunk size to change if new is generally acceptable, + * allow raid_disks to increase, and allow + * a switch between 'near' mode and 'offset' mode. + */ + struct r10conf *conf = mddev->private; + struct geom geo; + + if (conf->geo.far_copies != 1 && !conf->geo.far_offset) + return -EINVAL; + + if (setup_geo(&geo, mddev, geo_start) != conf->copies) + /* mustn't change number of copies */ + return -EINVAL; + if (geo.far_copies > 1 && !geo.far_offset) + /* Cannot switch to 'far' mode */ + return -EINVAL; + + if (mddev->array_sectors & geo.chunk_mask) + /* not factor of array size */ + return -EINVAL; + + if (!enough(conf, -1)) + return -EINVAL; + + kfree(conf->mirrors_new); + conf->mirrors_new = NULL; + if (mddev->delta_disks > 0) { + /* allocate new 'mirrors' list */ + conf->mirrors_new = kzalloc( + sizeof(struct mirror_info) + *(mddev->raid_disks + + mddev->delta_disks), + GFP_KERNEL); + if (!conf->mirrors_new) + return -ENOMEM; + } + return 0; +} + +/* + * Need to check if array has failed when deciding whether to: + * - start an array + * - remove non-faulty devices + * - add a spare + * - allow a reshape + * This determination is simple when no reshape is happening. + * However if there is a reshape, we need to carefully check + * both the before and after sections. + * This is because some failed devices may only affect one + * of the two sections, and some non-in_sync devices may + * be insync in the section most affected by failed devices. + */ +static int calc_degraded(struct r10conf *conf) +{ + int degraded, degraded2; + int i; + + rcu_read_lock(); + degraded = 0; + /* 'prev' section first */ + for (i = 0; i < conf->prev.raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (!test_bit(In_sync, &rdev->flags)) + /* When we can reduce the number of devices in + * an array, this might not contribute to + * 'degraded'. It does now. + */ + degraded++; + } + rcu_read_unlock(); + if (conf->geo.raid_disks == conf->prev.raid_disks) + return degraded; + rcu_read_lock(); + degraded2 = 0; + for (i = 0; i < conf->geo.raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded2++; + else if (!test_bit(In_sync, &rdev->flags)) { + /* If reshape is increasing the number of devices, + * this section has already been recovered, so + * it doesn't contribute to degraded. + * else it does. + */ + if (conf->geo.raid_disks <= conf->prev.raid_disks) + degraded2++; + } + } + rcu_read_unlock(); + if (degraded2 > degraded) + return degraded2; + return degraded; +} + +static int raid10_start_reshape(struct mddev *mddev) +{ + /* A 'reshape' has been requested. This commits + * the various 'new' fields and sets MD_RECOVER_RESHAPE + * This also checks if there are enough spares and adds them + * to the array. + * We currently require enough spares to make the final + * array non-degraded. We also require that the difference + * between old and new data_offset - on each device - is + * enough that we never risk over-writing. + */ + + unsigned long before_length, after_length; + sector_t min_offset_diff = 0; + int first = 1; + struct geom new; + struct r10conf *conf = mddev->private; + struct md_rdev *rdev; + int spares = 0; + int ret; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + + if (setup_geo(&new, mddev, geo_start) != conf->copies) + return -EINVAL; + + before_length = ((1 << conf->prev.chunk_shift) * + conf->prev.far_copies); + after_length = ((1 << conf->geo.chunk_shift) * + conf->geo.far_copies); + + rdev_for_each(rdev, mddev) { + if (!test_bit(In_sync, &rdev->flags) + && !test_bit(Faulty, &rdev->flags)) + spares++; + if (rdev->raid_disk >= 0) { + long long diff = (rdev->new_data_offset + - rdev->data_offset); + if (!mddev->reshape_backwards) + diff = -diff; + if (diff < 0) + diff = 0; + if (first || diff < min_offset_diff) + min_offset_diff = diff; + } + } + + if (max(before_length, after_length) > min_offset_diff) + return -EINVAL; + + if (spares < mddev->delta_disks) + return -EINVAL; + + conf->offset_diff = min_offset_diff; + spin_lock_irq(&conf->device_lock); + if (conf->mirrors_new) { + memcpy(conf->mirrors_new, conf->mirrors, + sizeof(struct mirror_info)*conf->prev.raid_disks); + smp_mb(); + kfree(conf->mirrors_old); /* FIXME and elsewhere */ + conf->mirrors_old = conf->mirrors; + conf->mirrors = conf->mirrors_new; + conf->mirrors_new = NULL; + } + setup_geo(&conf->geo, mddev, geo_start); + smp_mb(); + if (mddev->reshape_backwards) { + sector_t size = raid10_size(mddev, 0, 0); + if (size < mddev->array_sectors) { + spin_unlock_irq(&conf->device_lock); + printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n", + mdname(mddev)); + return -EINVAL; + } + mddev->resync_max_sectors = size; + conf->reshape_progress = size; + } else + conf->reshape_progress = 0; + spin_unlock_irq(&conf->device_lock); + + if (mddev->delta_disks && mddev->bitmap) { + ret = bitmap_resize(mddev->bitmap, + raid10_size(mddev, 0, + conf->geo.raid_disks), + 0, 0); + if (ret) + goto abort; + } + if (mddev->delta_disks > 0) { + rdev_for_each(rdev, mddev) + if (rdev->raid_disk < 0 && + !test_bit(Faulty, &rdev->flags)) { + if (raid10_add_disk(mddev, rdev) == 0) { + if (rdev->raid_disk >= + conf->prev.raid_disks) + set_bit(In_sync, &rdev->flags); + else + rdev->recovery_offset = 0; + + if (sysfs_link_rdev(mddev, rdev)) + /* Failure here is OK */; + } + } else if (rdev->raid_disk >= conf->prev.raid_disks + && !test_bit(Faulty, &rdev->flags)) { + /* This is a spare that was manually added */ + set_bit(In_sync, &rdev->flags); + } + } + /* When a reshape changes the number of devices, + * ->degraded is measured against the larger of the + * pre and post numbers. + */ + spin_lock_irq(&conf->device_lock); + mddev->degraded = calc_degraded(conf); + spin_unlock_irq(&conf->device_lock); + mddev->raid_disks = conf->geo.raid_disks; + mddev->reshape_position = conf->reshape_progress; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "reshape"); + if (!mddev->sync_thread) { + ret = -EAGAIN; + goto abort; + } + conf->reshape_checkpoint = jiffies; + md_wakeup_thread(mddev->sync_thread); + md_new_event(mddev); + return 0; + +abort: + mddev->recovery = 0; + spin_lock_irq(&conf->device_lock); + conf->geo = conf->prev; + mddev->raid_disks = conf->geo.raid_disks; + rdev_for_each(rdev, mddev) + rdev->new_data_offset = rdev->data_offset; + smp_wmb(); + conf->reshape_progress = MaxSector; + mddev->reshape_position = MaxSector; + spin_unlock_irq(&conf->device_lock); + return ret; +} + +/* Calculate the last device-address that could contain + * any block from the chunk that includes the array-address 's' + * and report the next address. + * i.e. the address returned will be chunk-aligned and after + * any data that is in the chunk containing 's'. + */ +static sector_t last_dev_address(sector_t s, struct geom *geo) +{ + s = (s | geo->chunk_mask) + 1; + s >>= geo->chunk_shift; + s *= geo->near_copies; + s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); + s *= geo->far_copies; + s <<= geo->chunk_shift; + return s; +} + +/* Calculate the first device-address that could contain + * any block from the chunk that includes the array-address 's'. + * This too will be the start of a chunk + */ +static sector_t first_dev_address(sector_t s, struct geom *geo) +{ + s >>= geo->chunk_shift; + s *= geo->near_copies; + sector_div(s, geo->raid_disks); + s *= geo->far_copies; + s <<= geo->chunk_shift; + return s; +} + +static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, + int *skipped) +{ + /* We simply copy at most one chunk (smallest of old and new) + * at a time, possibly less if that exceeds RESYNC_PAGES, + * or we hit a bad block or something. + * This might mean we pause for normal IO in the middle of + * a chunk, but that is not a problem was mddev->reshape_position + * can record any location. + * + * If we will want to write to a location that isn't + * yet recorded as 'safe' (i.e. in metadata on disk) then + * we need to flush all reshape requests and update the metadata. + * + * When reshaping forwards (e.g. to more devices), we interpret + * 'safe' as the earliest block which might not have been copied + * down yet. We divide this by previous stripe size and multiply + * by previous stripe length to get lowest device offset that we + * cannot write to yet. + * We interpret 'sector_nr' as an address that we want to write to. + * From this we use last_device_address() to find where we might + * write to, and first_device_address on the 'safe' position. + * If this 'next' write position is after the 'safe' position, + * we must update the metadata to increase the 'safe' position. + * + * When reshaping backwards, we round in the opposite direction + * and perform the reverse test: next write position must not be + * less than current safe position. + * + * In all this the minimum difference in data offsets + * (conf->offset_diff - always positive) allows a bit of slack, + * so next can be after 'safe', but not by more than offset_disk + * + * We need to prepare all the bios here before we start any IO + * to ensure the size we choose is acceptable to all devices. + * The means one for each copy for write-out and an extra one for + * read-in. + * We store the read-in bio in ->master_bio and the others in + * ->devs[x].bio and ->devs[x].repl_bio. + */ + struct r10conf *conf = mddev->private; + struct r10bio *r10_bio; + sector_t next, safe, last; + int max_sectors; + int nr_sectors; + int s; + struct md_rdev *rdev; + int need_flush = 0; + struct bio *blist; + struct bio *bio, *read_bio; + int sectors_done = 0; + + if (sector_nr == 0) { + /* If restarting in the middle, skip the initial sectors */ + if (mddev->reshape_backwards && + conf->reshape_progress < raid10_size(mddev, 0, 0)) { + sector_nr = (raid10_size(mddev, 0, 0) + - conf->reshape_progress); + } else if (!mddev->reshape_backwards && + conf->reshape_progress > 0) + sector_nr = conf->reshape_progress; + if (sector_nr) { + mddev->curr_resync_completed = sector_nr; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + *skipped = 1; + return sector_nr; + } + } + + /* We don't use sector_nr to track where we are up to + * as that doesn't work well for ->reshape_backwards. + * So just use ->reshape_progress. + */ + if (mddev->reshape_backwards) { + /* 'next' is the earliest device address that we might + * write to for this chunk in the new layout + */ + next = first_dev_address(conf->reshape_progress - 1, + &conf->geo); + + /* 'safe' is the last device address that we might read from + * in the old layout after a restart + */ + safe = last_dev_address(conf->reshape_safe - 1, + &conf->prev); + + if (next + conf->offset_diff < safe) + need_flush = 1; + + last = conf->reshape_progress - 1; + sector_nr = last & ~(sector_t)(conf->geo.chunk_mask + & conf->prev.chunk_mask); + if (sector_nr + RESYNC_BLOCK_SIZE/512 < last) + sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512; + } else { + /* 'next' is after the last device address that we + * might write to for this chunk in the new layout + */ + next = last_dev_address(conf->reshape_progress, &conf->geo); + + /* 'safe' is the earliest device address that we might + * read from in the old layout after a restart + */ + safe = first_dev_address(conf->reshape_safe, &conf->prev); + + /* Need to update metadata if 'next' might be beyond 'safe' + * as that would possibly corrupt data + */ + if (next > safe + conf->offset_diff) + need_flush = 1; + + sector_nr = conf->reshape_progress; + last = sector_nr | (conf->geo.chunk_mask + & conf->prev.chunk_mask); + + if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last) + last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1; + } + + if (need_flush || + time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { + /* Need to update reshape_position in metadata */ + wait_barrier(conf); + mddev->reshape_position = conf->reshape_progress; + if (mddev->reshape_backwards) + mddev->curr_resync_completed = raid10_size(mddev, 0, 0) + - conf->reshape_progress; + else + mddev->curr_resync_completed = conf->reshape_progress; + conf->reshape_checkpoint = jiffies; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, mddev->flags == 0 || + kthread_should_stop()); + conf->reshape_safe = mddev->reshape_position; + allow_barrier(conf); + } + +read_more: + /* Now schedule reads for blocks from sector_nr to last */ + r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + raise_barrier(conf, sectors_done != 0); + atomic_set(&r10_bio->remaining, 0); + r10_bio->mddev = mddev; + r10_bio->sector = sector_nr; + set_bit(R10BIO_IsReshape, &r10_bio->state); + r10_bio->sectors = last - sector_nr + 1; + rdev = read_balance(conf, r10_bio, &max_sectors); + BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); + + if (!rdev) { + /* Cannot read from here, so need to record bad blocks + * on all the target devices. + */ + // FIXME + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return sectors_done; + } + + read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); + + read_bio->bi_bdev = rdev->bdev; + read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr + + rdev->data_offset); + read_bio->bi_private = r10_bio; + read_bio->bi_end_io = end_sync_read; + read_bio->bi_rw = READ; + read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); + read_bio->bi_flags |= 1 << BIO_UPTODATE; + read_bio->bi_vcnt = 0; + read_bio->bi_idx = 0; + read_bio->bi_size = 0; + r10_bio->master_bio = read_bio; + r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; + + /* Now find the locations in the new layout */ + __raid10_find_phys(&conf->geo, r10_bio); + + blist = read_bio; + read_bio->bi_next = NULL; + + for (s = 0; s < conf->copies*2; s++) { + struct bio *b; + int d = r10_bio->devs[s/2].devnum; + struct md_rdev *rdev2; + if (s&1) { + rdev2 = conf->mirrors[d].replacement; + b = r10_bio->devs[s/2].repl_bio; + } else { + rdev2 = conf->mirrors[d].rdev; + b = r10_bio->devs[s/2].bio; + } + if (!rdev2 || test_bit(Faulty, &rdev2->flags)) + continue; + b->bi_bdev = rdev2->bdev; + b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; + b->bi_private = r10_bio; + b->bi_end_io = end_reshape_write; + b->bi_rw = WRITE; + b->bi_flags &= ~(BIO_POOL_MASK - 1); + b->bi_flags |= 1 << BIO_UPTODATE; + b->bi_next = blist; + b->bi_vcnt = 0; + b->bi_idx = 0; + b->bi_size = 0; + blist = b; + } + + /* Now add as many pages as possible to all of these bios. */ + + nr_sectors = 0; + for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { + struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page; + int len = (max_sectors - s) << 9; + if (len > PAGE_SIZE) + len = PAGE_SIZE; + for (bio = blist; bio ; bio = bio->bi_next) { + struct bio *bio2; + if (bio_add_page(bio, page, len, 0)) + continue; + + /* Didn't fit, must stop */ + for (bio2 = blist; + bio2 && bio2 != bio; + bio2 = bio2->bi_next) { + /* Remove last page from this bio */ + bio2->bi_vcnt--; + bio2->bi_size -= len; + bio2->bi_flags &= ~(1<<BIO_SEG_VALID); + } + goto bio_full; + } + sector_nr += len >> 9; + nr_sectors += len >> 9; + } +bio_full: + r10_bio->sectors = nr_sectors; + + /* Now submit the read */ + md_sync_acct(read_bio->bi_bdev, r10_bio->sectors); + atomic_inc(&r10_bio->remaining); + read_bio->bi_next = NULL; + generic_make_request(read_bio); + sector_nr += nr_sectors; + sectors_done += nr_sectors; + if (sector_nr <= last) + goto read_more; + + /* Now that we have done the whole section we can + * update reshape_progress + */ + if (mddev->reshape_backwards) + conf->reshape_progress -= sectors_done; + else + conf->reshape_progress += sectors_done; + + return sectors_done; +} + +static void end_reshape_request(struct r10bio *r10_bio); +static int handle_reshape_read_error(struct mddev *mddev, + struct r10bio *r10_bio); +static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) +{ + /* Reshape read completed. Hopefully we have a block + * to write out. + * If we got a read error then we do sync 1-page reads from + * elsewhere until we find the data - or give up. + */ + struct r10conf *conf = mddev->private; + int s; + + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + if (handle_reshape_read_error(mddev, r10_bio) < 0) { + /* Reshape has been aborted */ + md_done_sync(mddev, r10_bio->sectors, 0); + return; + } + + /* We definitely have the data in the pages, schedule the + * writes. + */ + atomic_set(&r10_bio->remaining, 1); + for (s = 0; s < conf->copies*2; s++) { + struct bio *b; + int d = r10_bio->devs[s/2].devnum; + struct md_rdev *rdev; + if (s&1) { + rdev = conf->mirrors[d].replacement; + b = r10_bio->devs[s/2].repl_bio; + } else { + rdev = conf->mirrors[d].rdev; + b = r10_bio->devs[s/2].bio; + } + if (!rdev || test_bit(Faulty, &rdev->flags)) + continue; + atomic_inc(&rdev->nr_pending); + md_sync_acct(b->bi_bdev, r10_bio->sectors); + atomic_inc(&r10_bio->remaining); + b->bi_next = NULL; + generic_make_request(b); + } + end_reshape_request(r10_bio); +} + +static void end_reshape(struct r10conf *conf) +{ + if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) + return; + + spin_lock_irq(&conf->device_lock); + conf->prev = conf->geo; + md_finish_reshape(conf->mddev); + smp_wmb(); + conf->reshape_progress = MaxSector; + spin_unlock_irq(&conf->device_lock); + + /* read-ahead size must cover two whole stripes, which is + * 2 * (datadisks) * chunksize where 'n' is the number of raid devices + */ + if (conf->mddev->queue) { + int stripe = conf->geo.raid_disks * + ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); + stripe /= conf->geo.near_copies; + if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) + conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + } + conf->fullsync = 0; +} + + +static int handle_reshape_read_error(struct mddev *mddev, + struct r10bio *r10_bio) +{ + /* Use sync reads to get the blocks from somewhere else */ + int sectors = r10_bio->sectors; + struct r10bio r10b; + struct r10conf *conf = mddev->private; + int slot = 0; + int idx = 0; + struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec; + + r10b.sector = r10_bio->sector; + __raid10_find_phys(&conf->prev, &r10b); + + while (sectors) { + int s = sectors; + int success = 0; + int first_slot = slot; + + if (s > (PAGE_SIZE >> 9)) + s = PAGE_SIZE >> 9; + + while (!success) { + int d = r10b.devs[slot].devnum; + struct md_rdev *rdev = conf->mirrors[d].rdev; + sector_t addr; + if (rdev == NULL || + test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + goto failed; + + addr = r10b.devs[slot].addr + idx * PAGE_SIZE; + success = sync_page_io(rdev, + addr, + s << 9, + bvec[idx].bv_page, + READ, false); + if (success) + break; + failed: + slot++; + if (slot >= conf->copies) + slot = 0; + if (slot == first_slot) + break; + } + if (!success) { + /* couldn't read this block, must give up */ + set_bit(MD_RECOVERY_INTR, + &mddev->recovery); + return -EIO; + } + sectors -= s; + idx++; + } + return 0; +} + +static void end_reshape_write(struct bio *bio, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct r10bio *r10_bio = bio->bi_private; + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + int d; + int slot; + int repl; + struct md_rdev *rdev = NULL; + + d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[d].replacement; + if (!rdev) { + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + + if (!uptodate) { + /* FIXME should record badblock */ + md_error(mddev, rdev); + } + + rdev_dec_pending(rdev, mddev); + end_reshape_request(r10_bio); +} + +static void end_reshape_request(struct r10bio *r10_bio) +{ + if (!atomic_dec_and_test(&r10_bio->remaining)) + return; + md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); + bio_put(r10_bio->master_bio); + put_buf(r10_bio); +} + +static void raid10_finish_reshape(struct mddev *mddev) +{ + struct r10conf *conf = mddev->private; + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + return; + + if (mddev->delta_disks > 0) { + sector_t size = raid10_size(mddev, 0, 0); + md_set_array_sectors(mddev, size); + if (mddev->recovery_cp > mddev->resync_max_sectors) { + mddev->recovery_cp = mddev->resync_max_sectors; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + mddev->resync_max_sectors = size; + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } else { + int d; + for (d = conf->geo.raid_disks ; + d < conf->geo.raid_disks - mddev->delta_disks; + d++) { + struct md_rdev *rdev = conf->mirrors[d].rdev; + if (rdev) + clear_bit(In_sync, &rdev->flags); + rdev = conf->mirrors[d].replacement; + if (rdev) + clear_bit(In_sync, &rdev->flags); + } + } + mddev->layout = mddev->new_layout; + mddev->chunk_sectors = 1 << conf->geo.chunk_shift; + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; +} + static struct md_personality raid10_personality = { .name = "raid10", @@ -3552,6 +4524,9 @@ static struct md_personality raid10_personality = .size = raid10_size, .resize = raid10_resize, .takeover = raid10_takeover, + .check_reshape = raid10_check_reshape, + .start_reshape = raid10_start_reshape, + .finish_reshape = raid10_finish_reshape, }; static int __init raid_init(void) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 7c615613..135b1b0 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -14,32 +14,38 @@ struct mirror_info { struct r10conf { struct mddev *mddev; struct mirror_info *mirrors; - int raid_disks; + struct mirror_info *mirrors_new, *mirrors_old; spinlock_t device_lock; /* geometry */ - int near_copies; /* number of copies laid out + struct geom { + int raid_disks; + int near_copies; /* number of copies laid out * raid0 style */ - int far_copies; /* number of copies laid out + int far_copies; /* number of copies laid out * at large strides across drives */ - int far_offset; /* far_copies are offset by 1 + int far_offset; /* far_copies are offset by 1 * stripe instead of many */ - int copies; /* near_copies * far_copies. - * must be <= raid_disks - */ - sector_t stride; /* distance between far copies. + sector_t stride; /* distance between far copies. * This is size / far_copies unless * far_offset, in which case it is * 1 stripe. */ + int chunk_shift; /* shift from chunks to sectors */ + sector_t chunk_mask; + } prev, geo; + int copies; /* near_copies * far_copies. + * must be <= raid_disks + */ sector_t dev_sectors; /* temp copy of * mddev->dev_sectors */ - - int chunk_shift; /* shift from chunks to sectors */ - sector_t chunk_mask; + sector_t reshape_progress; + sector_t reshape_safe; + unsigned long reshape_checkpoint; + sector_t offset_diff; struct list_head retry_list; /* queue pending writes and submit them on unplug */ @@ -136,6 +142,7 @@ enum r10bio_state { R10BIO_Uptodate, R10BIO_IsSync, R10BIO_IsRecover, + R10BIO_IsReshape, R10BIO_Degraded, /* Set ReadError on bios that experience a read error * so that raid10d knows what to do with them. @@ -146,5 +153,10 @@ enum r10bio_state { */ R10BIO_MadeGood, R10BIO_WriteError, +/* During a reshape we might be performing IO on the + * 'previous' part of the array, in which case this + * flag is set + */ + R10BIO_Previous, }; #endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f351422..d267672 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector, return sh; } +/* Determine if 'data_offset' or 'new_data_offset' should be used + * in this stripe_head. + */ +static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) +{ + sector_t progress = conf->reshape_progress; + /* Need a memory barrier to make sure we see the value + * of conf->generation, or ->data_offset that was set before + * reshape_progress was updated. + */ + smp_rmb(); + if (progress == MaxSector) + return 0; + if (sh->generation == conf->generation - 1) + return 0; + /* We are in a reshape, and this is a new-generation stripe, + * so use new_data_offset. + */ + return 1; +} + static void raid5_end_read_request(struct bio *bi, int error); static void @@ -518,6 +539,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) replace_only = 1; } else continue; + if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) + rw |= REQ_SYNC; bi = &sh->dev[i].req; rbi = &sh->dev[i].rreq; /* For writing to replacement */ @@ -603,7 +626,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) __func__, (unsigned long long)sh->sector, bi->bi_rw, i); atomic_inc(&sh->count); - bi->bi_sector = sh->sector + rdev->data_offset; + if (use_new_offset(conf, sh)) + bi->bi_sector = (sh->sector + + rdev->new_data_offset); + else + bi->bi_sector = (sh->sector + + rdev->data_offset); bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_idx = 0; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; @@ -627,7 +655,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) __func__, (unsigned long long)sh->sector, rbi->bi_rw, i); atomic_inc(&sh->count); - rbi->bi_sector = sh->sector + rrdev->data_offset; + if (use_new_offset(conf, sh)) + rbi->bi_sector = (sh->sector + + rrdev->new_data_offset); + else + rbi->bi_sector = (sh->sector + + rrdev->data_offset); rbi->bi_flags = 1 << BIO_UPTODATE; rbi->bi_idx = 0; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; @@ -1114,6 +1147,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) dev->sector + STRIPE_SECTORS) { if (wbi->bi_rw & REQ_FUA) set_bit(R5_WantFUA, &dev->flags); + if (wbi->bi_rw & REQ_SYNC) + set_bit(R5_SyncIO, &dev->flags); tx = async_copy_data(1, wbi, dev->page, dev->sector, tx); wbi = r5_next_bio(wbi, dev->sector); @@ -1131,13 +1166,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref) int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; int i; - bool fua = false; + bool fua = false, sync = false; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - for (i = disks; i--; ) + for (i = disks; i--; ) { fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); + sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); + } for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -1146,6 +1183,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref) set_bit(R5_UPTODATE, &dev->flags); if (fua) set_bit(R5_WantFUA, &dev->flags); + if (sync) + set_bit(R5_SyncIO, &dev->flags); } } @@ -1648,7 +1687,7 @@ static void raid5_end_read_request(struct bio * bi, int error) int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); char b[BDEVNAME_SIZE]; struct md_rdev *rdev = NULL; - + sector_t s; for (i=0 ; i<disks; i++) if (bi == &sh->dev[i].req) @@ -1671,6 +1710,10 @@ static void raid5_end_read_request(struct bio * bi, int error) if (!rdev) rdev = conf->disks[i].rdev; + if (use_new_offset(conf, sh)) + s = sh->sector + rdev->new_data_offset; + else + s = sh->sector + rdev->data_offset; if (uptodate) { set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { @@ -1683,8 +1726,7 @@ static void raid5_end_read_request(struct bio * bi, int error) "md/raid:%s: read error corrected" " (%lu sectors at %llu on %s)\n", mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)(sh->sector - + rdev->data_offset), + (unsigned long long)s, bdevname(rdev->bdev, b)); atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); clear_bit(R5_ReadError, &sh->dev[i].flags); @@ -1704,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error) "md/raid:%s: read error on replacement device " "(sector %llu on %s).\n", mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), + (unsigned long long)s, bdn); else if (conf->mddev->degraded >= conf->max_degraded) printk_ratelimited( @@ -1713,8 +1754,7 @@ static void raid5_end_read_request(struct bio * bi, int error) "md/raid:%s: read error not correctable " "(sector %llu on %s).\n", mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), + (unsigned long long)s, bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ @@ -1723,8 +1763,7 @@ static void raid5_end_read_request(struct bio * bi, int error) "md/raid:%s: read error NOT corrected!! " "(sector %llu on %s).\n", mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), + (unsigned long long)s, bdn); else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) @@ -3561,7 +3600,7 @@ finish: if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS); + STRIPE_SECTORS, 0); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { @@ -3570,7 +3609,7 @@ finish: /* rdev have been moved down */ rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS); + STRIPE_SECTORS, 0); rdev_dec_pending(rdev, conf->mddev); } } @@ -3842,6 +3881,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) raid_bio->bi_next = (void*)rdev; align_bi->bi_bdev = rdev->bdev; align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); + /* No reshape active, so we can trust rdev->data_offset */ align_bi->bi_sector += rdev->data_offset; if (!bio_fits_rdev(align_bi) || @@ -3953,12 +3993,10 @@ static void make_request(struct mddev *mddev, struct bio * bi) plugged = mddev_check_plugged(mddev); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); - int disks, data_disks; int previous; retry: previous = 0; - disks = conf->raid_disks; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); if (unlikely(conf->reshape_progress != MaxSector)) { /* spinlock is needed as reshape_progress may be @@ -3970,13 +4008,12 @@ static void make_request(struct mddev *mddev, struct bio * bi) * to check again. */ spin_lock_irq(&conf->device_lock); - if (mddev->delta_disks < 0 + if (mddev->reshape_backwards ? logical_sector < conf->reshape_progress : logical_sector >= conf->reshape_progress) { - disks = conf->previous_raid_disks; previous = 1; } else { - if (mddev->delta_disks < 0 + if (mddev->reshape_backwards ? logical_sector < conf->reshape_safe : logical_sector >= conf->reshape_safe) { spin_unlock_irq(&conf->device_lock); @@ -3986,7 +4023,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) } spin_unlock_irq(&conf->device_lock); } - data_disks = disks - conf->max_degraded; new_sector = raid5_compute_sector(conf, logical_sector, previous, @@ -4009,7 +4045,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) */ int must_retry = 0; spin_lock_irq(&conf->device_lock); - if (mddev->delta_disks < 0 + if (mddev->reshape_backwards ? logical_sector >= conf->reshape_progress : logical_sector < conf->reshape_progress) /* mismatch, need to try again */ @@ -4108,11 +4144,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ - if (mddev->delta_disks < 0 && + if (mddev->reshape_backwards && conf->reshape_progress < raid5_size(mddev, 0, 0)) { sector_nr = raid5_size(mddev, 0, 0) - conf->reshape_progress; - } else if (mddev->delta_disks >= 0 && + } else if (!mddev->reshape_backwards && conf->reshape_progress > 0) sector_nr = conf->reshape_progress; sector_div(sector_nr, new_data_disks); @@ -4133,13 +4169,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk else reshape_sectors = mddev->chunk_sectors; - /* we update the metadata when there is more than 3Meg - * in the block range (that is rather arbitrary, should - * probably be time based) or when the data about to be - * copied would over-write the source of the data at - * the front of the range. - * i.e. one new_stripe along from reshape_progress new_maps - * to after where reshape_safe old_maps to + /* We update the metadata at least every 10 seconds, or when + * the data about to be copied would over-write the source of + * the data at the front of the range. i.e. one new_stripe + * along from reshape_progress new_maps to after where + * reshape_safe old_maps to */ writepos = conf->reshape_progress; sector_div(writepos, new_data_disks); @@ -4147,7 +4181,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sector_div(readpos, data_disks); safepos = conf->reshape_safe; sector_div(safepos, data_disks); - if (mddev->delta_disks < 0) { + if (mddev->reshape_backwards) { writepos -= min_t(sector_t, reshape_sectors, writepos); readpos += reshape_sectors; safepos += reshape_sectors; @@ -4157,11 +4191,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk safepos -= min_t(sector_t, reshape_sectors, safepos); } + /* Having calculated the 'writepos' possibly use it + * to set 'stripe_addr' which is where we will write to. + */ + if (mddev->reshape_backwards) { + BUG_ON(conf->reshape_progress == 0); + stripe_addr = writepos; + BUG_ON((mddev->dev_sectors & + ~((sector_t)reshape_sectors - 1)) + - reshape_sectors - stripe_addr + != sector_nr); + } else { + BUG_ON(writepos != sector_nr + reshape_sectors); + stripe_addr = sector_nr; + } + /* 'writepos' is the most advanced device address we might write. * 'readpos' is the least advanced device address we might read. * 'safepos' is the least address recorded in the metadata as having * been reshaped. - * If 'readpos' is behind 'writepos', then there is no way that we can + * If there is a min_offset_diff, these are adjusted either by + * increasing the safepos/readpos if diff is negative, or + * increasing writepos if diff is positive. + * If 'readpos' is then behind 'writepos', there is no way that we can * ensure safety in the face of a crash - that must be done by userspace * making a backup of the data. So in that case there is no particular * rush to update metadata. @@ -4174,7 +4226,13 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk * Maybe that number should be configurable, but I'm not sure it is * worth it.... maybe it could be a multiple of safemode_delay??? */ - if ((mddev->delta_disks < 0 + if (conf->min_offset_diff < 0) { + safepos += -conf->min_offset_diff; + readpos += -conf->min_offset_diff; + } else + writepos += conf->min_offset_diff; + + if ((mddev->reshape_backwards ? (safepos > writepos && readpos < writepos) : (safepos < writepos && readpos > writepos)) || time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { @@ -4195,17 +4253,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - if (mddev->delta_disks < 0) { - BUG_ON(conf->reshape_progress == 0); - stripe_addr = writepos; - BUG_ON((mddev->dev_sectors & - ~((sector_t)reshape_sectors - 1)) - - reshape_sectors - stripe_addr - != sector_nr); - } else { - BUG_ON(writepos != sector_nr + reshape_sectors); - stripe_addr = sector_nr; - } INIT_LIST_HEAD(&stripes); for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; @@ -4239,7 +4286,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk list_add(&sh->lru, &stripes); } spin_lock_irq(&conf->device_lock); - if (mddev->delta_disks < 0) + if (mddev->reshape_backwards) conf->reshape_progress -= reshape_sectors * new_data_disks; else conf->reshape_progress += reshape_sectors * new_data_disks; @@ -4952,16 +4999,42 @@ static int run(struct mddev *mddev) struct md_rdev *rdev; sector_t reshape_offset = 0; int i; + long long min_offset_diff = 0; + int first = 1; if (mddev->recovery_cp != MaxSector) printk(KERN_NOTICE "md/raid:%s: not clean" " -- starting background reconstruction\n", mdname(mddev)); + + rdev_for_each(rdev, mddev) { + long long diff; + if (rdev->raid_disk < 0) + continue; + diff = (rdev->new_data_offset - rdev->data_offset); + if (first) { + min_offset_diff = diff; + first = 0; + } else if (mddev->reshape_backwards && + diff < min_offset_diff) + min_offset_diff = diff; + else if (!mddev->reshape_backwards && + diff > min_offset_diff) + min_offset_diff = diff; + } + if (mddev->reshape_position != MaxSector) { /* Check that we can continue the reshape. - * Currently only disks can change, it must - * increase, and we must be past the point where - * a stripe over-writes itself + * Difficulties arise if the stripe we would write to + * next is at or after the stripe we would read from next. + * For a reshape that changes the number of devices, this + * is only possible for a very short time, and mdadm makes + * sure that time appears to have past before assembling + * the array. So we fail if that time hasn't passed. + * For a reshape that keeps the number of devices the same + * mdadm must be monitoring the reshape can keeping the + * critical areas read-only and backed up. It will start + * the array in read-only mode, so we check for that. */ sector_t here_new, here_old; int old_disks; @@ -4993,26 +5066,34 @@ static int run(struct mddev *mddev) /* here_old is the first stripe that we might need to read * from */ if (mddev->delta_disks == 0) { + if ((here_new * mddev->new_chunk_sectors != + here_old * mddev->chunk_sectors)) { + printk(KERN_ERR "md/raid:%s: reshape position is" + " confused - aborting\n", mdname(mddev)); + return -EINVAL; + } /* We cannot be sure it is safe to start an in-place - * reshape. It is only safe if user-space if monitoring + * reshape. It is only safe if user-space is monitoring * and taking constant backups. * mdadm always starts a situation like this in * readonly mode so it can take control before * allowing any writes. So just check for that. */ - if ((here_new * mddev->new_chunk_sectors != - here_old * mddev->chunk_sectors) || - mddev->ro == 0) { - printk(KERN_ERR "md/raid:%s: in-place reshape must be started" - " in read-only mode - aborting\n", + if (abs(min_offset_diff) >= mddev->chunk_sectors && + abs(min_offset_diff) >= mddev->new_chunk_sectors) + /* not really in-place - so OK */; + else if (mddev->ro == 0) { + printk(KERN_ERR "md/raid:%s: in-place reshape " + "must be started in read-only mode " + "- aborting\n", mdname(mddev)); return -EINVAL; } - } else if (mddev->delta_disks < 0 - ? (here_new * mddev->new_chunk_sectors <= + } else if (mddev->reshape_backwards + ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= here_old * mddev->chunk_sectors) : (here_new * mddev->new_chunk_sectors >= - here_old * mddev->chunk_sectors)) { + here_old * mddev->chunk_sectors + (-min_offset_diff))) { /* Reading from the same stripe as writing to - bad */ printk(KERN_ERR "md/raid:%s: reshape_position too early for " "auto-recovery - aborting.\n", @@ -5037,6 +5118,7 @@ static int run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); + conf->min_offset_diff = min_offset_diff; mddev->thread = conf->thread; conf->thread = NULL; mddev->private = conf; @@ -5182,9 +5264,12 @@ static int run(struct mddev *mddev) blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks - conf->max_degraded)); - rdev_for_each(rdev, mddev) + rdev_for_each(rdev, mddev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->new_data_offset << 9); + } } return 0; @@ -5418,12 +5503,18 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ + sector_t newsize; sectors &= ~((sector_t)mddev->chunk_sectors - 1); - md_set_array_sectors(mddev, raid5_size(mddev, sectors, - mddev->raid_disks)); - if (mddev->array_sectors > - raid5_size(mddev, sectors, mddev->raid_disks)) + newsize = raid5_size(mddev, sectors, mddev->raid_disks); + if (mddev->external_size && + mddev->array_sectors > newsize) return -EINVAL; + if (mddev->bitmap) { + int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); + if (ret) + return ret; + } + md_set_array_sectors(mddev, newsize); set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && @@ -5468,9 +5559,6 @@ static int check_reshape(struct mddev *mddev) mddev->new_layout == mddev->layout && mddev->new_chunk_sectors == mddev->chunk_sectors) return 0; /* nothing to do */ - if (mddev->bitmap) - /* Cannot grow a bitmap yet */ - return -EBUSY; if (has_failed(conf)) return -EINVAL; if (mddev->delta_disks < 0) { @@ -5505,10 +5593,14 @@ static int raid5_start_reshape(struct mddev *mddev) if (!check_stripe_cache(mddev)) return -ENOSPC; - rdev_for_each(rdev, mddev) + if (has_failed(conf)) + return -EINVAL; + + rdev_for_each(rdev, mddev) { if (!test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) spares++; + } if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) /* Not enough devices even to make a degraded array @@ -5535,12 +5627,16 @@ static int raid5_start_reshape(struct mddev *mddev) conf->chunk_sectors = mddev->new_chunk_sectors; conf->prev_algo = conf->algorithm; conf->algorithm = mddev->new_layout; - if (mddev->delta_disks < 0) + conf->generation++; + /* Code that selects data_offset needs to see the generation update + * if reshape_progress has been set - so a memory barrier needed. + */ + smp_mb(); + if (mddev->reshape_backwards) conf->reshape_progress = raid5_size(mddev, 0, 0); else conf->reshape_progress = 0; conf->reshape_safe = conf->reshape_progress; - conf->generation++; spin_unlock_irq(&conf->device_lock); /* Add some new drives, as many as will fit. @@ -5592,6 +5688,9 @@ static int raid5_start_reshape(struct mddev *mddev) mddev->recovery = 0; spin_lock_irq(&conf->device_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; + rdev_for_each(rdev, mddev) + rdev->new_data_offset = rdev->data_offset; + smp_wmb(); conf->reshape_progress = MaxSector; mddev->reshape_position = MaxSector; spin_unlock_irq(&conf->device_lock); @@ -5610,9 +5709,13 @@ static void end_reshape(struct r5conf *conf) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + struct md_rdev *rdev; spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; + rdev_for_each(rdev, conf->mddev) + rdev->data_offset = rdev->new_data_offset; + smp_wmb(); conf->reshape_progress = MaxSector; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); @@ -5652,17 +5755,18 @@ static void raid5_finish_reshape(struct mddev *mddev) d < conf->raid_disks - mddev->delta_disks; d++) { struct md_rdev *rdev = conf->disks[d].rdev; - if (rdev && - raid5_remove_disk(mddev, rdev) == 0) { - sysfs_unlink_rdev(mddev, rdev); - rdev->raid_disk = -1; - } + if (rdev) + clear_bit(In_sync, &rdev->flags); + rdev = conf->disks[d].replacement; + if (rdev) + clear_bit(In_sync, &rdev->flags); } } mddev->layout = conf->algorithm; mddev->chunk_sectors = conf->chunk_sectors; mddev->reshape_position = MaxSector; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; } } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 8d8e139..2164021 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -285,6 +285,7 @@ enum r5dev_flags { */ R5_Wantdrain, /* dev->towrite needs to be drained */ R5_WantFUA, /* Write should be FUA */ + R5_SyncIO, /* The IO is sync */ R5_WriteError, /* got a write error - need to record it */ R5_MadeGood, /* A bad block has been fixed by writing to it */ R5_ReadRepl, /* Will/did read from replacement rather than orig */ @@ -385,6 +386,12 @@ struct r5conf { short generation; /* increments with every reshape */ unsigned long reshape_checkpoint; /* Time we last updated * metadata */ + long long min_offset_diff; /* minimum difference between + * data_offset and + * new_data_offset across all + * devices. May be negative, + * but is closest to zero. + */ struct list_head handle_list; /* stripes needing handling */ struct list_head hold_list; /* preread ready stripes */ diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 8c0a3ad..ee75353 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -233,7 +233,10 @@ struct mdp_superblock_1 { __le32 delta_disks; /* change in number of raid_disks */ __le32 new_layout; /* new layout */ __le32 new_chunk; /* new chunk size (512byte sectors) */ - __u8 pad1[128-124]; /* set to 0 when written */ + __le32 new_offset; /* signed number to add to data_offset in new + * layout. 0 == no-change. This can be + * different on each device in the array. + */ /* constant this-device information - 64 bytes */ __le64 data_offset; /* sector start of data, often 0 */ @@ -281,10 +284,18 @@ struct mdp_superblock_1 { * active device with same 'role'. * 'recovery_offset' is also set. */ +#define MD_FEATURE_RESHAPE_BACKWARDS 32 /* Reshape doesn't change number + * of devices, but is going + * backwards anyway. + */ +#define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RESHAPE_ACTIVE \ |MD_FEATURE_BAD_BLOCKS \ - |MD_FEATURE_REPLACEMENT) + |MD_FEATURE_REPLACEMENT \ + |MD_FEATURE_RESHAPE_BACKWARDS \ + |MD_FEATURE_NEW_OFFSET \ + ) #endif diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 53272e9..640c69c 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -99,8 +99,20 @@ extern const struct raid6_calls raid6_altivec2; extern const struct raid6_calls raid6_altivec4; extern const struct raid6_calls raid6_altivec8; +struct raid6_recov_calls { + void (*data2)(int, size_t, int, int, void **); + void (*datap)(int, size_t, int, void **); + int (*valid)(void); + const char *name; + int priority; +}; + +extern const struct raid6_recov_calls raid6_recov_intx1; +extern const struct raid6_recov_calls raid6_recov_ssse3; + /* Algorithm list */ extern const struct raid6_calls * const raid6_algos[]; +extern const struct raid6_recov_calls *const raid6_recov_algos[]; int raid6_select_algo(void); /* Return values from chk_syndrome */ @@ -111,14 +123,16 @@ int raid6_select_algo(void); /* Galois field tables */ extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); +extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256))); extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); /* Recovery routines */ -void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, +extern void (*raid6_2data_recov)(int disks, size_t bytes, int faila, int failb, void **ptrs); -void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); +extern void (*raid6_datap_recov)(int disks, size_t bytes, int faila, + void **ptrs); void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 8a38102..de06dfe 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_RAID6_PQ) += raid6_pq.o -raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ +raid6_pq-y += algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \ int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ altivec8.o mmx.o sse1.o sse2.o hostprogs-y += mktables diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 8b02f60..589f5f5 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -17,11 +17,11 @@ */ #include <linux/raid/pq.h> -#include <linux/module.h> #ifndef __KERNEL__ #include <sys/mman.h> #include <stdio.h> #else +#include <linux/module.h> #include <linux/gfp.h> #if !RAID6_USE_EMPTY_ZERO_PAGE /* In .bss so it's zeroed */ @@ -34,10 +34,6 @@ struct raid6_calls raid6_call; EXPORT_SYMBOL_GPL(raid6_call); const struct raid6_calls * const raid6_algos[] = { - &raid6_intx1, - &raid6_intx2, - &raid6_intx4, - &raid6_intx8, #if defined(__ia64__) &raid6_intx16, &raid6_intx32, @@ -61,6 +57,24 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_altivec4, &raid6_altivec8, #endif + &raid6_intx1, + &raid6_intx2, + &raid6_intx4, + &raid6_intx8, + NULL +}; + +void (*raid6_2data_recov)(int, size_t, int, int, void **); +EXPORT_SYMBOL_GPL(raid6_2data_recov); + +void (*raid6_datap_recov)(int, size_t, int, void **); +EXPORT_SYMBOL_GPL(raid6_datap_recov); + +const struct raid6_recov_calls *const raid6_recov_algos[] = { +#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) + &raid6_recov_ssse3, +#endif + &raid6_recov_intx1, NULL }; @@ -72,59 +86,55 @@ const struct raid6_calls * const raid6_algos[] = { #define time_before(x, y) ((x) < (y)) #endif -/* Try to pick the best algorithm */ -/* This code uses the gfmul table as convenient data set to abuse */ - -int __init raid6_select_algo(void) +static inline const struct raid6_recov_calls *raid6_choose_recov(void) { - const struct raid6_calls * const * algo; - const struct raid6_calls * best; - char *syndromes; - void *dptrs[(65536/PAGE_SIZE)+2]; - int i, disks; - unsigned long perf, bestperf; - int bestprefer; - unsigned long j0, j1; + const struct raid6_recov_calls *const *algo; + const struct raid6_recov_calls *best; - disks = (65536/PAGE_SIZE)+2; - for ( i = 0 ; i < disks-2 ; i++ ) { - dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; - } + for (best = NULL, algo = raid6_recov_algos; *algo; algo++) + if (!best || (*algo)->priority > best->priority) + if (!(*algo)->valid || (*algo)->valid()) + best = *algo; - /* Normal code - use a 2-page allocation to avoid D$ conflict */ - syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); + if (best) { + raid6_2data_recov = best->data2; + raid6_datap_recov = best->datap; - if ( !syndromes ) { - printk("raid6: Yikes! No memory available.\n"); - return -ENOMEM; - } + printk("raid6: using %s recovery algorithm\n", best->name); + } else + printk("raid6: Yikes! No recovery algorithm found!\n"); - dptrs[disks-2] = syndromes; - dptrs[disks-1] = syndromes + PAGE_SIZE; + return best; +} + +static inline const struct raid6_calls *raid6_choose_gen( + void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) +{ + unsigned long perf, bestperf, j0, j1; + const struct raid6_calls *const *algo; + const struct raid6_calls *best; - bestperf = 0; bestprefer = 0; best = NULL; + for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { + if (!best || (*algo)->prefer >= best->prefer) { + if ((*algo)->valid && !(*algo)->valid()) + continue; - for ( algo = raid6_algos ; *algo ; algo++ ) { - if ( !(*algo)->valid || (*algo)->valid() ) { perf = 0; preempt_disable(); j0 = jiffies; - while ( (j1 = jiffies) == j0 ) + while ((j1 = jiffies) == j0) cpu_relax(); while (time_before(jiffies, j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { - (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); + (*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs); perf++; } preempt_enable(); - if ( (*algo)->prefer > bestprefer || - ((*algo)->prefer == bestprefer && - perf > bestperf) ) { - best = *algo; - bestprefer = best->prefer; + if (perf > bestperf) { bestperf = perf; + best = *algo; } printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); @@ -139,9 +149,46 @@ int __init raid6_select_algo(void) } else printk("raid6: Yikes! No algorithm found!\n"); + return best; +} + + +/* Try to pick the best algorithm */ +/* This code uses the gfmul table as convenient data set to abuse */ + +int __init raid6_select_algo(void) +{ + const int disks = (65536/PAGE_SIZE)+2; + + const struct raid6_calls *gen_best; + const struct raid6_recov_calls *rec_best; + char *syndromes; + void *dptrs[(65536/PAGE_SIZE)+2]; + int i; + + for (i = 0; i < disks-2; i++) + dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; + + /* Normal code - use a 2-page allocation to avoid D$ conflict */ + syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); + + if (!syndromes) { + printk("raid6: Yikes! No memory available.\n"); + return -ENOMEM; + } + + dptrs[disks-2] = syndromes; + dptrs[disks-1] = syndromes + PAGE_SIZE; + + /* select raid gen_syndrome function */ + gen_best = raid6_choose_gen(&dptrs, disks); + + /* select raid recover functions */ + rec_best = raid6_choose_recov(); + free_pages((unsigned long)syndromes, 1); - return best ? 0 : -EINVAL; + return gen_best && rec_best ? 0 : -EINVAL; } static void raid6_exit(void) diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c index 8a37809..39787db 100644 --- a/lib/raid6/mktables.c +++ b/lib/raid6/mktables.c @@ -81,6 +81,31 @@ int main(int argc, char *argv[]) printf("EXPORT_SYMBOL(raid6_gfmul);\n"); printf("#endif\n"); + /* Compute vector multiplication table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_vgfmul[256][32] =\n" + "{\n"); + for (i = 0; i < 256; i++) { + printf("\t{\n"); + for (j = 0; j < 16; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, j + k), + (k == 7) ? '\n' : ' '); + } + for (j = 0; j < 16; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, (j + k) << 4), + (k == 7) ? '\n' : ' '); + } + printf("\t},\n"); + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_vgfmul);\n"); + printf("#endif\n"); + /* Compute power-of-2 table (exponent) */ v = 1; printf("\nconst u8 __attribute__((aligned(256)))\n" diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index fe275d7..1805a5c 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -22,7 +22,7 @@ #include <linux/raid/pq.h> /* Recover two failed data blocks. */ -void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, +void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb, void **ptrs) { u8 *p, *q, *dp, *dq; @@ -64,10 +64,9 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, p++; q++; } } -EXPORT_SYMBOL_GPL(raid6_2data_recov); /* Recover failure of one data block plus the P block */ -void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) +void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, void **ptrs) { u8 *p, *q, *dq; const u8 *qmul; /* Q multiplier table */ @@ -96,7 +95,15 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) q++; dq++; } } -EXPORT_SYMBOL_GPL(raid6_datap_recov); + + +const struct raid6_recov_calls raid6_recov_intx1 = { + .data2 = raid6_2data_recov_intx1, + .datap = raid6_datap_recov_intx1, + .valid = NULL, + .name = "intx1", + .priority = 0, +}; #ifndef __KERNEL__ /* Testing only */ diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c new file mode 100644 index 0000000..37ae619 --- /dev/null +++ b/lib/raid6/recov_ssse3.c @@ -0,0 +1,335 @@ +/* + * Copyright (C) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) + +#include <linux/raid/pq.h> +#include "x86.h" + +static int raid6_has_ssse3(void) +{ + return boot_cpu_has(X86_FEATURE_XMM) && + boot_cpu_has(X86_FEATURE_XMM2) && + boot_cpu_has(X86_FEATURE_SSSE3); +} + +void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb, + void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + static const u8 __aligned(16) x0f[16] = { + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0])); + +#ifdef CONFIG_X86_64 + asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0])); + asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0])); + asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16])); +#endif + + /* Now do it... */ + while (bytes) { +#ifdef CONFIG_X86_64 + /* xmm6, xmm14, xmm15 */ + + asm volatile("movdqa %0,%%xmm1" : : "m" (q[0])); + asm volatile("movdqa %0,%%xmm9" : : "m" (q[16])); + asm volatile("movdqa %0,%%xmm0" : : "m" (p[0])); + asm volatile("movdqa %0,%%xmm8" : : "m" (p[16])); + asm volatile("pxor %0,%%xmm1" : : "m" (dq[0])); + asm volatile("pxor %0,%%xmm9" : : "m" (dq[16])); + asm volatile("pxor %0,%%xmm0" : : "m" (dp[0])); + asm volatile("pxor %0,%%xmm8" : : "m" (dp[16])); + + /* xmm0/8 = px */ + + asm volatile("movdqa %xmm6,%xmm4"); + asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); + asm volatile("movdqa %xmm6,%xmm12"); + asm volatile("movdqa %xmm5,%xmm13"); + asm volatile("movdqa %xmm1,%xmm3"); + asm volatile("movdqa %xmm9,%xmm11"); + asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */ + asm volatile("movdqa %xmm8,%xmm10"); + asm volatile("psraw $4,%xmm1"); + asm volatile("psraw $4,%xmm9"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm11"); + asm volatile("pand %xmm7,%xmm1"); + asm volatile("pand %xmm7,%xmm9"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm11,%xmm12"); + asm volatile("pshufb %xmm1,%xmm5"); + asm volatile("pshufb %xmm9,%xmm13"); + asm volatile("pxor %xmm4,%xmm5"); + asm volatile("pxor %xmm12,%xmm13"); + + /* xmm5/13 = qx */ + + asm volatile("movdqa %xmm14,%xmm4"); + asm volatile("movdqa %xmm15,%xmm1"); + asm volatile("movdqa %xmm14,%xmm12"); + asm volatile("movdqa %xmm15,%xmm9"); + asm volatile("movdqa %xmm2,%xmm3"); + asm volatile("movdqa %xmm10,%xmm11"); + asm volatile("psraw $4,%xmm2"); + asm volatile("psraw $4,%xmm10"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm11"); + asm volatile("pand %xmm7,%xmm2"); + asm volatile("pand %xmm7,%xmm10"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm11,%xmm12"); + asm volatile("pshufb %xmm2,%xmm1"); + asm volatile("pshufb %xmm10,%xmm9"); + asm volatile("pxor %xmm4,%xmm1"); + asm volatile("pxor %xmm12,%xmm9"); + + /* xmm1/9 = pbmul[px] */ + asm volatile("pxor %xmm5,%xmm1"); + asm volatile("pxor %xmm13,%xmm9"); + /* xmm1/9 = db = DQ */ + asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0])); + asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16])); + + asm volatile("pxor %xmm1,%xmm0"); + asm volatile("pxor %xmm9,%xmm8"); + asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0])); + asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16])); + + bytes -= 32; + p += 32; + q += 32; + dp += 32; + dq += 32; +#else + asm volatile("movdqa %0,%%xmm1" : : "m" (*q)); + asm volatile("movdqa %0,%%xmm0" : : "m" (*p)); + asm volatile("pxor %0,%%xmm1" : : "m" (*dq)); + asm volatile("pxor %0,%%xmm0" : : "m" (*dp)); + + /* 1 = dq ^ q + * 0 = dp ^ p + */ + asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0])); + asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); + + asm volatile("movdqa %xmm1,%xmm3"); + asm volatile("psraw $4,%xmm1"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm1"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm1,%xmm5"); + asm volatile("pxor %xmm4,%xmm5"); + + asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */ + + /* xmm5 = qx */ + + asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0])); + asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16])); + asm volatile("movdqa %xmm2,%xmm3"); + asm volatile("psraw $4,%xmm2"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm2"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm2,%xmm1"); + asm volatile("pxor %xmm4,%xmm1"); + + /* xmm1 = pbmul[px] */ + asm volatile("pxor %xmm5,%xmm1"); + /* xmm1 = db = DQ */ + asm volatile("movdqa %%xmm1,%0" : "=m" (*dq)); + + asm volatile("pxor %xmm1,%xmm0"); + asm volatile("movdqa %%xmm0,%0" : "=m" (*dp)); + + bytes -= 16; + p += 16; + q += 16; + dp += 16; + dq += 16; +#endif + } + + kernel_fpu_end(); +} + + +void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + static const u8 __aligned(16) x0f[16] = { + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0])); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); + asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16])); + asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); + asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); + + /* xmm3 = q[0] ^ dq[0] */ + + asm volatile("pxor %0, %%xmm4" : : "m" (q[16])); + asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); + + /* xmm4 = q[16] ^ dq[16] */ + + asm volatile("movdqa %xmm3, %xmm6"); + asm volatile("movdqa %xmm4, %xmm8"); + + /* xmm4 = xmm8 = q[16] ^ dq[16] */ + + asm volatile("psraw $4, %xmm3"); + asm volatile("pand %xmm7, %xmm6"); + asm volatile("pand %xmm7, %xmm3"); + asm volatile("pshufb %xmm6, %xmm0"); + asm volatile("pshufb %xmm3, %xmm1"); + asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0])); + asm volatile("pxor %xmm0, %xmm1"); + asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16])); + + /* xmm1 = qmul[q[0] ^ dq[0]] */ + + asm volatile("psraw $4, %xmm4"); + asm volatile("pand %xmm7, %xmm8"); + asm volatile("pand %xmm7, %xmm4"); + asm volatile("pshufb %xmm8, %xmm10"); + asm volatile("pshufb %xmm4, %xmm11"); + asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); + asm volatile("pxor %xmm10, %xmm11"); + asm volatile("movdqa %0, %%xmm12" : : "m" (p[16])); + + /* xmm11 = qmul[q[16] ^ dq[16]] */ + + asm volatile("pxor %xmm1, %xmm2"); + + /* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */ + + asm volatile("pxor %xmm11, %xmm12"); + + /* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */ + + asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); + asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16])); + + asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); + asm volatile("movdqa %%xmm12, %0" : "=m" (p[16])); + + bytes -= 32; + p += 32; + q += 32; + dq += 32; + +#else + asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); + asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); + asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); + asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); + + /* xmm3 = *q ^ *dq */ + + asm volatile("movdqa %xmm3, %xmm6"); + asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); + asm volatile("psraw $4, %xmm3"); + asm volatile("pand %xmm7, %xmm6"); + asm volatile("pand %xmm7, %xmm3"); + asm volatile("pshufb %xmm6, %xmm0"); + asm volatile("pshufb %xmm3, %xmm1"); + asm volatile("pxor %xmm0, %xmm1"); + + /* xmm1 = qmul[*q ^ *dq */ + + asm volatile("pxor %xmm1, %xmm2"); + + /* xmm2 = *p ^ qmul[*q ^ *dq] */ + + asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); + asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); + + bytes -= 16; + p += 16; + q += 16; + dq += 16; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_ssse3 = { + .data2 = raid6_2data_recov_ssse3, + .datap = raid6_datap_recov_ssse3, + .valid = raid6_has_ssse3, +#ifdef CONFIG_X86_64 + .name = "ssse3x2", +#else + .name = "ssse3x1", +#endif + .priority = 1, +}; + +#endif diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index aa65169..c76151d 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -23,7 +23,7 @@ RANLIB = ranlib all: raid6.a raid6test raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ - altivec1.o altivec2.o altivec4.o altivec8.o recov.o algos.o \ + altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \ tables.o rm -f $@ $(AR) cq $@ $^ diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c index 7a93031..5a485b7 100644 --- a/lib/raid6/test/test.c +++ b/lib/raid6/test/test.c @@ -90,25 +90,35 @@ static int test_disks(int i, int j) int main(int argc, char *argv[]) { const struct raid6_calls *const *algo; + const struct raid6_recov_calls *const *ra; int i, j; int err = 0; makedata(); - for (algo = raid6_algos; *algo; algo++) { - if (!(*algo)->valid || (*algo)->valid()) { - raid6_call = **algo; + for (ra = raid6_recov_algos; *ra; ra++) { + if ((*ra)->valid && !(*ra)->valid()) + continue; + raid6_2data_recov = (*ra)->data2; + raid6_datap_recov = (*ra)->datap; - /* Nuke syndromes */ - memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); + printf("using recovery %s\n", (*ra)->name); - /* Generate assumed good syndrome */ - raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, - (void **)&dataptrs); + for (algo = raid6_algos; *algo; algo++) { + if (!(*algo)->valid || (*algo)->valid()) { + raid6_call = **algo; - for (i = 0; i < NDISKS-1; i++) - for (j = i+1; j < NDISKS; j++) - err += test_disks(i, j); + /* Nuke syndromes */ + memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); + + /* Generate assumed good syndrome */ + raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, + (void **)&dataptrs); + + for (i = 0; i < NDISKS-1; i++) + for (j = i+1; j < NDISKS; j++) + err += test_disks(i, j); + } } printf("\n"); } diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h index cb2a8c9..d55d632 100644 --- a/lib/raid6/x86.h +++ b/lib/raid6/x86.h @@ -35,24 +35,29 @@ static inline void kernel_fpu_end(void) { } +#define __aligned(x) __attribute__((aligned(x))) + #define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ #define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions * (fast save and restore) */ #define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ #define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ +#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ /* Should work well enough on modern CPUs for testing */ static inline int boot_cpu_has(int flag) { - u32 eax = (flag >> 5) ? 0x80000001 : 1; - u32 edx; + u32 eax = (flag & 0x20) ? 0x80000001 : 1; + u32 ecx, edx; asm volatile("cpuid" - : "+a" (eax), "=d" (edx) - : : "ecx", "ebx"); + : "+a" (eax), "=d" (edx), "=c" (ecx) + : : "ebx"); - return (edx >> (flag & 31)) & 1; + return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1; } #endif /* ndef __KERNEL__ */ |