diff options
165 files changed, 2562 insertions, 2524 deletions
diff --git a/Documentation/filesystems/xip.txt b/Documentation/filesystems/xip.txt index 0466ee5..b774729 100644 --- a/Documentation/filesystems/xip.txt +++ b/Documentation/filesystems/xip.txt @@ -28,12 +28,15 @@ Implementation Execute-in-place is implemented in three steps: block device operation, address space operation, and file operations. -A block device operation named direct_access is used to retrieve a -reference (pointer) to a block on-disk. The reference is supposed to be -cpu-addressable, physical address and remain valid until the release operation -is performed. A struct block_device reference is used to address the device, -and a sector_t argument is used to identify the individual block. As an -alternative, memory technology devices can be used for this. +A block device operation named direct_access is used to translate the +block device sector number to a page frame number (pfn) that identifies +the physical page for the memory. It also returns a kernel virtual +address that can be used to access the memory. + +The direct_access method takes a 'size' parameter that indicates the +number of bytes being requested. The function should return the number +of bytes that can be contiguously accessed at that offset. It may also +return a negative errno if an error occurs. The block device operation is optional, these block devices support it as of today: diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt index 8e1ddec..ae57b9e 100644 --- a/Documentation/nommu-mmap.txt +++ b/Documentation/nommu-mmap.txt @@ -43,12 +43,12 @@ and it's also much more restricted in the latter case: even if this was created by another process. - If possible, the file mapping will be directly on the backing device - if the backing device has the BDI_CAP_MAP_DIRECT capability and + if the backing device has the NOMMU_MAP_DIRECT capability and appropriate mapping protection capabilities. Ramfs, romfs, cramfs and mtd might all permit this. - If the backing device device can't or won't permit direct sharing, - but does have the BDI_CAP_MAP_COPY capability, then a copy of the + but does have the NOMMU_MAP_COPY capability, then a copy of the appropriate bit of the file will be read into a contiguous bit of memory and any extraneous space beyond the EOF will be cleared @@ -220,7 +220,7 @@ directly (can't be copied). The file->f_op->mmap() operation will be called to actually inaugurate the mapping. It can be rejected at that point. Returning the ENOSYS error will -cause the mapping to be copied instead if BDI_CAP_MAP_COPY is specified. +cause the mapping to be copied instead if NOMMU_MAP_COPY is specified. The vm_ops->close() routine will be invoked when the last mapping on a chardev is removed. An existing mapping will be shared, partially or not, if possible @@ -232,7 +232,7 @@ want to handle it, despite the fact it's got an operation. For instance, it might try directing the call to a secondary driver which turns out not to implement it. Such is the case for the framebuffer driver which attempts to direct the call to the device-specific driver. Under such circumstances, the -mapping request will be rejected if BDI_CAP_MAP_COPY is not specified, and a +mapping request will be rejected if NOMMU_MAP_COPY is not specified, and a copy mapped otherwise. IMPORTANT NOTE: diff --git a/MAINTAINERS b/MAINTAINERS index 2299965..482bfc5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6642,9 +6642,10 @@ F: include/uapi/linux/netrom.h F: net/netrom/ NETWORK BLOCK DEVICE (NBD) -M: Paul Clements <Paul.Clements@steeleye.com> +M: Markus Pargmann <mpa@pengutronix.de> S: Maintained L: nbd-general@lists.sourceforge.net +T: git git://git.pengutronix.de/git/mpa/linux-nbd.git F: Documentation/blockdev/nbd.txt F: drivers/block/nbd.c F: include/linux/nbd.h @@ -10690,6 +10691,7 @@ F: drivers/pci/*xen* XEN BLOCK SUBSYSTEM M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +M: Roger Pau Monné <roger.pau@citrix.com> L: xen-devel@lists.xenproject.org (moderated for non-subscribers) S: Supported F: drivers/block/xen-blkback/* diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index f532c92..ee90db1 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -139,26 +139,17 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio) * axon_ram_direct_access - direct_access() method for block device * @device, @sector, @data: see block_device_operations method */ -static int +static long axon_ram_direct_access(struct block_device *device, sector_t sector, - void **kaddr, unsigned long *pfn) + void **kaddr, unsigned long *pfn, long size) { struct axon_ram_bank *bank = device->bd_disk->private_data; - loff_t offset; - - offset = sector; - if (device->bd_part != NULL) - offset += device->bd_part->start_sect; - offset <<= AXON_RAM_SECTOR_SHIFT; - if (offset >= bank->size) { - dev_err(&bank->device->dev, "Access outside of address space\n"); - return -ERANGE; - } + loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; *kaddr = (void *)(bank->ph_addr + offset); - *pfn = virt_to_phys(kaddr) >> PAGE_SHIFT; + *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT; - return 0; + return bank->size - offset; } static const struct block_device_operations axon_ram_devops = { diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 920e616..5ba2d9c 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -148,6 +148,7 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) +asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) diff --git a/block/bio.c b/block/bio.c index 471d738..f66a4ea 100644 --- a/block/bio.c +++ b/block/bio.c @@ -28,7 +28,6 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> -#include <scsi/sg.h> /* for struct sg_iovec */ #include <trace/events/block.h> @@ -1022,21 +1021,11 @@ void bio_copy_data(struct bio *dst, struct bio *src) EXPORT_SYMBOL(bio_copy_data); struct bio_map_data { - int nr_sgvecs; int is_our_pages; - struct sg_iovec sgvecs[]; + struct iov_iter iter; + struct iovec iov[]; }; -static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, - const struct sg_iovec *iov, int iov_count, - int is_our_pages) -{ - memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); - bmd->nr_sgvecs = iov_count; - bmd->is_our_pages = is_our_pages; - bio->bi_private = bmd; -} - static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, gfp_t gfp_mask) { @@ -1044,85 +1033,101 @@ static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, return NULL; return kmalloc(sizeof(struct bio_map_data) + - sizeof(struct sg_iovec) * iov_count, gfp_mask); + sizeof(struct iovec) * iov_count, gfp_mask); } -static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, - int to_user, int from_user, int do_free_page) +/** + * bio_copy_from_iter - copy all pages from iov_iter to bio + * @bio: The &struct bio which describes the I/O as destination + * @iter: iov_iter as source + * + * Copy all pages from iov_iter to bio. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_from_iter(struct bio *bio, struct iov_iter iter) { - int ret = 0, i; + int i; struct bio_vec *bvec; - int iov_idx = 0; - unsigned int iov_off = 0; bio_for_each_segment_all(bvec, bio, i) { - char *bv_addr = page_address(bvec->bv_page); - unsigned int bv_len = bvec->bv_len; + ssize_t ret; - while (bv_len && iov_idx < iov_count) { - unsigned int bytes; - char __user *iov_addr; + ret = copy_page_from_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + &iter); - bytes = min_t(unsigned int, - iov[iov_idx].iov_len - iov_off, bv_len); - iov_addr = iov[iov_idx].iov_base + iov_off; + if (!iov_iter_count(&iter)) + break; - if (!ret) { - if (to_user) - ret = copy_to_user(iov_addr, bv_addr, - bytes); + if (ret < bvec->bv_len) + return -EFAULT; + } - if (from_user) - ret = copy_from_user(bv_addr, iov_addr, - bytes); + return 0; +} - if (ret) - ret = -EFAULT; - } +/** + * bio_copy_to_iter - copy all pages from bio to iov_iter + * @bio: The &struct bio which describes the I/O as source + * @iter: iov_iter as destination + * + * Copy all pages from bio to iov_iter. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) +{ + int i; + struct bio_vec *bvec; - bv_len -= bytes; - bv_addr += bytes; - iov_addr += bytes; - iov_off += bytes; + bio_for_each_segment_all(bvec, bio, i) { + ssize_t ret; - if (iov[iov_idx].iov_len == iov_off) { - iov_idx++; - iov_off = 0; - } - } + ret = copy_page_to_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + &iter); + + if (!iov_iter_count(&iter)) + break; - if (do_free_page) - __free_page(bvec->bv_page); + if (ret < bvec->bv_len) + return -EFAULT; } - return ret; + return 0; +} + +static void bio_free_pages(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) + __free_page(bvec->bv_page); } /** * bio_uncopy_user - finish previously mapped bio * @bio: bio being terminated * - * Free pages allocated from bio_copy_user() and write back data + * Free pages allocated from bio_copy_user_iov() and write back data * to user space in case of a read. */ int bio_uncopy_user(struct bio *bio) { struct bio_map_data *bmd = bio->bi_private; - struct bio_vec *bvec; - int ret = 0, i; + int ret = 0; if (!bio_flagged(bio, BIO_NULL_MAPPED)) { /* * if we're in a workqueue, the request is orphaned, so * don't copy into a random user address space, just free. */ - if (current->mm) - ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, - bio_data_dir(bio) == READ, - 0, bmd->is_our_pages); - else if (bmd->is_our_pages) - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); + if (current->mm && bio_data_dir(bio) == READ) + ret = bio_copy_to_iter(bio, bmd->iter); + if (bmd->is_our_pages) + bio_free_pages(bio); } kfree(bmd); bio_put(bio); @@ -1132,12 +1137,10 @@ EXPORT_SYMBOL(bio_uncopy_user); /** * bio_copy_user_iov - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iov: the iovec. - * @iov_count: number of elements in the iovec - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags + * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) + * @iter: iovec iterator + * @gfp_mask: memory allocation flags * * Prepares and returns a bio for indirect user io, bouncing data * to/from kernel pages as necessary. Must be paired with @@ -1145,25 +1148,25 @@ EXPORT_SYMBOL(bio_uncopy_user); */ struct bio *bio_copy_user_iov(struct request_queue *q, struct rq_map_data *map_data, - const struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) + const struct iov_iter *iter, + gfp_t gfp_mask) { struct bio_map_data *bmd; - struct bio_vec *bvec; struct page *page; struct bio *bio; int i, ret; int nr_pages = 0; - unsigned int len = 0; + unsigned int len = iter->count; unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; - for (i = 0; i < iov_count; i++) { + for (i = 0; i < iter->nr_segs; i++) { unsigned long uaddr; unsigned long end; unsigned long start; - uaddr = (unsigned long)iov[i].iov_base; - end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + uaddr = (unsigned long) iter->iov[i].iov_base; + end = (uaddr + iter->iov[i].iov_len + PAGE_SIZE - 1) + >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; /* @@ -1173,22 +1176,31 @@ struct bio *bio_copy_user_iov(struct request_queue *q, return ERR_PTR(-EINVAL); nr_pages += end - start; - len += iov[i].iov_len; } if (offset) nr_pages++; - bmd = bio_alloc_map_data(iov_count, gfp_mask); + bmd = bio_alloc_map_data(iter->nr_segs, gfp_mask); if (!bmd) return ERR_PTR(-ENOMEM); + /* + * We need to do a deep copy of the iov_iter including the iovecs. + * The caller provided iov might point to an on-stack or otherwise + * shortlived one. + */ + bmd->is_our_pages = map_data ? 0 : 1; + memcpy(bmd->iov, iter->iov, sizeof(struct iovec) * iter->nr_segs); + iov_iter_init(&bmd->iter, iter->type, bmd->iov, + iter->nr_segs, iter->count); + ret = -ENOMEM; bio = bio_kmalloc(gfp_mask, nr_pages); if (!bio) goto out_bmd; - if (!write_to_vm) + if (iter->type & WRITE) bio->bi_rw |= REQ_WRITE; ret = 0; @@ -1236,20 +1248,18 @@ struct bio *bio_copy_user_iov(struct request_queue *q, /* * success */ - if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || + if (((iter->type & WRITE) && (!map_data || !map_data->null_mapped)) || (map_data && map_data->from_user)) { - ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0); + ret = bio_copy_from_iter(bio, *iter); if (ret) goto cleanup; } - bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); + bio->bi_private = bmd; return bio; cleanup: if (!map_data) - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); - + bio_free_pages(bio); bio_put(bio); out_bmd: kfree(bmd); @@ -1257,46 +1267,30 @@ out_bmd: } /** - * bio_copy_user - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @uaddr: start of user address - * @len: length in bytes - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags + * bio_map_user_iov - map user iovec into bio + * @q: the struct request_queue for the bio + * @iter: iovec iterator + * @gfp_mask: memory allocation flags * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. + * Map the user space address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. */ -struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, - unsigned long uaddr, unsigned int len, - int write_to_vm, gfp_t gfp_mask) +struct bio *bio_map_user_iov(struct request_queue *q, + const struct iov_iter *iter, + gfp_t gfp_mask) { - struct sg_iovec iov; - - iov.iov_base = (void __user *)uaddr; - iov.iov_len = len; - - return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); -} -EXPORT_SYMBOL(bio_copy_user); - -static struct bio *__bio_map_user_iov(struct request_queue *q, - struct block_device *bdev, - const struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) -{ - int i, j; + int j; int nr_pages = 0; struct page **pages; struct bio *bio; int cur_page = 0; int ret, offset; + struct iov_iter i; + struct iovec iov; - for (i = 0; i < iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; - unsigned long len = iov[i].iov_len; + iov_for_each(iov, i, *iter) { + unsigned long uaddr = (unsigned long) iov.iov_base; + unsigned long len = iov.iov_len; unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = uaddr >> PAGE_SHIFT; @@ -1326,16 +1320,17 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, if (!pages) goto out; - for (i = 0; i < iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; - unsigned long len = iov[i].iov_len; + iov_for_each(iov, i, *iter) { + unsigned long uaddr = (unsigned long) iov.iov_base; + unsigned long len = iov.iov_len; unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = uaddr >> PAGE_SHIFT; const int local_nr_pages = end - start; const int page_limit = cur_page + local_nr_pages; ret = get_user_pages_fast(uaddr, local_nr_pages, - write_to_vm, &pages[cur_page]); + (iter->type & WRITE) != WRITE, + &pages[cur_page]); if (ret < local_nr_pages) { ret = -EFAULT; goto out_unmap; @@ -1375,72 +1370,10 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, /* * set data direction, and check if mapped pages need bouncing */ - if (!write_to_vm) + if (iter->type & WRITE) bio->bi_rw |= REQ_WRITE; - bio->bi_bdev = bdev; bio->bi_flags |= (1 << BIO_USER_MAPPED); - return bio; - - out_unmap: - for (i = 0; i < nr_pages; i++) { - if(!pages[i]) - break; - page_cache_release(pages[i]); - } - out: - kfree(pages); - bio_put(bio); - return ERR_PTR(ret); -} - -/** - * bio_map_user - map user address into bio - * @q: the struct request_queue for the bio - * @bdev: destination block device - * @uaddr: start of user address - * @len: length in bytes - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, - unsigned long uaddr, unsigned int len, int write_to_vm, - gfp_t gfp_mask) -{ - struct sg_iovec iov; - - iov.iov_base = (void __user *)uaddr; - iov.iov_len = len; - - return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); -} -EXPORT_SYMBOL(bio_map_user); - -/** - * bio_map_user_iov - map user sg_iovec table into bio - * @q: the struct request_queue for the bio - * @bdev: destination block device - * @iov: the iovec. - * @iov_count: number of elements in the iovec - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - const struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) -{ - struct bio *bio; - - bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, - gfp_mask); - if (IS_ERR(bio)) - return bio; /* * subtle -- if __bio_map_user() ended up bouncing a bio, @@ -1449,8 +1382,18 @@ struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, * reference to it */ bio_get(bio); - return bio; + + out_unmap: + for (j = 0; j < nr_pages; j++) { + if (!pages[j]) + break; + page_cache_release(pages[j]); + } + out: + kfree(pages); + bio_put(bio); + return ERR_PTR(ret); } static void __bio_unmap_user(struct bio *bio) @@ -1492,8 +1435,18 @@ static void bio_map_kern_endio(struct bio *bio, int err) bio_put(bio); } -static struct bio *__bio_map_kern(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask) +/** + * bio_map_kern - map kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to map + * @len: length in bytes + * @gfp_mask: allocation flags for bio allocation + * + * Map the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, + gfp_t gfp_mask) { unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -1517,8 +1470,11 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data, bytes = len; if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, - offset) < bytes) - break; + offset) < bytes) { + /* we don't support partial mappings */ + bio_put(bio); + return ERR_PTR(-EINVAL); + } data += bytes; len -= bytes; @@ -1528,57 +1484,26 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data, bio->bi_end_io = bio_map_kern_endio; return bio; } +EXPORT_SYMBOL(bio_map_kern); -/** - * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to map - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask) +static void bio_copy_kern_endio(struct bio *bio, int err) { - struct bio *bio; - - bio = __bio_map_kern(q, data, len, gfp_mask); - if (IS_ERR(bio)) - return bio; - - if (bio->bi_iter.bi_size == len) - return bio; - - /* - * Don't support partial mappings. - */ + bio_free_pages(bio); bio_put(bio); - return ERR_PTR(-EINVAL); } -EXPORT_SYMBOL(bio_map_kern); -static void bio_copy_kern_endio(struct bio *bio, int err) +static void bio_copy_kern_endio_read(struct bio *bio, int err) { + char *p = bio->bi_private; struct bio_vec *bvec; - const int read = bio_data_dir(bio) == READ; - struct bio_map_data *bmd = bio->bi_private; int i; - char *p = bmd->sgvecs[0].iov_base; bio_for_each_segment_all(bvec, bio, i) { - char *addr = page_address(bvec->bv_page); - - if (read) - memcpy(p, addr, bvec->bv_len); - - __free_page(bvec->bv_page); + memcpy(p, page_address(bvec->bv_page), bvec->bv_len); p += bvec->bv_len; } - kfree(bmd); - bio_put(bio); + bio_copy_kern_endio(bio, err); } /** @@ -1595,28 +1520,59 @@ static void bio_copy_kern_endio(struct bio *bio, int err) struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask, int reading) { + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; struct bio *bio; - struct bio_vec *bvec; - int i; + void *p = data; + int nr_pages = 0; + + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); - bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); - if (IS_ERR(bio)) - return bio; + nr_pages = end - start; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); - if (!reading) { - void *p = data; + while (len) { + struct page *page; + unsigned int bytes = PAGE_SIZE; - bio_for_each_segment_all(bvec, bio, i) { - char *addr = page_address(bvec->bv_page); + if (bytes > len) + bytes = len; - memcpy(addr, p, bvec->bv_len); - p += bvec->bv_len; - } + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) + goto cleanup; + + if (!reading) + memcpy(page_address(page), p, bytes); + + if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) + break; + + len -= bytes; + p += bytes; } - bio->bi_end_io = bio_copy_kern_endio; + if (reading) { + bio->bi_end_io = bio_copy_kern_endio_read; + bio->bi_private = data; + } else { + bio->bi_end_io = bio_copy_kern_endio; + bio->bi_rw |= REQ_WRITE; + } return bio; + +cleanup: + bio_free_pages(bio); + bio_put(bio); + return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(bio_copy_kern); diff --git a/block/blk-core.c b/block/blk-core.c index 3ad4055..794c3e7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -607,7 +607,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; q->backing_dev_info.state = 0; - q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; + q->backing_dev_info.capabilities = 0; q->backing_dev_info.name = "block"; q->node = node_id; @@ -2048,6 +2048,13 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return -EIO; + if (q->mq_ops) { + if (blk_queue_io_stat(q)) + blk_account_io_start(rq, true); + blk_mq_insert_request(rq, false, true, true); + return 0; + } + spin_lock_irqsave(q->queue_lock, flags); if (unlikely(blk_queue_dying(q))) { spin_unlock_irqrestore(q->queue_lock, flags); @@ -2907,7 +2914,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; - dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; + dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); @@ -2945,8 +2952,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, if (!bs) bs = fs_bio_set; - blk_rq_init(NULL, rq); - __rq_for_each_bio(bio_src, rq_src) { bio = bio_clone_fast(bio_src, gfp_mask, bs); if (!bio) diff --git a/block/blk-lib.c b/block/blk-lib.c index 8411be3..7688ee3 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -283,24 +283,34 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) + * @discard: whether to discard the block range * * Description: - * Generate and issue number of bios with zerofiled pages. + * Zero-fill a block range. If the discard flag is set and the block + * device guarantees that subsequent READ operations to the block range + * in question will return zeroes, the blocks will be discarded. Should + * the discard request fail, if the discard flag is not set, or if + * discard_zeroes_data is not supported, this function will resort to + * zeroing the blocks manually, thus provisioning (allocating, + * anchoring) them. If the block device supports the WRITE SAME command + * blkdev_issue_zeroout() will use it to optimize the process of + * clearing the block range. Otherwise the zeroing will be performed + * using regular WRITE calls. */ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask) + sector_t nr_sects, gfp_t gfp_mask, bool discard) { - if (bdev_write_same(bdev)) { - unsigned char bdn[BDEVNAME_SIZE]; + struct request_queue *q = bdev_get_queue(bdev); - if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, - ZERO_PAGE(0))) - return 0; + if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data && + blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0) == 0) + return 0; - bdevname(bdev, bdn); - pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn); - } + if (bdev_write_same(bdev) && + blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, + ZERO_PAGE(0)) == 0) + return 0; return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask); } diff --git a/block/blk-map.c b/block/blk-map.c index f890d43..b8d2725 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -5,7 +5,7 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> -#include <scsi/sg.h> /* for struct sg_iovec */ +#include <linux/uio.h> #include "blk.h" @@ -39,138 +39,12 @@ static int __blk_rq_unmap_user(struct bio *bio) return ret; } -static int __blk_rq_map_user(struct request_queue *q, struct request *rq, - struct rq_map_data *map_data, void __user *ubuf, - unsigned int len, gfp_t gfp_mask) -{ - unsigned long uaddr; - struct bio *bio, *orig_bio; - int reading, ret; - - reading = rq_data_dir(rq) == READ; - - /* - * if alignment requirement is satisfied, map in user pages for - * direct dma. else, set up kernel bounce buffers - */ - uaddr = (unsigned long) ubuf; - if (blk_rq_aligned(q, uaddr, len) && !map_data) - bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); - else - bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); - - if (IS_ERR(bio)) - return PTR_ERR(bio); - - if (map_data && map_data->null_mapped) - bio->bi_flags |= (1 << BIO_NULL_MAPPED); - - orig_bio = bio; - blk_queue_bounce(q, &bio); - - /* - * We link the bounce buffer in and could have to traverse it - * later so we have to get a ref to prevent it from being freed - */ - bio_get(bio); - - ret = blk_rq_append_bio(q, rq, bio); - if (!ret) - return bio->bi_iter.bi_size; - - /* if it was boucned we must call the end io function */ - bio_endio(bio, 0); - __blk_rq_unmap_user(orig_bio); - bio_put(bio); - return ret; -} - -/** - * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage - * @q: request queue where request should be inserted - * @rq: request structure to fill - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @ubuf: the user buffer - * @len: length of user data - * @gfp_mask: memory allocation flags - * - * Description: - * Data will be mapped directly for zero copy I/O, if possible. Otherwise - * a kernel bounce buffer is used. - * - * A matching blk_rq_unmap_user() must be issued at the end of I/O, while - * still in process context. - * - * Note: The mapped bio may need to be bounced through blk_queue_bounce() - * before being submitted to the device, as pages mapped may be out of - * reach. It's the callers responsibility to make sure this happens. The - * original bio must be passed back in to blk_rq_unmap_user() for proper - * unmapping. - */ -int blk_rq_map_user(struct request_queue *q, struct request *rq, - struct rq_map_data *map_data, void __user *ubuf, - unsigned long len, gfp_t gfp_mask) -{ - unsigned long bytes_read = 0; - struct bio *bio = NULL; - int ret; - - if (len > (queue_max_hw_sectors(q) << 9)) - return -EINVAL; - if (!len) - return -EINVAL; - - if (!ubuf && (!map_data || !map_data->null_mapped)) - return -EINVAL; - - while (bytes_read != len) { - unsigned long map_len, end, start; - - map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE); - end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1) - >> PAGE_SHIFT; - start = (unsigned long)ubuf >> PAGE_SHIFT; - - /* - * A bad offset could cause us to require BIO_MAX_PAGES + 1 - * pages. If this happens we just lower the requested - * mapping len by a page so that we can fit - */ - if (end - start > BIO_MAX_PAGES) - map_len -= PAGE_SIZE; - - ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len, - gfp_mask); - if (ret < 0) - goto unmap_rq; - if (!bio) - bio = rq->bio; - bytes_read += ret; - ubuf += ret; - - if (map_data) - map_data->offset += ret; - } - - if (!bio_flagged(bio, BIO_USER_MAPPED)) - rq->cmd_flags |= REQ_COPY_USER; - - return 0; -unmap_rq: - blk_rq_unmap_user(bio); - rq->bio = NULL; - return ret; -} -EXPORT_SYMBOL(blk_rq_map_user); - /** * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage * @q: request queue where request should be inserted * @rq: request to map data to * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iov: pointer to the iovec - * @iov_count: number of elements in the iovec - * @len: I/O byte count + * @iter: iovec iterator * @gfp_mask: memory allocation flags * * Description: @@ -187,20 +61,21 @@ EXPORT_SYMBOL(blk_rq_map_user); * unmapping. */ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, - struct rq_map_data *map_data, const struct sg_iovec *iov, - int iov_count, unsigned int len, gfp_t gfp_mask) + struct rq_map_data *map_data, + const struct iov_iter *iter, gfp_t gfp_mask) { struct bio *bio; - int i, read = rq_data_dir(rq) == READ; int unaligned = 0; + struct iov_iter i; + struct iovec iov; - if (!iov || iov_count <= 0) + if (!iter || !iter->count) return -EINVAL; - for (i = 0; i < iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; + iov_for_each(iov, i, *iter) { + unsigned long uaddr = (unsigned long) iov.iov_base; - if (!iov[i].iov_len) + if (!iov.iov_len) return -EINVAL; /* @@ -210,16 +85,18 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, unaligned = 1; } - if (unaligned || (q->dma_pad_mask & len) || map_data) - bio = bio_copy_user_iov(q, map_data, iov, iov_count, read, - gfp_mask); + if (unaligned || (q->dma_pad_mask & iter->count) || map_data) + bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); else - bio = bio_map_user_iov(q, NULL, iov, iov_count, read, gfp_mask); + bio = bio_map_user_iov(q, iter, gfp_mask); if (IS_ERR(bio)) return PTR_ERR(bio); - if (bio->bi_iter.bi_size != len) { + if (map_data && map_data->null_mapped) + bio->bi_flags |= (1 << BIO_NULL_MAPPED); + + if (bio->bi_iter.bi_size != iter->count) { /* * Grab an extra reference to this bio, as bio_unmap_user() * expects to be able to drop it twice as it happens on the @@ -241,6 +118,21 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_rq_map_user_iov); +int blk_rq_map_user(struct request_queue *q, struct request *rq, + struct rq_map_data *map_data, void __user *ubuf, + unsigned long len, gfp_t gfp_mask) +{ + struct iovec iov; + struct iov_iter i; + + iov.iov_base = ubuf; + iov.iov_len = len; + iov_iter_init(&i, rq_data_dir(rq), &iov, 1, len); + + return blk_rq_map_user_iov(q, rq, map_data, &i, gfp_mask); +} +EXPORT_SYMBOL(blk_rq_map_user); + /** * blk_rq_unmap_user - unmap a request with user data * @bio: start of bio list diff --git a/block/blk-merge.c b/block/blk-merge.c index 89b97b5..fc1ff3b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -283,35 +283,6 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_rq_map_sg); -/** - * blk_bio_map_sg - map a bio to a scatterlist - * @q: request_queue in question - * @bio: bio being mapped - * @sglist: scatterlist being mapped - * - * Note: - * Caller must make sure sg can hold bio->bi_phys_segments entries - * - * Will return the number of sg entries setup - */ -int blk_bio_map_sg(struct request_queue *q, struct bio *bio, - struct scatterlist *sglist) -{ - struct scatterlist *sg = NULL; - int nsegs; - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - - nsegs = __blk_bios_map_sg(q, bio, sglist, &sg); - bio->bi_next = next; - if (sg) - sg_mark_end(sg); - - BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); - return nsegs; -} -EXPORT_SYMBOL(blk_bio_map_sg); - static inline int ll_new_hw_segment(struct request_queue *q, struct request *req, struct bio *bio) @@ -385,6 +356,14 @@ static bool req_no_special_merge(struct request *req) return !q->mq_ops && req->special; } +static int req_gap_to_prev(struct request *req, struct request *next) +{ + struct bio *prev = req->biotail; + + return bvec_gap_to_prev(&prev->bi_io_vec[prev->bi_vcnt - 1], + next->bio->bi_io_vec[0].bv_offset); +} + static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { @@ -399,6 +378,10 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (req_no_special_merge(req) || req_no_special_merge(next)) return 0; + if (test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags) && + req_gap_to_prev(req, next)) + return 0; + /* * Will it become too large? */ diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 60c9d4a..d53a764 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -140,35 +140,39 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return atomic_read(&hctx->nr_active) < depth; } -static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) +static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag, + bool nowrap) { - int tag, org_last_tag, end; - bool wrap = last_tag != 0; + int tag, org_last_tag = last_tag; - org_last_tag = last_tag; - end = bm->depth; - do { -restart: - tag = find_next_zero_bit(&bm->word, end, last_tag); - if (unlikely(tag >= end)) { + while (1) { + tag = find_next_zero_bit(&bm->word, bm->depth, last_tag); + if (unlikely(tag >= bm->depth)) { /* - * We started with an offset, start from 0 to + * We started with an offset, and we didn't reset the + * offset to 0 in a failure case, so start from 0 to * exhaust the map. */ - if (wrap) { - wrap = false; - end = org_last_tag; - last_tag = 0; - goto restart; + if (org_last_tag && last_tag && !nowrap) { + last_tag = org_last_tag = 0; + continue; } return -1; } + + if (!test_and_set_bit(tag, &bm->word)) + break; + last_tag = tag + 1; - } while (test_and_set_bit(tag, &bm->word)); + if (last_tag >= bm->depth - 1) + last_tag = 0; + } return tag; } +#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR) + /* * Straight forward bitmap tag implementation, where each bit is a tag * (cleared == free, and set == busy). The small twist is using per-cpu @@ -181,7 +185,7 @@ restart: * until the map is exhausted. */ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, - unsigned int *tag_cache) + unsigned int *tag_cache, struct blk_mq_tags *tags) { unsigned int last_tag, org_last_tag; int index, i, tag; @@ -193,15 +197,24 @@ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, index = TAG_TO_INDEX(bt, last_tag); for (i = 0; i < bt->map_nr; i++) { - tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag)); + tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag), + BT_ALLOC_RR(tags)); if (tag != -1) { tag += (index << bt->bits_per_word); goto done; } - last_tag = 0; - if (++index >= bt->map_nr) + /* + * Jump to next index, and reset the last tag to be the + * first tag of that index + */ + index++; + last_tag = (index << bt->bits_per_word); + + if (index >= bt->map_nr) { index = 0; + last_tag = 0; + } } *tag_cache = 0; @@ -212,7 +225,7 @@ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, * up using the specific cached tag. */ done: - if (tag == org_last_tag) { + if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) { last_tag = tag + 1; if (last_tag >= bt->depth - 1) last_tag = 0; @@ -241,13 +254,13 @@ static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, static int bt_get(struct blk_mq_alloc_data *data, struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx, - unsigned int *last_tag) + unsigned int *last_tag, struct blk_mq_tags *tags) { struct bt_wait_state *bs; DEFINE_WAIT(wait); int tag; - tag = __bt_get(hctx, bt, last_tag); + tag = __bt_get(hctx, bt, last_tag, tags); if (tag != -1) return tag; @@ -258,7 +271,7 @@ static int bt_get(struct blk_mq_alloc_data *data, do { prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); - tag = __bt_get(hctx, bt, last_tag); + tag = __bt_get(hctx, bt, last_tag, tags); if (tag != -1) break; @@ -273,7 +286,7 @@ static int bt_get(struct blk_mq_alloc_data *data, * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ - tag = __bt_get(hctx, bt, last_tag); + tag = __bt_get(hctx, bt, last_tag, tags); if (tag != -1) break; @@ -304,7 +317,7 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) int tag; tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, - &data->ctx->last_tag); + &data->ctx->last_tag, data->hctx->tags); if (tag >= 0) return tag + data->hctx->tags->nr_reserved_tags; @@ -320,7 +333,8 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) return BLK_MQ_TAG_FAIL; } - tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero); + tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero, + data->hctx->tags); if (tag < 0) return BLK_MQ_TAG_FAIL; @@ -392,7 +406,8 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, BUG_ON(real_tag >= tags->nr_tags); bt_clear_tag(&tags->bitmap_tags, real_tag); - *last_tag = real_tag; + if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO)) + *last_tag = real_tag; } else { BUG_ON(tag >= tags->nr_reserved_tags); bt_clear_tag(&tags->breserved_tags, tag); @@ -509,6 +524,7 @@ static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); if (!bt->bs) { kfree(bt->map); + bt->map = NULL; return -ENOMEM; } @@ -529,10 +545,12 @@ static void bt_free(struct blk_mq_bitmap_tags *bt) } static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, - int node) + int node, int alloc_policy) { unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + tags->alloc_policy = alloc_policy; + if (bt_alloc(&tags->bitmap_tags, depth, node, false)) goto enomem; if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) @@ -546,7 +564,8 @@ enomem: } struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, - unsigned int reserved_tags, int node) + unsigned int reserved_tags, + int node, int alloc_policy) { struct blk_mq_tags *tags; @@ -562,7 +581,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; - return blk_mq_init_bitmap_tags(tags, node); + return blk_mq_init_bitmap_tags(tags, node, alloc_policy); } void blk_mq_free_tags(struct blk_mq_tags *tags) diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index a6fa0fc..90767b3 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -42,10 +42,12 @@ struct blk_mq_tags { struct request **rqs; struct list_head page_list; + + int alloc_policy; }; -extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); +extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); diff --git a/block/blk-mq.c b/block/blk-mq.c index 2390c55..4f4bea2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -33,6 +33,7 @@ static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); +static void blk_mq_run_queues(struct request_queue *q); /* * Check if any of the ctx's have pending work in this hardware queue @@ -117,7 +118,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q) if (freeze) { percpu_ref_kill(&q->mq_usage_counter); - blk_mq_run_queues(q, false); + blk_mq_run_queues(q); } } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); @@ -136,6 +137,7 @@ void blk_mq_freeze_queue(struct request_queue *q) blk_mq_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void blk_mq_unfreeze_queue(struct request_queue *q) { @@ -902,7 +904,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) &hctx->run_work, 0); } -void blk_mq_run_queues(struct request_queue *q, bool async) +static void blk_mq_run_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; @@ -913,10 +915,9 @@ void blk_mq_run_queues(struct request_queue *q, bool async) test_bit(BLK_MQ_S_STOPPED, &hctx->state)) continue; - blk_mq_run_hw_queue(hctx, async); + blk_mq_run_hw_queue(hctx, false); } } -EXPORT_SYMBOL(blk_mq_run_queues); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { @@ -954,7 +955,6 @@ void blk_mq_start_hw_queues(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_start_hw_queues); - void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; @@ -1423,7 +1423,8 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, size_t rq_size, left; tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, - set->numa_node); + set->numa_node, + BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; diff --git a/block/blk-tag.c b/block/blk-tag.c index a185b86..f0344e6 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -119,7 +119,7 @@ fail: } static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, - int depth) + int depth, int alloc_policy) { struct blk_queue_tag *tags; @@ -131,6 +131,8 @@ static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, goto fail; atomic_set(&tags->refcnt, 1); + tags->alloc_policy = alloc_policy; + tags->next_tag = 0; return tags; fail: kfree(tags); @@ -140,10 +142,11 @@ fail: /** * blk_init_tags - initialize the tag info for an external tag map * @depth: the maximum queue depth supported + * @alloc_policy: tag allocation policy **/ -struct blk_queue_tag *blk_init_tags(int depth) +struct blk_queue_tag *blk_init_tags(int depth, int alloc_policy) { - return __blk_queue_init_tags(NULL, depth); + return __blk_queue_init_tags(NULL, depth, alloc_policy); } EXPORT_SYMBOL(blk_init_tags); @@ -152,19 +155,20 @@ EXPORT_SYMBOL(blk_init_tags); * @q: the request queue for the device * @depth: the maximum queue depth supported * @tags: the tag to use + * @alloc_policy: tag allocation policy * * Queue lock must be held here if the function is called to resize an * existing map. **/ int blk_queue_init_tags(struct request_queue *q, int depth, - struct blk_queue_tag *tags) + struct blk_queue_tag *tags, int alloc_policy) { int rc; BUG_ON(tags && q->queue_tags && tags != q->queue_tags); if (!tags && !q->queue_tags) { - tags = __blk_queue_init_tags(q, depth); + tags = __blk_queue_init_tags(q, depth, alloc_policy); if (!tags) return -ENOMEM; @@ -344,9 +348,21 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) } do { - tag = find_first_zero_bit(bqt->tag_map, max_depth); - if (tag >= max_depth) - return 1; + if (bqt->alloc_policy == BLK_TAG_ALLOC_FIFO) { + tag = find_first_zero_bit(bqt->tag_map, max_depth); + if (tag >= max_depth) + return 1; + } else { + int start = bqt->next_tag; + int size = min_t(int, bqt->max_depth, max_depth + start); + tag = find_next_zero_bit(bqt->tag_map, size, start); + if (tag >= size && start + size > bqt->max_depth) { + size = start + size - bqt->max_depth; + tag = find_first_zero_bit(bqt->tag_map, size); + } + if (tag >= size) + return 1; + } } while (test_and_set_bit_lock(tag, bqt->tag_map)); /* @@ -354,6 +370,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) * See blk_queue_end_tag for details. */ + bqt->next_tag = (tag + 1) % bqt->max_depth; rq->cmd_flags |= REQ_QUEUED; rq->tag = tag; bqt->tag_index[tag] = rq; diff --git a/block/bsg.c b/block/bsg.c index 276e869..d214e92 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -136,42 +136,6 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index) return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)]; } -static int bsg_io_schedule(struct bsg_device *bd) -{ - DEFINE_WAIT(wait); - int ret = 0; - - spin_lock_irq(&bd->lock); - - BUG_ON(bd->done_cmds > bd->queued_cmds); - - /* - * -ENOSPC or -ENODATA? I'm going for -ENODATA, meaning "I have no - * work to do", even though we return -ENOSPC after this same test - * during bsg_write() -- there, it means our buffer can't have more - * bsg_commands added to it, thus has no space left. - */ - if (bd->done_cmds == bd->queued_cmds) { - ret = -ENODATA; - goto unlock; - } - - if (!test_bit(BSG_F_BLOCK, &bd->flags)) { - ret = -EAGAIN; - goto unlock; - } - - prepare_to_wait(&bd->wq_done, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&bd->lock); - io_schedule(); - finish_wait(&bd->wq_done, &wait); - - return ret; -unlock: - spin_unlock_irq(&bd->lock); - return ret; -} - static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, struct sg_io_v4 *hdr, struct bsg_device *bd, fmode_t has_write_perm) @@ -482,6 +446,30 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, return ret; } +static bool bsg_complete(struct bsg_device *bd) +{ + bool ret = false; + bool spin; + + do { + spin_lock_irq(&bd->lock); + + BUG_ON(bd->done_cmds > bd->queued_cmds); + + /* + * All commands consumed. + */ + if (bd->done_cmds == bd->queued_cmds) + ret = true; + + spin = !test_bit(BSG_F_BLOCK, &bd->flags); + + spin_unlock_irq(&bd->lock); + } while (!ret && spin); + + return ret; +} + static int bsg_complete_all_commands(struct bsg_device *bd) { struct bsg_command *bc; @@ -492,17 +480,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd) /* * wait for all commands to complete */ - ret = 0; - do { - ret = bsg_io_schedule(bd); - /* - * look for -ENODATA specifically -- we'll sometimes get - * -ERESTARTSYS when we've taken a signal, but we can't - * return until we're done freeing the queue, so ignore - * it. The signal will get handled when we're done freeing - * the bsg_device. - */ - } while (ret != -ENODATA); + io_wait_event(bd->wq_done, bsg_complete(bd)); /* * discard done commands diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 6f2751d..5da8e6e 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3590,6 +3590,11 @@ retry: blkcg = bio_blkcg(bio); cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); + if (!cfqg) { + cfqq = &cfqd->oom_cfqq; + goto out; + } + cfqq = cic_to_cfqq(cic, is_sync); /* @@ -3626,7 +3631,7 @@ retry: } else cfqq = &cfqd->oom_cfqq; } - +out: if (new_cfqq) kmem_cache_free(cfq_pool, new_cfqq); @@ -3656,12 +3661,17 @@ static struct cfq_queue * cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, struct bio *bio, gfp_t gfp_mask) { - const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); - const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); + int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); + int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); struct cfq_queue **async_cfqq = NULL; struct cfq_queue *cfqq = NULL; if (!is_sync) { + if (!ioprio_valid(cic->ioprio)) { + struct task_struct *tsk = current; + ioprio = task_nice_ioprio(tsk); + ioprio_class = task_nice_ioclass(tsk); + } async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); cfqq = *async_cfqq; } diff --git a/block/ioctl.c b/block/ioctl.c index 6c7bf90..7d8befd 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -198,7 +198,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start, if (start + len > (i_size_read(bdev->bd_inode) >> 9)) return -EINVAL; - return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL); + return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false); } static int put_ushort(unsigned long arg, unsigned short val) diff --git a/block/partitions/check.c b/block/partitions/check.c index 9ac1df7..16118d1 100644 --- a/block/partitions/check.c +++ b/block/partitions/check.c @@ -184,12 +184,12 @@ check_partition(struct gendisk *hd, struct block_device *bdev) if (err) /* The partition is unrecognized. So report I/O errors if there were any */ res = err; - if (!res) - strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE); - else if (warn_no_part) - strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); - - printk(KERN_INFO "%s", state->pp_buf); + if (res) { + if (warn_no_part) + strlcat(state->pp_buf, + " unable to read partition table\n", PAGE_SIZE); + printk(KERN_INFO "%s", state->pp_buf); + } free_page((unsigned long)state->pp_buf); free_partitions(state); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 28163fa..e1f71c3 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -332,7 +332,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, ret = 0; if (hdr->iovec_count) { - size_t iov_data_len; + struct iov_iter i; struct iovec *iov = NULL; ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count, @@ -342,20 +342,11 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, goto out_free_cdb; } - iov_data_len = ret; - ret = 0; - /* SG_IO howto says that the shorter of the two wins */ - if (hdr->dxfer_len < iov_data_len) { - hdr->iovec_count = iov_shorten(iov, - hdr->iovec_count, - hdr->dxfer_len); - iov_data_len = hdr->dxfer_len; - } + iov_iter_init(&i, rq_data_dir(rq), iov, hdr->iovec_count, + min_t(unsigned, ret, hdr->dxfer_len)); - ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov, - hdr->iovec_count, - iov_data_len, GFP_KERNEL); + ret = blk_rq_map_user_iov(q, rq, NULL, &i, GFP_KERNEL); kfree(iov); } else if (hdr->dxfer_len) ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 4b0d5e7..4c35f08 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -1585,8 +1585,6 @@ unsigned ata_exec_internal_sg(struct ata_device *dev, else tag = 0; - if (test_and_set_bit(tag, &ap->qc_allocated)) - BUG(); qc = __ata_qc_from_tag(ap, tag); qc->tag = tag; @@ -4722,69 +4720,36 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words) } /** - * ata_qc_new - Request an available ATA command, for queueing - * @ap: target port - * - * Some ATA host controllers may implement a queue depth which is less - * than ATA_MAX_QUEUE. So we shouldn't allocate a tag which is beyond - * the hardware limitation. + * ata_qc_new_init - Request an available ATA command, and initialize it + * @dev: Device from whom we request an available command structure * * LOCKING: * None. */ -static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap) +struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag) { - struct ata_queued_cmd *qc = NULL; - unsigned int max_queue = ap->host->n_tags; - unsigned int i, tag; + struct ata_port *ap = dev->link->ap; + struct ata_queued_cmd *qc; /* no command while frozen */ if (unlikely(ap->pflags & ATA_PFLAG_FROZEN)) return NULL; - for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) { - if (ap->flags & ATA_FLAG_LOWTAG) - tag = i; - else - tag = tag < max_queue ? tag : 0; - - /* the last tag is reserved for internal command. */ - if (tag == ATA_TAG_INTERNAL) - continue; - - if (!test_and_set_bit(tag, &ap->qc_allocated)) { - qc = __ata_qc_from_tag(ap, tag); - qc->tag = tag; - ap->last_tag = tag; - break; - } + /* libsas case */ + if (!ap->scsi_host) { + tag = ata_sas_allocate_tag(ap); + if (tag < 0) + return NULL; } - return qc; -} - -/** - * ata_qc_new_init - Request an available ATA command, and initialize it - * @dev: Device from whom we request an available command structure - * - * LOCKING: - * None. - */ - -struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev) -{ - struct ata_port *ap = dev->link->ap; - struct ata_queued_cmd *qc; - - qc = ata_qc_new(ap); - if (qc) { - qc->scsicmd = NULL; - qc->ap = ap; - qc->dev = dev; + qc = __ata_qc_from_tag(ap, tag); + qc->tag = tag; + qc->scsicmd = NULL; + qc->ap = ap; + qc->dev = dev; - ata_qc_reinit(qc); - } + ata_qc_reinit(qc); return qc; } @@ -4811,7 +4776,8 @@ void ata_qc_free(struct ata_queued_cmd *qc) tag = qc->tag; if (likely(ata_tag_valid(tag))) { qc->tag = ATA_TAG_POISON; - clear_bit(tag, &ap->qc_allocated); + if (!ap->scsi_host) + ata_sas_free_tag(tag, ap); } } diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 2807293..b061ba2 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -756,7 +756,7 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev, { struct ata_queued_cmd *qc; - qc = ata_qc_new_init(dev); + qc = ata_qc_new_init(dev, cmd->request->tag); if (qc) { qc->scsicmd = cmd; qc->scsidone = cmd->scsi_done; @@ -3668,6 +3668,9 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht) */ shost->max_host_blocked = 1; + if (scsi_init_shared_tag_map(shost, host->n_tags)) + goto err_add; + rc = scsi_add_host_with_dma(ap->scsi_host, &ap->tdev, ap->host->dev); if (rc) @@ -4230,3 +4233,31 @@ int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap) return rc; } EXPORT_SYMBOL_GPL(ata_sas_queuecmd); + +int ata_sas_allocate_tag(struct ata_port *ap) +{ + unsigned int max_queue = ap->host->n_tags; + unsigned int i, tag; + + for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) { + if (ap->flags & ATA_FLAG_LOWTAG) + tag = 1; + else + tag = tag < max_queue ? tag : 0; + + /* the last tag is reserved for internal command. */ + if (tag == ATA_TAG_INTERNAL) + continue; + + if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) { + ap->sas_last_tag = tag; + return tag; + } + } + return -1; +} + +void ata_sas_free_tag(unsigned int tag, struct ata_port *ap) +{ + clear_bit(tag, &ap->sas_tag_allocated); +} diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h index 82ebe26..f840ca1 100644 --- a/drivers/ata/libata.h +++ b/drivers/ata/libata.h @@ -63,7 +63,7 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev); extern void ata_force_cbl(struct ata_port *ap); extern u64 ata_tf_to_lba(const struct ata_taskfile *tf); extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf); -extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev); +extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag); extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev, u64 block, u32 n_block, unsigned int tf_flags, unsigned int tag); @@ -144,6 +144,8 @@ extern void ata_scsi_dev_rescan(struct work_struct *work); extern int ata_bus_probe(struct ata_port *ap); extern int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel, unsigned int id, u64 lun); +int ata_sas_allocate_tag(struct ata_port *ap); +void ata_sas_free_tag(unsigned int tag, struct ata_port *ap); /* libata-eh.c */ diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c index ea65594..ba2667f 100644 --- a/drivers/ata/sata_sil24.c +++ b/drivers/ata/sata_sil24.c @@ -388,6 +388,7 @@ static struct scsi_host_template sil24_sht = { .can_queue = SIL24_MAX_CMDS, .sg_tablesize = SIL24_MAX_SGE, .dma_boundary = ATA_DMA_BOUNDARY, + .tag_alloc_policy = BLK_TAG_ALLOC_FIFO, }; static struct ata_port_operations sil24_ops = { diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 3598110..c01b921 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -370,25 +370,25 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, } #ifdef CONFIG_BLK_DEV_XIP -static int brd_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, unsigned long *pfn) +static long brd_direct_access(struct block_device *bdev, sector_t sector, + void **kaddr, unsigned long *pfn, long size) { struct brd_device *brd = bdev->bd_disk->private_data; struct page *page; if (!brd) return -ENODEV; - if (sector & (PAGE_SECTORS-1)) - return -EINVAL; - if (sector + PAGE_SECTORS > get_capacity(bdev->bd_disk)) - return -ERANGE; page = brd_insert_page(brd, sector); if (!page) return -ENOSPC; *kaddr = page_address(page); *pfn = page_to_pfn(page); - return 0; + /* + * TODO: If size > PAGE_SIZE, we could look to see if the next page in + * the file happens to be mapped to the next page of physical RAM. + */ + return PAGE_SIZE; } #endif @@ -438,19 +438,18 @@ static const struct block_device_operations brd_fops = { /* * And now the modules code and kernel interface. */ -static int rd_nr; -int rd_size = CONFIG_BLK_DEV_RAM_SIZE; -static int max_part; -static int part_shift; -static int part_show = 0; +static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT; module_param(rd_nr, int, S_IRUGO); MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); + +int rd_size = CONFIG_BLK_DEV_RAM_SIZE; module_param(rd_size, int, S_IRUGO); MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); + +static int max_part = 1; module_param(max_part, int, S_IRUGO); -MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk"); -module_param(part_show, int, S_IRUGO); -MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions"); +MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); + MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); MODULE_ALIAS("rd"); @@ -487,25 +486,33 @@ static struct brd_device *brd_alloc(int i) brd->brd_queue = blk_alloc_queue(GFP_KERNEL); if (!brd->brd_queue) goto out_free_dev; + blk_queue_make_request(brd->brd_queue, brd_make_request); blk_queue_max_hw_sectors(brd->brd_queue, 1024); blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); + /* This is so fdisk will align partitions on 4k, because of + * direct_access API needing 4k alignment, returning a PFN + * (This is only a problem on very small devices <= 4M, + * otherwise fdisk will align on 1M. Regardless this call + * is harmless) + */ + blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE); + brd->brd_queue->limits.discard_granularity = PAGE_SIZE; brd->brd_queue->limits.max_discard_sectors = UINT_MAX; brd->brd_queue->limits.discard_zeroes_data = 1; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue); - disk = brd->brd_disk = alloc_disk(1 << part_shift); + disk = brd->brd_disk = alloc_disk(max_part); if (!disk) goto out_free_queue; disk->major = RAMDISK_MAJOR; - disk->first_minor = i << part_shift; + disk->first_minor = i * max_part; disk->fops = &brd_fops; disk->private_data = brd; disk->queue = brd->brd_queue; - if (!part_show) - disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->flags = GENHD_FL_EXT_DEVT; sprintf(disk->disk_name, "ram%d", i); set_capacity(disk, rd_size * 2); @@ -527,10 +534,11 @@ static void brd_free(struct brd_device *brd) kfree(brd); } -static struct brd_device *brd_init_one(int i) +static struct brd_device *brd_init_one(int i, bool *new) { struct brd_device *brd; + *new = false; list_for_each_entry(brd, &brd_devices, brd_list) { if (brd->brd_number == i) goto out; @@ -541,6 +549,7 @@ static struct brd_device *brd_init_one(int i) add_disk(brd->brd_disk); list_add_tail(&brd->brd_list, &brd_devices); } + *new = true; out: return brd; } @@ -556,70 +565,46 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data) { struct brd_device *brd; struct kobject *kobj; + bool new; mutex_lock(&brd_devices_mutex); - brd = brd_init_one(MINOR(dev) >> part_shift); + brd = brd_init_one(MINOR(dev) / max_part, &new); kobj = brd ? get_disk(brd->brd_disk) : NULL; mutex_unlock(&brd_devices_mutex); - *part = 0; + if (new) + *part = 0; + return kobj; } static int __init brd_init(void) { - int i, nr; - unsigned long range; struct brd_device *brd, *next; + int i; /* * brd module now has a feature to instantiate underlying device * structure on-demand, provided that there is an access dev node. - * However, this will not work well with user space tool that doesn't - * know about such "feature". In order to not break any existing - * tool, we do the following: * - * (1) if rd_nr is specified, create that many upfront, and this - * also becomes a hard limit. - * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT - * (default 16) rd device on module load, user can further - * extend brd device by create dev node themselves and have - * kernel automatically instantiate actual device on-demand. + * (1) if rd_nr is specified, create that many upfront. else + * it defaults to CONFIG_BLK_DEV_RAM_COUNT + * (2) User can further extend brd devices by create dev node themselves + * and have kernel automatically instantiate actual device + * on-demand. Example: + * mknod /path/devnod_name b 1 X # 1 is the rd major + * fdisk -l /path/devnod_name + * If (X / max_part) was not already created it will be created + * dynamically. */ - part_shift = 0; - if (max_part > 0) { - part_shift = fls(max_part); - - /* - * Adjust max_part according to part_shift as it is exported - * to user space so that user can decide correct minor number - * if [s]he want to create more devices. - * - * Note that -1 is required because partition 0 is reserved - * for the whole disk. - */ - max_part = (1UL << part_shift) - 1; - } - - if ((1UL << part_shift) > DISK_MAX_PARTS) - return -EINVAL; - - if (rd_nr > 1UL << (MINORBITS - part_shift)) - return -EINVAL; - - if (rd_nr) { - nr = rd_nr; - range = rd_nr << part_shift; - } else { - nr = CONFIG_BLK_DEV_RAM_COUNT; - range = 1UL << MINORBITS; - } - if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) return -EIO; - for (i = 0; i < nr; i++) { + if (unlikely(!max_part)) + max_part = 1; + + for (i = 0; i < rd_nr; i++) { brd = brd_alloc(i); if (!brd) goto out_free; @@ -631,10 +616,10 @@ static int __init brd_init(void) list_for_each_entry(brd, &brd_devices, brd_list) add_disk(brd->brd_disk); - blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range, + blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS, THIS_MODULE, brd_probe, NULL, NULL); - printk(KERN_INFO "brd: module loaded\n"); + pr_info("brd: module loaded\n"); return 0; out_free: @@ -644,21 +629,21 @@ out_free: } unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + pr_info("brd: module NOT loaded !!!\n"); return -ENOMEM; } static void __exit brd_exit(void) { - unsigned long range; struct brd_device *brd, *next; - range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS; - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) brd_del_one(brd); - blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range); + blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS); unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + + pr_info("brd: module unloaded\n"); } module_init(brd_init); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index d169b4a..cee2035 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1388,7 +1388,7 @@ int drbd_submit_peer_request(struct drbd_device *device, list_add_tail(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); if (blkdev_issue_zeroout(device->ldev->backing_bdev, - sector, data_size >> 9, GFP_NOIO)) + sector, data_size >> 9, GFP_NOIO, false)) peer_req->flags |= EE_WAS_ERROR; drbd_endio_write_sec_final(peer_req); return 0; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 56d46ff..a08cda9 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4112,6 +4112,13 @@ static ssize_t floppy_cmos_show(struct device *dev, static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL); +static struct attribute *floppy_dev_attrs[] = { + &dev_attr_cmos.attr, + NULL +}; + +ATTRIBUTE_GROUPS(floppy_dev); + static void floppy_device_release(struct device *dev) { } @@ -4324,16 +4331,12 @@ static int __init do_floppy_init(void) floppy_device[drive].name = floppy_device_name; floppy_device[drive].id = drive; floppy_device[drive].dev.release = floppy_device_release; + floppy_device[drive].dev.groups = floppy_dev_groups; err = platform_device_register(&floppy_device[drive]); if (err) goto out_remove_drives; - err = device_create_file(&floppy_device[drive].dev, - &dev_attr_cmos); - if (err) - goto out_unreg_platform_dev; - /* to be cleaned up... */ disks[drive]->private_data = (void *)(long)drive; disks[drive]->flags |= GENHD_FL_REMOVABLE; @@ -4343,13 +4346,10 @@ static int __init do_floppy_init(void) return 0; -out_unreg_platform_dev: - platform_device_unregister(&floppy_device[drive]); out_remove_drives: while (drive--) { if (floppy_available(drive)) { del_gendisk(disks[drive]); - device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); platform_device_unregister(&floppy_device[drive]); } } @@ -4594,7 +4594,6 @@ static void __exit floppy_module_exit(void) if (floppy_available(drive)) { del_gendisk(disks[drive]); - device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); platform_device_unregister(&floppy_device[drive]); } blk_cleanup_queue(disks[drive]->queue); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6cb1beb..d1f168b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -85,6 +85,8 @@ static DEFINE_MUTEX(loop_index_mutex); static int max_part; static int part_shift; +static struct workqueue_struct *loop_wq; + /* * Transfer functions */ @@ -284,12 +286,12 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec, return ret; } -static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos) +static int lo_send(struct loop_device *lo, struct request *rq, loff_t pos) { int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t, struct page *page); struct bio_vec bvec; - struct bvec_iter iter; + struct req_iterator iter; struct page *page = NULL; int ret = 0; @@ -303,7 +305,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos) do_lo_send = do_lo_send_direct_write; } - bio_for_each_segment(bvec, bio, iter) { + rq_for_each_segment(bvec, rq, iter) { ret = do_lo_send(lo, &bvec, pos, page); if (ret < 0) break; @@ -391,19 +393,22 @@ do_lo_receive(struct loop_device *lo, } static int -lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) +lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos) { struct bio_vec bvec; - struct bvec_iter iter; + struct req_iterator iter; ssize_t s; - bio_for_each_segment(bvec, bio, iter) { + rq_for_each_segment(bvec, rq, iter) { s = do_lo_receive(lo, &bvec, bsize, pos); if (s < 0) return s; if (s != bvec.bv_len) { - zero_fill_bio(bio); + struct bio *bio; + + __rq_for_each_bio(bio, rq) + zero_fill_bio(bio); break; } pos += bvec.bv_len; @@ -411,106 +416,58 @@ lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) return 0; } -static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) +static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos) { - loff_t pos; + /* + * We use punch hole to reclaim the free space used by the + * image a.k.a. discard. However we do not support discard if + * encryption is enabled, because it may give an attacker + * useful information. + */ + struct file *file = lo->lo_backing_file; + int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; int ret; - pos = ((loff_t) bio->bi_iter.bi_sector << 9) + lo->lo_offset; - - if (bio_rw(bio) == WRITE) { - struct file *file = lo->lo_backing_file; - - if (bio->bi_rw & REQ_FLUSH) { - ret = vfs_fsync(file, 0); - if (unlikely(ret && ret != -EINVAL)) { - ret = -EIO; - goto out; - } - } - - /* - * We use punch hole to reclaim the free space used by the - * image a.k.a. discard. However we do not support discard if - * encryption is enabled, because it may give an attacker - * useful information. - */ - if (bio->bi_rw & REQ_DISCARD) { - struct file *file = lo->lo_backing_file; - int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; - - if ((!file->f_op->fallocate) || - lo->lo_encrypt_key_size) { - ret = -EOPNOTSUPP; - goto out; - } - ret = file->f_op->fallocate(file, mode, pos, - bio->bi_iter.bi_size); - if (unlikely(ret && ret != -EINVAL && - ret != -EOPNOTSUPP)) - ret = -EIO; - goto out; - } - - ret = lo_send(lo, bio, pos); - - if ((bio->bi_rw & REQ_FUA) && !ret) { - ret = vfs_fsync(file, 0); - if (unlikely(ret && ret != -EINVAL)) - ret = -EIO; - } - } else - ret = lo_receive(lo, bio, lo->lo_blocksize, pos); + if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) { + ret = -EOPNOTSUPP; + goto out; + } -out: + ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); + if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) + ret = -EIO; + out: return ret; } -/* - * Add bio to back of pending list - */ -static void loop_add_bio(struct loop_device *lo, struct bio *bio) +static int lo_req_flush(struct loop_device *lo, struct request *rq) { - lo->lo_bio_count++; - bio_list_add(&lo->lo_bio_list, bio); -} + struct file *file = lo->lo_backing_file; + int ret = vfs_fsync(file, 0); + if (unlikely(ret && ret != -EINVAL)) + ret = -EIO; -/* - * Grab first pending buffer - */ -static struct bio *loop_get_bio(struct loop_device *lo) -{ - lo->lo_bio_count--; - return bio_list_pop(&lo->lo_bio_list); + return ret; } -static void loop_make_request(struct request_queue *q, struct bio *old_bio) +static int do_req_filebacked(struct loop_device *lo, struct request *rq) { - struct loop_device *lo = q->queuedata; - int rw = bio_rw(old_bio); - - if (rw == READA) - rw = READ; + loff_t pos; + int ret; - BUG_ON(!lo || (rw != READ && rw != WRITE)); + pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; - spin_lock_irq(&lo->lo_lock); - if (lo->lo_state != Lo_bound) - goto out; - if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) - goto out; - if (lo->lo_bio_count >= q->nr_congestion_on) - wait_event_lock_irq(lo->lo_req_wait, - lo->lo_bio_count < q->nr_congestion_off, - lo->lo_lock); - loop_add_bio(lo, old_bio); - wake_up(&lo->lo_event); - spin_unlock_irq(&lo->lo_lock); - return; + if (rq->cmd_flags & REQ_WRITE) { + if (rq->cmd_flags & REQ_FLUSH) + ret = lo_req_flush(lo, rq); + else if (rq->cmd_flags & REQ_DISCARD) + ret = lo_discard(lo, rq, pos); + else + ret = lo_send(lo, rq, pos); + } else + ret = lo_receive(lo, rq, lo->lo_blocksize, pos); -out: - spin_unlock_irq(&lo->lo_lock); - bio_io_error(old_bio); + return ret; } struct switch_request { @@ -518,57 +475,26 @@ struct switch_request { struct completion wait; }; -static void do_loop_switch(struct loop_device *, struct switch_request *); - -static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio) -{ - if (unlikely(!bio->bi_bdev)) { - do_loop_switch(lo, bio->bi_private); - bio_put(bio); - } else { - int ret = do_bio_filebacked(lo, bio); - bio_endio(bio, ret); - } -} - /* - * worker thread that handles reads/writes to file backed loop devices, - * to avoid blocking in our make_request_fn. it also does loop decrypting - * on reads for block backed loop, as that is too heavy to do from - * b_end_io context where irqs may be disabled. - * - * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before - * calling kthread_stop(). Therefore once kthread_should_stop() is - * true, make_request will not place any more requests. Therefore - * once kthread_should_stop() is true and lo_bio is NULL, we are - * done with the loop. + * Do the actual switch; called from the BIO completion routine */ -static int loop_thread(void *data) +static void do_loop_switch(struct loop_device *lo, struct switch_request *p) { - struct loop_device *lo = data; - struct bio *bio; - - set_user_nice(current, MIN_NICE); - - while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) { - - wait_event_interruptible(lo->lo_event, - !bio_list_empty(&lo->lo_bio_list) || - kthread_should_stop()); - - if (bio_list_empty(&lo->lo_bio_list)) - continue; - spin_lock_irq(&lo->lo_lock); - bio = loop_get_bio(lo); - if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off) - wake_up(&lo->lo_req_wait); - spin_unlock_irq(&lo->lo_lock); + struct file *file = p->file; + struct file *old_file = lo->lo_backing_file; + struct address_space *mapping; - BUG_ON(!bio); - loop_handle_bio(lo, bio); - } + /* if no new file, only flush of queued bios requested */ + if (!file) + return; - return 0; + mapping = file->f_mapping; + mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); + lo->lo_backing_file = file; + lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? + mapping->host->i_bdev->bd_block_size : PAGE_SIZE; + lo->old_gfp_mask = mapping_gfp_mask(mapping); + mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); } /* @@ -579,15 +505,18 @@ static int loop_thread(void *data) static int loop_switch(struct loop_device *lo, struct file *file) { struct switch_request w; - struct bio *bio = bio_alloc(GFP_KERNEL, 0); - if (!bio) - return -ENOMEM; - init_completion(&w.wait); + w.file = file; - bio->bi_private = &w; - bio->bi_bdev = NULL; - loop_make_request(lo->lo_queue, bio); - wait_for_completion(&w.wait); + + /* freeze queue and wait for completion of scheduled requests */ + blk_mq_freeze_queue(lo->lo_queue); + + /* do the switch action */ + do_loop_switch(lo, &w); + + /* unfreeze */ + blk_mq_unfreeze_queue(lo->lo_queue); + return 0; } @@ -596,39 +525,10 @@ static int loop_switch(struct loop_device *lo, struct file *file) */ static int loop_flush(struct loop_device *lo) { - /* loop not yet configured, no running thread, nothing to flush */ - if (!lo->lo_thread) - return 0; - return loop_switch(lo, NULL); } /* - * Do the actual switch; called from the BIO completion routine - */ -static void do_loop_switch(struct loop_device *lo, struct switch_request *p) -{ - struct file *file = p->file; - struct file *old_file = lo->lo_backing_file; - struct address_space *mapping; - - /* if no new file, only flush of queued bios requested */ - if (!file) - goto out; - - mapping = file->f_mapping; - mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); - lo->lo_backing_file = file; - lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? - mapping->host->i_bdev->bd_block_size : PAGE_SIZE; - lo->old_gfp_mask = mapping_gfp_mask(mapping); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); -out: - complete(&p->wait); -} - - -/* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up * the original file and in High Availability environments to switch to @@ -889,12 +789,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, lo->transfer = transfer_none; lo->ioctl = NULL; lo->lo_sizelimit = 0; - lo->lo_bio_count = 0; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - bio_list_init(&lo->lo_bio_list); - if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_flush(lo->lo_queue, REQ_FLUSH); @@ -906,14 +803,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, set_blocksize(bdev, lo_blocksize); - lo->lo_thread = kthread_create(loop_thread, lo, "loop%d", - lo->lo_number); - if (IS_ERR(lo->lo_thread)) { - error = PTR_ERR(lo->lo_thread); - goto out_clr; - } lo->lo_state = Lo_bound; - wake_up_process(lo->lo_thread); if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; if (lo->lo_flags & LO_FLAGS_PARTSCAN) @@ -925,18 +815,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, bdgrab(bdev); return 0; -out_clr: - loop_sysfs_exit(lo); - lo->lo_thread = NULL; - lo->lo_device = NULL; - lo->lo_backing_file = NULL; - lo->lo_flags = 0; - set_capacity(lo->lo_disk, 0); - invalidate_bdev(bdev); - bd_set_size(bdev, 0); - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask); - lo->lo_state = Lo_unbound; out_putf: fput(file); out: @@ -1012,11 +890,6 @@ static int loop_clr_fd(struct loop_device *lo) spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_rundown; - spin_unlock_irq(&lo->lo_lock); - - kthread_stop(lo->lo_thread); - - spin_lock_irq(&lo->lo_lock); lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); @@ -1028,7 +901,6 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_offset = 0; lo->lo_sizelimit = 0; lo->lo_encrypt_key_size = 0; - lo->lo_thread = NULL; memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); @@ -1601,6 +1473,105 @@ int loop_unregister_transfer(int number) EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); +static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + + blk_mq_start_request(bd->rq); + + if (cmd->rq->cmd_flags & REQ_WRITE) { + struct loop_device *lo = cmd->rq->q->queuedata; + bool need_sched = true; + + spin_lock_irq(&lo->lo_lock); + if (lo->write_started) + need_sched = false; + else + lo->write_started = true; + list_add_tail(&cmd->list, &lo->write_cmd_head); + spin_unlock_irq(&lo->lo_lock); + + if (need_sched) + queue_work(loop_wq, &lo->write_work); + } else { + queue_work(loop_wq, &cmd->read_work); + } + + return BLK_MQ_RQ_QUEUE_OK; +} + +static void loop_handle_cmd(struct loop_cmd *cmd) +{ + const bool write = cmd->rq->cmd_flags & REQ_WRITE; + struct loop_device *lo = cmd->rq->q->queuedata; + int ret = -EIO; + + if (lo->lo_state != Lo_bound) + goto failed; + + if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) + goto failed; + + ret = do_req_filebacked(lo, cmd->rq); + + failed: + if (ret) + cmd->rq->errors = -EIO; + blk_mq_complete_request(cmd->rq); +} + +static void loop_queue_write_work(struct work_struct *work) +{ + struct loop_device *lo = + container_of(work, struct loop_device, write_work); + LIST_HEAD(cmd_list); + + spin_lock_irq(&lo->lo_lock); + repeat: + list_splice_init(&lo->write_cmd_head, &cmd_list); + spin_unlock_irq(&lo->lo_lock); + + while (!list_empty(&cmd_list)) { + struct loop_cmd *cmd = list_first_entry(&cmd_list, + struct loop_cmd, list); + list_del_init(&cmd->list); + loop_handle_cmd(cmd); + } + + spin_lock_irq(&lo->lo_lock); + if (!list_empty(&lo->write_cmd_head)) + goto repeat; + lo->write_started = false; + spin_unlock_irq(&lo->lo_lock); +} + +static void loop_queue_read_work(struct work_struct *work) +{ + struct loop_cmd *cmd = + container_of(work, struct loop_cmd, read_work); + + loop_handle_cmd(cmd); +} + +static int loop_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); + + cmd->rq = rq; + INIT_WORK(&cmd->read_work, loop_queue_read_work); + + return 0; +} + +static struct blk_mq_ops loop_mq_ops = { + .queue_rq = loop_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = loop_init_request, +}; + static int loop_add(struct loop_device **l, int i) { struct loop_device *lo; @@ -1627,16 +1598,28 @@ static int loop_add(struct loop_device **l, int i) i = err; err = -ENOMEM; - lo->lo_queue = blk_alloc_queue(GFP_KERNEL); - if (!lo->lo_queue) + lo->tag_set.ops = &loop_mq_ops; + lo->tag_set.nr_hw_queues = 1; + lo->tag_set.queue_depth = 128; + lo->tag_set.numa_node = NUMA_NO_NODE; + lo->tag_set.cmd_size = sizeof(struct loop_cmd); + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + lo->tag_set.driver_data = lo; + + err = blk_mq_alloc_tag_set(&lo->tag_set); + if (err) goto out_free_idr; - /* - * set queue make_request_fn - */ - blk_queue_make_request(lo->lo_queue, loop_make_request); + lo->lo_queue = blk_mq_init_queue(&lo->tag_set); + if (IS_ERR_OR_NULL(lo->lo_queue)) { + err = PTR_ERR(lo->lo_queue); + goto out_cleanup_tags; + } lo->lo_queue->queuedata = lo; + INIT_LIST_HEAD(&lo->write_cmd_head); + INIT_WORK(&lo->write_work, loop_queue_write_work); + disk = lo->lo_disk = alloc_disk(1 << part_shift); if (!disk) goto out_free_queue; @@ -1664,9 +1647,6 @@ static int loop_add(struct loop_device **l, int i) disk->flags |= GENHD_FL_EXT_DEVT; mutex_init(&lo->lo_ctl_mutex); lo->lo_number = i; - lo->lo_thread = NULL; - init_waitqueue_head(&lo->lo_event); - init_waitqueue_head(&lo->lo_req_wait); spin_lock_init(&lo->lo_lock); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; @@ -1680,6 +1660,8 @@ static int loop_add(struct loop_device **l, int i) out_free_queue: blk_cleanup_queue(lo->lo_queue); +out_cleanup_tags: + blk_mq_free_tag_set(&lo->tag_set); out_free_idr: idr_remove(&loop_index_idr, i); out_free_dev: @@ -1692,6 +1674,7 @@ static void loop_remove(struct loop_device *lo) { del_gendisk(lo->lo_disk); blk_cleanup_queue(lo->lo_queue); + blk_mq_free_tag_set(&lo->tag_set); put_disk(lo->lo_disk); kfree(lo); } @@ -1875,6 +1858,13 @@ static int __init loop_init(void) goto misc_out; } + loop_wq = alloc_workqueue("kloopd", + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0); + if (!loop_wq) { + err = -ENOMEM; + goto misc_out; + } + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, THIS_MODULE, loop_probe, NULL, NULL); @@ -1912,6 +1902,8 @@ static void __exit loop_exit(void) blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); unregister_blkdev(LOOP_MAJOR, "loop"); + destroy_workqueue(loop_wq); + misc_deregister(&loop_misc); } diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 90df5d6..301c27f 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -11,8 +11,10 @@ #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/spinlock.h> #include <linux/mutex.h> +#include <linux/workqueue.h> #include <uapi/linux/loop.h> /* Possible states of device */ @@ -52,19 +54,23 @@ struct loop_device { gfp_t old_gfp_mask; spinlock_t lo_lock; - struct bio_list lo_bio_list; - unsigned int lo_bio_count; + struct list_head write_cmd_head; + struct work_struct write_work; + bool write_started; int lo_state; struct mutex lo_ctl_mutex; - struct task_struct *lo_thread; - wait_queue_head_t lo_event; - /* wait queue for incoming requests */ - wait_queue_head_t lo_req_wait; struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; struct gendisk *lo_disk; }; +struct loop_cmd { + struct work_struct read_work; + struct request *rq; + struct list_head list; +}; + /* Support for loadable transfer modules */ struct loop_func_table { int number; /* filter type */ diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index aa2224a..65cd61a 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -579,7 +579,7 @@ static int null_add_dev(void) sector_div(size, bs); set_capacity(disk, size); - disk->flags |= GENHD_FL_EXT_DEVT; + disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; disk->major = null_major; disk->first_minor = nullb->index; disk->fops = &null_fops; diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index d826bf3..cbdfbbf 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -144,8 +144,37 @@ struct nvme_cmd_info { void *ctx; int aborted; struct nvme_queue *nvmeq; + struct nvme_iod iod[0]; }; +/* + * Max size of iod being embedded in the request payload + */ +#define NVME_INT_PAGES 2 +#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) + +/* + * Will slightly overestimate the number of pages needed. This is OK + * as it only leads to a small amount of wasted memory for the lifetime of + * the I/O. + */ +static int nvme_npages(unsigned size, struct nvme_dev *dev) +{ + unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); +} + +static unsigned int nvme_cmd_size(struct nvme_dev *dev) +{ + unsigned int ret = sizeof(struct nvme_cmd_info); + + ret += sizeof(struct nvme_iod); + ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); + ret += sizeof(struct scatterlist) * NVME_INT_PAGES; + + return ret; +} + static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -218,6 +247,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); } +static void *iod_get_private(struct nvme_iod *iod) +{ + return (void *) (iod->private & ~0x1UL); +} + +/* + * If bit 0 is set, the iod is embedded in the request payload. + */ +static bool iod_should_kfree(struct nvme_iod *iod) +{ + return (iod->private & 0x01) == 0; +} + /* Special values must be less than 0x1000 */ #define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) @@ -361,35 +403,53 @@ static __le64 **iod_list(struct nvme_iod *iod) return ((void *)iod) + iod->offset; } -/* - * Will slightly overestimate the number of pages needed. This is OK - * as it only leads to a small amount of wasted memory for the lifetime of - * the I/O. - */ -static int nvme_npages(unsigned size, struct nvme_dev *dev) +static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, + unsigned nseg, unsigned long private) { - unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); - return DIV_ROUND_UP(8 * nprps, dev->page_size - 8); + iod->private = private; + iod->offset = offsetof(struct nvme_iod, sg[nseg]); + iod->npages = -1; + iod->length = nbytes; + iod->nents = 0; } static struct nvme_iod * -nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) +__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, + unsigned long priv, gfp_t gfp) { struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(nbytes, dev) + + sizeof(__le64 *) * nvme_npages(bytes, dev) + sizeof(struct scatterlist) * nseg, gfp); - if (iod) { - iod->offset = offsetof(struct nvme_iod, sg[nseg]); - iod->npages = -1; - iod->length = nbytes; - iod->nents = 0; - iod->first_dma = 0ULL; - } + if (iod) + iod_init(iod, bytes, nseg, priv); return iod; } +static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, + gfp_t gfp) +{ + unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : + sizeof(struct nvme_dsm_range); + unsigned long mask = 0; + struct nvme_iod *iod; + + if (rq->nr_phys_segments <= NVME_INT_PAGES && + size <= NVME_INT_BYTES(dev)) { + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); + + iod = cmd->iod; + mask = 0x01; + iod_init(iod, size, rq->nr_phys_segments, + (unsigned long) rq | 0x01); + return iod; + } + + return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, + (unsigned long) rq, gfp); +} + void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { const int last_prp = dev->page_size / 8 - 1; @@ -405,7 +465,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); prp_dma = next_prp_dma; } - kfree(iod); + + if (iod_should_kfree(iod)) + kfree(iod); } static int nvme_error_status(u16 status) @@ -424,7 +486,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct nvme_iod *iod = ctx; - struct request *req = iod->private; + struct request *req = iod_get_private(iod); struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); u16 status = le16_to_cpup(&cqe->status) >> 1; @@ -585,7 +647,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, struct nvme_ns *ns) { - struct request *req = iod->private; + struct request *req = iod_get_private(iod); struct nvme_command *cmnd; u16 control = 0; u32 dsmgmt = 0; @@ -626,17 +688,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; - int psegs = req->nr_phys_segments; enum dma_data_direction dma_dir; - unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) : - sizeof(struct nvme_dsm_range); - iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); + iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); if (!iod) return BLK_MQ_RQ_QUEUE_BUSY; - iod->private = req; - if (req->cmd_flags & REQ_DISCARD) { void *range; /* @@ -651,10 +708,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, goto retry_cmd; iod_list(iod)[0] = (__le64 *)range; iod->npages = 0; - } else if (psegs) { + } else if (req->nr_phys_segments) { dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; - sg_init_table(iod->sg, psegs); + sg_init_table(iod->sg, req->nr_phys_segments); iod->nents = blk_rq_map_sg(req->q, req, iod->sg); if (!iod->nents) goto error_cmd; @@ -1137,21 +1194,14 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) static void nvme_free_queues(struct nvme_dev *dev, int lowest) { - LLIST_HEAD(q_list); - struct nvme_queue *nvmeq, *next; - struct llist_node *entry; int i; for (i = dev->queue_count - 1; i >= lowest; i--) { struct nvme_queue *nvmeq = dev->queues[i]; - llist_add(&nvmeq->node, &q_list); dev->queue_count--; dev->queues[i] = NULL; - } - synchronize_rcu(); - entry = llist_del_all(&q_list); - llist_for_each_entry_safe(nvmeq, next, entry, node) nvme_free_queue(nvmeq); + } } /** @@ -1408,7 +1458,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); - dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->admin_tagset.cmd_size = nvme_cmd_size(dev); dev->admin_tagset.driver_data = dev; if (blk_mq_alloc_tag_set(&dev->admin_tagset)) @@ -1522,7 +1572,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, } err = -ENOMEM; - iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL); + iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL); if (!iod) goto put_pages; @@ -2148,7 +2198,7 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; - dev->tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->tagset.cmd_size = nvme_cmd_size(dev); dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index 79aa179..e229425 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -423,7 +423,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev) } /* switch queue to TCQ mode; allocate tag map */ - rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL); + rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO); if (rc) { blk_cleanup_queue(q); put_disk(disk); diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index cc90a84..375d288 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -214,6 +214,15 @@ enum blkif_protocol { BLKIF_PROTOCOL_X86_64 = 3, }; +/* + * Default protocol if the frontend doesn't specify one. + */ +#ifdef CONFIG_X86 +# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32 +#else +# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE +#endif + struct xen_vbd { /* What the domain refers to this vbd as. */ blkif_vdev_t handle; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 630a489..e3afe97 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -868,11 +868,11 @@ static int connect_ring(struct backend_info *be) return err; } - be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", "%63s", protocol, NULL); if (err) - strcpy(protocol, "unspecified, assuming native"); + strcpy(protocol, "unspecified, assuming default"); else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d2cae5f..37779e4 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1511,7 +1511,7 @@ static int blkif_recover(struct blkfront_info *info) merge_bio.tail = copy[i].request->biotail; bio_list_merge(&bio_list, &merge_bio); copy[i].request->bio = NULL; - blk_put_request(copy[i].request); + blk_end_request_all(copy[i].request, 0); } kfree(copy); @@ -1534,7 +1534,7 @@ static int blkif_recover(struct blkfront_info *info) req->bio = NULL; if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) pr_alert("diskcache flush request found!\n"); - __blk_put_request(info->rq, req); + __blk_end_request_all(req, 0); } spin_unlock_irq(&info->io_lock); diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 4c58333..9a6b637 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -287,13 +287,24 @@ static unsigned long get_unmapped_area_mem(struct file *file, return pgoff << PAGE_SHIFT; } +/* permit direct mmap, for read, write or exec */ +static unsigned memory_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_DIRECT | + NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC; +} + +static unsigned zero_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_COPY; +} + /* can't do an in-place private mapping if there's no MMU */ static inline int private_mapping_ok(struct vm_area_struct *vma) { return vma->vm_flags & VM_MAYSHARE; } #else -#define get_unmapped_area_mem NULL static inline int private_mapping_ok(struct vm_area_struct *vma) { @@ -721,7 +732,10 @@ static const struct file_operations mem_fops = { .write = write_mem, .mmap = mmap_mem, .open = open_mem, +#ifndef CONFIG_MMU .get_unmapped_area = get_unmapped_area_mem, + .mmap_capabilities = memory_mmap_capabilities, +#endif }; #ifdef CONFIG_DEVKMEM @@ -731,7 +745,10 @@ static const struct file_operations kmem_fops = { .write = write_kmem, .mmap = mmap_kmem, .open = open_kmem, +#ifndef CONFIG_MMU .get_unmapped_area = get_unmapped_area_mem, + .mmap_capabilities = memory_mmap_capabilities, +#endif }; #endif @@ -760,16 +777,9 @@ static const struct file_operations zero_fops = { .read_iter = read_iter_zero, .aio_write = aio_write_zero, .mmap = mmap_zero, -}; - -/* - * capabilities for /dev/zero - * - permits private mappings, "copies" are taken of the source of zeros - * - no writeback happens - */ -static struct backing_dev_info zero_bdi = { - .name = "char/mem", - .capabilities = BDI_CAP_MAP_COPY | BDI_CAP_NO_ACCT_AND_WRITEBACK, +#ifndef CONFIG_MMU + .mmap_capabilities = zero_mmap_capabilities, +#endif }; static const struct file_operations full_fops = { @@ -783,22 +793,22 @@ static const struct memdev { const char *name; umode_t mode; const struct file_operations *fops; - struct backing_dev_info *dev_info; + fmode_t fmode; } devlist[] = { - [1] = { "mem", 0, &mem_fops, &directly_mappable_cdev_bdi }, + [1] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET }, #ifdef CONFIG_DEVKMEM - [2] = { "kmem", 0, &kmem_fops, &directly_mappable_cdev_bdi }, + [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET }, #endif - [3] = { "null", 0666, &null_fops, NULL }, + [3] = { "null", 0666, &null_fops, 0 }, #ifdef CONFIG_DEVPORT - [4] = { "port", 0, &port_fops, NULL }, + [4] = { "port", 0, &port_fops, 0 }, #endif - [5] = { "zero", 0666, &zero_fops, &zero_bdi }, - [7] = { "full", 0666, &full_fops, NULL }, - [8] = { "random", 0666, &random_fops, NULL }, - [9] = { "urandom", 0666, &urandom_fops, NULL }, + [5] = { "zero", 0666, &zero_fops, 0 }, + [7] = { "full", 0666, &full_fops, 0 }, + [8] = { "random", 0666, &random_fops, 0 }, + [9] = { "urandom", 0666, &urandom_fops, 0 }, #ifdef CONFIG_PRINTK - [11] = { "kmsg", 0644, &kmsg_fops, NULL }, + [11] = { "kmsg", 0644, &kmsg_fops, 0 }, #endif }; @@ -816,12 +826,7 @@ static int memory_open(struct inode *inode, struct file *filp) return -ENXIO; filp->f_op = dev->fops; - if (dev->dev_info) - filp->f_mapping->backing_dev_info = dev->dev_info; - - /* Is /dev/mem or /dev/kmem ? */ - if (dev->dev_info == &directly_mappable_cdev_bdi) - filp->f_mode |= FMODE_UNSIGNED_OFFSET; + filp->f_mode |= dev->fmode; if (dev->fops->open) return dev->fops->open(inode, filp); @@ -846,11 +851,6 @@ static struct class *mem_class; static int __init chr_dev_init(void) { int minor; - int err; - - err = bdi_init(&zero_bdi); - if (err) - return err; if (register_chrdev(MEM_MAJOR, "mem", &memory_fops)) printk("unable to get major %d for memory devs\n", MEM_MAJOR); diff --git a/drivers/char/raw.c b/drivers/char/raw.c index a24891b..6e29bf2 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -104,11 +104,9 @@ static int raw_release(struct inode *inode, struct file *filp) mutex_lock(&raw_mutex); bdev = raw_devices[minor].binding; - if (--raw_devices[minor].inuse == 0) { + if (--raw_devices[minor].inuse == 0) /* Here inode->i_mapping == bdev->bd_inode->i_mapping */ inode->i_mapping = &inode->i_data; - inode->i_mapping->backing_dev_info = &default_backing_dev_info; - } mutex_unlock(&raw_mutex); blkdev_put(bdev, filp->f_mode | FMODE_EXCL); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index c355a22..c396444 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -231,9 +231,8 @@ config DM_CRYPT transparently encrypts the data on it. You'll need to activate the ciphers you're going to use in the cryptoapi configuration. - Information on how to use dm-crypt can be found on - - <http://www.saout.de/misc/dm-crypt/> + For further information on dm-crypt and userspace tools see: + <http://code.google.com/p/cryptsetup/wiki/DMCrypt> To compile this code as a module, choose M here: the module will be called dm-crypt. diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 1695ee5..3a57679 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1619,7 +1619,9 @@ void bitmap_destroy(struct mddev *mddev) return; mutex_lock(&mddev->bitmap_info.mutex); + spin_lock(&mddev->lock); mddev->bitmap = NULL; /* disconnect from the md device */ + spin_unlock(&mddev->lock); mutex_unlock(&mddev->bitmap_info.mutex); if (mddev->thread) mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; @@ -2209,11 +2211,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t can_clear_show(struct mddev *mddev, char *page) { int len; + spin_lock(&mddev->lock); if (mddev->bitmap) len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? "false" : "true")); else len = sprintf(page, "\n"); + spin_unlock(&mddev->lock); return len; } @@ -2238,10 +2242,15 @@ __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); static ssize_t behind_writes_used_show(struct mddev *mddev, char *page) { + ssize_t ret; + spin_lock(&mddev->lock); if (mddev->bitmap == NULL) - return sprintf(page, "0\n"); - return sprintf(page, "%lu\n", - mddev->bitmap->behind_writes_used); + ret = sprintf(page, "0\n"); + else + ret = sprintf(page, "%lu\n", + mddev->bitmap->behind_writes_used); + spin_unlock(&mddev->lock); + return ret; } static ssize_t diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index c33b497..86dbbc7 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -11,6 +11,7 @@ #include <linux/device-mapper.h> #include <linux/dm-io.h> #include <linux/slab.h> +#include <linux/jiffies.h> #include <linux/vmalloc.h> #include <linux/shrinker.h> #include <linux/module.h> @@ -1739,7 +1740,7 @@ static unsigned get_max_age_hz(void) static bool older_than(struct dm_buffer *b, unsigned long age_hz) { - return (jiffies - b->last_accessed) >= age_hz; + return time_after_eq(jiffies, b->last_accessed + age_hz); } static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index e165053..7755af3 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -11,6 +11,7 @@ #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> +#include <linux/jiffies.h> #include <linux/init.h> #include <linux/mempool.h> #include <linux/module.h> @@ -1562,8 +1563,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, static int need_commit_due_to_time(struct cache *cache) { - return jiffies < cache->last_commit_jiffies || - jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; + return !time_in_range(jiffies, cache->last_commit_jiffies, + cache->last_commit_jiffies + COMMIT_PERIOD); } static int commit_if_needed(struct cache *cache) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 73f791b..c8a18e4 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -639,8 +639,8 @@ static int check_name(const char *name) /* * On successful return, the caller must not attempt to acquire - * _hash_lock without first calling dm_table_put, because dm_table_destroy - * waits for this dm_table_put and could be called under this lock. + * _hash_lock without first calling dm_put_live_table, because dm_table_destroy + * waits for this dm_put_live_table and could be called under this lock. */ static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx) { diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index b953db6..03177ca 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -6,6 +6,7 @@ #include <linux/bio.h> #include <linux/slab.h> +#include <linux/jiffies.h> #include <linux/dm-dirty-log.h> #include <linux/device-mapper.h> #include <linux/dm-log-userspace.h> @@ -829,7 +830,7 @@ static int userspace_is_remote_recovering(struct dm_dirty_log *log, int r; uint64_t region64 = region; struct log_c *lc = log->context; - static unsigned long long limit; + static unsigned long limit; struct { int64_t is_recovering; uint64_t in_sync_hint; @@ -845,7 +846,7 @@ static int userspace_is_remote_recovering(struct dm_dirty_log *log, */ if (region < lc->in_sync_hint) return 0; - else if (jiffies < limit) + else if (time_after(limit, jiffies)) return 1; limit = jiffies + (HZ / 4); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 7b6b0f0..d376dc8 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -11,6 +11,7 @@ #include "dm-path-selector.h" #include "dm-uevent.h" +#include <linux/blkdev.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/mempool.h> @@ -378,18 +379,18 @@ static int __must_push_back(struct multipath *m) /* * Map cloned requests */ -static int multipath_map(struct dm_target *ti, struct request *clone, - union map_info *map_context) +static int __multipath_map(struct dm_target *ti, struct request *clone, + union map_info *map_context, + struct request *rq, struct request **__clone) { struct multipath *m = (struct multipath *) ti->private; int r = DM_MAPIO_REQUEUE; - size_t nr_bytes = blk_rq_bytes(clone); - unsigned long flags; + size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq); struct pgpath *pgpath; struct block_device *bdev; struct dm_mpath_io *mpio; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); /* Do we need to select a new pgpath? */ if (!m->current_pgpath || @@ -411,25 +412,61 @@ static int multipath_map(struct dm_target *ti, struct request *clone, /* ENOMEM, requeue */ goto out_unlock; - bdev = pgpath->path.dev->bdev; - clone->q = bdev_get_queue(bdev); - clone->rq_disk = bdev->bd_disk; - clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; mpio = map_context->ptr; mpio->pgpath = pgpath; mpio->nr_bytes = nr_bytes; + + bdev = pgpath->path.dev->bdev; + + spin_unlock_irq(&m->lock); + + if (clone) { + /* Old request-based interface: allocated clone is passed in */ + clone->q = bdev_get_queue(bdev); + clone->rq_disk = bdev->bd_disk; + clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; + } else { + /* blk-mq request-based interface */ + *__clone = blk_get_request(bdev_get_queue(bdev), + rq_data_dir(rq), GFP_KERNEL); + if (IS_ERR(*__clone)) + /* ENOMEM, requeue */ + return r; + (*__clone)->bio = (*__clone)->biotail = NULL; + (*__clone)->rq_disk = bdev->bd_disk; + (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT; + } + if (pgpath->pg->ps.type->start_io) pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, nr_bytes); - r = DM_MAPIO_REMAPPED; + return DM_MAPIO_REMAPPED; out_unlock: - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); return r; } +static int multipath_map(struct dm_target *ti, struct request *clone, + union map_info *map_context) +{ + return __multipath_map(ti, clone, map_context, NULL, NULL); +} + +static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, + union map_info *map_context, + struct request **clone) +{ + return __multipath_map(ti, NULL, map_context, rq, clone); +} + +static void multipath_release_clone(struct request *clone) +{ + blk_put_request(clone); +} + /* * If we run out of usable paths, should we queue I/O or error it? */ @@ -1666,11 +1703,13 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 7, 0}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, .map_rq = multipath_map, + .clone_and_map_rq = multipath_clone_and_map, + .release_clone_rq = multipath_release_clone, .rq_end_io = multipath_end_io, .presuspend = multipath_presuspend, .postsuspend = multipath_postsuspend, @@ -1694,16 +1733,15 @@ static int __init dm_multipath_init(void) r = dm_register_target(&multipath_target); if (r < 0) { DMERR("register failed %d", r); - kmem_cache_destroy(_mpio_cache); - return -EINVAL; + r = -EINVAL; + goto bad_register_target; } kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); if (!kmultipathd) { DMERR("failed to create workqueue kmpathd"); - dm_unregister_target(&multipath_target); - kmem_cache_destroy(_mpio_cache); - return -ENOMEM; + r = -ENOMEM; + goto bad_alloc_kmultipathd; } /* @@ -1716,16 +1754,23 @@ static int __init dm_multipath_init(void) WQ_MEM_RECLAIM); if (!kmpath_handlerd) { DMERR("failed to create workqueue kmpath_handlerd"); - destroy_workqueue(kmultipathd); - dm_unregister_target(&multipath_target); - kmem_cache_destroy(_mpio_cache); - return -ENOMEM; + r = -ENOMEM; + goto bad_alloc_kmpath_handlerd; } DMINFO("version %u.%u.%u loaded", multipath_target.version[0], multipath_target.version[1], multipath_target.version[2]); + return 0; + +bad_alloc_kmpath_handlerd: + destroy_workqueue(kmultipathd); +bad_alloc_kmultipathd: + dm_unregister_target(&multipath_target); +bad_register_target: + kmem_cache_destroy(_mpio_cache); + return r; } diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 07c0fa0..88e4c7f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -746,13 +746,7 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) { struct raid_set *rs = container_of(cb, struct raid_set, callbacks); - if (rs->raid_type->level == 1) - return md_raid1_congested(&rs->md, bits); - - if (rs->raid_type->level == 10) - return md_raid10_congested(&rs->md, bits); - - return md_raid5_congested(&rs->md, bits); + return mddev_congested(&rs->md, bits); } /* @@ -1243,7 +1237,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) argv++; /* Skip over RAID params for now and find out # of devices */ - if (num_raid_params + 1 > argc) { + if (num_raid_params >= argc) { ti->error = "Arguments do not agree with counts given"; return -EINVAL; } @@ -1254,6 +1248,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } + argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ + if (argc != (num_raid_devs * 2)) { + ti->error = "Supplied RAID devices does not match the count given"; + return -EINVAL; + } + rs = context_alloc(ti, rt, (unsigned)num_raid_devs); if (IS_ERR(rs)) return PTR_ERR(rs); @@ -1262,16 +1262,8 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) if (ret) goto bad; - ret = -EINVAL; - - argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ argv += num_raid_params + 1; - if (argc != (num_raid_devs * 2)) { - ti->error = "Supplied RAID devices does not match the count given"; - goto bad; - } - ret = dev_parms(rs, argv); if (ret) goto bad; diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index d6e8817..808b841 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -200,16 +200,11 @@ err_area: static void free_area(struct pstore *ps) { - if (ps->area) - vfree(ps->area); + vfree(ps->area); ps->area = NULL; - - if (ps->zero_area) - vfree(ps->zero_area); + vfree(ps->zero_area); ps->zero_area = NULL; - - if (ps->header_area) - vfree(ps->header_area); + vfree(ps->header_area); ps->header_area = NULL; } @@ -605,8 +600,7 @@ static void persistent_dtr(struct dm_exception_store *store) free_area(ps); /* Allocated in persistent_read_metadata */ - if (ps->callbacks) - vfree(ps->callbacks); + vfree(ps->callbacks); kfree(ps); } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 3afae9e..6554d91 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -827,10 +827,11 @@ static int dm_table_set_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0, hybrid = 0; + bool use_blk_mq = false; struct dm_target *tgt; struct dm_dev_internal *dd; struct list_head *devices; - unsigned live_md_type; + unsigned live_md_type = dm_get_md_type(t->md); for (i = 0; i < t->num_targets; i++) { tgt = t->targets + i; @@ -854,8 +855,8 @@ static int dm_table_set_type(struct dm_table *t) * Determine the type from the live device. * Default to bio-based if device is new. */ - live_md_type = dm_get_md_type(t->md); - if (live_md_type == DM_TYPE_REQUEST_BASED) + if (live_md_type == DM_TYPE_REQUEST_BASED || + live_md_type == DM_TYPE_MQ_REQUEST_BASED) request_based = 1; else bio_based = 1; @@ -869,16 +870,6 @@ static int dm_table_set_type(struct dm_table *t) BUG_ON(!request_based); /* No targets in this table */ - /* Non-request-stackable devices can't be used for request-based dm */ - devices = dm_table_get_devices(t); - list_for_each_entry(dd, devices, list) { - if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev->bdev))) { - DMWARN("table load rejected: including" - " non-request-stackable devices"); - return -EINVAL; - } - } - /* * Request-based dm supports only tables that have a single target now. * To support multiple targets, request splitting support is needed, @@ -890,7 +881,37 @@ static int dm_table_set_type(struct dm_table *t) return -EINVAL; } - t->type = DM_TYPE_REQUEST_BASED; + /* Non-request-stackable devices can't be used for request-based dm */ + devices = dm_table_get_devices(t); + list_for_each_entry(dd, devices, list) { + struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); + + if (!blk_queue_stackable(q)) { + DMERR("table load rejected: including" + " non-request-stackable devices"); + return -EINVAL; + } + + if (q->mq_ops) + use_blk_mq = true; + } + + if (use_blk_mq) { + /* verify _all_ devices in the table are blk-mq devices */ + list_for_each_entry(dd, devices, list) + if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) { + DMERR("table load rejected: not all devices" + " are blk-mq request-stackable"); + return -EINVAL; + } + t->type = DM_TYPE_MQ_REQUEST_BASED; + + } else if (hybrid && list_empty(devices) && live_md_type != DM_TYPE_NONE) { + /* inherit live MD type */ + t->type = live_md_type; + + } else + t->type = DM_TYPE_REQUEST_BASED; return 0; } @@ -907,7 +928,15 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) bool dm_table_request_based(struct dm_table *t) { - return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; + unsigned table_type = dm_table_get_type(t); + + return (table_type == DM_TYPE_REQUEST_BASED || + table_type == DM_TYPE_MQ_REQUEST_BASED); +} + +bool dm_table_mq_request_based(struct dm_table *t) +{ + return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED; } static int dm_table_alloc_md_mempools(struct dm_table *t) @@ -1360,6 +1389,14 @@ static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev, return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); } +static int queue_supports_sg_gaps(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && !test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags); +} + static bool dm_table_all_devices_attribute(struct dm_table *t, iterate_devices_callout_fn func) { @@ -1480,6 +1517,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + if (dm_table_all_devices_attribute(t, queue_supports_sg_gaps)) + queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, q); + else + queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, q); + dm_table_set_integrity(t); /* diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 242e3ce..925ec1b 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -137,13 +137,26 @@ static int io_err_map_rq(struct dm_target *ti, struct request *clone, return -EIO; } +static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq, + union map_info *map_context, + struct request **clone) +{ + return -EIO; +} + +static void io_err_release_clone_rq(struct request *clone) +{ +} + static struct target_type error_target = { .name = "error", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .ctr = io_err_ctr, .dtr = io_err_dtr, .map = io_err_map, .map_rq = io_err_map_rq, + .clone_and_map_rq = io_err_clone_and_map_rq, + .release_clone_rq = io_err_release_clone_rq, }; int __init dm_target_init(void) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 43adbb8..79f6941 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1635,15 +1635,6 @@ int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, return r; } -int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result) -{ - down_read(&pmd->root_lock); - *result = pmd->data_block_size; - up_read(&pmd->root_lock); - - return 0; -} - int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) { int r = -EINVAL; diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 921d15e..fac01a9 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -182,8 +182,6 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); -int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result); - int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 07705ee..654773c 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -11,6 +11,7 @@ #include <linux/device-mapper.h> #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> +#include <linux/jiffies.h> #include <linux/log2.h> #include <linux/list.h> #include <linux/rculist.h> @@ -1700,8 +1701,8 @@ static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell */ static int need_commit_due_to_time(struct pool *pool) { - return jiffies < pool->last_commit_jiffies || - jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; + return !time_in_range(jiffies, pool->last_commit_jiffies, + pool->last_commit_jiffies + COMMIT_PERIOD); } #define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2caf5b3..ec1444f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -20,6 +20,7 @@ #include <linux/hdreg.h> #include <linux/delay.h> #include <linux/wait.h> +#include <linux/kthread.h> #include <trace/events/block.h> @@ -78,7 +79,8 @@ struct dm_io { struct dm_rq_target_io { struct mapped_device *md; struct dm_target *ti; - struct request *orig, clone; + struct request *orig, *clone; + struct kthread_work work; int error; union map_info info; }; @@ -179,6 +181,7 @@ struct mapped_device { * io objects are allocated from here. */ mempool_t *io_pool; + mempool_t *rq_pool; struct bio_set *bs; @@ -210,6 +213,9 @@ struct mapped_device { unsigned internal_suspend_count; struct dm_stats stats; + + struct kthread_worker kworker; + struct task_struct *kworker_task; }; /* @@ -217,6 +223,7 @@ struct mapped_device { */ struct dm_md_mempools { mempool_t *io_pool; + mempool_t *rq_pool; struct bio_set *bs; }; @@ -231,6 +238,7 @@ struct table_device { #define RESERVED_MAX_IOS 1024 static struct kmem_cache *_io_cache; static struct kmem_cache *_rq_tio_cache; +static struct kmem_cache *_rq_cache; /* * Bio-based DM's mempools' reserved IOs set by the user. @@ -288,9 +296,14 @@ static int __init local_init(void) if (!_rq_tio_cache) goto out_free_io_cache; + _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request), + __alignof__(struct request), 0, NULL); + if (!_rq_cache) + goto out_free_rq_tio_cache; + r = dm_uevent_init(); if (r) - goto out_free_rq_tio_cache; + goto out_free_rq_cache; deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); if (!deferred_remove_workqueue) { @@ -312,6 +325,8 @@ out_free_workqueue: destroy_workqueue(deferred_remove_workqueue); out_uevent_exit: dm_uevent_exit(); +out_free_rq_cache: + kmem_cache_destroy(_rq_cache); out_free_rq_tio_cache: kmem_cache_destroy(_rq_tio_cache); out_free_io_cache: @@ -325,6 +340,7 @@ static void local_exit(void) flush_scheduled_work(); destroy_workqueue(deferred_remove_workqueue); + kmem_cache_destroy(_rq_cache); kmem_cache_destroy(_rq_tio_cache); kmem_cache_destroy(_io_cache); unregister_blkdev(_major, _name); @@ -577,6 +593,17 @@ static void free_rq_tio(struct dm_rq_target_io *tio) mempool_free(tio, tio->md->io_pool); } +static struct request *alloc_clone_request(struct mapped_device *md, + gfp_t gfp_mask) +{ + return mempool_alloc(md->rq_pool, gfp_mask); +} + +static void free_clone_request(struct mapped_device *md, struct request *rq) +{ + mempool_free(rq, md->rq_pool); +} + static int md_in_flight(struct mapped_device *md) { return atomic_read(&md->pending[READ]) + @@ -992,7 +1019,7 @@ static void end_clone_bio(struct bio *clone, int error) * the md may be freed in dm_put() at the end of this function. * Or do dm_get() before calling this function and dm_put() later. */ -static void rq_completed(struct mapped_device *md, int rw, int run_queue) +static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { atomic_dec(&md->pending[rw]); @@ -1020,12 +1047,17 @@ static void free_rq_clone(struct request *clone) struct dm_rq_target_io *tio = clone->end_io_data; blk_rq_unprep_clone(clone); + if (clone->q && clone->q->mq_ops) + tio->ti->type->release_clone_rq(clone); + else + free_clone_request(tio->md, clone); free_rq_tio(tio); } /* * Complete the clone and the original request. - * Must be called without queue lock. + * Must be called without clone's queue lock held, + * see end_clone_request() for more details. */ static void dm_end_request(struct request *clone, int error) { @@ -1054,23 +1086,23 @@ static void dm_end_request(struct request *clone, int error) static void dm_unprep_request(struct request *rq) { - struct request *clone = rq->special; + struct dm_rq_target_io *tio = rq->special; + struct request *clone = tio->clone; rq->special = NULL; rq->cmd_flags &= ~REQ_DONTPREP; - free_rq_clone(clone); + if (clone) + free_rq_clone(clone); } /* * Requeue the original request of a clone. */ -void dm_requeue_unmapped_request(struct request *clone) +static void dm_requeue_unmapped_original_request(struct mapped_device *md, + struct request *rq) { - int rw = rq_data_dir(clone); - struct dm_rq_target_io *tio = clone->end_io_data; - struct mapped_device *md = tio->md; - struct request *rq = tio->orig; + int rw = rq_data_dir(rq); struct request_queue *q = rq->q; unsigned long flags; @@ -1080,9 +1112,15 @@ void dm_requeue_unmapped_request(struct request *clone) blk_requeue_request(q, rq); spin_unlock_irqrestore(q->queue_lock, flags); - rq_completed(md, rw, 0); + rq_completed(md, rw, false); +} + +static void dm_requeue_unmapped_request(struct request *clone) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + + dm_requeue_unmapped_original_request(tio->md, tio->orig); } -EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); static void __stop_queue(struct request_queue *q) { @@ -1151,8 +1189,15 @@ static void dm_done(struct request *clone, int error, bool mapped) static void dm_softirq_done(struct request *rq) { bool mapped = true; - struct request *clone = rq->completion_data; - struct dm_rq_target_io *tio = clone->end_io_data; + struct dm_rq_target_io *tio = rq->special; + struct request *clone = tio->clone; + + if (!clone) { + blk_end_request_all(rq, tio->error); + rq_completed(tio->md, rq_data_dir(rq), false); + free_rq_tio(tio); + return; + } if (rq->cmd_flags & REQ_FAILED) mapped = false; @@ -1164,13 +1209,11 @@ static void dm_softirq_done(struct request *rq) * Complete the clone and the original request with the error status * through softirq context. */ -static void dm_complete_request(struct request *clone, int error) +static void dm_complete_request(struct request *rq, int error) { - struct dm_rq_target_io *tio = clone->end_io_data; - struct request *rq = tio->orig; + struct dm_rq_target_io *tio = rq->special; tio->error = error; - rq->completion_data = clone; blk_complete_request(rq); } @@ -1178,40 +1221,40 @@ static void dm_complete_request(struct request *clone, int error) * Complete the not-mapped clone and the original request with the error status * through softirq context. * Target's rq_end_io() function isn't called. - * This may be used when the target's map_rq() function fails. + * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. */ -void dm_kill_unmapped_request(struct request *clone, int error) +static void dm_kill_unmapped_request(struct request *rq, int error) { - struct dm_rq_target_io *tio = clone->end_io_data; - struct request *rq = tio->orig; - rq->cmd_flags |= REQ_FAILED; - dm_complete_request(clone, error); + dm_complete_request(rq, error); } -EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); /* - * Called with the queue lock held + * Called with the clone's queue lock held */ static void end_clone_request(struct request *clone, int error) { - /* - * For just cleaning up the information of the queue in which - * the clone was dispatched. - * The clone is *NOT* freed actually here because it is alloced from - * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. - */ - __blk_put_request(clone->q, clone); + struct dm_rq_target_io *tio = clone->end_io_data; + + if (!clone->q->mq_ops) { + /* + * For just cleaning up the information of the queue in which + * the clone was dispatched. + * The clone is *NOT* freed actually here because it is alloced + * from dm own mempool (REQ_ALLOCED isn't set). + */ + __blk_put_request(clone->q, clone); + } /* * Actual request completion is done in a softirq context which doesn't - * hold the queue lock. Otherwise, deadlock could occur because: + * hold the clone's queue lock. Otherwise, deadlock could occur because: * - another request may be submitted by the upper level driver * of the stacking during the completion * - the submission which requires queue lock may be done - * against this queue + * against this clone's queue */ - dm_complete_request(clone, error); + dm_complete_request(tio->orig, error); } /* @@ -1689,19 +1732,19 @@ static void dm_request(struct request_queue *q, struct bio *bio) _dm_request(q, bio); } -void dm_dispatch_request(struct request *rq) +static void dm_dispatch_clone_request(struct request *clone, struct request *rq) { int r; - if (blk_queue_io_stat(rq->q)) - rq->cmd_flags |= REQ_IO_STAT; + if (blk_queue_io_stat(clone->q)) + clone->cmd_flags |= REQ_IO_STAT; - rq->start_time = jiffies; - r = blk_insert_cloned_request(rq->q, rq); + clone->start_time = jiffies; + r = blk_insert_cloned_request(clone->q, clone); if (r) + /* must complete clone in terms of original request */ dm_complete_request(rq, r); } -EXPORT_SYMBOL_GPL(dm_dispatch_request); static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, void *data) @@ -1718,11 +1761,11 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, } static int setup_clone(struct request *clone, struct request *rq, - struct dm_rq_target_io *tio) + struct dm_rq_target_io *tio, gfp_t gfp_mask) { int r; - r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, + r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, dm_rq_bio_constructor, tio); if (r) return r; @@ -1733,14 +1776,37 @@ static int setup_clone(struct request *clone, struct request *rq, clone->end_io = end_clone_request; clone->end_io_data = tio; + tio->clone = clone; + return 0; } static struct request *clone_rq(struct request *rq, struct mapped_device *md, - gfp_t gfp_mask) + struct dm_rq_target_io *tio, gfp_t gfp_mask) +{ + struct request *clone = alloc_clone_request(md, gfp_mask); + + if (!clone) + return NULL; + + blk_rq_init(NULL, clone); + if (setup_clone(clone, rq, tio, gfp_mask)) { + /* -ENOMEM */ + free_clone_request(md, clone); + return NULL; + } + + return clone; +} + +static void map_tio_request(struct kthread_work *work); + +static struct dm_rq_target_io *prep_tio(struct request *rq, + struct mapped_device *md, gfp_t gfp_mask) { - struct request *clone; struct dm_rq_target_io *tio; + int srcu_idx; + struct dm_table *table; tio = alloc_rq_tio(md, gfp_mask); if (!tio) @@ -1748,18 +1814,23 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md, tio->md = md; tio->ti = NULL; + tio->clone = NULL; tio->orig = rq; tio->error = 0; memset(&tio->info, 0, sizeof(tio->info)); - - clone = &tio->clone; - if (setup_clone(clone, rq, tio)) { - /* -ENOMEM */ - free_rq_tio(tio); - return NULL; + init_kthread_work(&tio->work, map_tio_request); + + table = dm_get_live_table(md, &srcu_idx); + if (!dm_table_mq_request_based(table)) { + if (!clone_rq(rq, md, tio, gfp_mask)) { + dm_put_live_table(md, srcu_idx); + free_rq_tio(tio); + return NULL; + } } + dm_put_live_table(md, srcu_idx); - return clone; + return tio; } /* @@ -1768,18 +1839,18 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md, static int dm_prep_fn(struct request_queue *q, struct request *rq) { struct mapped_device *md = q->queuedata; - struct request *clone; + struct dm_rq_target_io *tio; if (unlikely(rq->special)) { DMWARN("Already has something in rq->special."); return BLKPREP_KILL; } - clone = clone_rq(rq, md, GFP_ATOMIC); - if (!clone) + tio = prep_tio(rq, md, GFP_ATOMIC); + if (!tio) return BLKPREP_DEFER; - rq->special = clone; + rq->special = tio; rq->cmd_flags |= REQ_DONTPREP; return BLKPREP_OK; @@ -1787,17 +1858,36 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) /* * Returns: - * 0 : the request has been processed (not requeued) - * !0 : the request has been requeued + * 0 : the request has been processed + * DM_MAPIO_REQUEUE : the original request needs to be requeued + * < 0 : the request was completed due to failure */ -static int map_request(struct dm_target *ti, struct request *clone, +static int map_request(struct dm_target *ti, struct request *rq, struct mapped_device *md) { - int r, requeued = 0; - struct dm_rq_target_io *tio = clone->end_io_data; + int r; + struct dm_rq_target_io *tio = rq->special; + struct request *clone = NULL; + + if (tio->clone) { + clone = tio->clone; + r = ti->type->map_rq(ti, clone, &tio->info); + } else { + r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); + if (r < 0) { + /* The target wants to complete the I/O */ + dm_kill_unmapped_request(rq, r); + return r; + } + if (IS_ERR(clone)) + return DM_MAPIO_REQUEUE; + if (setup_clone(clone, rq, tio, GFP_KERNEL)) { + /* -ENOMEM */ + ti->type->release_clone_rq(clone); + return DM_MAPIO_REQUEUE; + } + } - tio->ti = ti; - r = ti->type->map_rq(ti, clone, &tio->info); switch (r) { case DM_MAPIO_SUBMITTED: /* The target has taken the I/O to submit by itself later */ @@ -1805,13 +1895,12 @@ static int map_request(struct dm_target *ti, struct request *clone, case DM_MAPIO_REMAPPED: /* The target has remapped the I/O so dispatch it */ trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), - blk_rq_pos(tio->orig)); - dm_dispatch_request(clone); + blk_rq_pos(rq)); + dm_dispatch_clone_request(clone, rq); break; case DM_MAPIO_REQUEUE: /* The target wants to requeue the I/O */ dm_requeue_unmapped_request(clone); - requeued = 1; break; default: if (r > 0) { @@ -1820,20 +1909,27 @@ static int map_request(struct dm_target *ti, struct request *clone, } /* The target wants to complete the I/O */ - dm_kill_unmapped_request(clone, r); - break; + dm_kill_unmapped_request(rq, r); + return r; } - return requeued; + return 0; } -static struct request *dm_start_request(struct mapped_device *md, struct request *orig) +static void map_tio_request(struct kthread_work *work) { - struct request *clone; + struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); + struct request *rq = tio->orig; + struct mapped_device *md = tio->md; + if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE) + dm_requeue_unmapped_original_request(md, rq); +} + +static void dm_start_request(struct mapped_device *md, struct request *orig) +{ blk_start_request(orig); - clone = orig->special; - atomic_inc(&md->pending[rq_data_dir(clone)]); + atomic_inc(&md->pending[rq_data_dir(orig)]); /* * Hold the md reference here for the in-flight I/O. @@ -1843,8 +1939,6 @@ static struct request *dm_start_request(struct mapped_device *md, struct request * See the comment in rq_completed() too. */ dm_get(md); - - return clone; } /* @@ -1857,7 +1951,8 @@ static void dm_request_fn(struct request_queue *q) int srcu_idx; struct dm_table *map = dm_get_live_table(md, &srcu_idx); struct dm_target *ti; - struct request *rq, *clone; + struct request *rq; + struct dm_rq_target_io *tio; sector_t pos; /* @@ -1879,34 +1974,29 @@ static void dm_request_fn(struct request_queue *q) ti = dm_table_find_target(map, pos); if (!dm_target_is_valid(ti)) { /* - * Must perform setup, that dm_done() requires, + * Must perform setup, that rq_completed() requires, * before calling dm_kill_unmapped_request */ DMERR_LIMIT("request attempted access beyond the end of device"); - clone = dm_start_request(md, rq); - dm_kill_unmapped_request(clone, -EIO); + dm_start_request(md, rq); + dm_kill_unmapped_request(rq, -EIO); continue; } if (ti->type->busy && ti->type->busy(ti)) goto delay_and_out; - clone = dm_start_request(md, rq); - - spin_unlock(q->queue_lock); - if (map_request(ti, clone, md)) - goto requeued; + dm_start_request(md, rq); + tio = rq->special; + /* Establish tio->ti before queuing work (map_tio_request) */ + tio->ti = ti; + queue_kthread_work(&md->kworker, &tio->work); BUG_ON(!irqs_disabled()); - spin_lock(q->queue_lock); } goto out; -requeued: - BUG_ON(!irqs_disabled()); - spin_lock(q->queue_lock); - delay_and_out: blk_delay_queue(q, HZ / 10); out: @@ -2092,6 +2182,7 @@ static struct mapped_device *alloc_dev(int minor) INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); init_completion(&md->kobj_holder.completion); + md->kworker_task = NULL; md->disk->major = _major; md->disk->first_minor = minor; @@ -2152,8 +2243,13 @@ static void free_dev(struct mapped_device *md) unlock_fs(md); bdput(md->bdev); destroy_workqueue(md->wq); + + if (md->kworker_task) + kthread_stop(md->kworker_task); if (md->io_pool) mempool_destroy(md->io_pool); + if (md->rq_pool) + mempool_destroy(md->rq_pool); if (md->bs) bioset_free(md->bs); blk_integrity_unregister(md->disk); @@ -2187,23 +2283,24 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) bioset_free(md->bs); md->bs = p->bs; p->bs = NULL; - } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) { - /* - * There's no need to reload with request-based dm - * because the size of front_pad doesn't change. - * Note for future: If you are to reload bioset, - * prep-ed requests in the queue may refer - * to bio from the old bioset, so you must walk - * through the queue to unprep. - */ } + /* + * There's no need to reload with request-based dm + * because the size of front_pad doesn't change. + * Note for future: If you are to reload bioset, + * prep-ed requests in the queue may refer + * to bio from the old bioset, so you must walk + * through the queue to unprep. + */ goto out; } - BUG_ON(!p || md->io_pool || md->bs); + BUG_ON(!p || md->io_pool || md->rq_pool || md->bs); md->io_pool = p->io_pool; p->io_pool = NULL; + md->rq_pool = p->rq_pool; + p->rq_pool = NULL; md->bs = p->bs; p->bs = NULL; @@ -2406,6 +2503,14 @@ unsigned dm_get_md_type(struct mapped_device *md) return md->type; } +static bool dm_md_type_request_based(struct mapped_device *md) +{ + unsigned table_type = dm_get_md_type(md); + + return (table_type == DM_TYPE_REQUEST_BASED || + table_type == DM_TYPE_MQ_REQUEST_BASED); +} + struct target_type *dm_get_immutable_target_type(struct mapped_device *md) { return md->immutable_target_type; @@ -2443,6 +2548,11 @@ static int dm_init_request_based_queue(struct mapped_device *md) blk_queue_prep_rq(md->queue, dm_prep_fn); blk_queue_lld_busy(md->queue, dm_lld_busy); + /* Also initialize the request-based DM worker thread */ + init_kthread_worker(&md->kworker); + md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, + "kdmwork-%s", dm_device_name(md)); + elv_register_queue(md->queue); return 1; @@ -2453,8 +2563,7 @@ static int dm_init_request_based_queue(struct mapped_device *md) */ int dm_setup_md_queue(struct mapped_device *md) { - if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && - !dm_init_request_based_queue(md)) { + if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) { DMWARN("Cannot initialize queue for request-based mapped device"); return -EINVAL; } @@ -2533,6 +2642,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait) set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); + if (dm_request_based(md)) + flush_kthread_worker(&md->kworker); + if (!dm_suspended_md(md)) { dm_table_presuspend_targets(map); dm_table_postsuspend_targets(map); @@ -2776,8 +2888,10 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * Stop md->queue before flushing md->wq in case request-based * dm defers requests to md->wq from md->queue. */ - if (dm_request_based(md)) + if (dm_request_based(md)) { stop_queue(md->queue); + flush_kthread_worker(&md->kworker); + } flush_workqueue(md->wq); @@ -3123,24 +3237,35 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u { struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); struct kmem_cache *cachep; - unsigned int pool_size; + unsigned int pool_size = 0; unsigned int front_pad; if (!pools) return NULL; - if (type == DM_TYPE_BIO_BASED) { + switch (type) { + case DM_TYPE_BIO_BASED: cachep = _io_cache; pool_size = dm_get_reserved_bio_based_ios(); front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); - } else if (type == DM_TYPE_REQUEST_BASED) { - cachep = _rq_tio_cache; + break; + case DM_TYPE_REQUEST_BASED: pool_size = dm_get_reserved_rq_based_ios(); + pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); + if (!pools->rq_pool) + goto out; + /* fall through to setup remaining rq-based pools */ + case DM_TYPE_MQ_REQUEST_BASED: + cachep = _rq_tio_cache; + if (!pool_size) + pool_size = dm_get_reserved_rq_based_ios(); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); /* per_bio_data_size is not used. See __bind_mempools(). */ WARN_ON(per_bio_data_size != 0); - } else + break; + default: goto out; + } pools->io_pool = mempool_create_slab_pool(pool_size, cachep); if (!pools->io_pool) @@ -3169,6 +3294,9 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) if (pools->io_pool) mempool_destroy(pools->io_pool); + if (pools->rq_pool) + mempool_destroy(pools->rq_pool); + if (pools->bs) bioset_free(pools->bs); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 84b0f9e4..59f53e7 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -34,9 +34,10 @@ /* * Type of table and mapped_device's mempool */ -#define DM_TYPE_NONE 0 -#define DM_TYPE_BIO_BASED 1 -#define DM_TYPE_REQUEST_BASED 2 +#define DM_TYPE_NONE 0 +#define DM_TYPE_BIO_BASED 1 +#define DM_TYPE_REQUEST_BASED 2 +#define DM_TYPE_MQ_REQUEST_BASED 3 /* * List of devices that a metadevice uses and should open/close. @@ -73,6 +74,7 @@ int dm_table_any_busy_target(struct dm_table *t); unsigned dm_table_get_type(struct dm_table *t); struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); +bool dm_table_mq_request_based(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); @@ -99,7 +101,8 @@ int dm_setup_md_queue(struct mapped_device *md); /* * To check whether the target type is request-based or not (bio-based). */ -#define dm_target_request_based(t) ((t)->type->map_rq != NULL) +#define dm_target_request_based(t) (((t)->type->map_rq != NULL) || \ + ((t)->type->clone_and_map_rq != NULL)) /* * To check whether the target type is a hybrid (capable of being diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index e8b4574..1277eb2 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -332,13 +332,11 @@ static int run(struct mddev *mddev) return 0; } -static int stop(struct mddev *mddev) +static void faulty_free(struct mddev *mddev, void *priv) { - struct faulty_conf *conf = mddev->private; + struct faulty_conf *conf = priv; kfree(conf); - mddev->private = NULL; - return 0; } static struct md_personality faulty_personality = @@ -348,7 +346,7 @@ static struct md_personality faulty_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = faulty_free, .status = status, .check_reshape = reshape, .size = faulty_size, diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 64713b7..fa7d577 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -34,7 +34,7 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) lo = 0; hi = mddev->raid_disks - 1; - conf = rcu_dereference(mddev->private); + conf = mddev->private; /* * Binary Search @@ -60,18 +60,16 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) * * Return amount of bytes we can take at this offset */ -static int linear_mergeable_bvec(struct request_queue *q, +static int linear_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct dev_info *dev0; unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int maxbytes = biovec->bv_len; struct request_queue *subq; - rcu_read_lock(); dev0 = which_dev(mddev, sector); maxsectors = dev0->end_sector - sector; subq = bdev_get_queue(dev0->rdev->bdev); @@ -81,7 +79,6 @@ static int linear_mergeable_bvec(struct request_queue *q, maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm, biovec)); } - rcu_read_unlock(); if (maxsectors < bio_sectors) maxsectors = 0; @@ -97,24 +94,18 @@ static int linear_mergeable_bvec(struct request_queue *q, return maxsectors << 9; } -static int linear_congested(void *data, int bits) +static int linear_congested(struct mddev *mddev, int bits) { - struct mddev *mddev = data; struct linear_conf *conf; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - - rcu_read_lock(); - conf = rcu_dereference(mddev->private); + conf = mddev->private; for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); } - rcu_read_unlock(); return ret; } @@ -123,12 +114,10 @@ static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disk struct linear_conf *conf; sector_t array_sectors; - rcu_read_lock(); - conf = rcu_dereference(mddev->private); + conf = mddev->private; WARN_ONCE(sectors || raid_disks, "%s does not support generic reshape\n", __func__); array_sectors = conf->array_sectors; - rcu_read_unlock(); return array_sectors; } @@ -217,10 +206,6 @@ static int linear_run (struct mddev *mddev) mddev->private = conf; md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); - mddev->queue->backing_dev_info.congested_fn = linear_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - ret = md_integrity_register(mddev); if (ret) { kfree(conf); @@ -252,38 +237,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; - oldconf = rcu_dereference_protected(mddev->private, - lockdep_is_held( - &mddev->reconfig_mutex)); + mddev_suspend(mddev); + oldconf = mddev->private; mddev->raid_disks++; - rcu_assign_pointer(mddev->private, newconf); + mddev->private = newconf; md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); + mddev_resume(mddev); revalidate_disk(mddev->gendisk); - kfree_rcu(oldconf, rcu); + kfree(oldconf); return 0; } -static int linear_stop (struct mddev *mddev) +static void linear_free(struct mddev *mddev, void *priv) { - struct linear_conf *conf = - rcu_dereference_protected(mddev->private, - lockdep_is_held( - &mddev->reconfig_mutex)); + struct linear_conf *conf = priv; - /* - * We do not require rcu protection here since - * we hold reconfig_mutex for both linear_add and - * linear_stop, so they cannot race. - * We should make sure any old 'conf's are properly - * freed though. - */ - rcu_barrier(); - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf); - mddev->private = NULL; - - return 0; } static void linear_make_request(struct mddev *mddev, struct bio *bio) @@ -299,16 +269,12 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) } do { - rcu_read_lock(); - tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; end_sector = tmp_dev->end_sector; data_offset = tmp_dev->rdev->data_offset; bio->bi_bdev = tmp_dev->rdev->bdev; - rcu_read_unlock(); - if (unlikely(bio->bi_iter.bi_sector >= end_sector || bio->bi_iter.bi_sector < start_sector)) goto out_of_bounds; @@ -355,6 +321,10 @@ static void linear_status (struct seq_file *seq, struct mddev *mddev) seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); } +static void linear_quiesce(struct mddev *mddev, int state) +{ +} + static struct md_personality linear_personality = { .name = "linear", @@ -362,10 +332,13 @@ static struct md_personality linear_personality = .owner = THIS_MODULE, .make_request = linear_make_request, .run = linear_run, - .stop = linear_stop, + .free = linear_free, .status = linear_status, .hot_add_disk = linear_add, .size = linear_size, + .quiesce = linear_quiesce, + .congested = linear_congested, + .mergeable_bvec = linear_mergeable_bvec, }; static int __init linear_init (void) diff --git a/drivers/md/md.c b/drivers/md/md.c index 709755f..c8d2bac 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -72,6 +72,7 @@ static struct workqueue_struct *md_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); +static void mddev_detach(struct mddev *mddev); /* * Default number of read corrections we'll attempt on an rdev @@ -292,8 +293,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio) /* mddev_suspend makes sure no new requests are submitted * to the device, and that any requests that have been submitted * are completely handled. - * Once ->stop is called and completes, the module will be completely - * unused. + * Once mddev_detach() is called and completes, the module will be + * completely unused. */ void mddev_suspend(struct mddev *mddev) { @@ -321,10 +322,47 @@ EXPORT_SYMBOL_GPL(mddev_resume); int mddev_congested(struct mddev *mddev, int bits) { - return mddev->suspended; + struct md_personality *pers = mddev->pers; + int ret = 0; + + rcu_read_lock(); + if (mddev->suspended) + ret = 1; + else if (pers && pers->congested) + ret = pers->congested(mddev, bits); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(mddev_congested); +static int md_congested(void *data, int bits) +{ + struct mddev *mddev = data; + return mddev_congested(mddev, bits); } -EXPORT_SYMBOL(mddev_congested); +static int md_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) +{ + struct mddev *mddev = q->queuedata; + int ret; + rcu_read_lock(); + if (mddev->suspended) { + /* Must always allow one vec */ + if (bvm->bi_size == 0) + ret = biovec->bv_len; + else + ret = 0; + } else { + struct md_personality *pers = mddev->pers; + if (pers && pers->mergeable_bvec) + ret = pers->mergeable_bvec(mddev, bvm, biovec); + else + ret = biovec->bv_len; + } + rcu_read_unlock(); + return ret; +} /* * Generic flush handling for md */ @@ -397,12 +435,12 @@ static void md_submit_flush_data(struct work_struct *ws) void md_flush_request(struct mddev *mddev, struct bio *bio) { - spin_lock_irq(&mddev->write_lock); + spin_lock_irq(&mddev->lock); wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio, - mddev->write_lock); + mddev->lock); mddev->flush_bio = bio; - spin_unlock_irq(&mddev->write_lock); + spin_unlock_irq(&mddev->lock); INIT_WORK(&mddev->flush_work, submit_flushes); queue_work(md_wq, &mddev->flush_work); @@ -465,7 +503,7 @@ void mddev_init(struct mddev *mddev) atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); - spin_lock_init(&mddev->write_lock); + spin_lock_init(&mddev->lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); @@ -552,32 +590,9 @@ static struct mddev *mddev_find(dev_t unit) goto retry; } -static inline int __must_check mddev_lock(struct mddev *mddev) -{ - return mutex_lock_interruptible(&mddev->reconfig_mutex); -} - -/* Sometimes we need to take the lock in a situation where - * failure due to interrupts is not acceptable. - */ -static inline void mddev_lock_nointr(struct mddev *mddev) -{ - mutex_lock(&mddev->reconfig_mutex); -} - -static inline int mddev_is_locked(struct mddev *mddev) -{ - return mutex_is_locked(&mddev->reconfig_mutex); -} - -static inline int mddev_trylock(struct mddev *mddev) -{ - return mutex_trylock(&mddev->reconfig_mutex); -} - static struct attribute_group md_redundancy_group; -static void mddev_unlock(struct mddev *mddev) +void mddev_unlock(struct mddev *mddev) { if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as @@ -619,6 +634,7 @@ static void mddev_unlock(struct mddev *mddev) md_wakeup_thread(mddev->thread); spin_unlock(&pers_lock); } +EXPORT_SYMBOL_GPL(mddev_unlock); static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) { @@ -2230,7 +2246,7 @@ repeat: return; } - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); mddev->utime = get_seconds(); @@ -2287,7 +2303,7 @@ repeat: } sync_sbs(mddev, nospares); - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", mdname(mddev), mddev->in_sync); @@ -2326,15 +2342,15 @@ repeat: md_super_wait(mddev); /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->in_sync != sync_req || test_bit(MD_CHANGE_DEVS, &mddev->flags)) { /* have to write it out again */ - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); goto repeat; } clear_bit(MD_CHANGE_PENDING, &mddev->flags); - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); @@ -2381,40 +2397,41 @@ state_show(struct md_rdev *rdev, char *page) { char *sep = ""; size_t len = 0; + unsigned long flags = ACCESS_ONCE(rdev->flags); - if (test_bit(Faulty, &rdev->flags) || + if (test_bit(Faulty, &flags) || rdev->badblocks.unacked_exist) { len+= sprintf(page+len, "%sfaulty",sep); sep = ","; } - if (test_bit(In_sync, &rdev->flags)) { + if (test_bit(In_sync, &flags)) { len += sprintf(page+len, "%sin_sync",sep); sep = ","; } - if (test_bit(WriteMostly, &rdev->flags)) { + if (test_bit(WriteMostly, &flags)) { len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; } - if (test_bit(Blocked, &rdev->flags) || + if (test_bit(Blocked, &flags) || (rdev->badblocks.unacked_exist - && !test_bit(Faulty, &rdev->flags))) { + && !test_bit(Faulty, &flags))) { len += sprintf(page+len, "%sblocked", sep); sep = ","; } - if (!test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags)) { + if (!test_bit(Faulty, &flags) && + !test_bit(In_sync, &flags)) { len += sprintf(page+len, "%sspare", sep); sep = ","; } - if (test_bit(WriteErrorSeen, &rdev->flags)) { + if (test_bit(WriteErrorSeen, &flags)) { len += sprintf(page+len, "%swrite_error", sep); sep = ","; } - if (test_bit(WantReplacement, &rdev->flags)) { + if (test_bit(WantReplacement, &flags)) { len += sprintf(page+len, "%swant_replacement", sep); sep = ","; } - if (test_bit(Replacement, &rdev->flags)) { + if (test_bit(Replacement, &flags)) { len += sprintf(page+len, "%sreplacement", sep); sep = ","; } @@ -2927,21 +2944,12 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); - struct mddev *mddev = rdev->mddev; - ssize_t rv; if (!entry->show) return -EIO; - - rv = mddev ? mddev_lock(mddev) : -EBUSY; - if (!rv) { - if (rdev->mddev == NULL) - rv = -EBUSY; - else - rv = entry->show(rdev, page); - mddev_unlock(mddev); - } - return rv; + if (!rdev->mddev) + return -EBUSY; + return entry->show(rdev, page); } static ssize_t @@ -3212,11 +3220,13 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) mddev->safemode_delay = 0; else { unsigned long old_delay = mddev->safemode_delay; - mddev->safemode_delay = (msec*HZ)/1000; - if (mddev->safemode_delay == 0) - mddev->safemode_delay = 1; - if (mddev->safemode_delay < old_delay || old_delay == 0) - md_safemode_timeout((unsigned long)mddev); + unsigned long new_delay = (msec*HZ)/1000; + + if (new_delay == 0) + new_delay = 1; + mddev->safemode_delay = new_delay; + if (new_delay < old_delay || old_delay == 0) + mod_timer(&mddev->safemode_timer, jiffies+1); } return len; } @@ -3226,41 +3236,52 @@ __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); static ssize_t level_show(struct mddev *mddev, char *page) { - struct md_personality *p = mddev->pers; + struct md_personality *p; + int ret; + spin_lock(&mddev->lock); + p = mddev->pers; if (p) - return sprintf(page, "%s\n", p->name); + ret = sprintf(page, "%s\n", p->name); else if (mddev->clevel[0]) - return sprintf(page, "%s\n", mddev->clevel); + ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) - return sprintf(page, "%d\n", mddev->level); + ret = sprintf(page, "%d\n", mddev->level); else - return 0; + ret = 0; + spin_unlock(&mddev->lock); + return ret; } static ssize_t level_store(struct mddev *mddev, const char *buf, size_t len) { char clevel[16]; - ssize_t rv = len; - struct md_personality *pers; + ssize_t rv; + size_t slen = len; + struct md_personality *pers, *oldpers; long level; - void *priv; + void *priv, *oldpriv; struct md_rdev *rdev; + if (slen == 0 || slen >= sizeof(clevel)) + return -EINVAL; + + rv = mddev_lock(mddev); + if (rv) + return rv; + if (mddev->pers == NULL) { - if (len == 0) - return 0; - if (len >= sizeof(mddev->clevel)) - return -ENOSPC; - strncpy(mddev->clevel, buf, len); - if (mddev->clevel[len-1] == '\n') - len--; - mddev->clevel[len] = 0; + strncpy(mddev->clevel, buf, slen); + if (mddev->clevel[slen-1] == '\n') + slen--; + mddev->clevel[slen] = 0; mddev->level = LEVEL_NONE; - return rv; + rv = len; + goto out_unlock; } + rv = -EROFS; if (mddev->ro) - return -EROFS; + goto out_unlock; /* request to change the personality. Need to ensure: * - array is not engaged in resync/recovery/reshape @@ -3268,25 +3289,25 @@ level_store(struct mddev *mddev, const char *buf, size_t len) * - new personality will access other array. */ + rv = -EBUSY; if (mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->reshape_position != MaxSector || mddev->sysfs_active) - return -EBUSY; + goto out_unlock; + rv = -EINVAL; if (!mddev->pers->quiesce) { printk(KERN_WARNING "md: %s: %s does not support online personality change\n", mdname(mddev), mddev->pers->name); - return -EINVAL; + goto out_unlock; } /* Now find the new personality */ - if (len == 0 || len >= sizeof(clevel)) - return -EINVAL; - strncpy(clevel, buf, len); - if (clevel[len-1] == '\n') - len--; - clevel[len] = 0; + strncpy(clevel, buf, slen); + if (clevel[slen-1] == '\n') + slen--; + clevel[slen] = 0; if (kstrtol(clevel, 10, &level)) level = LEVEL_NONE; @@ -3297,20 +3318,23 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); printk(KERN_WARNING "md: personality %s not loaded\n", clevel); - return -EINVAL; + rv = -EINVAL; + goto out_unlock; } spin_unlock(&pers_lock); if (pers == mddev->pers) { /* Nothing to do! */ module_put(pers->owner); - return rv; + rv = len; + goto out_unlock; } if (!pers->takeover) { module_put(pers->owner); printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", mdname(mddev), clevel); - return -EINVAL; + rv = -EINVAL; + goto out_unlock; } rdev_for_each(rdev, mddev) @@ -3330,30 +3354,29 @@ level_store(struct mddev *mddev, const char *buf, size_t len) module_put(pers->owner); printk(KERN_WARNING "md: %s: %s would not accept array\n", mdname(mddev), clevel); - return PTR_ERR(priv); + rv = PTR_ERR(priv); + goto out_unlock; } /* Looks like we have a winner */ mddev_suspend(mddev); - mddev->pers->stop(mddev); + mddev_detach(mddev); - if (mddev->pers->sync_request == NULL && - pers->sync_request != NULL) { - /* need to add the md_redundancy_group */ - if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) - printk(KERN_WARNING - "md: cannot register extra attributes for %s\n", - mdname(mddev)); - mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); - } - if (mddev->pers->sync_request != NULL && - pers->sync_request == NULL) { - /* need to remove the md_redundancy_group */ - if (mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - } + spin_lock(&mddev->lock); + oldpers = mddev->pers; + oldpriv = mddev->private; + mddev->pers = pers; + mddev->private = priv; + strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_sectors = mddev->new_chunk_sectors; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + mddev->degraded = 0; + spin_unlock(&mddev->lock); - if (mddev->pers->sync_request == NULL && + if (oldpers->sync_request == NULL && mddev->external) { /* We are converting from a no-redundancy array * to a redundancy array and metadata is managed @@ -3367,6 +3390,24 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->safemode = 0; } + oldpers->free(mddev, oldpriv); + + if (oldpers->sync_request == NULL && + pers->sync_request != NULL) { + /* need to add the md_redundancy_group */ + if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + printk(KERN_WARNING + "md: cannot register extra attributes for %s\n", + mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); + } + if (oldpers->sync_request != NULL && + pers->sync_request == NULL) { + /* need to remove the md_redundancy_group */ + if (mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + } + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; @@ -3392,17 +3433,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) } } - module_put(mddev->pers->owner); - mddev->pers = pers; - mddev->private = priv; - strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); - mddev->level = mddev->new_level; - mddev->layout = mddev->new_layout; - mddev->chunk_sectors = mddev->new_chunk_sectors; - mddev->delta_disks = 0; - mddev->reshape_backwards = 0; - mddev->degraded = 0; - if (mddev->pers->sync_request == NULL) { + if (pers->sync_request == NULL) { /* this is now an array without redundancy, so * it must always be in_sync */ @@ -3417,6 +3448,9 @@ level_store(struct mddev *mddev, const char *buf, size_t len) md_update_sb(mddev, 1); sysfs_notify(&mddev->kobj, NULL, "level"); md_new_event(mddev); + rv = len; +out_unlock: + mddev_unlock(mddev); return rv; } @@ -3439,28 +3473,32 @@ layout_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long n = simple_strtoul(buf, &e, 10); + int err; if (!*buf || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { - int err; if (mddev->pers->check_reshape == NULL) - return -EBUSY; - if (mddev->ro) - return -EROFS; - mddev->new_layout = n; - err = mddev->pers->check_reshape(mddev); - if (err) { - mddev->new_layout = mddev->layout; - return err; + err = -EBUSY; + else if (mddev->ro) + err = -EROFS; + else { + mddev->new_layout = n; + err = mddev->pers->check_reshape(mddev); + if (err) + mddev->new_layout = mddev->layout; } } else { mddev->new_layout = n; if (mddev->reshape_position == MaxSector) mddev->layout = n; } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_layout = __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); @@ -3483,32 +3521,39 @@ static ssize_t raid_disks_store(struct mddev *mddev, const char *buf, size_t len) { char *e; - int rv = 0; + int err; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) - rv = update_raid_disks(mddev, n); + err = update_raid_disks(mddev, n); else if (mddev->reshape_position != MaxSector) { struct md_rdev *rdev; int olddisks = mddev->raid_disks - mddev->delta_disks; + err = -EINVAL; rdev_for_each(rdev, mddev) { if (olddisks < n && rdev->data_offset < rdev->new_data_offset) - return -EINVAL; + goto out_unlock; if (olddisks > n && rdev->data_offset > rdev->new_data_offset) - return -EINVAL; + goto out_unlock; } + err = 0; mddev->delta_disks = n - olddisks; mddev->raid_disks = n; mddev->reshape_backwards = (mddev->delta_disks < 0); } else mddev->raid_disks = n; - return rv ? rv : len; +out_unlock: + mddev_unlock(mddev); + return err ? err : len; } static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); @@ -3527,30 +3572,34 @@ chunk_size_show(struct mddev *mddev, char *page) static ssize_t chunk_size_store(struct mddev *mddev, const char *buf, size_t len) { + int err; char *e; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { - int err; if (mddev->pers->check_reshape == NULL) - return -EBUSY; - if (mddev->ro) - return -EROFS; - mddev->new_chunk_sectors = n >> 9; - err = mddev->pers->check_reshape(mddev); - if (err) { - mddev->new_chunk_sectors = mddev->chunk_sectors; - return err; + err = -EBUSY; + else if (mddev->ro) + err = -EROFS; + else { + mddev->new_chunk_sectors = n >> 9; + err = mddev->pers->check_reshape(mddev); + if (err) + mddev->new_chunk_sectors = mddev->chunk_sectors; } } else { mddev->new_chunk_sectors = n >> 9; if (mddev->reshape_position == MaxSector) mddev->chunk_sectors = n >> 9; } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_chunk_size = __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); @@ -3566,20 +3615,27 @@ resync_start_show(struct mddev *mddev, char *page) static ssize_t resync_start_store(struct mddev *mddev, const char *buf, size_t len) { + int err; char *e; unsigned long long n = simple_strtoull(buf, &e, 10); + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - return -EBUSY; - if (cmd_match(buf, "none")) + err = -EBUSY; + else if (cmd_match(buf, "none")) n = MaxSector; else if (!*buf || (*e && *e != '\n')) - return -EINVAL; + err = -EINVAL; - mddev->recovery_cp = n; - if (mddev->pers) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - return len; + if (!err) { + mddev->recovery_cp = n; + if (mddev->pers) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_resync_start = __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); @@ -3677,8 +3733,39 @@ static int restart_array(struct mddev *mddev); static ssize_t array_state_store(struct mddev *mddev, const char *buf, size_t len) { - int err = -EINVAL; + int err; enum array_state st = match_word(buf, array_states); + + if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { + /* don't take reconfig_mutex when toggling between + * clean and active + */ + spin_lock(&mddev->lock); + if (st == active) { + restart_array(mddev); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + wake_up(&mddev->sb_wait); + err = 0; + } else /* st == clean */ { + restart_array(mddev); + if (atomic_read(&mddev->writes_pending) == 0) { + if (mddev->in_sync == 0) { + mddev->in_sync = 1; + if (mddev->safemode == 1) + mddev->safemode = 0; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } + err = 0; + } else + err = -EBUSY; + } + spin_unlock(&mddev->lock); + return err; + } + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; switch(st) { case bad_word: break; @@ -3722,7 +3809,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) case clean: if (mddev->pers) { restart_array(mddev); - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (atomic_read(&mddev->writes_pending) == 0) { if (mddev->in_sync == 0) { mddev->in_sync = 1; @@ -3733,7 +3820,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) err = 0; } else err = -EBUSY; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); } else err = -EINVAL; break; @@ -3754,14 +3841,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) /* these cannot be set */ break; } - if (err) - return err; - else { + + if (!err) { if (mddev->hold_active == UNTIL_IOCTL) mddev->hold_active = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); - return len; } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_array_state = __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); @@ -3822,6 +3909,11 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; + flush_workqueue(md_misc_wq); + + err = mddev_lock(mddev); + if (err) + return err; if (mddev->persistent) { rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); @@ -3845,6 +3937,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) out: if (err) export_rdev(rdev); + mddev_unlock(mddev); return err ? err : len; } @@ -3856,7 +3949,11 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) { char *end; unsigned long chunk, end_chunk; + int err; + err = mddev_lock(mddev); + if (err) + return err; if (!mddev->bitmap) goto out; /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ @@ -3874,6 +3971,7 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) } bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ out: + mddev_unlock(mddev); return len; } @@ -3901,6 +3999,9 @@ size_store(struct mddev *mddev, const char *buf, size_t len) if (err < 0) return err; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { err = update_size(mddev, sectors); md_update_sb(mddev, 1); @@ -3911,6 +4012,7 @@ size_store(struct mddev *mddev, const char *buf, size_t len) else err = -ENOSPC; } + mddev_unlock(mddev); return err ? err : len; } @@ -3940,21 +4042,28 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) { int major, minor; char *e; + int err; /* Changing the details of 'external' metadata is * always permitted. Otherwise there must be * no devices attached to the array. */ + + err = mddev_lock(mddev); + if (err) + return err; + err = -EBUSY; if (mddev->external && strncmp(buf, "external:", 9) == 0) ; else if (!list_empty(&mddev->disks)) - return -EBUSY; + goto out_unlock; + err = 0; if (cmd_match(buf, "none")) { mddev->persistent = 0; mddev->external = 0; mddev->major_version = 0; mddev->minor_version = 90; - return len; + goto out_unlock; } if (strncmp(buf, "external:", 9) == 0) { size_t namelen = len-9; @@ -3968,22 +4077,27 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) mddev->external = 1; mddev->major_version = 0; mddev->minor_version = 90; - return len; + goto out_unlock; } major = simple_strtoul(buf, &e, 10); + err = -EINVAL; if (e==buf || *e != '.') - return -EINVAL; + goto out_unlock; buf = e+1; minor = simple_strtoul(buf, &e, 10); if (e==buf || (*e && *e != '\n') ) - return -EINVAL; + goto out_unlock; + err = -ENOENT; if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) - return -ENOENT; + goto out_unlock; mddev->major_version = major; mddev->minor_version = minor; mddev->persistent = 1; mddev->external = 0; - return len; + err = 0; +out_unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_metadata = @@ -3993,20 +4107,21 @@ static ssize_t action_show(struct mddev *mddev, char *page) { char *type = "idle"; - if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + unsigned long recovery = mddev->recovery; + if (test_bit(MD_RECOVERY_FROZEN, &recovery)) type = "frozen"; - else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) type = "reshape"; - else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) type = "resync"; - else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_CHECK, &recovery)) type = "check"; else type = "repair"; - } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) type = "recover"; } return sprintf(page, "%s\n", type); @@ -4027,7 +4142,10 @@ action_store(struct mddev *mddev, const char *page, size_t len) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); + if (mddev_lock(mddev) == 0) { + md_reap_sync_thread(mddev); + mddev_unlock(mddev); + } } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -4041,7 +4159,11 @@ action_store(struct mddev *mddev, const char *page, size_t len) int err; if (mddev->pers->start_reshape == NULL) return -EINVAL; - err = mddev->pers->start_reshape(mddev); + err = mddev_lock(mddev); + if (!err) { + err = mddev->pers->start_reshape(mddev); + mddev_unlock(mddev); + } if (err) return err; sysfs_notify(&mddev->kobj, NULL, "degraded"); @@ -4225,22 +4347,36 @@ static ssize_t min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; + int err; + int chunk; + if (kstrtoull(buf, 10, &min)) return -EINVAL; + + spin_lock(&mddev->lock); + err = -EINVAL; if (min > mddev->resync_max) - return -EINVAL; + goto out_unlock; + + err = -EBUSY; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; + goto out_unlock; /* Must be a multiple of chunk_size */ - if (mddev->chunk_sectors) { + chunk = mddev->chunk_sectors; + if (chunk) { sector_t temp = min; - if (sector_div(temp, mddev->chunk_sectors)) - return -EINVAL; + + err = -EINVAL; + if (sector_div(temp, chunk)) + goto out_unlock; } mddev->resync_min = min; + err = 0; - return len; +out_unlock: + spin_unlock(&mddev->lock); + return err ?: len; } static struct md_sysfs_entry md_min_sync = @@ -4258,29 +4394,42 @@ max_sync_show(struct mddev *mddev, char *page) static ssize_t max_sync_store(struct mddev *mddev, const char *buf, size_t len) { + int err; + spin_lock(&mddev->lock); if (strncmp(buf, "max", 3) == 0) mddev->resync_max = MaxSector; else { unsigned long long max; + int chunk; + + err = -EINVAL; if (kstrtoull(buf, 10, &max)) - return -EINVAL; + goto out_unlock; if (max < mddev->resync_min) - return -EINVAL; + goto out_unlock; + + err = -EBUSY; if (max < mddev->resync_max && mddev->ro == 0 && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; + goto out_unlock; /* Must be a multiple of chunk_size */ - if (mddev->chunk_sectors) { + chunk = mddev->chunk_sectors; + if (chunk) { sector_t temp = max; - if (sector_div(temp, mddev->chunk_sectors)) - return -EINVAL; + + err = -EINVAL; + if (sector_div(temp, chunk)) + goto out_unlock; } mddev->resync_max = max; } wake_up(&mddev->recovery_wait); - return len; + err = 0; +out_unlock: + spin_unlock(&mddev->lock); + return err ?: len; } static struct md_sysfs_entry md_max_sync = @@ -4297,14 +4446,20 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old = mddev->suspend_lo; + unsigned long long old; + int err; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) + goto unlock; + old = mddev->suspend_lo; mddev->suspend_lo = new; if (new >= old) /* Shrinking suspended region */ @@ -4314,7 +4469,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - return len; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); @@ -4330,14 +4488,20 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old = mddev->suspend_hi; + unsigned long long old; + int err; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) + goto unlock; + old = mddev->suspend_hi; mddev->suspend_hi = new; if (new <= old) /* Shrinking suspended region */ @@ -4347,7 +4511,10 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - return len; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); @@ -4367,11 +4534,17 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { struct md_rdev *rdev; char *e; + int err; unsigned long long new = simple_strtoull(buf, &e, 10); - if (mddev->pers) - return -EBUSY; + if (buf == e || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EBUSY; + if (mddev->pers) + goto unlock; mddev->reshape_position = new; mddev->delta_disks = 0; mddev->reshape_backwards = 0; @@ -4380,7 +4553,10 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) mddev->new_chunk_sectors = mddev->chunk_sectors; rdev_for_each(rdev, mddev) rdev->new_data_offset = rdev->data_offset; - return len; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_reshape_position = @@ -4398,6 +4574,8 @@ static ssize_t reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) { int backwards = 0; + int err; + if (cmd_match(buf, "forwards")) backwards = 0; else if (cmd_match(buf, "backwards")) @@ -4407,16 +4585,19 @@ reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->reshape_backwards == backwards) return len; + err = mddev_lock(mddev); + if (err) + return err; /* check if we are allowed to change */ if (mddev->delta_disks) - return -EBUSY; - - if (mddev->persistent && + err = -EBUSY; + else if (mddev->persistent && mddev->major_version == 0) - return -EINVAL; - - mddev->reshape_backwards = backwards; - return len; + err = -EINVAL; + else + mddev->reshape_backwards = backwards; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_reshape_direction = @@ -4437,6 +4618,11 @@ static ssize_t array_size_store(struct mddev *mddev, const char *buf, size_t len) { sector_t sectors; + int err; + + err = mddev_lock(mddev); + if (err) + return err; if (strncmp(buf, "default", 7) == 0) { if (mddev->pers) @@ -4447,19 +4633,22 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len) mddev->external_size = 0; } else { if (strict_blocks_to_sectors(buf, §ors) < 0) - return -EINVAL; - if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) - return -E2BIG; - - mddev->external_size = 1; + err = -EINVAL; + else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) + err = -E2BIG; + else + mddev->external_size = 1; } - mddev->array_sectors = sectors; - if (mddev->pers) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + if (!err) { + mddev->array_sectors = sectors; + if (mddev->pers) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_array_size = @@ -4523,11 +4712,7 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) mddev_get(mddev); spin_unlock(&all_mddevs_lock); - rv = mddev_lock(mddev); - if (!rv) { - rv = entry->show(mddev, page); - mddev_unlock(mddev); - } + rv = entry->show(mddev, page); mddev_put(mddev); return rv; } @@ -4551,13 +4736,7 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, } mddev_get(mddev); spin_unlock(&all_mddevs_lock); - if (entry->store == new_dev_store) - flush_workqueue(md_misc_wq); - rv = mddev_lock(mddev); - if (!rv) { - rv = entry->store(mddev, page, length); - mddev_unlock(mddev); - } + rv = entry->store(mddev, page, length); mddev_put(mddev); return rv; } @@ -4825,7 +5004,6 @@ int md_run(struct mddev *mddev) mddev->clevel); return -EINVAL; } - mddev->pers = pers; spin_unlock(&pers_lock); if (mddev->level != pers->level) { mddev->level = pers->level; @@ -4836,7 +5014,6 @@ int md_run(struct mddev *mddev) if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ - mddev->pers = NULL; module_put(pers->owner); return -EINVAL; } @@ -4880,35 +5057,38 @@ int md_run(struct mddev *mddev) if (start_readonly && mddev->ro == 0) mddev->ro = 2; /* read-only, but switch on first write */ - err = mddev->pers->run(mddev); + err = pers->run(mddev); if (err) printk(KERN_ERR "md: pers->run() failed ...\n"); - else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { + else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { WARN_ONCE(!mddev->external_size, "%s: default size too small," " but 'external_size' not in effect?\n", __func__); printk(KERN_ERR "md: invalid array_size %llu > default size %llu\n", (unsigned long long)mddev->array_sectors / 2, - (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); + (unsigned long long)pers->size(mddev, 0, 0) / 2); err = -EINVAL; - mddev->pers->stop(mddev); } - if (err == 0 && mddev->pers->sync_request && + if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { err = bitmap_create(mddev); - if (err) { + if (err) printk(KERN_ERR "%s: failed to create bitmap (%d)\n", mdname(mddev), err); - mddev->pers->stop(mddev); - } } if (err) { - module_put(mddev->pers->owner); - mddev->pers = NULL; + mddev_detach(mddev); + pers->free(mddev, mddev->private); + module_put(pers->owner); bitmap_destroy(mddev); return err; } - if (mddev->pers->sync_request) { + if (mddev->queue) { + mddev->queue->backing_dev_info.congested_data = mddev; + mddev->queue->backing_dev_info.congested_fn = md_congested; + blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec); + } + if (pers->sync_request) { if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_redundancy_group)) printk(KERN_WARNING @@ -4927,7 +5107,10 @@ int md_run(struct mddev *mddev) mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; smp_wmb(); + spin_lock(&mddev->lock); + mddev->pers = pers; mddev->ready = 1; + spin_unlock(&mddev->lock); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) if (sysfs_link_rdev(mddev, rdev)) @@ -5070,14 +5253,38 @@ void md_stop_writes(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_stop_writes); +static void mddev_detach(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + /* wait for behind writes to complete */ + if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n", + mdname(mddev)); + /* need to kick something here to make sure I/O goes? */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } + if (mddev->pers && mddev->pers->quiesce) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + md_unregister_thread(&mddev->thread); + if (mddev->queue) + blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ +} + static void __md_stop(struct mddev *mddev) { + struct md_personality *pers = mddev->pers; + mddev_detach(mddev); + spin_lock(&mddev->lock); mddev->ready = 0; - mddev->pers->stop(mddev); - if (mddev->pers->sync_request && mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - module_put(mddev->pers->owner); mddev->pers = NULL; + spin_unlock(&mddev->lock); + pers->free(mddev, mddev->private); + if (pers->sync_request && mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } @@ -5226,8 +5433,11 @@ static int do_md_stop(struct mddev *mddev, int mode, bitmap_destroy(mddev); if (mddev->bitmap_info.file) { - fput(mddev->bitmap_info.file); + struct file *f = mddev->bitmap_info.file; + spin_lock(&mddev->lock); mddev->bitmap_info.file = NULL; + spin_unlock(&mddev->lock); + fput(f); } mddev->bitmap_info.offset = 0; @@ -5436,37 +5646,31 @@ static int get_array_info(struct mddev *mddev, void __user *arg) static int get_bitmap_file(struct mddev *mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ - char *ptr, *buf = NULL; - int err = -ENOMEM; + char *ptr; + int err; file = kmalloc(sizeof(*file), GFP_NOIO); - if (!file) - goto out; + return -ENOMEM; + err = 0; + spin_lock(&mddev->lock); /* bitmap disabled, zero the first byte and copy out */ - if (!mddev->bitmap || !mddev->bitmap->storage.file) { + if (!mddev->bitmap_info.file) file->pathname[0] = '\0'; - goto copy_out; - } - - buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); - if (!buf) - goto out; - - ptr = d_path(&mddev->bitmap->storage.file->f_path, - buf, sizeof(file->pathname)); - if (IS_ERR(ptr)) - goto out; - - strcpy(file->pathname, ptr); + else if ((ptr = d_path(&mddev->bitmap_info.file->f_path, + file->pathname, sizeof(file->pathname))), + IS_ERR(ptr)) + err = PTR_ERR(ptr); + else + memmove(file->pathname, ptr, + sizeof(file->pathname)-(ptr-file->pathname)); + spin_unlock(&mddev->lock); -copy_out: - err = 0; - if (copy_to_user(arg, file, sizeof(*file))) + if (err == 0 && + copy_to_user(arg, file, sizeof(*file))) err = -EFAULT; -out: - kfree(buf); + kfree(file); return err; } @@ -5789,22 +5993,24 @@ static int set_bitmap_file(struct mddev *mddev, int fd) if (fd >= 0) { struct inode *inode; - if (mddev->bitmap) + struct file *f; + + if (mddev->bitmap || mddev->bitmap_info.file) return -EEXIST; /* cannot add when bitmap is present */ - mddev->bitmap_info.file = fget(fd); + f = fget(fd); - if (mddev->bitmap_info.file == NULL) { + if (f == NULL) { printk(KERN_ERR "%s: error: failed to get bitmap file\n", mdname(mddev)); return -EBADF; } - inode = mddev->bitmap_info.file->f_mapping->host; + inode = f->f_mapping->host; if (!S_ISREG(inode->i_mode)) { printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", mdname(mddev)); err = -EBADF; - } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) { + } else if (!(f->f_mode & FMODE_WRITE)) { printk(KERN_ERR "%s: error: bitmap file must open for write\n", mdname(mddev)); err = -EBADF; @@ -5814,10 +6020,10 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = -EBUSY; } if (err) { - fput(mddev->bitmap_info.file); - mddev->bitmap_info.file = NULL; + fput(f); return err; } + mddev->bitmap_info.file = f; mddev->bitmap_info.offset = 0; /* file overrides offset */ } else if (mddev->bitmap == NULL) return -ENOENT; /* cannot remove what isn't there */ @@ -5836,9 +6042,13 @@ static int set_bitmap_file(struct mddev *mddev, int fd) mddev->pers->quiesce(mddev, 0); } if (fd < 0) { - if (mddev->bitmap_info.file) - fput(mddev->bitmap_info.file); - mddev->bitmap_info.file = NULL; + struct file *f = mddev->bitmap_info.file; + if (f) { + spin_lock(&mddev->lock); + mddev->bitmap_info.file = NULL; + spin_unlock(&mddev->lock); + fput(f); + } } return err; @@ -6251,6 +6461,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, case SET_DISK_FAULTY: err = set_disk_faulty(mddev, new_decode_dev(arg)); goto out; + + case GET_BITMAP_FILE: + err = get_bitmap_file(mddev, argp); + goto out; + } if (cmd == ADD_NEW_DISK) @@ -6342,10 +6557,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, * Commands even a read-only array can execute: */ switch (cmd) { - case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, argp); - goto unlock; - case RESTART_ARRAY_RW: err = restart_array(mddev); goto unlock; @@ -6873,9 +7084,7 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; } - if (mddev_lock(mddev) < 0) - return -EINTR; - + spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : %sactive", mdname(mddev), mddev->pers ? "" : "in"); @@ -6888,7 +7097,8 @@ static int md_seq_show(struct seq_file *seq, void *v) } sectors = 0; - rdev_for_each(rdev, mddev) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); @@ -6904,6 +7114,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "(R)"); sectors += rdev->sectors; } + rcu_read_unlock(); if (!list_empty(&mddev->disks)) { if (mddev->pers) @@ -6946,7 +7157,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } - mddev_unlock(mddev); + spin_unlock(&mddev->lock); return 0; } @@ -7102,7 +7313,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) if (mddev->safemode == 1) mddev->safemode = 0; if (mddev->in_sync) { - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); @@ -7110,7 +7321,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) md_wakeup_thread(mddev->thread); did_change = 1; } - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); } if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -7148,7 +7359,7 @@ int md_allow_write(struct mddev *mddev) if (!mddev->pers->sync_request) return 0; - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); @@ -7156,11 +7367,11 @@ int md_allow_write(struct mddev *mddev) if (mddev->safemode_delay && mddev->safemode == 0) mddev->safemode = 1; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); md_update_sb(mddev, 0); sysfs_notify_dirent_safe(mddev->sysfs_state); } else - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) return -EAGAIN; @@ -7513,6 +7724,7 @@ void md_do_sync(struct md_thread *thread) skip: set_bit(MD_CHANGE_DEVS, &mddev->flags); + spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) @@ -7521,6 +7733,8 @@ void md_do_sync(struct md_thread *thread) } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; mddev->curr_resync = 0; + spin_unlock(&mddev->lock); + wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -7688,7 +7902,7 @@ void md_check_recovery(struct mddev *mddev) if (!mddev->external) { int did_change = 0; - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && @@ -7699,7 +7913,7 @@ void md_check_recovery(struct mddev *mddev) } if (mddev->safemode == 1) mddev->safemode = 0; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); } @@ -7721,7 +7935,9 @@ void md_check_recovery(struct mddev *mddev) * any transients in the value of "sync_action". */ mddev->curr_resync_completed = 0; + spin_lock(&mddev->lock); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + spin_unlock(&mddev->lock); /* Clear some bits that don't mean anything, but * might be left set */ diff --git a/drivers/md/md.h b/drivers/md/md.h index 03cec5b..318ca8f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -386,7 +386,18 @@ struct mddev { struct work_struct del_work; /* used for delayed sysfs removal */ - spinlock_t write_lock; + /* "lock" protects: + * flush_bio transition from NULL to !NULL + * rdev superblocks, events + * clearing MD_CHANGE_* + * in_sync - and related safemode and MD_CHANGE changes + * pers (also protected by reconfig_mutex and pending IO). + * clearing ->bitmap + * clearing ->bitmap_info.file + * changing ->resync_{min,max} + * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max}) + */ + spinlock_t lock; wait_queue_head_t sb_wait; /* for waiting on superblock updates */ atomic_t pending_writes; /* number of active superblock writes */ @@ -439,13 +450,30 @@ struct mddev { void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); }; -static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) +static inline int __must_check mddev_lock(struct mddev *mddev) { - int faulty = test_bit(Faulty, &rdev->flags); - if (atomic_dec_and_test(&rdev->nr_pending) && faulty) - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + return mutex_lock_interruptible(&mddev->reconfig_mutex); +} + +/* Sometimes we need to take the lock in a situation where + * failure due to interrupts is not acceptable. + */ +static inline void mddev_lock_nointr(struct mddev *mddev) +{ + mutex_lock(&mddev->reconfig_mutex); +} + +static inline int mddev_is_locked(struct mddev *mddev) +{ + return mutex_is_locked(&mddev->reconfig_mutex); } +static inline int mddev_trylock(struct mddev *mddev) +{ + return mutex_trylock(&mddev->reconfig_mutex); +} +extern void mddev_unlock(struct mddev *mddev); + static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) { atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); @@ -459,7 +487,7 @@ struct md_personality struct module *owner; void (*make_request)(struct mddev *mddev, struct bio *bio); int (*run)(struct mddev *mddev); - int (*stop)(struct mddev *mddev); + void (*free)(struct mddev *mddev, void *priv); void (*status)(struct seq_file *seq, struct mddev *mddev); /* error_handler must set ->faulty and clear ->in_sync * if appropriate, and should abort recovery if needed @@ -490,6 +518,13 @@ struct md_personality * array. */ void *(*takeover) (struct mddev *mddev); + /* congested implements bdi.congested_fn(). + * Will not be called while array is 'suspended' */ + int (*congested)(struct mddev *mddev, int bits); + /* mergeable_bvec is use to implement ->merge_bvec_fn */ + int (*mergeable_bvec)(struct mddev *mddev, + struct bvec_merge_data *bvm, + struct bio_vec *biovec); }; struct md_sysfs_entry { @@ -624,4 +659,14 @@ static inline int mddev_check_plugged(struct mddev *mddev) return !!blk_check_plugged(md_unplug, mddev, sizeof(struct blk_plug_cb)); } + +static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) +{ + int faulty = test_bit(Faulty, &rdev->flags); + if (atomic_dec_and_test(&rdev->nr_pending) && faulty) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } +} + #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 399272f..ac3ede2 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -153,15 +153,11 @@ static void multipath_status (struct seq_file *seq, struct mddev *mddev) seq_printf (seq, "]"); } -static int multipath_congested(void *data, int bits) +static int multipath_congested(struct mddev *mddev, int bits) { - struct mddev *mddev = data; struct mpconf *conf = mddev->private; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - rcu_read_lock(); for (i = 0; i < mddev->raid_disks ; i++) { struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); @@ -403,7 +399,7 @@ static int multipath_run (struct mddev *mddev) /* * copy the already verified devices into our private MULTIPATH * bookkeeping area. [whatever we allocate in multipath_run(), - * should be freed in multipath_stop()] + * should be freed in multipath_free()] */ conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); @@ -489,9 +485,6 @@ static int multipath_run (struct mddev *mddev) */ md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); - mddev->queue->backing_dev_info.congested_fn = multipath_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - if (md_integrity_register(mddev)) goto out_free_conf; @@ -507,17 +500,13 @@ out: return -EIO; } -static int multipath_stop (struct mddev *mddev) +static void multipath_free(struct mddev *mddev, void *priv) { - struct mpconf *conf = mddev->private; + struct mpconf *conf = priv; - md_unregister_thread(&mddev->thread); - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ mempool_destroy(conf->pool); kfree(conf->multipaths); kfree(conf); - mddev->private = NULL; - return 0; } static struct md_personality multipath_personality = @@ -527,12 +516,13 @@ static struct md_personality multipath_personality = .owner = THIS_MODULE, .make_request = multipath_make_request, .run = multipath_run, - .stop = multipath_stop, + .free = multipath_free, .status = multipath_status, .error_handler = multipath_error, .hot_add_disk = multipath_add_disk, .hot_remove_disk= multipath_remove_disk, .size = multipath_size, + .congested = multipath_congested, }; static int __init multipath_init (void) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ba6b85d..a13f738 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -25,17 +25,13 @@ #include "raid0.h" #include "raid5.h" -static int raid0_congested(void *data, int bits) +static int raid0_congested(struct mddev *mddev, int bits) { - struct mddev *mddev = data; struct r0conf *conf = mddev->private; struct md_rdev **devlist = conf->devlist; int raid_disks = conf->strip_zone[0].nb_dev; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - for (i = 0; i < raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(devlist[i]->bdev); @@ -263,8 +259,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) mdname(mddev), (unsigned long long)smallest->sectors); } - mddev->queue->backing_dev_info.congested_fn = raid0_congested; - mddev->queue->backing_dev_info.congested_data = mddev; /* * now since we have the hard sector sizes, we can make sure @@ -356,17 +350,16 @@ static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, /** * raid0_mergeable_bvec -- tell bio layer if two requests can be merged - * @q: request queue + * @mddev: the md device * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset */ -static int raid0_mergeable_bvec(struct request_queue *q, +static int raid0_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct r0conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); sector_t sector_offset = sector; @@ -422,7 +415,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks return array_sectors; } -static int raid0_stop(struct mddev *mddev); +static void raid0_free(struct mddev *mddev, void *priv); static int raid0_run(struct mddev *mddev) { @@ -471,26 +464,22 @@ static int raid0_run(struct mddev *mddev) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } - blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); dump_zones(mddev); ret = md_integrity_register(mddev); if (ret) - raid0_stop(mddev); + raid0_free(mddev, conf); return ret; } -static int raid0_stop(struct mddev *mddev) +static void raid0_free(struct mddev *mddev, void *priv) { - struct r0conf *conf = mddev->private; + struct r0conf *conf = priv; - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf->strip_zone); kfree(conf->devlist); kfree(conf); - mddev->private = NULL; - return 0; } /* @@ -724,11 +713,13 @@ static struct md_personality raid0_personality= .owner = THIS_MODULE, .make_request = raid0_make_request, .run = raid0_run, - .stop = raid0_stop, + .free = raid0_free, .status = raid0_status, .size = raid0_size, .takeover = raid0_takeover, .quiesce = raid0_quiesce, + .congested = raid0_congested, + .mergeable_bvec = raid0_mergeable_bvec, }; static int __init raid0_init (void) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 40b35be..5dd0c2e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -701,11 +701,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect return best_disk; } -static int raid1_mergeable_bvec(struct request_queue *q, +static int raid1_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct r1conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max = biovec->bv_len; @@ -734,7 +733,7 @@ static int raid1_mergeable_bvec(struct request_queue *q, } -int md_raid1_congested(struct mddev *mddev, int bits) +static int raid1_congested(struct mddev *mddev, int bits) { struct r1conf *conf = mddev->private; int i, ret = 0; @@ -763,15 +762,6 @@ int md_raid1_congested(struct mddev *mddev, int bits) rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(md_raid1_congested); - -static int raid1_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid1_congested(mddev, bits); -} static void flush_pending_writes(struct r1conf *conf) { @@ -2882,7 +2872,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) return ERR_PTR(err); } -static int stop(struct mddev *mddev); +static void raid1_free(struct mddev *mddev, void *priv); static int run(struct mddev *mddev) { struct r1conf *conf; @@ -2904,7 +2894,7 @@ static int run(struct mddev *mddev) /* * copy the already verified devices into our private RAID1 * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] + * should be freed in raid1_free()] */ if (mddev->private == NULL) conf = setup_conf(mddev); @@ -2955,10 +2945,6 @@ static int run(struct mddev *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); if (mddev->queue) { - mddev->queue->backing_dev_info.congested_fn = raid1_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); - if (discard_supported) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); @@ -2968,37 +2954,23 @@ static int run(struct mddev *mddev) } ret = md_integrity_register(mddev); - if (ret) - stop(mddev); + if (ret) { + md_unregister_thread(&mddev->thread); + raid1_free(mddev, conf); + } return ret; } -static int stop(struct mddev *mddev) +static void raid1_free(struct mddev *mddev, void *priv) { - struct r1conf *conf = mddev->private; - struct bitmap *bitmap = mddev->bitmap; + struct r1conf *conf = priv; - /* wait for behind writes to complete */ - if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", - mdname(mddev)); - /* need to kick something here to make sure I/O goes? */ - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); - } - - freeze_array(conf, 0); - unfreeze_array(conf); - - md_unregister_thread(&mddev->thread); if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); kfree(conf); - mddev->private = NULL; - return 0; } static int raid1_resize(struct mddev *mddev, sector_t sectors) @@ -3181,7 +3153,7 @@ static struct md_personality raid1_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid1_free, .status = status, .error_handler = error, .hot_add_disk = raid1_add_disk, @@ -3193,6 +3165,8 @@ static struct md_personality raid1_personality = .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, .takeover = raid1_takeover, + .congested = raid1_congested, + .mergeable_bvec = raid1_mergeable_bvec, }; static int __init raid_init(void) diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 33bda55..14ebb28 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -170,7 +170,4 @@ struct r1bio { */ #define R1BIO_MadeGood 7 #define R1BIO_WriteError 8 - -extern int md_raid1_congested(struct mddev *mddev, int bits); - #endif diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 32e282f..b8d76b1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -674,7 +674,7 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) /** * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged - * @q: request queue + * @mddev: the md device * @bvm: properties of new bio * @biovec: the request that could be merged to it. * @@ -682,11 +682,10 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) * This requires checking for end-of-chunk if near_copies != raid_disks, * and for subordinate merge_bvec_fns if merge_check_needed. */ -static int raid10_mergeable_bvec(struct request_queue *q, +static int raid10_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct r10conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; @@ -910,7 +909,7 @@ retry: return rdev; } -int md_raid10_congested(struct mddev *mddev, int bits) +static int raid10_congested(struct mddev *mddev, int bits) { struct r10conf *conf = mddev->private; int i, ret = 0; @@ -934,15 +933,6 @@ int md_raid10_congested(struct mddev *mddev, int bits) rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(md_raid10_congested); - -static int raid10_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid10_congested(mddev, bits); -} static void flush_pending_writes(struct r10conf *conf) { @@ -3757,8 +3747,6 @@ static int run(struct mddev *mddev) if (mddev->queue) { int stripe = conf->geo.raid_disks * ((mddev->chunk_sectors << 9) / PAGE_SIZE); - mddev->queue->backing_dev_info.congested_fn = raid10_congested; - mddev->queue->backing_dev_info.congested_data = mddev; /* Calculate max read-ahead size. * We need to readahead at least twice a whole stripe.... @@ -3767,7 +3755,6 @@ static int run(struct mddev *mddev) stripe /= conf->geo.near_copies; if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); } if (md_integrity_register(mddev)) @@ -3811,17 +3798,9 @@ out: return -EIO; } -static int stop(struct mddev *mddev) +static void raid10_free(struct mddev *mddev, void *priv) { - struct r10conf *conf = mddev->private; - - raise_barrier(conf, 0); - lower_barrier(conf); - - md_unregister_thread(&mddev->thread); - if (mddev->queue) - /* the unplug fn references 'conf'*/ - blk_sync_queue(mddev->queue); + struct r10conf *conf = priv; if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); @@ -3830,8 +3809,6 @@ static int stop(struct mddev *mddev) kfree(conf->mirrors_old); kfree(conf->mirrors_new); kfree(conf); - mddev->private = NULL; - return 0; } static void raid10_quiesce(struct mddev *mddev, int state) @@ -3895,7 +3872,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) return 0; } -static void *raid10_takeover_raid0(struct mddev *mddev) +static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) { struct md_rdev *rdev; struct r10conf *conf; @@ -3905,6 +3882,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev) mdname(mddev)); return ERR_PTR(-EINVAL); } + sector_div(size, devs); /* Set new parameters */ mddev->new_level = 10; @@ -3915,12 +3893,15 @@ static void *raid10_takeover_raid0(struct mddev *mddev) mddev->raid_disks *= 2; /* make sure it will be not marked as dirty */ mddev->recovery_cp = MaxSector; + mddev->dev_sectors = size; conf = setup_conf(mddev); if (!IS_ERR(conf)) { rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0) + if (rdev->raid_disk >= 0) { rdev->new_raid_disk = rdev->raid_disk * 2; + rdev->sectors = size; + } conf->barrier = 1; } @@ -3943,7 +3924,9 @@ static void *raid10_takeover(struct mddev *mddev) mdname(mddev)); return ERR_PTR(-EINVAL); } - return raid10_takeover_raid0(mddev); + return raid10_takeover_raid0(mddev, + raid0_conf->strip_zone->zone_end, + raid0_conf->strip_zone->nb_dev); } return ERR_PTR(-EINVAL); } @@ -4713,7 +4696,7 @@ static struct md_personality raid10_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid10_free, .status = status, .error_handler = error, .hot_add_disk = raid10_add_disk, @@ -4727,6 +4710,8 @@ static struct md_personality raid10_personality = .check_reshape = raid10_check_reshape, .start_reshape = raid10_start_reshape, .finish_reshape = raid10_finish_reshape, + .congested = raid10_congested, + .mergeable_bvec = raid10_mergeable_bvec, }; static int __init raid_init(void) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 157d69e..5ee6473 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -150,7 +150,4 @@ enum r10bio_state { */ R10BIO_Previous, }; - -extern int md_raid10_congested(struct mddev *mddev, int bits); - #endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b98765f..aa76865 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -296,12 +296,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state) && - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); - if (atomic_read(&conf->preread_active_stripes) - < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { @@ -2898,31 +2895,102 @@ static int want_replace(struct stripe_head *sh, int disk_idx) * Returns 1 when no more member devices need to be checked, otherwise returns * 0 to tell the loop in handle_stripe_fill to continue */ -static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, - int disk_idx, int disks) + +static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], &sh->dev[s->failed_num[1]] }; + int i; + + + if (test_bit(R5_LOCKED, &dev->flags) || + test_bit(R5_UPTODATE, &dev->flags)) + /* No point reading this as we already have it or have + * decided to get it. + */ + return 0; + + if (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) + /* We need this block to directly satisfy a request */ + return 1; + + if (s->syncing || s->expanding || + (s->replacing && want_replace(sh, disk_idx))) + /* When syncing, or expanding we read everything. + * When replacing, we need the replaced block. + */ + return 1; + + if ((s->failed >= 1 && fdev[0]->toread) || + (s->failed >= 2 && fdev[1]->toread)) + /* If we want to read from a failed device, then + * we need to actually read every other device. + */ + return 1; + + /* Sometimes neither read-modify-write nor reconstruct-write + * cycles can work. In those cases we read every block we + * can. Then the parity-update is certain to have enough to + * work with. + * This can only be a problem when we need to write something, + * and some device has failed. If either of those tests + * fail we need look no further. + */ + if (!s->failed || !s->to_write) + return 0; + + if (test_bit(R5_Insync, &dev->flags) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + /* Pre-reads at not permitted until after short delay + * to gather multiple requests. However if this + * device is no Insync, the block could only be be computed + * and there is no need to delay that. + */ + return 0; + + for (i = 0; i < s->failed; i++) { + if (fdev[i]->towrite && + !test_bit(R5_UPTODATE, &fdev[i]->flags) && + !test_bit(R5_OVERWRITE, &fdev[i]->flags)) + /* If we have a partial write to a failed + * device, then we will need to reconstruct + * the content of that device, so all other + * devices must be read. + */ + return 1; + } + + /* If we are forced to do a reconstruct-write, either because + * the current RAID6 implementation only supports that, or + * or because parity cannot be trusted and we are currently + * recovering it, there is extra need to be careful. + * If one of the devices that we would need to read, because + * it is not being overwritten (and maybe not written at all) + * is missing/faulty, then we need to read everything we can. + */ + if (sh->raid_conf->level != 6 && + sh->sector < sh->raid_conf->mddev->recovery_cp) + /* reconstruct-write isn't being forced */ + return 0; + for (i = 0; i < s->failed; i++) { + if (!test_bit(R5_UPTODATE, &fdev[i]->flags) && + !test_bit(R5_OVERWRITE, &fdev[i]->flags)) + return 1; + } + + return 0; +} + +static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) +{ + struct r5dev *dev = &sh->dev[disk_idx]; /* is the data in this block needed, and can we get it? */ - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->replacing && want_replace(sh, disk_idx)) || - (s->failed >= 1 && fdev[0]->toread) || - (s->failed >= 2 && fdev[1]->toread) || - (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && - (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && - !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || - ((sh->raid_conf->level == 6 || - sh->sector >= sh->raid_conf->mddev->recovery_cp) - && s->failed && s->to_write && - (s->to_write - s->non_overwrite < - sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) && - (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { + if (need_this_block(sh, s, disk_idx, disks)) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ @@ -4081,7 +4149,7 @@ static void activate_bit_delay(struct r5conf *conf, } } -int md_raid5_congested(struct mddev *mddev, int bits) +static int raid5_congested(struct mddev *mddev, int bits) { struct r5conf *conf = mddev->private; @@ -4098,24 +4166,14 @@ int md_raid5_congested(struct mddev *mddev, int bits) return 0; } -EXPORT_SYMBOL_GPL(md_raid5_congested); - -static int raid5_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid5_congested(mddev, bits); -} /* We want read requests to align with chunks where possible, * but write requests don't need to. */ -static int raid5_mergeable_bvec(struct request_queue *q, +static int raid5_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_sectors; @@ -5296,11 +5354,14 @@ static void raid5d(struct md_thread *thread) static ssize_t raid5_show_stripe_cache_size(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->max_nr_stripes); - else - return 0; + ret = sprintf(page, "%d\n", conf->max_nr_stripes); + spin_unlock(&mddev->lock); + return ret; } int @@ -5339,21 +5400,25 @@ EXPORT_SYMBOL(raid5_set_cache_size); static ssize_t raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; int err; if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - err = raid5_set_cache_size(mddev, new); + err = mddev_lock(mddev); if (err) return err; - return len; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else + err = raid5_set_cache_size(mddev, new); + mddev_unlock(mddev); + + return err ?: len; } static struct md_sysfs_entry @@ -5364,29 +5429,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, static ssize_t raid5_show_preread_threshold(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->bypass_threshold); - else - return 0; + ret = sprintf(page, "%d\n", conf->bypass_threshold); + spin_unlock(&mddev->lock); + return ret; } static ssize_t raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - if (new > conf->max_nr_stripes) - return -EINVAL; - conf->bypass_threshold = new; - return len; + + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new > conf->max_nr_stripes) + err = -EINVAL; + else + conf->bypass_threshold = new; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry @@ -5398,39 +5474,48 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, static ssize_t raid5_show_skip_copy(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->skip_copy); - else - return 0; + ret = sprintf(page, "%d\n", conf->skip_copy); + spin_unlock(&mddev->lock); + return ret; } static ssize_t raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; new = !!new; - if (new == conf->skip_copy) - return len; - mddev_suspend(mddev); - conf->skip_copy = new; - if (new) - mddev->queue->backing_dev_info.capabilities |= - BDI_CAP_STABLE_WRITES; - else - mddev->queue->backing_dev_info.capabilities &= - ~BDI_CAP_STABLE_WRITES; - mddev_resume(mddev); - return len; + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new != conf->skip_copy) { + mddev_suspend(mddev); + conf->skip_copy = new; + if (new) + mddev->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + mddev->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; + mddev_resume(mddev); + } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry @@ -5454,11 +5539,14 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active); static ssize_t raid5_show_group_thread_cnt(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->worker_cnt_per_group); - else - return 0; + ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); + spin_unlock(&mddev->lock); + return ret; } static int alloc_thread_groups(struct r5conf *conf, int cnt, @@ -5468,7 +5556,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt, static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; int err; struct r5worker_group *new_groups, *old_groups; @@ -5476,41 +5564,41 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - if (new == conf->worker_cnt_per_group) - return len; - - mddev_suspend(mddev); + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new != conf->worker_cnt_per_group) { + mddev_suspend(mddev); - old_groups = conf->worker_groups; - if (old_groups) - flush_workqueue(raid5_wq); + old_groups = conf->worker_groups; + if (old_groups) + flush_workqueue(raid5_wq); - err = alloc_thread_groups(conf, new, - &group_cnt, &worker_cnt_per_group, - &new_groups); - if (!err) { - spin_lock_irq(&conf->device_lock); - conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; - conf->worker_groups = new_groups; - spin_unlock_irq(&conf->device_lock); + err = alloc_thread_groups(conf, new, + &group_cnt, &worker_cnt_per_group, + &new_groups); + if (!err) { + spin_lock_irq(&conf->device_lock); + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_groups; + spin_unlock_irq(&conf->device_lock); - if (old_groups) - kfree(old_groups[0].workers); - kfree(old_groups); + if (old_groups) + kfree(old_groups[0].workers); + kfree(old_groups); + } + mddev_resume(mddev); } + mddev_unlock(mddev); - mddev_resume(mddev); - - if (err) - return err; - return len; + return err ?: len; } static struct md_sysfs_entry @@ -6178,11 +6266,6 @@ static int run(struct mddev *mddev) if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); - - mddev->queue->backing_dev_info.congested_data = mddev; - mddev->queue->backing_dev_info.congested_fn = raid5_congested; - chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * @@ -6260,17 +6343,12 @@ abort: return -EIO; } -static int stop(struct mddev *mddev) +static void raid5_free(struct mddev *mddev, void *priv) { - struct r5conf *conf = mddev->private; + struct r5conf *conf = priv; - md_unregister_thread(&mddev->thread); - if (mddev->queue) - mddev->queue->backing_dev_info.congested_fn = NULL; free_conf(conf); - mddev->private = NULL; mddev->to_remove = &raid5_attrs_group; - return 0; } static void status(struct seq_file *seq, struct mddev *mddev) @@ -7044,7 +7122,7 @@ static struct md_personality raid6_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7058,6 +7136,8 @@ static struct md_personality raid6_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid6_takeover, + .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid5_personality = { @@ -7066,7 +7146,7 @@ static struct md_personality raid5_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7080,6 +7160,8 @@ static struct md_personality raid5_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid5_takeover, + .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid4_personality = @@ -7089,7 +7171,7 @@ static struct md_personality raid4_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7103,6 +7185,8 @@ static struct md_personality raid4_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid4_takeover, + .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static int __init raid5_init(void) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index d59f5ca..983e18a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -558,7 +558,6 @@ static inline int algorithm_is_DDF(int layout) return layout >= 8 && layout <= 10; } -extern int md_raid5_congested(struct mddev *mddev, int bits); extern void md_raid5_kick_device(struct r5conf *conf); extern int raid5_set_cache_size(struct mddev *mddev, int size); #endif diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index 5356395..55fa27e 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -49,7 +49,6 @@ static DEFINE_MUTEX(mtd_mutex); */ struct mtd_file_info { struct mtd_info *mtd; - struct inode *ino; enum mtd_file_modes mode; }; @@ -59,10 +58,6 @@ static loff_t mtdchar_lseek(struct file *file, loff_t offset, int orig) return fixed_size_llseek(file, offset, orig, mfi->mtd->size); } -static int count; -static struct vfsmount *mnt; -static struct file_system_type mtd_inodefs_type; - static int mtdchar_open(struct inode *inode, struct file *file) { int minor = iminor(inode); @@ -70,7 +65,6 @@ static int mtdchar_open(struct inode *inode, struct file *file) int ret = 0; struct mtd_info *mtd; struct mtd_file_info *mfi; - struct inode *mtd_ino; pr_debug("MTD_open\n"); @@ -78,10 +72,6 @@ static int mtdchar_open(struct inode *inode, struct file *file) if ((file->f_mode & FMODE_WRITE) && (minor & 1)) return -EACCES; - ret = simple_pin_fs(&mtd_inodefs_type, &mnt, &count); - if (ret) - return ret; - mutex_lock(&mtd_mutex); mtd = get_mtd_device(NULL, devnum); @@ -95,43 +85,26 @@ static int mtdchar_open(struct inode *inode, struct file *file) goto out1; } - mtd_ino = iget_locked(mnt->mnt_sb, devnum); - if (!mtd_ino) { - ret = -ENOMEM; - goto out1; - } - if (mtd_ino->i_state & I_NEW) { - mtd_ino->i_private = mtd; - mtd_ino->i_mode = S_IFCHR; - mtd_ino->i_data.backing_dev_info = mtd->backing_dev_info; - unlock_new_inode(mtd_ino); - } - file->f_mapping = mtd_ino->i_mapping; - /* You can't open it RW if it's not a writeable device */ if ((file->f_mode & FMODE_WRITE) && !(mtd->flags & MTD_WRITEABLE)) { ret = -EACCES; - goto out2; + goto out1; } mfi = kzalloc(sizeof(*mfi), GFP_KERNEL); if (!mfi) { ret = -ENOMEM; - goto out2; + goto out1; } - mfi->ino = mtd_ino; mfi->mtd = mtd; file->private_data = mfi; mutex_unlock(&mtd_mutex); return 0; -out2: - iput(mtd_ino); out1: put_mtd_device(mtd); out: mutex_unlock(&mtd_mutex); - simple_release_fs(&mnt, &count); return ret; } /* mtdchar_open */ @@ -148,12 +121,9 @@ static int mtdchar_close(struct inode *inode, struct file *file) if ((file->f_mode & FMODE_WRITE)) mtd_sync(mtd); - iput(mfi->ino); - put_mtd_device(mtd); file->private_data = NULL; kfree(mfi); - simple_release_fs(&mnt, &count); return 0; } /* mtdchar_close */ @@ -1117,6 +1087,13 @@ static unsigned long mtdchar_get_unmapped_area(struct file *file, ret = mtd_get_unmapped_area(mtd, len, offset, flags); return ret == -EOPNOTSUPP ? -ENODEV : ret; } + +static unsigned mtdchar_mmap_capabilities(struct file *file) +{ + struct mtd_file_info *mfi = file->private_data; + + return mtd_mmap_capabilities(mfi->mtd); +} #endif /* @@ -1160,27 +1137,10 @@ static const struct file_operations mtd_fops = { .mmap = mtdchar_mmap, #ifndef CONFIG_MMU .get_unmapped_area = mtdchar_get_unmapped_area, + .mmap_capabilities = mtdchar_mmap_capabilities, #endif }; -static const struct super_operations mtd_ops = { - .drop_inode = generic_delete_inode, - .statfs = simple_statfs, -}; - -static struct dentry *mtd_inodefs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_pseudo(fs_type, "mtd_inode:", &mtd_ops, NULL, MTD_INODE_FS_MAGIC); -} - -static struct file_system_type mtd_inodefs_type = { - .name = "mtd_inodefs", - .mount = mtd_inodefs_mount, - .kill_sb = kill_anon_super, -}; -MODULE_ALIAS_FS("mtd_inodefs"); - int __init init_mtdchar(void) { int ret; @@ -1193,23 +1153,11 @@ int __init init_mtdchar(void) return ret; } - ret = register_filesystem(&mtd_inodefs_type); - if (ret) { - pr_err("Can't register mtd_inodefs filesystem, error %d\n", - ret); - goto err_unregister_chdev; - } - - return ret; - -err_unregister_chdev: - __unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd"); return ret; } void __exit cleanup_mtdchar(void) { - unregister_filesystem(&mtd_inodefs_type); __unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd"); } diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c index b900056..eacc3aa 100644 --- a/drivers/mtd/mtdconcat.c +++ b/drivers/mtd/mtdconcat.c @@ -732,8 +732,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c concat->mtd.ecc_stats.badblocks = subdev[0]->ecc_stats.badblocks; - concat->mtd.backing_dev_info = subdev[0]->backing_dev_info; - concat->subdev[0] = subdev[0]; for (i = 1; i < num_devs; i++) { @@ -761,14 +759,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c subdev[i]->flags & MTD_WRITEABLE; } - /* only permit direct mapping if the BDIs are all the same - * - copy-mapping is still permitted - */ - if (concat->mtd.backing_dev_info != - subdev[i]->backing_dev_info) - concat->mtd.backing_dev_info = - &default_backing_dev_info; - concat->mtd.size += subdev[i]->size; concat->mtd.ecc_stats.badblocks += subdev[i]->ecc_stats.badblocks; diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 4c61187..0ec4d6e 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -43,33 +43,7 @@ #include "mtdcore.h" -/* - * backing device capabilities for non-mappable devices (such as NAND flash) - * - permits private mappings, copies are taken of the data - */ -static struct backing_dev_info mtd_bdi_unmappable = { - .capabilities = BDI_CAP_MAP_COPY, -}; - -/* - * backing device capabilities for R/O mappable devices (such as ROM) - * - permits private mappings, copies are taken of the data - * - permits non-writable shared mappings - */ -static struct backing_dev_info mtd_bdi_ro_mappable = { - .capabilities = (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT | - BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP), -}; - -/* - * backing device capabilities for writable mappable devices (such as RAM) - * - permits private mappings, copies are taken of the data - * - permits non-writable shared mappings - */ -static struct backing_dev_info mtd_bdi_rw_mappable = { - .capabilities = (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT | - BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP | - BDI_CAP_WRITE_MAP), +static struct backing_dev_info mtd_bdi = { }; static int mtd_cls_suspend(struct device *dev, pm_message_t state); @@ -365,6 +339,23 @@ static struct device_type mtd_devtype = { .release = mtd_release, }; +#ifndef CONFIG_MMU +unsigned mtd_mmap_capabilities(struct mtd_info *mtd) +{ + switch (mtd->type) { + case MTD_RAM: + return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC | + NOMMU_MAP_READ | NOMMU_MAP_WRITE; + case MTD_ROM: + return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC | + NOMMU_MAP_READ; + default: + return NOMMU_MAP_COPY; + } +} +EXPORT_SYMBOL_GPL(mtd_mmap_capabilities); +#endif + /** * add_mtd_device - register an MTD device * @mtd: pointer to new MTD device info structure @@ -380,19 +371,7 @@ int add_mtd_device(struct mtd_info *mtd) struct mtd_notifier *not; int i, error; - if (!mtd->backing_dev_info) { - switch (mtd->type) { - case MTD_RAM: - mtd->backing_dev_info = &mtd_bdi_rw_mappable; - break; - case MTD_ROM: - mtd->backing_dev_info = &mtd_bdi_ro_mappable; - break; - default: - mtd->backing_dev_info = &mtd_bdi_unmappable; - break; - } - } + mtd->backing_dev_info = &mtd_bdi; BUG_ON(mtd->writesize == 0); mutex_lock(&mtd_table_mutex); @@ -1237,17 +1216,9 @@ static int __init init_mtd(void) if (ret) goto err_reg; - ret = mtd_bdi_init(&mtd_bdi_unmappable, "mtd-unmap"); - if (ret) - goto err_bdi1; - - ret = mtd_bdi_init(&mtd_bdi_ro_mappable, "mtd-romap"); - if (ret) - goto err_bdi2; - - ret = mtd_bdi_init(&mtd_bdi_rw_mappable, "mtd-rwmap"); + ret = mtd_bdi_init(&mtd_bdi, "mtd"); if (ret) - goto err_bdi3; + goto err_bdi; proc_mtd = proc_create("mtd", 0, NULL, &mtd_proc_ops); @@ -1260,11 +1231,7 @@ static int __init init_mtd(void) out_procfs: if (proc_mtd) remove_proc_entry("mtd", NULL); -err_bdi3: - bdi_destroy(&mtd_bdi_ro_mappable); -err_bdi2: - bdi_destroy(&mtd_bdi_unmappable); -err_bdi1: +err_bdi: class_unregister(&mtd_class); err_reg: pr_err("Error registering mtd class or bdi: %d\n", ret); @@ -1277,9 +1244,7 @@ static void __exit cleanup_mtd(void) if (proc_mtd) remove_proc_entry("mtd", NULL); class_unregister(&mtd_class); - bdi_destroy(&mtd_bdi_unmappable); - bdi_destroy(&mtd_bdi_ro_mappable); - bdi_destroy(&mtd_bdi_rw_mappable); + bdi_destroy(&mtd_bdi); } module_init(init_mtd); diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index a3e3a7d..e779de3 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -378,7 +378,6 @@ static struct mtd_part *allocate_partition(struct mtd_info *master, slave->mtd.name = name; slave->mtd.owner = master->owner; - slave->mtd.backing_dev_info = master->backing_dev_info; /* NOTE: we don't arrange MTDs as a tree; it'd be error-prone * to have the same data be in two different partitions. diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 7f90022..96128cb 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -28,8 +28,8 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode); static void dcssblk_release(struct gendisk *disk, fmode_t mode); static void dcssblk_make_request(struct request_queue *q, struct bio *bio); -static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum, - void **kaddr, unsigned long *pfn); +static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, + void **kaddr, unsigned long *pfn, long size); static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; @@ -877,25 +877,22 @@ fail: bio_io_error(bio); } -static int +static long dcssblk_direct_access (struct block_device *bdev, sector_t secnum, - void **kaddr, unsigned long *pfn) + void **kaddr, unsigned long *pfn, long size) { struct dcssblk_dev_info *dev_info; - unsigned long pgoff; + unsigned long offset, dev_sz; dev_info = bdev->bd_disk->private_data; if (!dev_info) return -ENODEV; - if (secnum % (PAGE_SIZE/512)) - return -EINVAL; - pgoff = secnum / (PAGE_SIZE / 512); - if ((pgoff+1)*PAGE_SIZE-1 > dev_info->end - dev_info->start) - return -ERANGE; - *kaddr = (void *) (dev_info->start+pgoff*PAGE_SIZE); + dev_sz = dev_info->end - dev_info->start; + offset = secnum * 512; + *kaddr = (void *) (dev_info->start + offset); *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT; - return 0; + return dev_sz - offset; } static void diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 17bb541..54d7a6c 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2197,6 +2197,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost) shost->tag_set.cmd_size = cmd_size; shost->tag_set.numa_node = NUMA_NO_NODE; shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + shost->tag_set.flags |= + BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy); shost->tag_set.driver_data = shost; return blk_mq_alloc_tag_set(&shost->tag_set); diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 0deb385..9c0a520 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -277,7 +277,8 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, if (!shost_use_blk_mq(sdev->host) && (shost->bqt || shost->hostt->use_blk_tags)) { blk_queue_init_tags(sdev->request_queue, - sdev->host->cmd_per_lun, shost->bqt); + sdev->host->cmd_per_lun, shost->bqt, + shost->hostt->tag_alloc_policy); } scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun); diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index a668c88..0cbc1fb 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1719,22 +1719,19 @@ sg_start_req(Sg_request *srp, unsigned char *cmd) } if (iov_count) { - int len, size = sizeof(struct sg_iovec) * iov_count; + int size = sizeof(struct iovec) * iov_count; struct iovec *iov; + struct iov_iter i; iov = memdup_user(hp->dxferp, size); if (IS_ERR(iov)) return PTR_ERR(iov); - len = iov_length(iov, iov_count); - if (hp->dxfer_len < len) { - iov_count = iov_shorten(iov, iov_count, hp->dxfer_len); - len = hp->dxfer_len; - } + iov_iter_init(&i, rw, iov, iov_count, + min_t(size_t, hp->dxfer_len, + iov_length(iov, iov_count))); - res = blk_rq_map_user_iov(q, rq, md, (struct sg_iovec *)iov, - iov_count, - len, GFP_ATOMIC); + res = blk_rq_map_user_iov(q, rq, md, &i, GFP_ATOMIC); kfree(iov); } else res = blk_rq_map_user(q, rq, md, hp->dxferp, diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c index a3367bf..45aaa1c 100644 --- a/drivers/staging/lustre/lustre/llite/llite_lib.c +++ b/drivers/staging/lustre/lustre/llite/llite_lib.c @@ -987,7 +987,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt) if (err) goto out_free; lsi->lsi_flags |= LSI_BDI_INITIALIZED; - lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY; + lsi->lsi_bdi.capabilities = 0; err = ll_bdi_register(&lsi->lsi_bdi); if (err) goto out_free; @@ -1812,10 +1812,6 @@ void ll_read_inode2(struct inode *inode, void *opaque) /* OIDEBUG(inode); */ - /* initializing backing dev info. */ - inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi; - - if (S_ISREG(inode->i_mode)) { struct ll_sb_info *sbi = ll_i2sbi(inode); diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 6894b08..620d934 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -335,7 +335,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, } init_rwsem(&v9ses->rename_sem); - rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); + rc = bdi_setup_and_register(&v9ses->bdi, "9p"); if (rc) { kfree(v9ses->aname); kfree(v9ses->uname); diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 2b60725..d142a24 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -106,7 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) volume->cell = params->cell; volume->vid = vlocation->vldb.vid[params->type]; - ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY); + ret = bdi_setup_and_register(&volume->bdi, "afs"); if (ret) goto error_bdi; @@ -165,15 +165,6 @@ static struct vfsmount *aio_mnt; static const struct file_operations aio_ring_fops; static const struct address_space_operations aio_ctx_aops; -/* Backing dev info for aio fs. - * -no dirty page accounting or writeback happens - */ -static struct backing_dev_info aio_fs_backing_dev_info = { - .name = "aiofs", - .state = 0, - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY, -}; - static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) { struct qstr this = QSTR_INIT("[aio]", 5); @@ -185,7 +176,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) inode->i_mapping->a_ops = &aio_ctx_aops; inode->i_mapping->private_data = ctx; - inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info; inode->i_size = PAGE_SIZE * nr_pages; path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this); @@ -230,9 +220,6 @@ static int __init aio_setup(void) if (IS_ERR(aio_mnt)) panic("Failed to create aio fs mount."); - if (bdi_init(&aio_fs_backing_dev_info)) - panic("Failed to init aio fs backing dev info."); - kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); diff --git a/fs/block_dev.c b/fs/block_dev.c index b48c41b..975266b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -49,23 +49,15 @@ inline struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); -/* - * Move the inode from its current bdi to a new bdi. Make sure the inode - * is clean before moving so that it doesn't linger on the old bdi. - */ -static void bdev_inode_switch_bdi(struct inode *inode, - struct backing_dev_info *dst) +static void bdev_write_inode(struct inode *inode) { - while (true) { - spin_lock(&inode->i_lock); - if (!(inode->i_state & I_DIRTY)) { - inode->i_data.backing_dev_info = dst; - spin_unlock(&inode->i_lock); - return; - } + spin_lock(&inode->i_lock); + while (inode->i_state & I_DIRTY) { spin_unlock(&inode->i_lock); WARN_ON_ONCE(write_inode_now(inode, true)); + spin_lock(&inode->i_lock); } + spin_unlock(&inode->i_lock); } /* Kill _all_ buffers and pagecache , dirty or not.. */ @@ -429,6 +421,46 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(bdev_write_page); +/** + * bdev_direct_access() - Get the address for directly-accessibly memory + * @bdev: The device containing the memory + * @sector: The offset within the device + * @addr: Where to put the address of the memory + * @pfn: The Page Frame Number for the memory + * @size: The number of bytes requested + * + * If a block device is made up of directly addressable memory, this function + * will tell the caller the PFN and the address of the memory. The address + * may be directly dereferenced within the kernel without the need to call + * ioremap(), kmap() or similar. The PFN is suitable for inserting into + * page tables. + * + * Return: negative errno if an error occurs, otherwise the number of bytes + * accessible at this address. + */ +long bdev_direct_access(struct block_device *bdev, sector_t sector, + void **addr, unsigned long *pfn, long size) +{ + long avail; + const struct block_device_operations *ops = bdev->bd_disk->fops; + + if (size < 0) + return size; + if (!ops->direct_access) + return -EOPNOTSUPP; + if ((sector + DIV_ROUND_UP(size, 512)) > + part_nr_sects_read(bdev->bd_part)) + return -ERANGE; + sector += get_start_sect(bdev); + if (sector % (PAGE_SIZE / 512)) + return -EINVAL; + avail = ops->direct_access(bdev, sector, addr, pfn, size); + if (!avail) + return -ERANGE; + return min(avail, size); +} +EXPORT_SYMBOL_GPL(bdev_direct_access); + /* * pseudo-fs */ @@ -584,7 +616,6 @@ struct block_device *bdget(dev_t dev) inode->i_bdev = bdev; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); - inode->i_data.backing_dev_info = &default_backing_dev_info; spin_lock(&bdev_lock); list_add(&bdev->bd_list, &all_bdevs); spin_unlock(&bdev_lock); @@ -1145,8 +1176,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; if (!partno) { - struct backing_dev_info *bdi; - ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); if (!bdev->bd_part) @@ -1172,11 +1201,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) { + if (!ret) bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - bdi = blk_get_backing_dev_info(bdev); - bdev_inode_switch_bdi(bdev->bd_inode, bdi); - } /* * If the device is invalidated, rescan partition @@ -1203,8 +1229,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (ret) goto out_clear; bdev->bd_contains = whole; - bdev_inode_switch_bdi(bdev->bd_inode, - whole->bd_inode->i_data.backing_dev_info); bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { @@ -1244,7 +1268,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; bdev->bd_part = NULL; bdev->bd_queue = NULL; - bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; @@ -1464,11 +1487,11 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); - /* ->release can cause the old bdi to disappear, - * so must switch it out first + /* + * ->release can cause the queue to disappear, so flush all + * dirty data before. */ - bdev_inode_switch_bdi(bdev->bd_inode, - &default_backing_dev_info); + bdev_write_inode(bdev->bd_inode); } if (bdev->bd_contains == bdev) { if (disk->fops->release) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8c63419..1afb182 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1715,12 +1715,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { int err; - bdi->capabilities = BDI_CAP_MAP_COPY; - err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY); + err = bdi_setup_and_register(bdi, "btrfs"); if (err) return err; - bdi->ra_pages = default_backing_dev_info.ra_pages; + bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; bdi->congested_fn = btrfs_congested_fn; bdi->congested_data = info; return 0; @@ -2319,7 +2318,6 @@ int open_ctree(struct super_block *sb, */ fs_info->btree_inode->i_size = OFFSET_MAX; fs_info->btree_inode->i_mapping->a_ops = &btree_aops; - fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a606ab5..b78bbba 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1746,7 +1746,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, mutex_lock(&inode->i_mutex); - current->backing_dev_info = inode->i_mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) { mutex_unlock(&inode->i_mutex); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8bf326a..54bcf63 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3608,7 +3608,6 @@ cache_acl: switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; @@ -3623,7 +3622,6 @@ cache_acl: case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; inode->i_mapping->a_ops = &btrfs_symlink_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; break; default: inode->i_op = &btrfs_special_inode_operations; @@ -6088,7 +6086,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) @@ -9203,7 +9200,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); @@ -9247,7 +9243,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_symlink_inode_operations; inode->i_mapping->a_ops = &btrfs_symlink_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; inode_set_bytes(inode, name_len); btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); @@ -9459,7 +9454,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; ret = btrfs_init_inode_security(trans, inode, dir, NULL); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ce74b39..905986d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -945,7 +945,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ - current->backing_dev_info = file->f_mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f61a741..6b51736 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -783,8 +783,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page, } inode->i_mapping->a_ops = &ceph_aops; - inode->i_mapping->backing_dev_info = - &ceph_sb_to_client(inode->i_sb)->backing_dev_info; switch (inode->i_mode & S_IFMT) { case S_IFIFO: diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 50f06cd..5ae6258 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -40,17 +40,6 @@ static void ceph_put_super(struct super_block *s) dout("put_super\n"); ceph_mdsc_close_sessions(fsc->mdsc); - - /* - * ensure we release the bdi before put_anon_super releases - * the device name. - */ - if (s->s_bdi == &fsc->backing_dev_info) { - bdi_unregister(&fsc->backing_dev_info); - s->s_bdi = NULL; - } - - return; } static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -910,7 +899,7 @@ static int ceph_register_bdi(struct super_block *sb, >> PAGE_SHIFT; else fsc->backing_dev_info.ra_pages = - default_backing_dev_info.ra_pages; + VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", atomic_long_inc_return(&bdi_seq)); @@ -1002,11 +991,16 @@ out_final: static void ceph_kill_sb(struct super_block *s) { struct ceph_fs_client *fsc = ceph_sb_to_client(s); + dev_t dev = s->s_dev; + dout("kill_sb %p\n", s); + ceph_mdsc_pre_umount(fsc->mdsc); - kill_anon_super(s); /* will call put_super after sb is r/o */ + generic_shutdown_super(s); ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); + free_anon_bdev(dev); } static struct file_system_type ceph_fs_type = { diff --git a/fs/char_dev.c b/fs/char_dev.c index 67b2007..ea06a3d 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -24,27 +24,6 @@ #include "internal.h" -/* - * capabilities for /dev/mem, /dev/kmem and similar directly mappable character - * devices - * - permits shared-mmap for read, write and/or exec - * - does not permit private mmap in NOMMU mode (can't do COW) - * - no readahead or I/O queue unplugging required - */ -struct backing_dev_info directly_mappable_cdev_bdi = { - .name = "char", - .capabilities = ( -#ifdef CONFIG_MMU - /* permit private copies of the data to be taken */ - BDI_CAP_MAP_COPY | -#endif - /* permit direct mmap, for read, write or exec */ - BDI_CAP_MAP_DIRECT | - BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP | - /* no writeback happens */ - BDI_CAP_NO_ACCT_AND_WRITEBACK), -}; - static struct kobj_map *cdev_map; static DEFINE_MUTEX(chrdevs_lock); @@ -575,8 +554,6 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data) void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); - if (bdi_init(&directly_mappable_cdev_bdi)) - panic("Failed to init directly mappable cdev bdi"); } @@ -590,4 +567,3 @@ EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev); -EXPORT_SYMBOL(directly_mappable_cdev_bdi); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2a772da..d3aa999 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3446,7 +3446,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) int referral_walks_count = 0; #endif - rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); + rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs"); if (rc) return rc; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 0c3ce464..2d4f372 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -937,8 +937,6 @@ retry_iget5_locked: inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; - if (S_ISREG(inode->i_mode)) - inode->i_data.backing_dev_info = sb->s_bdi; #ifdef CONFIG_CIFS_FSCACHE /* initialize per-inode cache cookie pointer */ CIFS_I(inode)->fscache = NULL; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index b945410..82ec68b 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -183,7 +183,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) goto unlock_out; } - error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY); + error = bdi_setup_and_register(&vc->bdi, "coda"); if (error) goto unlock_out; diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index bd4a3c1..a315677 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -70,8 +70,6 @@ extern int configfs_is_root(struct config_item *item); extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *); extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *)); -extern int configfs_inode_init(void); -extern void configfs_inode_exit(void); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); extern int configfs_make_dirent(struct configfs_dirent *, diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 5946ad9..65af861 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -50,12 +50,6 @@ static const struct address_space_operations configfs_aops = { .write_end = simple_write_end, }; -static struct backing_dev_info configfs_backing_dev_info = { - .name = "configfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - static const struct inode_operations configfs_inode_operations ={ .setattr = configfs_setattr, }; @@ -137,7 +131,6 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd, if (inode) { inode->i_ino = get_next_ino(); inode->i_mapping->a_ops = &configfs_aops; - inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; inode->i_op = &configfs_inode_operations; if (sd->s_iattr) { @@ -283,13 +276,3 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) } mutex_unlock(&dir->d_inode->i_mutex); } - -int __init configfs_inode_init(void) -{ - return bdi_init(&configfs_backing_dev_info); -} - -void configfs_inode_exit(void) -{ - bdi_destroy(&configfs_backing_dev_info); -} diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index f6c2858..da94e41 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -145,19 +145,13 @@ static int __init configfs_init(void) if (!config_kobj) goto out2; - err = configfs_inode_init(); - if (err) - goto out3; - err = register_filesystem(&configfs_fs_type); if (err) - goto out4; + goto out3; return 0; -out4: - pr_err("Unable to register filesystem!\n"); - configfs_inode_exit(); out3: + pr_err("Unable to register filesystem!\n"); kobject_put(config_kobj); out2: kmem_cache_destroy(configfs_dir_cachep); @@ -172,7 +166,6 @@ static void __exit configfs_exit(void) kobject_put(config_kobj); kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; - configfs_inode_exit(); } MODULE_AUTHOR("Oracle"); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 1686dc2..34b36a5 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -67,7 +67,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque) inode->i_ino = lower_inode->i_ino; inode->i_version++; inode->i_mapping->a_ops = &ecryptfs_aops; - inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; if (S_ISLNK(inode->i_mode)) inode->i_op = &ecryptfs_symlink_iops; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index d9eb84b..1895d60 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -520,7 +520,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out; } - rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); + rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs"); if (rc) goto out1; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index f1d3d4e..6fc91df 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1214,7 +1214,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); } - inode->i_mapping->backing_dev_info = sb->s_bdi; if (S_ISREG(inode->i_mode)) { inode->i_op = &exofs_file_inode_operations; inode->i_fop = &exofs_file_operations; @@ -1314,7 +1313,6 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode) set_obj_2bcreated(oi); - inode->i_mapping->backing_dev_info = sb->s_bdi; inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; inode->i_blkbits = EXOFS_BLKSHIFT; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 9596550..fcc2e56 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -836,7 +836,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); + ret = bdi_setup_and_register(&sbi->bdi, "exofs"); if (ret) { EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); dput(sb->s_root); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 7d66fb0..6c14bb8 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -170,7 +170,7 @@ static void ext2_preread_inode(struct inode *inode) struct ext2_group_desc * gdp; struct backing_dev_info *bdi; - bdi = inode->i_mapping->backing_dev_info; + bdi = inode_to_bdi(inode); if (bdi_read_congested(bdi)) return; if (bdi_write_congested(bdi)) diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index e98171a..bbc5fec 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -13,18 +13,12 @@ #include "ext2.h" #include "xip.h" -static inline int -__inode_direct_access(struct inode *inode, sector_t block, - void **kaddr, unsigned long *pfn) +static inline long __inode_direct_access(struct inode *inode, sector_t block, + void **kaddr, unsigned long *pfn, long size) { struct block_device *bdev = inode->i_sb->s_bdev; - const struct block_device_operations *ops = bdev->bd_disk->fops; - sector_t sector; - - sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */ - - BUG_ON(!ops->direct_access); - return ops->direct_access(bdev, sector, kaddr, pfn); + sector_t sector = block * (PAGE_SIZE / 512); + return bdev_direct_access(bdev, sector, kaddr, pfn, size); } static inline int @@ -53,12 +47,13 @@ ext2_clear_xip_target(struct inode *inode, sector_t block) { void *kaddr; unsigned long pfn; - int rc; + long size; - rc = __inode_direct_access(inode, block, &kaddr, &pfn); - if (!rc) - clear_page(kaddr); - return rc; + size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE); + if (size < 0) + return size; + clear_page(kaddr); + return 0; } void ext2_xip_verify_sb(struct super_block *sb) @@ -77,7 +72,7 @@ void ext2_xip_verify_sb(struct super_block *sb) int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create, void **kmem, unsigned long *pfn) { - int rc; + long rc; sector_t block; /* first, retrieve the sector number */ @@ -86,6 +81,6 @@ int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create, return rc; /* retrieve address of the target data */ - rc = __inode_direct_access(mapping->host, block, kmem, pfn); - return rc; + rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE); + return (rc < 0) ? rc : 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ac64edb..64c39c7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -334,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func, static int block_device_ejected(struct super_block *sb) { struct inode *bd_inode = sb->s_bdev->bd_inode; - struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(bd_inode); return bdi->dev == NULL; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2d609a5..c399152 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -66,15 +66,21 @@ int writeback_in_progress(struct backing_dev_info *bdi) } EXPORT_SYMBOL(writeback_in_progress); -static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) +struct backing_dev_info *inode_to_bdi(struct inode *inode) { - struct super_block *sb = inode->i_sb; + struct super_block *sb; - if (sb_is_blkdev_sb(sb)) - return inode->i_mapping->backing_dev_info; + if (!inode) + return &noop_backing_dev_info; + sb = inode->i_sb; +#ifdef CONFIG_BLOCK + if (sb_is_blkdev_sb(sb)) + return blk_get_backing_dev_info(I_BDEV(inode)); +#endif return sb->s_bdi; } +EXPORT_SYMBOL_GPL(inode_to_bdi); static inline struct inode *wb_inode(struct list_head *head) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d769e59..c01ec3b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1159,7 +1159,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) @@ -1464,7 +1464,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) { struct inode *inode = req->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(inode); int i; list_del(&req->writepages_entry); @@ -1658,7 +1658,7 @@ static int fuse_writepage_locked(struct page *page) req->end = fuse_writepage_end; req->inode = inode; - inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK); + inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fc->lock); @@ -1768,7 +1768,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT || old_req->state == FUSE_REQ_PENDING)) { - struct backing_dev_info *bdi = page->mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host); copy_highpage(old_req->pages[0], page); spin_unlock(&fc->lock); @@ -1872,7 +1872,7 @@ static int fuse_writepages_fill(struct page *page, req->page_descs[req->num_pages].offset = 0; req->page_descs[req->num_pages].length = PAGE_SIZE; - inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK); + inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f38256e..e8799c1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -308,7 +308,6 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; - inode->i_data.backing_dev_info = &fc->bdi; fuse_init_inode(inode, attr); unlock_new_inode(inode); } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 805b37f..4ad4f94 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -289,7 +289,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - trace_wbc_writepage(wbc, mapping->backing_dev_info); + trace_wbc_writepage(wbc, inode_to_bdi(inode)); ret = __gfs2_jdata_writepage(page, wbc); if (unlikely(ret)) { diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index aeb7bc9..f42dffb 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -768,7 +768,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->private_data = NULL; - mapping->backing_dev_info = s->s_bdi; mapping->writeback_index = 0; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 8633ad3..efc8e25 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -112,7 +112,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->private_data = NULL; - mapping->backing_dev_info = sb->s_bdi; mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 5b327f8..1666382 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -743,7 +743,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); - struct backing_dev_info *bdi = metamapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(metamapping->host); int ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5eba47f..c274aca 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -62,12 +62,6 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); } -static struct backing_dev_info hugetlbfs_backing_dev_info = { - .name = "hugetlbfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - int sysctl_hugetlb_shm_group; enum { @@ -498,7 +492,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; - inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->private_data = resv_map; info = HUGETLBFS_I(inode); @@ -1032,10 +1025,6 @@ static int __init init_hugetlbfs_fs(void) return -ENOTSUPP; } - error = bdi_init(&hugetlbfs_backing_dev_info); - if (error) - return error; - error = -ENOMEM; hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), @@ -1071,7 +1060,6 @@ static int __init init_hugetlbfs_fs(void) out: kmem_cache_destroy(hugetlbfs_inode_cachep); out2: - bdi_destroy(&hugetlbfs_backing_dev_info); return error; } @@ -1091,7 +1079,6 @@ static void __exit exit_hugetlbfs_fs(void) for_each_hstate(h) kern_unmount(hugetlbfs_vfsmount[i++]); unregister_filesystem(&hugetlbfs_fs_type); - bdi_destroy(&hugetlbfs_backing_dev_info); } module_init(init_hugetlbfs_fs) @@ -170,20 +170,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&mapping->i_mmap_writable, 0); mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; - mapping->backing_dev_info = &default_backing_dev_info; mapping->writeback_index = 0; - - /* - * If the block_device provides a backing_dev_info for client - * inodes then use that. Otherwise the inode share the bdev's - * backing_dev_info. - */ - if (sb->s_bdev) { - struct backing_dev_info *bdi; - - bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; - mapping->backing_dev_info = bdi; - } inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 9852176..9000874 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -24,12 +24,6 @@ static const struct address_space_operations kernfs_aops = { .write_end = simple_write_end, }; -static struct backing_dev_info kernfs_bdi = { - .name = "kernfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - static const struct inode_operations kernfs_iops = { .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, @@ -40,12 +34,6 @@ static const struct inode_operations kernfs_iops = { .listxattr = kernfs_iop_listxattr, }; -void __init kernfs_inode_init(void) -{ - if (bdi_init(&kernfs_bdi)) - panic("failed to init kernfs_bdi"); -} - static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { static DEFINE_MUTEX(iattr_mutex); @@ -298,7 +286,6 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) kernfs_get(kn); inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; - inode->i_mapping->backing_dev_info = &kernfs_bdi; inode->i_op = &kernfs_iops; set_default_inode_attr(inode, kn->mode); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index dc84a3e..af9fa74 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -88,7 +88,6 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name); ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, size_t size); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); -void kernfs_inode_init(void); /* * dir.c diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index f973ae9..8eaf417 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -246,5 +246,4 @@ void __init kernfs_init(void) kernfs_node_cache = kmem_cache_create("kernfs_node_cache", sizeof(struct kernfs_node), 0, SLAB_PANIC, NULL); - kernfs_inode_init(); } diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index e31e589..01a9e16 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -267,7 +267,6 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info) if (inode) { atomic_set(&NCP_FINFO(inode)->opened, info->opened); - inode->i_mapping->backing_dev_info = sb->s_bdi; inode->i_ino = info->ino; ncp_set_attr(inode, info); if (S_ISREG(inode->i_mode)) { @@ -560,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) server = NCP_SBP(sb); memset(server, 0, sizeof(*server)); - error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY); + error = bdi_setup_and_register(&server->bdi, "ncpfs"); if (error) goto out_fput; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 3c97694..7ae1c26 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1002,7 +1002,7 @@ mds_commit: spin_unlock(cinfo->lock); if (!cinfo->dreq) { inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, + inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host), BDI_RECLAIMABLE); __mark_inode_dirty(req->wb_context->dentry->d_inode, I_DIRTY_DATASYNC); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index f29fb7d..c22ecaa 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1366,7 +1366,7 @@ ff_layout_mark_request_commit(struct nfs_page *req, spin_unlock(cinfo->lock); if (!cinfo->dreq) { inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, + inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host), BDI_RECLAIMABLE); __mark_inode_dirty(req->wb_context->dentry->d_inode, I_DIRTY_DATASYNC); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d2398c1..e4f0dce 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -388,7 +388,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st if (S_ISREG(inode->i_mode)) { inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; - inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 21469e6..212b8c8 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -430,7 +430,6 @@ int nfs_show_options(struct seq_file *, struct dentry *); int nfs_show_devname(struct seq_file *, struct dentry *); int nfs_show_path(struct seq_file *, struct dentry *); int nfs_show_stats(struct seq_file *, struct dentry *); -void nfs_put_super(struct super_block *); int nfs_remount(struct super_block *sb, int *flags, char *raw_data); /* write.c */ diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 48cea3c..75090fe 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -53,7 +53,6 @@ static const struct super_operations nfs4_sops = { .destroy_inode = nfs_destroy_inode, .write_inode = nfs4_write_inode, .drop_inode = nfs_drop_inode, - .put_super = nfs_put_super, .statfs = nfs_statfs, .evict_inode = nfs4_evict_inode, .umount_begin = nfs_umount_begin, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 368d939..322b2de02 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -311,7 +311,6 @@ const struct super_operations nfs_sops = { .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, .drop_inode = nfs_drop_inode, - .put_super = nfs_put_super, .statfs = nfs_statfs, .evict_inode = nfs_evict_inode, .umount_begin = nfs_umount_begin, @@ -2572,7 +2571,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, error = nfs_bdi_register(server); if (error) { mntroot = ERR_PTR(error); - goto error_splat_bdi; + goto error_splat_super; } server->super = s; } @@ -2604,9 +2603,6 @@ error_splat_root: dput(mntroot); mntroot = ERR_PTR(error); error_splat_super: - if (server && !s->s_root) - bdi_unregister(&server->backing_dev_info); -error_splat_bdi: deactivate_locked_super(s); goto out; } @@ -2654,27 +2650,19 @@ out: EXPORT_SYMBOL_GPL(nfs_fs_mount); /* - * Ensure that we unregister the bdi before kill_anon_super - * releases the device name - */ -void nfs_put_super(struct super_block *s) -{ - struct nfs_server *server = NFS_SB(s); - - bdi_unregister(&server->backing_dev_info); -} -EXPORT_SYMBOL_GPL(nfs_put_super); - -/* * Destroy an NFS2/3 superblock */ void nfs_kill_super(struct super_block *s) { struct nfs_server *server = NFS_SB(s); + dev_t dev = s->s_dev; + + generic_shutdown_super(s); - kill_anon_super(s); nfs_fscache_release_super_cookie(s); + nfs_free_server(server); + free_anon_bdev(dev); } EXPORT_SYMBOL_GPL(nfs_kill_super); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bcf83e5..88a6d21 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -791,7 +791,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, spin_unlock(cinfo->lock); if (!cinfo->dreq) { inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, + inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host), BDI_RECLAIMABLE); __mark_inode_dirty(req->wb_context->dentry->d_inode, I_DIRTY_DATASYNC); @@ -858,7 +858,7 @@ static void nfs_clear_page_commit(struct page *page) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE); } /* Called holding inode (/cinfo) lock */ @@ -1607,7 +1607,7 @@ void nfs_retry_commit(struct list_head *page_list, nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx); if (!cinfo->dreq) { dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, + dec_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host), BDI_RECLAIMABLE); } nfs_unlock_and_release_request(req); diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 57ceaf3..748ca23 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -172,7 +172,6 @@ int nilfs_init_gcinode(struct inode *inode) inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); inode->i_mapping->a_ops = &empty_aops; - inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; ii->i_flags = 0; nilfs_bmap_init_gc(ii->i_bmap); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index c4dcd1d..892cf5f 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -429,7 +429,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz) inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, gfp_mask); - inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; inode->i_op = &def_mdt_iops; inode->i_fop = &def_mdt_fops; @@ -457,13 +456,12 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode, struct nilfs_shadow_map *shadow) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); - struct backing_dev_info *bdi = inode->i_sb->s_bdi; INIT_LIST_HEAD(&shadow->frozen_buffers); address_space_init_once(&shadow->frozen_data); - nilfs_mapping_init(&shadow->frozen_data, inode, bdi); + nilfs_mapping_init(&shadow->frozen_data, inode); address_space_init_once(&shadow->frozen_btnodes); - nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi); + nilfs_mapping_init(&shadow->frozen_btnodes, inode); mi->mi_shadow = shadow; return 0; } diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index da27664..700ecbc 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -461,14 +461,12 @@ unsigned nilfs_page_count_clean_buffers(struct page *page, return nc; } -void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, - struct backing_dev_info *bdi) +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode) { mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->private_data = NULL; - mapping->backing_dev_info = bdi; mapping->a_ops = &empty_aops; } diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index ef30c5c..a43b8287 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -57,8 +57,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_dirty_page(struct page *, bool); void nilfs_clear_dirty_pages(struct address_space *, bool); -void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, - struct backing_dev_info *bdi); +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode); unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 2e5b3ec..5bc2a1c 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -166,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) ii->i_state = 0; ii->i_cno = 0; ii->vfs_inode.i_version = 1; - nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi); + nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode); return &ii->vfs_inode; } @@ -1057,7 +1057,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) { struct the_nilfs *nilfs; struct nilfs_root *fsroot; - struct backing_dev_info *bdi; __u64 cno; int err; @@ -1077,8 +1076,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; - bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; - sb->s_bdi = bdi ? : &default_backing_dev_info; + sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info; err = load_nilfs(nilfs, sb); if (err) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 643faa4..1da9b2d 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -19,6 +19,7 @@ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/backing-dev.h> #include <linux/buffer_head.h> #include <linux/gfp.h> #include <linux/pagemap.h> @@ -2091,7 +2092,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, count = iov_length(iov, nr_segs); pos = *ppos; /* We can write back this queue in page reclaim. */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); written = 0; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 57c40e3..061ba6a 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -390,12 +390,6 @@ clear_fields: ip->ip_conn = NULL; } -static struct backing_dev_info dlmfs_backing_dev_info = { - .name = "ocfs2-dlmfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - static struct inode *dlmfs_get_root_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -404,7 +398,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); inode_init_owner(inode, NULL, mode); - inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inc_nlink(inode); @@ -428,7 +421,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inode->i_ino = get_next_ino(); inode_init_owner(inode, parent, mode); - inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; ip = DLMFS_I(inode); @@ -643,10 +635,6 @@ static int __init init_dlmfs_fs(void) int status; int cleanup_inode = 0, cleanup_worker = 0; - status = bdi_init(&dlmfs_backing_dev_info); - if (status) - return status; - dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", sizeof(struct dlmfs_inode_private), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| @@ -673,7 +661,6 @@ bail: kmem_cache_destroy(dlmfs_inode_cache); if (cleanup_worker) destroy_workqueue(user_dlm_worker); - bdi_destroy(&dlmfs_backing_dev_info); } else printk("OCFS2 User DLM kernel interface loaded\n"); return status; @@ -693,7 +680,6 @@ static void __exit exit_dlmfs_fs(void) rcu_barrier(); kmem_cache_destroy(dlmfs_inode_cache); - bdi_destroy(&dlmfs_backing_dev_info); } MODULE_AUTHOR("Oracle"); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 245db4f..e0f04d5 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2363,7 +2363,7 @@ relock: goto out_dio; } } else { - current->backing_dev_info = file->f_mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); written = generic_perform_write(file, from, *ppos); if (likely(written >= 0)) iocb->ki_pos = *ppos + written; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index bbafbde..f6ab41b 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -34,7 +34,14 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long flags); static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma); +static unsigned ramfs_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ | + NOMMU_MAP_WRITE | NOMMU_MAP_EXEC; +} + const struct file_operations ramfs_file_operations = { + .mmap_capabilities = ramfs_mmap_capabilities, .mmap = ramfs_nommu_mmap, .get_unmapped_area = ramfs_nommu_get_unmapped_area, .read = new_sync_read, diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index d365b1c..889d558 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -50,14 +50,6 @@ static const struct address_space_operations ramfs_aops = { .set_page_dirty = __set_page_dirty_no_writeback, }; -static struct backing_dev_info ramfs_backing_dev_info = { - .name = "ramfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | - BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | - BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP, -}; - struct inode *ramfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev) { @@ -67,7 +59,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); inode->i_mapping->a_ops = &ramfs_aops; - inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -267,19 +258,9 @@ static struct file_system_type ramfs_fs_type = { int __init init_ramfs_fs(void) { static unsigned long once; - int err; if (test_and_set_bit(0, &once)) return 0; - - err = bdi_init(&ramfs_backing_dev_info); - if (err) - return err; - - err = register_filesystem(&ramfs_fs_type); - if (err) - bdi_destroy(&ramfs_backing_dev_info); - - return err; + return register_filesystem(&ramfs_fs_type); } fs_initcall(init_ramfs_fs); diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index ea06c75..7da9e21 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -70,6 +70,15 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma) return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; } +static unsigned romfs_mmap_capabilities(struct file *file) +{ + struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd; + + if (!mtd) + return NOMMU_MAP_COPY; + return mtd_mmap_capabilities(mtd); +} + const struct file_operations romfs_ro_fops = { .llseek = generic_file_llseek, .read = new_sync_read, @@ -77,4 +86,5 @@ const struct file_operations romfs_ro_fops = { .splice_read = generic_file_splice_read, .mmap = romfs_mmap, .get_unmapped_area = romfs_get_unmapped_area, + .mmap_capabilities = romfs_mmap_capabilities, }; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index e98dd88..268733c 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -355,9 +355,6 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) case ROMFH_REG: i->i_fop = &romfs_ro_fops; i->i_data.a_ops = &romfs_aops; - if (i->i_sb->s_mtd) - i->i_data.backing_dev_info = - i->i_sb->s_mtd->backing_dev_info; if (nextfh & ROMFH_EXEC) mode |= S_IXUGO; break; @@ -36,8 +36,8 @@ #include "internal.h" -LIST_HEAD(super_blocks); -DEFINE_SPINLOCK(sb_lock); +static LIST_HEAD(super_blocks); +static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_writers", @@ -186,8 +186,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) } init_waitqueue_head(&s->s_writers.wait); init_waitqueue_head(&s->s_writers.wait_unfrozen); + s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; - s->s_bdi = &default_backing_dev_info; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); @@ -872,10 +872,7 @@ EXPORT_SYMBOL(free_anon_bdev); int set_anon_super(struct super_block *s, void *data) { - int error = get_anon_bdev(&s->s_dev); - if (!error) - s->s_bdi = &noop_backing_dev_info; - return error; + return get_anon_bdev(&s->s_dev); } EXPORT_SYMBOL(set_anon_super); @@ -1120,7 +1117,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) sb = root->d_sb; BUG_ON(!sb); WARN_ON(!sb->s_bdi); - WARN_ON(sb->s_bdi == &default_backing_dev_info); sb->s_flags |= MS_BORN; error = security_sb_kern_mount(sb, flags, secdata); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ea41649..c49b198 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -108,8 +108,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, inode->i_mtime = inode->i_atime = inode->i_ctime = ubifs_current_time(inode); inode->i_mapping->nrpages = 0; - /* Disable readahead */ - inode->i_mapping->backing_dev_info = &c->bdi; switch (mode & S_IFMT) { case S_IFREG: diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 106bf20..6197154 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -156,9 +156,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) if (err) goto out_invalid; - /* Disable read-ahead */ - inode->i_mapping->backing_dev_info = &c->bdi; - switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &ubifs_file_address_operations; @@ -2017,7 +2014,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) * Read-ahead will be disabled because @c->bdi.ra_pages is 0. */ c->bdi.name = "ubifs", - c->bdi.capabilities = BDI_CAP_MAP_COPY; + c->bdi.capabilities = 0; err = bdi_init(&c->bdi); if (err) goto out_close; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f2d05a1..1cdba95 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -735,7 +735,7 @@ xfs_file_buffered_aio_write( iov_iter_truncate(from, count); /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5da6012..d94077f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -106,6 +106,8 @@ struct backing_dev_info { #endif }; +struct backing_dev_info *inode_to_bdi(struct inode *inode); + int __must_check bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); @@ -114,7 +116,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); -int __must_check bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); +int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); @@ -228,46 +230,17 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_NO_ACCT_DIRTY: Dirty pages shouldn't contribute to accounting * BDI_CAP_NO_WRITEBACK: Don't write pages back * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages - * - * These flags let !MMU mmap() govern direct device mapping vs immediate - * copying more easily for MAP_PRIVATE, especially for ROM filesystems. - * - * BDI_CAP_MAP_COPY: Copy can be mapped (MAP_PRIVATE) - * BDI_CAP_MAP_DIRECT: Can be mapped directly (MAP_SHARED) - * BDI_CAP_READ_MAP: Can be mapped for reading - * BDI_CAP_WRITE_MAP: Can be mapped for writing - * BDI_CAP_EXEC_MAP: Can be mapped for execution - * - * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed. - * * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 -#define BDI_CAP_MAP_COPY 0x00000004 -#define BDI_CAP_MAP_DIRECT 0x00000008 -#define BDI_CAP_READ_MAP 0x00000010 -#define BDI_CAP_WRITE_MAP 0x00000020 -#define BDI_CAP_EXEC_MAP 0x00000040 -#define BDI_CAP_NO_ACCT_WB 0x00000080 -#define BDI_CAP_SWAP_BACKED 0x00000100 -#define BDI_CAP_STABLE_WRITES 0x00000200 -#define BDI_CAP_STRICTLIMIT 0x00000400 - -#define BDI_CAP_VMFLAGS \ - (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) +#define BDI_CAP_NO_ACCT_WB 0x00000004 +#define BDI_CAP_STABLE_WRITES 0x00000008 +#define BDI_CAP_STRICTLIMIT 0x00000010 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) -#if defined(VM_MAYREAD) && \ - (BDI_CAP_READ_MAP != VM_MAYREAD || \ - BDI_CAP_WRITE_MAP != VM_MAYWRITE || \ - BDI_CAP_EXEC_MAP != VM_MAYEXEC) -#error please change backing_dev_info::capabilities flags -#endif - -extern struct backing_dev_info default_backing_dev_info; extern struct backing_dev_info noop_backing_dev_info; int writeback_in_progress(struct backing_dev_info *bdi); @@ -329,24 +302,14 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi) BDI_CAP_NO_WRITEBACK)); } -static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) -{ - return bdi->capabilities & BDI_CAP_SWAP_BACKED; -} - static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { - return bdi_cap_writeback_dirty(mapping->backing_dev_info); + return bdi_cap_writeback_dirty(inode_to_bdi(mapping->host)); } static inline bool mapping_cap_account_dirty(struct address_space *mapping) { - return bdi_cap_account_dirty(mapping->backing_dev_info); -} - -static inline bool mapping_cap_swap_backed(struct address_space *mapping) -{ - return bdi_cap_swap_backed(mapping->backing_dev_info); + return bdi_cap_account_dirty(inode_to_bdi(mapping->host)); } static inline int bdi_sched_wait(void *word) diff --git a/include/linux/bio.h b/include/linux/bio.h index efead0b..da3a127 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -428,13 +428,9 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); extern int bio_get_nr_vecs(struct block_device *); -extern struct bio *bio_map_user(struct request_queue *, struct block_device *, - unsigned long, unsigned int, int, gfp_t); -struct sg_iovec; struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, - struct block_device *, - const struct sg_iovec *, int, int, gfp_t); + const struct iov_iter *, gfp_t); extern void bio_unmap_user(struct bio *); extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, gfp_t); @@ -462,12 +458,10 @@ static inline void bio_flush_dcache_pages(struct bio *bi) extern void bio_copy_data(struct bio *dst, struct bio *src); extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); -extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, - unsigned long, unsigned int, int, gfp_t); extern struct bio *bio_copy_user_iov(struct request_queue *, struct rq_map_data *, - const struct sg_iovec *, - int, int, gfp_t); + const struct iov_iter *, + gfp_t); extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 5735e71..7aec861 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -146,6 +146,8 @@ enum { BLK_MQ_F_SG_MERGE = 1 << 2, BLK_MQ_F_SYSFS_UP = 1 << 3, BLK_MQ_F_DEFER_ISSUE = 1 << 4, + BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, + BLK_MQ_F_ALLOC_POLICY_BITS = 1, BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, @@ -154,6 +156,12 @@ enum { BLK_MQ_CPU_WORK_BATCH = 8, }; +#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ + ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ + ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) +#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ + ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ + << BLK_MQ_F_ALLOC_POLICY_START_BIT) struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); void blk_mq_finish_init(struct request_queue *q); @@ -166,7 +174,6 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_insert_request(struct request *, bool, bool, bool); -void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); @@ -214,6 +221,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, void *priv); +void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_mq_freeze_queue_start(struct request_queue *q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 92f4b4b..7f9a516 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -272,7 +272,11 @@ struct blk_queue_tag { int max_depth; /* what we will send to device */ int real_max_depth; /* what the array can hold */ atomic_t refcnt; /* map can be shared */ + int alloc_policy; /* tag allocation policy */ + int next_tag; /* next tag */ }; +#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ +#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ #define BLK_SCSI_MAX_CMDS (256) #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) @@ -516,6 +520,7 @@ struct request_queue { (1 << QUEUE_FLAG_ADD_RANDOM)) #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ + (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP)) static inline void queue_lockdep_assert_held(struct request_queue *q) @@ -850,8 +855,8 @@ extern int blk_rq_map_user(struct request_queue *, struct request *, extern int blk_rq_unmap_user(struct bio *); extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); extern int blk_rq_map_user_iov(struct request_queue *, struct request *, - struct rq_map_data *, const struct sg_iovec *, - int, unsigned int, gfp_t); + struct rq_map_data *, const struct iov_iter *, + gfp_t); extern int blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, @@ -1044,8 +1049,6 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); -extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio, - struct scatterlist *sglist); extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); @@ -1139,11 +1142,11 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) extern int blk_queue_start_tag(struct request_queue *, struct request *); extern struct request *blk_queue_find_tag(struct request_queue *, int); extern void blk_queue_end_tag(struct request_queue *, struct request *); -extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *); +extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int); extern void blk_queue_free_tags(struct request_queue *); extern int blk_queue_resize_tags(struct request_queue *, int); extern void blk_queue_invalidate_tags(struct request_queue *); -extern struct blk_queue_tag *blk_init_tags(int); +extern struct blk_queue_tag *blk_init_tags(int, int); extern void blk_free_tags(struct blk_queue_tag *); static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, @@ -1162,7 +1165,7 @@ extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask); + sector_t nr_sects, gfp_t gfp_mask, bool discard); static inline int sb_issue_discard(struct super_block *sb, sector_t block, sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags) { @@ -1176,7 +1179,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, return blkdev_issue_zeroout(sb->s_bdev, block << (sb->s_blocksize_bits - 9), nr_blocks << (sb->s_blocksize_bits - 9), - gfp_mask); + gfp_mask, true); } extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); @@ -1601,8 +1604,8 @@ struct block_device_operations { int (*rw_page)(struct block_device *, sector_t, struct page *, int rw); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); - int (*direct_access) (struct block_device *, sector_t, - void **, unsigned long *); + long (*direct_access)(struct block_device *, sector_t, + void **, unsigned long *pfn, long size); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); /* ->media_changed() is DEPRECATED, use ->check_events() instead */ @@ -1620,6 +1623,8 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); +extern long bdev_direct_access(struct block_device *, sector_t, void **addr, + unsigned long *pfn, long size); #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/cdev.h b/include/linux/cdev.h index fb45919..f876361 100644 --- a/include/linux/cdev.h +++ b/include/linux/cdev.h @@ -30,6 +30,4 @@ void cdev_del(struct cdev *); void cd_forget(struct inode *); -extern struct backing_dev_info directly_mappable_cdev_bdi; - #endif diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index ca6d2ac..2646aed 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -48,6 +48,11 @@ typedef void (*dm_dtr_fn) (struct dm_target *ti); typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio); typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone, union map_info *map_context); +typedef int (*dm_clone_and_map_request_fn) (struct dm_target *ti, + struct request *rq, + union map_info *map_context, + struct request **clone); +typedef void (*dm_release_clone_request_fn) (struct request *clone); /* * Returns: @@ -143,6 +148,8 @@ struct target_type { dm_dtr_fn dtr; dm_map_fn map; dm_map_request_fn map_rq; + dm_clone_and_map_request_fn clone_and_map_rq; + dm_release_clone_request_fn release_clone_rq; dm_endio_fn end_io; dm_request_endio_fn rq_end_io; dm_presuspend_fn presuspend; @@ -600,9 +607,6 @@ static inline unsigned long to_bytes(sector_t n) /*----------------------------------------------------------------- * Helper for block layer and dm core operations *---------------------------------------------------------------*/ -void dm_dispatch_request(struct request *rq); -void dm_requeue_unmapped_request(struct request *rq); -void dm_kill_unmapped_request(struct request *rq, int error); int dm_underlying_device_busy(struct request_queue *q); #endif /* _LINUX_DEVICE_MAPPER_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index a20d658..e49f10c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -34,6 +34,7 @@ #include <asm/byteorder.h> #include <uapi/linux/fs.h> +struct backing_dev_info; struct export_operations; struct hd_geometry; struct iovec; @@ -394,7 +395,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); -struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ @@ -408,7 +408,6 @@ struct address_space { pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ unsigned long flags; /* error bits/gfp mask */ - struct backing_dev_info *backing_dev_info; /* device readahead, etc */ spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ void *private_data; /* ditto */ @@ -1201,8 +1200,6 @@ struct mm_struct; #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ -extern struct list_head super_blocks; -extern spinlock_t sb_lock; /* Possible states of 'frozen' field */ enum { @@ -1519,6 +1516,26 @@ struct block_device_operations; #define HAVE_COMPAT_IOCTL 1 #define HAVE_UNLOCKED_IOCTL 1 +/* + * These flags let !MMU mmap() govern direct device mapping vs immediate + * copying more easily for MAP_PRIVATE, especially for ROM filesystems. + * + * NOMMU_MAP_COPY: Copy can be mapped (MAP_PRIVATE) + * NOMMU_MAP_DIRECT: Can be mapped directly (MAP_SHARED) + * NOMMU_MAP_READ: Can be mapped for reading + * NOMMU_MAP_WRITE: Can be mapped for writing + * NOMMU_MAP_EXEC: Can be mapped for execution + */ +#define NOMMU_MAP_COPY 0x00000001 +#define NOMMU_MAP_DIRECT 0x00000008 +#define NOMMU_MAP_READ VM_MAYREAD +#define NOMMU_MAP_WRITE VM_MAYWRITE +#define NOMMU_MAP_EXEC VM_MAYEXEC + +#define NOMMU_VMFLAGS \ + (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC) + + struct iov_iter; struct file_operations { @@ -1553,6 +1570,9 @@ struct file_operations { long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); +#ifndef CONFIG_MMU + unsigned (*mmap_capabilities)(struct file *); +#endif }; struct inode_operations { diff --git a/include/linux/libata.h b/include/linux/libata.h index 61df823..fc03efa 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -823,10 +823,10 @@ struct ata_port { unsigned int cbl; /* cable type; ATA_CBL_xxx */ struct ata_queued_cmd qcmd[ATA_MAX_QUEUE]; - unsigned long qc_allocated; + unsigned long sas_tag_allocated; /* for sas tag allocation only */ unsigned int qc_active; int nr_active_links; /* #links with active qcs */ - unsigned int last_tag; /* track next tag hw expects */ + unsigned int sas_last_tag; /* track next tag hw expects */ struct ata_link link; /* host default link */ struct ata_link *slave_link; /* see ata_slave_link_init() */ @@ -1352,6 +1352,7 @@ extern struct device_attribute *ata_common_sdev_attrs[]; .ioctl = ata_scsi_ioctl, \ .queuecommand = ata_scsi_queuecmd, \ .can_queue = ATA_DEF_QUEUE, \ + .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ .this_id = ATA_SHT_THIS_ID, \ .cmd_per_lun = ATA_SHT_CMD_PER_LUN, \ .emulated = ATA_SHT_EMULATED, \ diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 031ff3a..3301c4c 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -408,4 +408,6 @@ static inline int mtd_is_bitflip_or_eccerr(int err) { return mtd_is_bitflip(err) || mtd_is_eccerr(err); } +unsigned mtd_mmap_capabilities(struct mtd_info *mtd); + #endif /* __MTD_MTD_H__ */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 258945f..19a5d4b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -132,13 +132,12 @@ struct nvme_ns { * allocated to store the PRP list. */ struct nvme_iod { - void *private; /* For the use of the submitter of the I/O */ + unsigned long private; /* For the use of the submitter of the I/O */ int npages; /* In the PRP list. 0 means small pool in use */ int offset; /* Of PRP list */ int nents; /* Used in scatterlist */ int length; /* Of data, in bytes */ dma_addr_t first_dma; - struct list_head node; struct scatterlist sg[0]; }; diff --git a/include/linux/wait.h b/include/linux/wait.h index 537d58e..2db8334 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -267,6 +267,21 @@ do { \ __wait_event(wq, condition); \ } while (0) +#define __io_wait_event(wq, condition) \ + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + io_schedule()) + +/* + * io_wait_event() -- like wait_event() but with io_schedule() + */ +#define io_wait_event(wq, condition) \ +do { \ + might_sleep(); \ + if (condition) \ + break; \ + __io_wait_event(wq, condition); \ +} while (0) + #define __wait_event_freezable(wq, condition) \ ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule(); try_to_freeze()) diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 019e668..e113c75 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -402,6 +402,9 @@ struct scsi_host_template { */ unsigned char present; + /* If use block layer to manage tags, this is tag allocation policy */ + int tag_alloc_policy; + /* * Let the block layer assigns tags to all commands. */ diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h index 9708b28..b27977e 100644 --- a/include/scsi/scsi_tcq.h +++ b/include/scsi/scsi_tcq.h @@ -66,7 +66,8 @@ static inline int scsi_init_shared_tag_map(struct Scsi_Host *shost, int depth) * devices on the shared host (for libata) */ if (!shost->bqt) { - shost->bqt = blk_init_tags(depth); + shost->bqt = blk_init_tags(depth, + shost->hostt->tag_alloc_policy); if (!shost->bqt) return -ENOMEM; } diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index cee02d6..0e93109 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -47,7 +47,7 @@ TRACE_EVENT(writeback_dirty_page, TP_fast_assign( strncpy(__entry->name, - mapping ? dev_name(mapping->backing_dev_info->dev) : "(unknown)", 32); + mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", 32); __entry->ino = mapping ? mapping->host->i_ino : 0; __entry->index = page->index; ), @@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, ), TP_fast_assign( - struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ strncpy(__entry->name, @@ -116,7 +116,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_fast_assign( strncpy(__entry->name, - dev_name(inode->i_mapping->backing_dev_info->dev), 32); + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; ), @@ -156,10 +156,8 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(int, reason) ), TP_fast_assign( - struct device *dev = bdi->dev; - if (!dev) - dev = default_backing_dev_info.dev; - strncpy(__entry->name, dev_name(dev), 32); + strncpy(__entry->name, + bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index a570d7b..889f3a5 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 29 +#define DM_VERSION_MINOR 30 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2014-10-28)" +#define DM_VERSION_EXTRA "-ioctl (2014-12-22)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 7d0e5cd..dbef231 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -89,10 +89,10 @@ void (*raid6_datap_recov)(int, size_t, int, void **); EXPORT_SYMBOL_GPL(raid6_datap_recov); const struct raid6_recov_calls *const raid6_recov_algos[] = { -#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) #ifdef CONFIG_AS_AVX2 &raid6_recov_avx2, #endif +#ifdef CONFIG_AS_SSSE3 &raid6_recov_ssse3, #endif &raid6_recov_intx1, diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c index e1eea43..53fe3d7 100644 --- a/lib/raid6/recov_avx2.c +++ b/lib/raid6/recov_avx2.c @@ -8,7 +8,7 @@ * of the License. */ -#if CONFIG_AS_AVX2 +#ifdef CONFIG_AS_AVX2 #include <linux/raid/pq.h> #include "x86.h" diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c index a916832..cda33e5 100644 --- a/lib/raid6/recov_ssse3.c +++ b/lib/raid6/recov_ssse3.c @@ -7,6 +7,8 @@ * of the License. */ +#ifdef CONFIG_AS_SSSE3 + #include <linux/raid/pq.h> #include "x86.h" @@ -330,3 +332,7 @@ const struct raid6_recov_calls raid6_recov_ssse3 = { #endif .priority = 1, }; + +#else +#warning "your version of binutils lacks SSSE3 support" +#endif diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0ae0df5..7690ec7 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -14,19 +14,10 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -struct backing_dev_info default_backing_dev_info = { - .name = "default", - .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, - .state = 0, - .capabilities = BDI_CAP_MAP_COPY, -}; -EXPORT_SYMBOL_GPL(default_backing_dev_info); - struct backing_dev_info noop_backing_dev_info = { .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; @@ -40,17 +31,6 @@ LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; -static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) -{ - if (wb1 < wb2) { - spin_lock(&wb1->list_lock); - spin_lock_nested(&wb2->list_lock, 1); - } else { - spin_lock(&wb2->list_lock); - spin_lock_nested(&wb1->list_lock, 1); - } -} - #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -264,9 +244,6 @@ static int __init default_bdi_init(void) if (!bdi_wq) return -ENOMEM; - err = bdi_init(&default_backing_dev_info); - if (!err) - bdi_register(&default_backing_dev_info, NULL, "default"); err = bdi_init(&noop_backing_dev_info); return err; @@ -355,19 +332,19 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - if (!bdi_cap_writeback_dirty(bdi)) + /* Make sure nobody queues further work */ + spin_lock_bh(&bdi->wb_lock); + if (!test_and_clear_bit(BDI_registered, &bdi->state)) { + spin_unlock_bh(&bdi->wb_lock); return; + } + spin_unlock_bh(&bdi->wb_lock); /* * Make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); - /* Make sure nobody queues further work */ - spin_lock_bh(&bdi->wb_lock); - clear_bit(BDI_registered, &bdi->state); - spin_unlock_bh(&bdi->wb_lock); - /* * Drain work list and shutdown the delayed_work. At this point, * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi @@ -375,37 +352,22 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) */ mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); flush_delayed_work(&bdi->wb.dwork); - WARN_ON(!list_empty(&bdi->work_list)); - WARN_ON(delayed_work_pending(&bdi->wb.dwork)); } /* - * This bdi is going away now, make sure that no super_blocks point to it + * Called when the device behind @bdi has been removed or ejected. + * + * We can't really do much here except for reducing the dirty ratio at + * the moment. In the future we should be able to set a flag so that + * the filesystem can handle errors at mark_inode_dirty time instead + * of only at writeback time. */ -static void bdi_prune_sb(struct backing_dev_info *bdi) -{ - struct super_block *sb; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_bdi == bdi) - sb->s_bdi = &default_backing_dev_info; - } - spin_unlock(&sb_lock); -} - void bdi_unregister(struct backing_dev_info *bdi) { - if (bdi->dev) { - bdi_set_min_ratio(bdi, 0); - trace_writeback_bdi_unregister(bdi); - bdi_prune_sb(bdi); + if (WARN_ON_ONCE(!bdi->dev)) + return; - bdi_wb_shutdown(bdi); - bdi_debug_unregister(bdi); - device_unregister(bdi->dev); - bdi->dev = NULL; - } + bdi_set_min_ratio(bdi, 0); } EXPORT_SYMBOL(bdi_unregister); @@ -474,37 +436,19 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; - /* - * Splice our entries to the default_backing_dev_info. This - * condition shouldn't happen. @wb must be empty at this point and - * dirty inodes on it might cause other issues. This workaround is - * added by ce5f8e779519 ("writeback: splice dirty inode entries to - * default bdi on bdi_destroy()") without root-causing the issue. - * - * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com - * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350 - * - * We should probably add WARN_ON() to find out whether it still - * happens and track it down if so. - */ - if (bdi_has_dirty_io(bdi)) { - struct bdi_writeback *dst = &default_backing_dev_info.wb; - - bdi_lock_two(&bdi->wb, dst); - list_splice(&bdi->wb.b_dirty, &dst->b_dirty); - list_splice(&bdi->wb.b_io, &dst->b_io); - list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&bdi->wb.list_lock); - spin_unlock(&dst->list_lock); - } - - bdi_unregister(bdi); + bdi_wb_shutdown(bdi); + WARN_ON(!list_empty(&bdi->work_list)); WARN_ON(delayed_work_pending(&bdi->wb.dwork)); + if (bdi->dev) { + bdi_debug_unregister(bdi); + device_unregister(bdi->dev); + bdi->dev = NULL; + } + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); - fprop_local_destroy_percpu(&bdi->completions); } EXPORT_SYMBOL(bdi_destroy); @@ -513,13 +457,12 @@ EXPORT_SYMBOL(bdi_destroy); * For use from filesystems to quickly init and register a bdi associated * with dirty writeback */ -int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, - unsigned int cap) +int bdi_setup_and_register(struct backing_dev_info *bdi, char *name) { int err; bdi->name = name; - bdi->capabilities = cap; + bdi->capabilities = 0; err = bdi_init(bdi); if (err) return err; diff --git a/mm/fadvise.c b/mm/fadvise.c index 2ad7adf..fac23ec 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -73,7 +73,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) else endbyte--; /* inclusive */ - bdi = mapping->backing_dev_info; + bdi = inode_to_bdi(mapping->host); switch (advice) { case POSIX_FADV_NORMAL: @@ -113,7 +113,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) case POSIX_FADV_NOREUSE: break; case POSIX_FADV_DONTNEED: - if (!bdi_write_congested(mapping->backing_dev_info)) + if (!bdi_write_congested(bdi)) __filemap_fdatawrite_range(mapping, offset, endbyte, WB_SYNC_NONE); diff --git a/mm/filemap.c b/mm/filemap.c index bf7a271..d9f5336 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -211,7 +211,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) */ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); } } @@ -2564,7 +2564,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) size_t count = iov_iter_count(from); /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) goto out; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 70c09da..c175f9f 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -9,6 +9,7 @@ */ #include <linux/fs.h> +#include <linux/backing-dev.h> #include <linux/pagemap.h> #include <linux/export.h> #include <linux/uio.h> @@ -409,7 +410,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, count = len; /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); if (ret) diff --git a/mm/madvise.c b/mm/madvise.c index d79fb5e..1077cbd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -222,19 +222,22 @@ static long madvise_willneed(struct vm_area_struct *vma, struct file *file = vma->vm_file; #ifdef CONFIG_SWAP - if (!file || mapping_cap_swap_backed(file->f_mapping)) { + if (!file) { *prev = vma; - if (!file) - force_swapin_readahead(vma, start, end); - else - force_shm_swapin_readahead(vma, start, end, - file->f_mapping); + force_swapin_readahead(vma, start, end); return 0; } -#endif + if (shmem_mapping(file->f_mapping)) { + *prev = vma; + force_shm_swapin_readahead(vma, start, end, + file->f_mapping); + return 0; + } +#else if (!file) return -EBADF; +#endif if (file->f_mapping->a_ops->get_xip_mem) { /* no bad return value, but ignore advice */ @@ -980,9 +980,6 @@ static int validate_mmap_request(struct file *file, return -EOVERFLOW; if (file) { - /* validate file mapping requests */ - struct address_space *mapping; - /* files must support mmap */ if (!file->f_op->mmap) return -ENODEV; @@ -991,28 +988,22 @@ static int validate_mmap_request(struct file *file, * - we support chardevs that provide their own "memory" * - we support files/blockdevs that are memory backed */ - mapping = file->f_mapping; - if (!mapping) - mapping = file_inode(file)->i_mapping; - - capabilities = 0; - if (mapping && mapping->backing_dev_info) - capabilities = mapping->backing_dev_info->capabilities; - - if (!capabilities) { + if (file->f_op->mmap_capabilities) { + capabilities = file->f_op->mmap_capabilities(file); + } else { /* no explicit capabilities set, so assume some * defaults */ switch (file_inode(file)->i_mode & S_IFMT) { case S_IFREG: case S_IFBLK: - capabilities = BDI_CAP_MAP_COPY; + capabilities = NOMMU_MAP_COPY; break; case S_IFCHR: capabilities = - BDI_CAP_MAP_DIRECT | - BDI_CAP_READ_MAP | - BDI_CAP_WRITE_MAP; + NOMMU_MAP_DIRECT | + NOMMU_MAP_READ | + NOMMU_MAP_WRITE; break; default: @@ -1023,9 +1014,9 @@ static int validate_mmap_request(struct file *file, /* eliminate any capabilities that we can't support on this * device */ if (!file->f_op->get_unmapped_area) - capabilities &= ~BDI_CAP_MAP_DIRECT; + capabilities &= ~NOMMU_MAP_DIRECT; if (!file->f_op->read) - capabilities &= ~BDI_CAP_MAP_COPY; + capabilities &= ~NOMMU_MAP_COPY; /* The file shall have been opened with read permission. */ if (!(file->f_mode & FMODE_READ)) @@ -1044,29 +1035,29 @@ static int validate_mmap_request(struct file *file, if (locks_verify_locked(file)) return -EAGAIN; - if (!(capabilities & BDI_CAP_MAP_DIRECT)) + if (!(capabilities & NOMMU_MAP_DIRECT)) return -ENODEV; /* we mustn't privatise shared mappings */ - capabilities &= ~BDI_CAP_MAP_COPY; + capabilities &= ~NOMMU_MAP_COPY; } else { /* we're going to read the file into private memory we * allocate */ - if (!(capabilities & BDI_CAP_MAP_COPY)) + if (!(capabilities & NOMMU_MAP_COPY)) return -ENODEV; /* we don't permit a private writable mapping to be * shared with the backing device */ if (prot & PROT_WRITE) - capabilities &= ~BDI_CAP_MAP_DIRECT; + capabilities &= ~NOMMU_MAP_DIRECT; } - if (capabilities & BDI_CAP_MAP_DIRECT) { - if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || - ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || - ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) + if (capabilities & NOMMU_MAP_DIRECT) { + if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) || + ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) || + ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC)) ) { - capabilities &= ~BDI_CAP_MAP_DIRECT; + capabilities &= ~NOMMU_MAP_DIRECT; if (flags & MAP_SHARED) { printk(KERN_WARNING "MAP_SHARED not completely supported on !MMU\n"); @@ -1083,21 +1074,21 @@ static int validate_mmap_request(struct file *file, } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { /* handle implication of PROT_EXEC by PROT_READ */ if (current->personality & READ_IMPLIES_EXEC) { - if (capabilities & BDI_CAP_EXEC_MAP) + if (capabilities & NOMMU_MAP_EXEC) prot |= PROT_EXEC; } } else if ((prot & PROT_READ) && (prot & PROT_EXEC) && - !(capabilities & BDI_CAP_EXEC_MAP) + !(capabilities & NOMMU_MAP_EXEC) ) { /* backing file is not executable, try to copy */ - capabilities &= ~BDI_CAP_MAP_DIRECT; + capabilities &= ~NOMMU_MAP_DIRECT; } } else { /* anonymous mappings are always memory backed and can be * privately mapped */ - capabilities = BDI_CAP_MAP_COPY; + capabilities = NOMMU_MAP_COPY; /* handle PROT_EXEC implication by PROT_READ */ if ((prot & PROT_READ) && @@ -1129,7 +1120,7 @@ static unsigned long determine_vm_flags(struct file *file, vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); /* vm_flags |= mm->def_flags; */ - if (!(capabilities & BDI_CAP_MAP_DIRECT)) { + if (!(capabilities & NOMMU_MAP_DIRECT)) { /* attempt to share read-only copies of mapped file chunks */ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (file && !(prot & PROT_WRITE)) @@ -1138,7 +1129,7 @@ static unsigned long determine_vm_flags(struct file *file, /* overlay a shareable mapping on the backing device or inode * if possible - used for chardevs, ramfs/tmpfs/shmfs and * romfs/cramfs */ - vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS); + vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS); if (flags & MAP_SHARED) vm_flags |= VM_SHARED; } @@ -1191,7 +1182,7 @@ static int do_mmap_private(struct vm_area_struct *vma, * shared mappings on devices or memory * - VM_MAYSHARE will be set if it may attempt to share */ - if (capabilities & BDI_CAP_MAP_DIRECT) { + if (capabilities & NOMMU_MAP_DIRECT) { ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { /* shouldn't return success if we're not sharing */ @@ -1380,7 +1371,7 @@ unsigned long do_mmap_pgoff(struct file *file, if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { /* new mapping is not a subset of the region */ - if (!(capabilities & BDI_CAP_MAP_DIRECT)) + if (!(capabilities & NOMMU_MAP_DIRECT)) goto sharing_violation; continue; } @@ -1419,7 +1410,7 @@ unsigned long do_mmap_pgoff(struct file *file, * - this is the hook for quasi-memory character devices to * tell us the location of a shared mapping */ - if (capabilities & BDI_CAP_MAP_DIRECT) { + if (capabilities & NOMMU_MAP_DIRECT) { addr = file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) { @@ -1431,10 +1422,10 @@ unsigned long do_mmap_pgoff(struct file *file, * the mapping so we'll have to attempt to copy * it */ ret = -ENODEV; - if (!(capabilities & BDI_CAP_MAP_COPY)) + if (!(capabilities & NOMMU_MAP_COPY)) goto error_just_free; - capabilities &= ~BDI_CAP_MAP_DIRECT; + capabilities &= ~NOMMU_MAP_DIRECT; } else { vma->vm_start = region->vm_start = addr; vma->vm_end = region->vm_end = addr + len; @@ -1445,7 +1436,7 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_region = region; /* set up the mapping - * - the region is filled in if BDI_CAP_MAP_DIRECT is still set + * - the region is filled in if NOMMU_MAP_DIRECT is still set */ if (file && vma->vm_flags & VM_SHARED) ret = do_mmap_shared_file(vma); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6a73e47..45e187b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1351,7 +1351,7 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long task_ratelimit; unsigned long dirty_ratelimit; unsigned long pos_ratio; - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; @@ -1574,7 +1574,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); int ratelimit; int *p; @@ -1929,7 +1929,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - trace_wbc_writepage(wbc, mapping->backing_dev_info); + trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { @@ -2094,10 +2094,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) trace_writeback_dirty_page(page, mapping); if (mapping_cap_account_dirty(mapping)) { + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); - __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); - __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); + __inc_bdi_stat(bdi, BDI_RECLAIMABLE); + __inc_bdi_stat(bdi, BDI_DIRTIED); task_io_account_write(PAGE_CACHE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); @@ -2156,7 +2158,7 @@ void account_page_redirty(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { current->nr_dirtied--; dec_zone_page_state(page, NR_DIRTIED); - dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED); } } EXPORT_SYMBOL(account_page_redirty); @@ -2298,7 +2300,7 @@ int clear_page_dirty_for_io(struct page *page) */ if (TestClearPageDirty(page)) { dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); return 1; } @@ -2316,7 +2318,7 @@ int test_clear_page_writeback(struct page *page) memcg = mem_cgroup_begin_page_stat(page); if (mapping) { - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); @@ -2351,7 +2353,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) memcg = mem_cgroup_begin_page_stat(page); if (mapping) { - struct backing_dev_info *bdi = mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); @@ -2405,12 +2407,7 @@ EXPORT_SYMBOL(mapping_tagged); */ void wait_for_stable_page(struct page *page) { - struct address_space *mapping = page_mapping(page); - struct backing_dev_info *bdi = mapping->backing_dev_info; - - if (!bdi_cap_stable_pages_required(bdi)) - return; - - wait_on_page_writeback(page); + if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) + wait_on_page_writeback(page); } EXPORT_SYMBOL_GPL(wait_for_stable_page); diff --git a/mm/readahead.c b/mm/readahead.c index 17b9172..9356758 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -27,7 +27,7 @@ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { - ra->ra_pages = mapping->backing_dev_info->ra_pages; + ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; ra->prev_pos = -1; } EXPORT_SYMBOL_GPL(file_ra_state_init); @@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping, /* * Defer asynchronous read-ahead on IO congestion. */ - if (bdi_read_congested(mapping->backing_dev_info)) + if (bdi_read_congested(inode_to_bdi(mapping->host))) return; /* do read-ahead */ @@ -191,11 +191,6 @@ static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; -static struct backing_dev_info shmem_backing_dev_info __read_mostly = { - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, -}; - static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); @@ -765,11 +760,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) goto redirty; /* - * shmem_backing_dev_info's capabilities prevent regular writeback or - * sync from ever calling shmem_writepage; but a stacking filesystem - * might use ->writepage of its underlying filesystem, in which case - * tmpfs should write out to swap only in response to memory pressure, - * and not for the writeback threads or sync. + * Our capabilities prevent regular writeback or sync from ever calling + * shmem_writepage; but a stacking filesystem might use ->writepage of + * its underlying filesystem, in which case tmpfs should write out to + * swap only in response to memory pressure, and not for the writeback + * threads or sync. */ if (!wbc->for_reclaim) { WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ @@ -1415,7 +1410,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); inode->i_blocks = 0; - inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_generation = get_seconds(); info = SHMEM_I(inode); @@ -1461,7 +1455,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode bool shmem_mapping(struct address_space *mapping) { - return mapping->backing_dev_info == &shmem_backing_dev_info; + return mapping->host->i_sb->s_op == &shmem_ops; } #ifdef CONFIG_TMPFS @@ -3225,10 +3219,6 @@ int __init shmem_init(void) if (shmem_inode_cachep) return 0; - error = bdi_init(&shmem_backing_dev_info); - if (error) - goto out4; - error = shmem_init_inodecache(); if (error) goto out3; @@ -3252,8 +3242,6 @@ out1: out2: shmem_destroy_inodecache(); out3: - bdi_destroy(&shmem_backing_dev_info); -out4: shm_mnt = ERR_PTR(error); return error; } @@ -1138,8 +1138,6 @@ void __init swap_setup(void) #ifdef CONFIG_SWAP int i; - if (bdi_init(swapper_spaces[0].backing_dev_info)) - panic("Failed to init swap bdi"); for (i = 0; i < MAX_SWAPFILES; i++) spin_lock_init(&swapper_spaces[i].tree_lock); #endif diff --git a/mm/swap_state.c b/mm/swap_state.c index 9711342..405923f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -32,17 +32,11 @@ static const struct address_space_operations swap_aops = { #endif }; -static struct backing_dev_info swap_backing_dev_info = { - .name = "swap", - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, -}; - struct address_space swapper_spaces[MAX_SWAPFILES] = { [0 ... MAX_SWAPFILES - 1] = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), .i_mmap_writable = ATOMIC_INIT(0), .a_ops = &swap_aops, - .backing_dev_info = &swap_backing_dev_info, } }; diff --git a/mm/truncate.c b/mm/truncate.c index f1e4d60..ddec5a5 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -112,7 +112,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) struct address_space *mapping = page->mapping; if (mapping && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); if (account_size) task_io_account_cancelled_write(account_size); diff --git a/mm/vmscan.c b/mm/vmscan.c index 803886b..5e8eadd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -538,7 +538,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info, sc)) + if (!may_write_to_queue(inode_to_bdi(mapping->host), sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -917,7 +917,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ mapping = page_mapping(page); if (((dirty || writeback) && mapping && - bdi_write_congested(mapping->backing_dev_info)) || + bdi_write_congested(inode_to_bdi(mapping->host))) || (writeback && PageReclaim(page))) nr_congested++; diff --git a/security/security.c b/security/security.c index 18b35c6..a0442b2 100644 --- a/security/security.c +++ b/security/security.c @@ -726,16 +726,15 @@ static inline unsigned long mmap_prot(struct file *file, unsigned long prot) return prot | PROT_EXEC; /* * ditto if it's not on noexec mount, except that on !MMU we need - * BDI_CAP_EXEC_MMAP (== VM_MAYEXEC) in this case + * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case */ if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) { #ifndef CONFIG_MMU - unsigned long caps = 0; - struct address_space *mapping = file->f_mapping; - if (mapping && mapping->backing_dev_info) - caps = mapping->backing_dev_info->capabilities; - if (!(caps & BDI_CAP_EXEC_MAP)) - return prot; + if (file->f_op->mmap_capabilities) { + unsigned caps = file->f_op->mmap_capabilities(file); + if (!(caps & NOMMU_MAP_EXEC)) + return prot; + } #endif return prot | PROT_EXEC; } |