diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-04 07:58:06 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-04 07:58:06 -0700 |
commit | f459c34538f57661e0fd1d3eaf7c0b17125ae011 (patch) | |
tree | 3addc82d7f792c4533501978798dad0095293933 | |
parent | 29dcea88779c856c7dc92040a0c01233263101d4 (diff) | |
parent | 32a50fabb334b2f0725de84bf248bd8c24c22b05 (diff) | |
download | op-kernel-dev-f459c34538f57661e0fd1d3eaf7c0b17125ae011.zip op-kernel-dev-f459c34538f57661e0fd1d3eaf7c0b17125ae011.tar.gz |
Merge tag 'for-4.18/block-20180603' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
- clean up how we pass around gfp_t and
blk_mq_req_flags_t (Christoph)
- prepare us to defer scheduler attach (Christoph)
- clean up drivers handling of bounce buffers (Christoph)
- fix timeout handling corner cases (Christoph/Bart/Keith)
- bcache fixes (Coly)
- prep work for bcachefs and some block layer optimizations (Kent).
- convert users of bio_sets to using embedded structs (Kent).
- fixes for the BFQ io scheduler (Paolo/Davide/Filippo)
- lightnvm fixes and improvements (Matias, with contributions from Hans
and Javier)
- adding discard throttling to blk-wbt (me)
- sbitmap blk-mq-tag handling (me/Omar/Ming).
- remove the sparc jsflash block driver, acked by DaveM.
- Kyber scheduler improvement from Jianchao, making it more friendly
wrt merging.
- conversion of symbolic proc permissions to octal, from Joe Perches.
Previously the block parts were a mix of both.
- nbd fixes (Josef and Kevin Vigor)
- unify how we handle the various kinds of timestamps that the block
core and utility code uses (Omar)
- three NVMe pull requests from Keith and Christoph, bringing AEN to
feature completeness, file backed namespaces, cq/sq lock split, and
various fixes
- various little fixes and improvements all over the map
* tag 'for-4.18/block-20180603' of git://git.kernel.dk/linux-block: (196 commits)
blk-mq: update nr_requests when switching to 'none' scheduler
block: don't use blocking queue entered for recursive bio submits
dm-crypt: fix warning in shutdown path
lightnvm: pblk: take bitmap alloc. out of critical section
lightnvm: pblk: kick writer on new flush points
lightnvm: pblk: only try to recover lines with written smeta
lightnvm: pblk: remove unnecessary bio_get/put
lightnvm: pblk: add possibility to set write buffer size manually
lightnvm: fix partial read error path
lightnvm: proper error handling for pblk_bio_add_pages
lightnvm: pblk: fix smeta write error path
lightnvm: pblk: garbage collect lines with failed writes
lightnvm: pblk: rework write error recovery path
lightnvm: pblk: remove dead function
lightnvm: pass flag on graceful teardown to targets
lightnvm: pblk: check for chunk size before allocating it
lightnvm: pblk: remove unnecessary argument
lightnvm: pblk: remove unnecessary indirection
lightnvm: pblk: return NVM_ error on failed submission
lightnvm: pblk: warn in case of corrupted write buffer
...
212 files changed, 4020 insertions, 3984 deletions
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt index 733927a..07f1473 100644 --- a/Documentation/block/null_blk.txt +++ b/Documentation/block/null_blk.txt @@ -71,13 +71,16 @@ use_per_node_hctx=[0/1]: Default: 0 1: The multi-queue block layer is instantiated with a hardware dispatch queue for each CPU node in the system. -use_lightnvm=[0/1]: Default: 0 - Register device with LightNVM. Requires blk-mq and CONFIG_NVM to be enabled. - no_sched=[0/1]: Default: 0 0: nullb* use default blk-mq io scheduler. 1: nullb* doesn't use io scheduler. +blocking=[0/1]: Default: 0 + 0: Register as a non-blocking blk-mq driver device. + 1: Register as a blocking blk-mq driver device, null_blk will set + the BLK_MQ_F_BLOCKING flag, indicating that it sometimes/always + needs to block in its ->queue_rq() function. + shared_tags=[0/1]: Default: 0 0: Tag set is not shared. 1: Tag set shared between devices for blk-mq. Only makes sense with diff --git a/Documentation/scsi/scsi_eh.txt b/Documentation/scsi/scsi_eh.txt index 11e447b..1b74369 100644 --- a/Documentation/scsi/scsi_eh.txt +++ b/Documentation/scsi/scsi_eh.txt @@ -82,24 +82,13 @@ function 1. invokes optional hostt->eh_timed_out() callback. Return value can be one of - - BLK_EH_HANDLED - This indicates that eh_timed_out() dealt with the timeout. - The command is passed back to the block layer and completed - via __blk_complete_requests(). - - *NOTE* After returning BLK_EH_HANDLED the SCSI layer is - assumed to be finished with the command, and no other - functions from the SCSI layer will be called. So this - should typically only be returned if the eh_timed_out() - handler raced with normal completion. - - BLK_EH_RESET_TIMER This indicates that more time is required to finish the command. Timer is restarted. This action is counted as a retry and only allowed scmd->allowed + 1(!) times. Once the - limit is reached, action for BLK_EH_NOT_HANDLED is taken instead. + limit is reached, action for BLK_EH_DONE is taken instead. - - BLK_EH_NOT_HANDLED + - BLK_EH_DONE eh_timed_out() callback did not handle the command. Step #2 is taken. diff --git a/MAINTAINERS b/MAINTAINERS index 9c125f7..cbc6bbf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9700,7 +9700,7 @@ S: Maintained F: drivers/net/ethernet/netronome/ NETWORK BLOCK DEVICE (NBD) -M: Josef Bacik <jbacik@fb.com> +M: Josef Bacik <josef@toxicpanda.com> S: Maintained L: linux-block@vger.kernel.org L: nbd@other.debian.org diff --git a/arch/sparc/include/uapi/asm/jsflash.h b/arch/sparc/include/uapi/asm/jsflash.h deleted file mode 100644 index 68c98a5..0000000 --- a/arch/sparc/include/uapi/asm/jsflash.h +++ /dev/null @@ -1,40 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * jsflash.h: OS Flash SIMM support for JavaStations. - * - * Copyright (C) 1999 Pete Zaitcev - */ - -#ifndef _SPARC_JSFLASH_H -#define _SPARC_JSFLASH_H - -#ifndef _SPARC_TYPES_H -#include <linux/types.h> -#endif - -/* - * Semantics of the offset is a full address. - * Hardcode it or get it from probe ioctl. - * - * We use full bus address, so that we would be - * automatically compatible with possible future systems. - */ - -#define JSFLASH_IDENT (('F'<<8)|54) -struct jsflash_ident_arg { - __u64 off; /* 0x20000000 is included */ - __u32 size; - char name[32]; /* With trailing zero */ -}; - -#define JSFLASH_ERASE (('F'<<8)|55) -/* Put 0 as argument, may be flags or sector number... */ - -#define JSFLASH_PROGRAM (('F'<<8)|56) -struct jsflash_program_arg { - __u64 data; /* char* for sparc and sparc64 */ - __u64 off; - __u32 size; -}; - -#endif /* _SPARC_JSFLASH_H */ diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index d819dc7..a9e8633 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -55,13 +55,13 @@ BFQG_FLAG_FNS(empty) /* This should be called with the scheduler lock held. */ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) { - unsigned long long now; + u64 now; if (!bfqg_stats_waiting(stats)) return; - now = sched_clock(); - if (time_after64(now, stats->start_group_wait_time)) + now = ktime_get_ns(); + if (now > stats->start_group_wait_time) blkg_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); bfqg_stats_clear_waiting(stats); @@ -77,20 +77,20 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, return; if (bfqg == curr_bfqg) return; - stats->start_group_wait_time = sched_clock(); + stats->start_group_wait_time = ktime_get_ns(); bfqg_stats_mark_waiting(stats); } /* This should be called with the scheduler lock held. */ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { - unsigned long long now; + u64 now; if (!bfqg_stats_empty(stats)) return; - now = sched_clock(); - if (time_after64(now, stats->start_empty_time)) + now = ktime_get_ns(); + if (now > stats->start_empty_time) blkg_stat_add(&stats->empty_time, now - stats->start_empty_time); bfqg_stats_clear_empty(stats); @@ -116,7 +116,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) if (bfqg_stats_empty(stats)) return; - stats->start_empty_time = sched_clock(); + stats->start_empty_time = ktime_get_ns(); bfqg_stats_mark_empty(stats); } @@ -125,9 +125,9 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_idling(stats)) { - unsigned long long now = sched_clock(); + u64 now = ktime_get_ns(); - if (time_after64(now, stats->start_idle_time)) + if (now > stats->start_idle_time) blkg_stat_add(&stats->idle_time, now - stats->start_idle_time); bfqg_stats_clear_idling(stats); @@ -138,7 +138,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; - stats->start_idle_time = sched_clock(); + stats->start_idle_time = ktime_get_ns(); bfqg_stats_mark_idling(stats); } @@ -171,18 +171,18 @@ void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) blkg_rwstat_add(&bfqg->stats.merged, op, 1); } -void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, - uint64_t io_start_time, unsigned int op) +void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, + u64 io_start_time_ns, unsigned int op) { struct bfqg_stats *stats = &bfqg->stats; - unsigned long long now = sched_clock(); + u64 now = ktime_get_ns(); - if (time_after64(now, io_start_time)) + if (now > io_start_time_ns) blkg_rwstat_add(&stats->service_time, op, - now - io_start_time); - if (time_after64(io_start_time, start_time)) + now - io_start_time_ns); + if (io_start_time_ns > start_time_ns) blkg_rwstat_add(&stats->wait_time, op, - io_start_time - start_time); + io_start_time_ns - start_time_ns); } #else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ @@ -191,8 +191,8 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, unsigned int op) { } void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, - uint64_t io_start_time, unsigned int op) { } +void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, + u64 io_start_time_ns, unsigned int op) { } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 771ae97..495b9dd 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -49,9 +49,39 @@ * * In particular, to provide these low-latency guarantees, BFQ * explicitly privileges the I/O of two classes of time-sensitive - * applications: interactive and soft real-time. This feature enables - * BFQ to provide applications in these classes with a very low - * latency. Finally, BFQ also features additional heuristics for + * applications: interactive and soft real-time. In more detail, BFQ + * behaves this way if the low_latency parameter is set (default + * configuration). This feature enables BFQ to provide applications in + * these classes with a very low latency. + * + * To implement this feature, BFQ constantly tries to detect whether + * the I/O requests in a bfq_queue come from an interactive or a soft + * real-time application. For brevity, in these cases, the queue is + * said to be interactive or soft real-time. In both cases, BFQ + * privileges the service of the queue, over that of non-interactive + * and non-soft-real-time queues. This privileging is performed, + * mainly, by raising the weight of the queue. So, for brevity, we + * call just weight-raising periods the time periods during which a + * queue is privileged, because deemed interactive or soft real-time. + * + * The detection of soft real-time queues/applications is described in + * detail in the comments on the function + * bfq_bfqq_softrt_next_start. On the other hand, the detection of an + * interactive queue works as follows: a queue is deemed interactive + * if it is constantly non empty only for a limited time interval, + * after which it does become empty. The queue may be deemed + * interactive again (for a limited time), if it restarts being + * constantly non empty, provided that this happens only after the + * queue has remained empty for a given minimum idle time. + * + * By default, BFQ computes automatically the above maximum time + * interval, i.e., the time interval after which a constantly + * non-empty queue stops being deemed interactive. Since a queue is + * weight-raised while it is deemed interactive, this maximum time + * interval happens to coincide with the (maximum) duration of the + * weight-raising for interactive queues. + * + * Finally, BFQ also features additional heuristics for * preserving both a low latency and a high throughput on NCQ-capable, * rotational or flash-based devices, and to get the job done quickly * for applications consisting in many I/O-bound processes. @@ -61,14 +91,14 @@ * all low-latency heuristics for that device, by setting low_latency * to 0. * - * BFQ is described in [1], where also a reference to the initial, more - * theoretical paper on BFQ can be found. The interested reader can find - * in the latter paper full details on the main algorithm, as well as - * formulas of the guarantees and formal proofs of all the properties. - * With respect to the version of BFQ presented in these papers, this - * implementation adds a few more heuristics, such as the one that - * guarantees a low latency to soft real-time applications, and a - * hierarchical extension based on H-WF2Q+. + * BFQ is described in [1], where also a reference to the initial, + * more theoretical paper on BFQ can be found. The interested reader + * can find in the latter paper full details on the main algorithm, as + * well as formulas of the guarantees and formal proofs of all the + * properties. With respect to the version of BFQ presented in these + * papers, this implementation adds a few more heuristics, such as the + * ones that guarantee a low latency to interactive and soft real-time + * applications, and a hierarchical extension based on H-WF2Q+. * * B-WF2Q+ is based on WF2Q+, which is described in [2], together with * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+ @@ -218,56 +248,46 @@ static struct kmem_cache *bfq_pool; #define BFQ_RATE_SHIFT 16 /* - * By default, BFQ computes the duration of the weight raising for - * interactive applications automatically, using the following formula: - * duration = (R / r) * T, where r is the peak rate of the device, and - * R and T are two reference parameters. - * In particular, R is the peak rate of the reference device (see - * below), and T is a reference time: given the systems that are - * likely to be installed on the reference device according to its - * speed class, T is about the maximum time needed, under BFQ and - * while reading two files in parallel, to load typical large - * applications on these systems (see the comments on - * max_service_from_wr below, for more details on how T is obtained). - * In practice, the slower/faster the device at hand is, the more/less - * it takes to load applications with respect to the reference device. - * Accordingly, the longer/shorter BFQ grants weight raising to - * interactive applications. - * - * BFQ uses four different reference pairs (R, T), depending on: - * . whether the device is rotational or non-rotational; - * . whether the device is slow, such as old or portable HDDs, as well as - * SD cards, or fast, such as newer HDDs and SSDs. + * When configured for computing the duration of the weight-raising + * for interactive queues automatically (see the comments at the + * beginning of this file), BFQ does it using the following formula: + * duration = (ref_rate / r) * ref_wr_duration, + * where r is the peak rate of the device, and ref_rate and + * ref_wr_duration are two reference parameters. In particular, + * ref_rate is the peak rate of the reference storage device (see + * below), and ref_wr_duration is about the maximum time needed, with + * BFQ and while reading two files in parallel, to load typical large + * applications on the reference device (see the comments on + * max_service_from_wr below, for more details on how ref_wr_duration + * is obtained). In practice, the slower/faster the device at hand + * is, the more/less it takes to load applications with respect to the + * reference device. Accordingly, the longer/shorter BFQ grants + * weight raising to interactive applications. * - * The device's speed class is dynamically (re)detected in - * bfq_update_peak_rate() every time the estimated peak rate is updated. + * BFQ uses two different reference pairs (ref_rate, ref_wr_duration), + * depending on whether the device is rotational or non-rotational. * - * In the following definitions, R_slow[0]/R_fast[0] and - * T_slow[0]/T_fast[0] are the reference values for a slow/fast - * rotational device, whereas R_slow[1]/R_fast[1] and - * T_slow[1]/T_fast[1] are the reference values for a slow/fast - * non-rotational device. Finally, device_speed_thresh are the - * thresholds used to switch between speed classes. The reference - * rates are not the actual peak rates of the devices used as a - * reference, but slightly lower values. The reason for using these - * slightly lower values is that the peak-rate estimator tends to - * yield slightly lower values than the actual peak rate (it can yield - * the actual peak rate only if there is only one process doing I/O, - * and the process does sequential I/O). + * In the following definitions, ref_rate[0] and ref_wr_duration[0] + * are the reference values for a rotational device, whereas + * ref_rate[1] and ref_wr_duration[1] are the reference values for a + * non-rotational device. The reference rates are not the actual peak + * rates of the devices used as a reference, but slightly lower + * values. The reason for using slightly lower values is that the + * peak-rate estimator tends to yield slightly lower values than the + * actual peak rate (it can yield the actual peak rate only if there + * is only one process doing I/O, and the process does sequential + * I/O). * - * Both the reference peak rates and the thresholds are measured in - * sectors/usec, left-shifted by BFQ_RATE_SHIFT. + * The reference peak rates are measured in sectors/usec, left-shifted + * by BFQ_RATE_SHIFT. */ -static int R_slow[2] = {1000, 10700}; -static int R_fast[2] = {14000, 33000}; +static int ref_rate[2] = {14000, 33000}; /* - * To improve readability, a conversion function is used to initialize the - * following arrays, which entails that they can be initialized only in a - * function. + * To improve readability, a conversion function is used to initialize + * the following array, which entails that the array can be + * initialized only in a function. */ -static int T_slow[2]; -static int T_fast[2]; -static int device_speed_thresh[2]; +static int ref_wr_duration[2]; /* * BFQ uses the above-detailed, time-based weight-raising mechanism to @@ -487,46 +507,6 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, } /* - * See the comments on bfq_limit_depth for the purpose of - * the depths set in the function. - */ -static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) -{ - bfqd->sb_shift = bt->sb.shift; - - /* - * In-word depths if no bfq_queue is being weight-raised: - * leaving 25% of tags only for sync reads. - * - * In next formulas, right-shift the value - * (1U<<bfqd->sb_shift), instead of computing directly - * (1U<<(bfqd->sb_shift - something)), to be robust against - * any possible value of bfqd->sb_shift, without having to - * limit 'something'. - */ - /* no more than 50% of tags for async I/O */ - bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U); - /* - * no more than 75% of tags for sync writes (25% extra tags - * w.r.t. async I/O, to prevent async I/O from starving sync - * writes) - */ - bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U); - - /* - * In-word depths in case some bfq_queue is being weight- - * raised: leaving ~63% of tags for sync reads. This is the - * highest percentage for which, in our tests, application - * start-up times didn't suffer from any regression due to tag - * shortage. - */ - /* no more than ~18% of tags for async I/O */ - bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U); - /* no more than ~37% of tags for sync writes (~20% extra tags) */ - bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U); -} - -/* * Async I/O can easily starve sync I/O (both sync reads and sync * writes), by consuming all tags. Similarly, storms of sync writes, * such as those that sync(2) may trigger, can starve sync reads. @@ -535,25 +515,11 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) */ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) { - struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct bfq_data *bfqd = data->q->elevator->elevator_data; - struct sbitmap_queue *bt; if (op_is_sync(op) && !op_is_write(op)) return; - if (data->flags & BLK_MQ_REQ_RESERVED) { - if (unlikely(!tags->nr_reserved_tags)) { - WARN_ON_ONCE(1); - return; - } - bt = &tags->breserved_tags; - } else - bt = &tags->bitmap_tags; - - if (unlikely(bfqd->sb_shift != bt->sb.shift)) - bfq_update_depths(bfqd, bt); - data->shallow_depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; @@ -906,26 +872,30 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) if (bfqd->bfq_wr_max_time > 0) return bfqd->bfq_wr_max_time; - dur = bfqd->RT_prod; + dur = bfqd->rate_dur_prod; do_div(dur, bfqd->peak_rate); /* - * Limit duration between 3 and 13 seconds. Tests show that - * higher values than 13 seconds often yield the opposite of - * the desired result, i.e., worsen responsiveness by letting - * non-interactive and non-soft-real-time applications - * preserve weight raising for a too long time interval. + * Limit duration between 3 and 25 seconds. The upper limit + * has been conservatively set after the following worst case: + * on a QEMU/KVM virtual machine + * - running in a slow PC + * - with a virtual disk stacked on a slow low-end 5400rpm HDD + * - serving a heavy I/O workload, such as the sequential reading + * of several files + * mplayer took 23 seconds to start, if constantly weight-raised. + * + * As for higher values than that accomodating the above bad + * scenario, tests show that higher values would often yield + * the opposite of the desired result, i.e., would worsen + * responsiveness by allowing non-interactive applications to + * preserve weight raising for too long. * * On the other end, lower values than 3 seconds make it * difficult for most interactive tasks to complete their jobs * before weight-raising finishes. */ - if (dur > msecs_to_jiffies(13000)) - dur = msecs_to_jiffies(13000); - else if (dur < msecs_to_jiffies(3000)) - dur = msecs_to_jiffies(3000); - - return dur; + return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000)); } /* switch back from soft real-time to interactive weight raising */ @@ -1393,15 +1363,6 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, } /* - * Return the farthest future time instant according to jiffies - * macros. - */ -static unsigned long bfq_greatest_from_now(void) -{ - return jiffies + MAX_JIFFY_OFFSET; -} - -/* * Return the farthest past time instant according to jiffies * macros. */ @@ -1545,7 +1506,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, in_burst = bfq_bfqq_in_large_burst(bfqq); soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && !in_burst && - time_is_before_jiffies(bfqq->soft_rt_next_start); + time_is_before_jiffies(bfqq->soft_rt_next_start) && + bfqq->dispatched == 0; *interactive = !in_burst && idle_for_long_time; wr_or_deserves_wr = bfqd->low_latency && (bfqq->wr_coeff > 1 || @@ -1858,6 +1820,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, return ELEVATOR_NO_MERGE; } +static struct bfq_queue *bfq_init_rq(struct request *rq); + static void bfq_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { @@ -1866,7 +1830,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, blk_rq_pos(req) < blk_rq_pos(container_of(rb_prev(&req->rb_node), struct request, rb_node))) { - struct bfq_queue *bfqq = RQ_BFQQ(req); + struct bfq_queue *bfqq = bfq_init_rq(req); struct bfq_data *bfqd = bfqq->bfqd; struct request *prev, *next_rq; @@ -1891,14 +1855,25 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, } } +/* + * This function is called to notify the scheduler that the requests + * rq and 'next' have been merged, with 'next' going away. BFQ + * exploits this hook to address the following issue: if 'next' has a + * fifo_time lower that rq, then the fifo_time of rq must be set to + * the value of 'next', to not forget the greater age of 'next'. + * + * NOTE: in this function we assume that rq is in a bfq_queue, basing + * on that rq is picked from the hash table q->elevator->hash, which, + * in its turn, is filled only with I/O requests present in + * bfq_queues, while BFQ is in use for the request queue q. In fact, + * the function that fills this hash table (elv_rqhash_add) is called + * only by bfq_insert_request. + */ static void bfq_requests_merged(struct request_queue *q, struct request *rq, struct request *next) { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); - - if (!RB_EMPTY_NODE(&rq->rb_node)) - goto end; - spin_lock_irq(&bfqq->bfqd->lock); + struct bfq_queue *bfqq = bfq_init_rq(rq), + *next_bfqq = bfq_init_rq(next); /* * If next and rq belong to the same bfq_queue and next is older @@ -1920,11 +1895,6 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, if (bfqq->next_rq == next) bfqq->next_rq = rq; - bfq_remove_request(q, next); - bfqg_stats_update_io_remove(bfqq_group(bfqq), next->cmd_flags); - - spin_unlock_irq(&bfqq->bfqd->lock); -end: bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); } @@ -2506,37 +2476,15 @@ static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) /* * Update parameters related to throughput and responsiveness, as a * function of the estimated peak rate. See comments on - * bfq_calc_max_budget(), and on T_slow and T_fast arrays. + * bfq_calc_max_budget(), and on the ref_wr_duration array. */ static void update_thr_responsiveness_params(struct bfq_data *bfqd) { - int dev_type = blk_queue_nonrot(bfqd->queue); - - if (bfqd->bfq_user_max_budget == 0) + if (bfqd->bfq_user_max_budget == 0) { bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); - - if (bfqd->device_speed == BFQ_BFQD_FAST && - bfqd->peak_rate < device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_SLOW; - bfqd->RT_prod = R_slow[dev_type] * - T_slow[dev_type]; - } else if (bfqd->device_speed == BFQ_BFQD_SLOW && - bfqd->peak_rate > device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_FAST; - bfqd->RT_prod = R_fast[dev_type] * - T_fast[dev_type]; + bfq_log(bfqd, "new max_budget = %d", bfqd->bfq_max_budget); } - - bfq_log(bfqd, -"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", - dev_type == 0 ? "ROT" : "NONROT", - bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", - bfqd->device_speed == BFQ_BFQD_FAST ? - (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : - (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, - (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> - BFQ_RATE_SHIFT); } static void bfq_reset_rate_computation(struct bfq_data *bfqd, @@ -3266,23 +3214,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, bfq_bfqq_softrt_next_start(bfqd, bfqq); else { /* - * The application is still waiting for the - * completion of one or more requests: - * prevent it from possibly being incorrectly - * deemed as soft real-time by setting its - * soft_rt_next_start to infinity. In fact, - * without this assignment, the application - * would be incorrectly deemed as soft - * real-time if: - * 1) it issued a new request before the - * completion of all its in-flight - * requests, and - * 2) at that time, its soft_rt_next_start - * happened to be in the past. - */ - bfqq->soft_rt_next_start = - bfq_greatest_from_now(); - /* * Schedule an update of soft_rt_next_start to when * the task may be discovered to be isochronous. */ @@ -4540,14 +4471,12 @@ static inline void bfq_update_insert_stats(struct request_queue *q, unsigned int cmd_flags) {} #endif -static void bfq_prepare_request(struct request *rq, struct bio *bio); - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_queue *bfqq; bool idle_timer_disabled = false; unsigned int cmd_flags; @@ -4562,24 +4491,13 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_mq_sched_request_inserted(rq); spin_lock_irq(&bfqd->lock); + bfqq = bfq_init_rq(rq); if (at_head || blk_rq_is_passthrough(rq)) { if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); else list_add_tail(&rq->queuelist, &bfqd->dispatch); - } else { - if (WARN_ON_ONCE(!bfqq)) { - /* - * This should never happen. Most likely rq is - * a requeued regular request, being - * re-inserted without being first - * re-prepared. Do a prepare, to avoid - * failure. - */ - bfq_prepare_request(rq, rq->bio); - bfqq = RQ_BFQQ(rq); - } - + } else { /* bfqq is assumed to be non null here */ idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred @@ -4778,8 +4696,8 @@ static void bfq_finish_requeue_request(struct request *rq) if (rq->rq_flags & RQF_STARTED) bfqg_stats_update_completion(bfqq_group(bfqq), - rq_start_time_ns(rq), - rq_io_start_time_ns(rq), + rq->start_time_ns, + rq->io_start_time_ns, rq->cmd_flags); if (likely(rq->rq_flags & RQF_STARTED)) { @@ -4922,11 +4840,48 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, } /* - * Allocate bfq data structures associated with this request. + * Only reset private fields. The actual request preparation will be + * performed by bfq_init_rq, when rq is either inserted or merged. See + * comments on bfq_init_rq for the reason behind this delayed + * preparation. */ static void bfq_prepare_request(struct request *rq, struct bio *bio) { + /* + * Regardless of whether we have an icq attached, we have to + * clear the scheduler pointers, as they might point to + * previously allocated bic/bfqq structs. + */ + rq->elv.priv[0] = rq->elv.priv[1] = NULL; +} + +/* + * If needed, init rq, allocate bfq data structures associated with + * rq, and increment reference counters in the destination bfq_queue + * for rq. Return the destination bfq_queue for rq, or NULL is rq is + * not associated with any bfq_queue. + * + * This function is invoked by the functions that perform rq insertion + * or merging. One may have expected the above preparation operations + * to be performed in bfq_prepare_request, and not delayed to when rq + * is inserted or merged. The rationale behind this delayed + * preparation is that, after the prepare_request hook is invoked for + * rq, rq may still be transformed into a request with no icq, i.e., a + * request not associated with any queue. No bfq hook is invoked to + * signal this tranformation. As a consequence, should these + * preparation operations be performed when the prepare_request hook + * is invoked, and should rq be transformed one moment later, bfq + * would end up in an inconsistent state, because it would have + * incremented some queue counters for an rq destined to + * transformation, without any chance to correctly lower these + * counters back. In contrast, no transformation can still happen for + * rq after rq has been inserted or merged. So, it is safe to execute + * these preparation operations when rq is finally inserted or merged. + */ +static struct bfq_queue *bfq_init_rq(struct request *rq) +{ struct request_queue *q = rq->q; + struct bio *bio = rq->bio; struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_io_cq *bic; const int is_sync = rq_is_sync(rq); @@ -4934,20 +4889,21 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio) bool new_queue = false; bool bfqq_already_existing = false, split = false; + if (unlikely(!rq->elv.icq)) + return NULL; + /* - * Even if we don't have an icq attached, we should still clear - * the scheduler pointers, as they might point to previously - * allocated bic/bfqq structs. + * Assuming that elv.priv[1] is set only if everything is set + * for this rq. This holds true, because this function is + * invoked only for insertion or merging, and, after such + * events, a request cannot be manipulated any longer before + * being removed from bfq. */ - if (!rq->elv.icq) { - rq->elv.priv[0] = rq->elv.priv[1] = NULL; - return; - } + if (rq->elv.priv[1]) + return rq->elv.priv[1]; bic = icq_to_bic(rq->elv.icq); - spin_lock_irq(&bfqd->lock); - bfq_check_ioprio_change(bic, bio); bfq_bic_update_cgroup(bic, bio); @@ -5006,7 +4962,7 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio) if (unlikely(bfq_bfqq_just_created(bfqq))) bfq_handle_burst(bfqd, bfqq); - spin_unlock_irq(&bfqd->lock); + return bfqq; } static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) @@ -5105,6 +5061,64 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); } +/* + * See the comments on bfq_limit_depth for the purpose of + * the depths set in the function. Return minimum shallow depth we'll use. + */ +static unsigned int bfq_update_depths(struct bfq_data *bfqd, + struct sbitmap_queue *bt) +{ + unsigned int i, j, min_shallow = UINT_MAX; + + /* + * In-word depths if no bfq_queue is being weight-raised: + * leaving 25% of tags only for sync reads. + * + * In next formulas, right-shift the value + * (1U<<bt->sb.shift), instead of computing directly + * (1U<<(bt->sb.shift - something)), to be robust against + * any possible value of bt->sb.shift, without having to + * limit 'something'. + */ + /* no more than 50% of tags for async I/O */ + bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U); + /* + * no more than 75% of tags for sync writes (25% extra tags + * w.r.t. async I/O, to prevent async I/O from starving sync + * writes) + */ + bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U); + + /* + * In-word depths in case some bfq_queue is being weight- + * raised: leaving ~63% of tags for sync reads. This is the + * highest percentage for which, in our tests, application + * start-up times didn't suffer from any regression due to tag + * shortage. + */ + /* no more than ~18% of tags for async I/O */ + bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U); + /* no more than ~37% of tags for sync writes (~20% extra tags) */ + bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U); + + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + min_shallow = min(min_shallow, bfqd->word_depths[i][j]); + + return min_shallow; +} + +static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) +{ + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct blk_mq_tags *tags = hctx->sched_tags; + unsigned int min_shallow; + + min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); + return 0; +} + static void bfq_exit_queue(struct elevator_queue *e) { struct bfq_data *bfqd = e->elevator_data; @@ -5242,14 +5256,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->wr_busy_queues = 0; /* - * Begin by assuming, optimistically, that the device is a - * high-speed one, and that its peak rate is equal to 2/3 of - * the highest reference rate. + * Begin by assuming, optimistically, that the device peak + * rate is equal to 2/3 of the highest reference rate. */ - bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * - T_fast[blk_queue_nonrot(bfqd->queue)]; - bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; - bfqd->device_speed = BFQ_BFQD_FAST; + bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * + ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; + bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; spin_lock_init(&bfqd->lock); @@ -5526,6 +5538,7 @@ static struct elevator_type iosched_bfq_mq = { .requests_merged = bfq_requests_merged, .request_merged = bfq_request_merged, .has_work = bfq_has_work, + .init_hctx = bfq_init_hctx, .init_sched = bfq_init_queue, .exit_sched = bfq_exit_queue, }, @@ -5556,8 +5569,8 @@ static int __init bfq_init(void) /* * Times to load large popular applications for the typical * systems installed on the reference devices (see the - * comments before the definitions of the next two - * arrays). Actually, we use slightly slower values, as the + * comments before the definition of the next + * array). Actually, we use slightly lower values, as the * estimated peak rate tends to be smaller than the actual * peak rate. The reason for this last fact is that estimates * are computed over much shorter time intervals than the long @@ -5566,25 +5579,8 @@ static int __init bfq_init(void) * scheduler cannot rely on a peak-rate-evaluation workload to * be run for a long time. */ - T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ - T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ - T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ - T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ - - /* - * Thresholds that determine the switch between speed classes - * (see the comments before the definition of the array - * device_speed_thresh). These thresholds are biased towards - * transitions to the fast class. This is safer than the - * opposite bias. In fact, a wrong transition to the slow - * class results in short weight-raising periods, because the - * speed of the device then tends to be higher that the - * reference peak rate. On the opposite end, a wrong - * transition to the fast class tends to increase - * weight-raising periods, because of the opposite reason. - */ - device_speed_thresh[0] = (4 * R_slow[0]) / 3; - device_speed_thresh[1] = (4 * R_slow[1]) / 3; + ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */ + ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */ ret = elv_register(&iosched_bfq_mq); if (ret) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index ae2f3da..0f712e0 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -399,11 +399,6 @@ struct bfq_io_cq { struct bfq_ttime saved_ttime; }; -enum bfq_device_speed { - BFQ_BFQD_FAST, - BFQ_BFQD_SLOW, -}; - /** * struct bfq_data - per-device data structure. * @@ -611,12 +606,11 @@ struct bfq_data { /* Max service-rate for a soft real-time queue, in sectors/sec */ unsigned int bfq_wr_max_softrt_rate; /* - * Cached value of the product R*T, used for computing the - * maximum duration of weight raising automatically. + * Cached value of the product ref_rate*ref_wr_duration, used + * for computing the maximum duration of weight raising + * automatically. */ - u64 RT_prod; - /* device-speed class for the low-latency heuristic */ - enum bfq_device_speed device_speed; + u64 rate_dur_prod; /* fallback dummy bfqq for extreme OOM conditions */ struct bfq_queue oom_bfqq; @@ -636,12 +630,6 @@ struct bfq_data { struct bfq_queue *bio_bfqq; /* - * Cached sbitmap shift, used to compute depth limits in - * bfq_update_depths. - */ - unsigned int sb_shift; - - /* * Depth limits used in bfq_limit_depth (see comments on the * function) */ @@ -732,9 +720,9 @@ struct bfqg_stats { /* total time with empty current active q with other requests queued */ struct blkg_stat empty_time; /* fields after this shouldn't be cleared on stat reset */ - uint64_t start_group_wait_time; - uint64_t start_idle_time; - uint64_t start_empty_time; + u64 start_group_wait_time; + u64 start_idle_time; + u64 start_empty_time; uint16_t flags; #endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ }; @@ -856,8 +844,8 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, unsigned int op); void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op); void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op); -void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time, - uint64_t io_start_time, unsigned int op); +void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, + u64 io_start_time_ns, unsigned int op); void bfqg_stats_update_dequeue(struct bfq_group *bfqg); void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); void bfqg_stats_update_idle_time(struct bfq_group *bfqg); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 9cfdd6c..add7c7c 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -56,12 +56,12 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, struct bio_set *bs = bio->bi_pool; unsigned inline_vecs; - if (!bs || !bs->bio_integrity_pool) { + if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) { bip = kmalloc(sizeof(struct bio_integrity_payload) + sizeof(struct bio_vec) * nr_vecs, gfp_mask); inline_vecs = nr_vecs; } else { - bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); + bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); inline_vecs = BIP_INLINE_VECS; } @@ -74,7 +74,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, unsigned long idx = 0; bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, - bs->bvec_integrity_pool); + &bs->bvec_integrity_pool); if (!bip->bip_vec) goto err; bip->bip_max_vcnt = bvec_nr_vecs(idx); @@ -90,7 +90,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, return bip; err: - mempool_free(bip, bs->bio_integrity_pool); + mempool_free(bip, &bs->bio_integrity_pool); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(bio_integrity_alloc); @@ -111,10 +111,10 @@ static void bio_integrity_free(struct bio *bio) kfree(page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset); - if (bs && bs->bio_integrity_pool) { - bvec_free(bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab); + if (bs && mempool_initialized(&bs->bio_integrity_pool)) { + bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab); - mempool_free(bip, bs->bio_integrity_pool); + mempool_free(bip, &bs->bio_integrity_pool); } else { kfree(bip); } @@ -465,16 +465,15 @@ EXPORT_SYMBOL(bio_integrity_clone); int bioset_integrity_create(struct bio_set *bs, int pool_size) { - if (bs->bio_integrity_pool) + if (mempool_initialized(&bs->bio_integrity_pool)) return 0; - bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); - if (!bs->bio_integrity_pool) + if (mempool_init_slab_pool(&bs->bio_integrity_pool, + pool_size, bip_slab)) return -1; - bs->bvec_integrity_pool = biovec_create_pool(pool_size); - if (!bs->bvec_integrity_pool) { - mempool_destroy(bs->bio_integrity_pool); + if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) { + mempool_exit(&bs->bio_integrity_pool); return -1; } @@ -484,8 +483,8 @@ EXPORT_SYMBOL(bioset_integrity_create); void bioset_integrity_free(struct bio_set *bs) { - mempool_destroy(bs->bio_integrity_pool); - mempool_destroy(bs->bvec_integrity_pool); + mempool_exit(&bs->bio_integrity_pool); + mempool_exit(&bs->bvec_integrity_pool); } EXPORT_SYMBOL(bioset_integrity_free); diff --git a/block/bio.c b/block/bio.c index 53e0f0a..595663e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -53,7 +53,7 @@ static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = { * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. */ -struct bio_set *fs_bio_set; +struct bio_set fs_bio_set; EXPORT_SYMBOL(fs_bio_set); /* @@ -254,7 +254,7 @@ static void bio_free(struct bio *bio) bio_uninit(bio); if (bs) { - bvec_free(bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); + bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); /* * If we have front padding, adjust the bio pointer before freeing @@ -262,7 +262,7 @@ static void bio_free(struct bio *bio) p = bio; p -= bs->front_pad; - mempool_free(p, bs->bio_pool); + mempool_free(p, &bs->bio_pool); } else { /* Bio was allocated by bio_kmalloc() */ kfree(bio); @@ -454,7 +454,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, inline_vecs = nr_iovecs; } else { /* should not use nobvec bioset for nr_iovecs > 0 */ - if (WARN_ON_ONCE(!bs->bvec_pool && nr_iovecs > 0)) + if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && + nr_iovecs > 0)) return NULL; /* * generic_make_request() converts recursion to iteration; this @@ -483,11 +484,11 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, bs->rescue_workqueue) gfp_mask &= ~__GFP_DIRECT_RECLAIM; - p = mempool_alloc(bs->bio_pool, gfp_mask); + p = mempool_alloc(&bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - p = mempool_alloc(bs->bio_pool, gfp_mask); + p = mempool_alloc(&bs->bio_pool, gfp_mask); } front_pad = bs->front_pad; @@ -503,11 +504,11 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, if (nr_iovecs > inline_vecs) { unsigned long idx = 0; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); } if (unlikely(!bvl)) @@ -524,25 +525,25 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, return bio; err_free: - mempool_free(p, bs->bio_pool); + mempool_free(p, &bs->bio_pool); return NULL; } EXPORT_SYMBOL(bio_alloc_bioset); -void zero_fill_bio(struct bio *bio) +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { unsigned long flags; struct bio_vec bv; struct bvec_iter iter; - bio_for_each_segment(bv, bio, iter) { + __bio_for_each_segment(bv, bio, iter, start) { char *data = bvec_kmap_irq(&bv, &flags); memset(data, 0, bv.bv_len); flush_dcache_page(bv.bv_page); bvec_kunmap_irq(data, &flags); } } -EXPORT_SYMBOL(zero_fill_bio); +EXPORT_SYMBOL(zero_fill_bio_iter); /** * bio_put - release a reference to a bio @@ -970,27 +971,68 @@ void bio_advance(struct bio *bio, unsigned bytes) } EXPORT_SYMBOL(bio_advance); +void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, + struct bio *src, struct bvec_iter *src_iter) +{ + struct bio_vec src_bv, dst_bv; + void *src_p, *dst_p; + unsigned bytes; + + while (src_iter->bi_size && dst_iter->bi_size) { + src_bv = bio_iter_iovec(src, *src_iter); + dst_bv = bio_iter_iovec(dst, *dst_iter); + + bytes = min(src_bv.bv_len, dst_bv.bv_len); + + src_p = kmap_atomic(src_bv.bv_page); + dst_p = kmap_atomic(dst_bv.bv_page); + + memcpy(dst_p + dst_bv.bv_offset, + src_p + src_bv.bv_offset, + bytes); + + kunmap_atomic(dst_p); + kunmap_atomic(src_p); + + flush_dcache_page(dst_bv.bv_page); + + bio_advance_iter(src, src_iter, bytes); + bio_advance_iter(dst, dst_iter, bytes); + } +} +EXPORT_SYMBOL(bio_copy_data_iter); + /** - * bio_copy_data - copy contents of data buffers from one chain of bios to - * another - * @src: source bio list - * @dst: destination bio list - * - * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats - * @src and @dst as linked lists of bios. + * bio_copy_data - copy contents of data buffers from one bio to another + * @src: source bio + * @dst: destination bio * * Stops when it reaches the end of either @src or @dst - that is, copies * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). */ void bio_copy_data(struct bio *dst, struct bio *src) { - struct bvec_iter src_iter, dst_iter; - struct bio_vec src_bv, dst_bv; - void *src_p, *dst_p; - unsigned bytes; + struct bvec_iter src_iter = src->bi_iter; + struct bvec_iter dst_iter = dst->bi_iter; - src_iter = src->bi_iter; - dst_iter = dst->bi_iter; + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); +} +EXPORT_SYMBOL(bio_copy_data); + +/** + * bio_list_copy_data - copy contents of data buffers from one chain of bios to + * another + * @src: source bio list + * @dst: destination bio list + * + * Stops when it reaches the end of either the @src list or @dst list - that is, + * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of + * bios). + */ +void bio_list_copy_data(struct bio *dst, struct bio *src) +{ + struct bvec_iter src_iter = src->bi_iter; + struct bvec_iter dst_iter = dst->bi_iter; while (1) { if (!src_iter.bi_size) { @@ -1009,26 +1051,10 @@ void bio_copy_data(struct bio *dst, struct bio *src) dst_iter = dst->bi_iter; } - src_bv = bio_iter_iovec(src, src_iter); - dst_bv = bio_iter_iovec(dst, dst_iter); - - bytes = min(src_bv.bv_len, dst_bv.bv_len); - - src_p = kmap_atomic(src_bv.bv_page); - dst_p = kmap_atomic(dst_bv.bv_page); - - memcpy(dst_p + dst_bv.bv_offset, - src_p + src_bv.bv_offset, - bytes); - - kunmap_atomic(dst_p); - kunmap_atomic(src_p); - - bio_advance_iter(src, &src_iter, bytes); - bio_advance_iter(dst, &dst_iter, bytes); + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); } } -EXPORT_SYMBOL(bio_copy_data); +EXPORT_SYMBOL(bio_list_copy_data); struct bio_map_data { int is_our_pages; @@ -1584,6 +1610,7 @@ void bio_set_pages_dirty(struct bio *bio) set_page_dirty_lock(page); } } +EXPORT_SYMBOL_GPL(bio_set_pages_dirty); static void bio_release_pages(struct bio *bio) { @@ -1667,6 +1694,7 @@ void bio_check_pages_dirty(struct bio *bio) bio_put(bio); } } +EXPORT_SYMBOL_GPL(bio_check_pages_dirty); void generic_start_io_acct(struct request_queue *q, int rw, unsigned long sectors, struct hd_struct *part) @@ -1749,6 +1777,9 @@ again: if (!bio_integrity_endio(bio)) return; + if (WARN_ONCE(bio->bi_next, "driver left bi_next not NULL")) + bio->bi_next = NULL; + /* * Need to have a real endio function for chained bios, otherwise * various corner cases will break (like stacking block devices that @@ -1848,30 +1879,38 @@ EXPORT_SYMBOL_GPL(bio_trim); * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ -mempool_t *biovec_create_pool(int pool_entries) +int biovec_init_pool(mempool_t *pool, int pool_entries) { struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX; - return mempool_create_slab_pool(pool_entries, bp->slab); + return mempool_init_slab_pool(pool, pool_entries, bp->slab); } -void bioset_free(struct bio_set *bs) +/* + * bioset_exit - exit a bioset initialized with bioset_init() + * + * May be called on a zeroed but uninitialized bioset (i.e. allocated with + * kzalloc()). + */ +void bioset_exit(struct bio_set *bs) { if (bs->rescue_workqueue) destroy_workqueue(bs->rescue_workqueue); + bs->rescue_workqueue = NULL; - mempool_destroy(bs->bio_pool); - mempool_destroy(bs->bvec_pool); + mempool_exit(&bs->bio_pool); + mempool_exit(&bs->bvec_pool); bioset_integrity_free(bs); - bio_put_slab(bs); - - kfree(bs); + if (bs->bio_slab) + bio_put_slab(bs); + bs->bio_slab = NULL; } -EXPORT_SYMBOL(bioset_free); +EXPORT_SYMBOL(bioset_exit); /** - * bioset_create - Create a bio_set + * bioset_init - Initialize a bio_set + * @bs: pool to initialize * @pool_size: Number of bio and bio_vecs to cache in the mempool * @front_pad: Number of bytes to allocate in front of the returned bio * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS @@ -1890,16 +1929,12 @@ EXPORT_SYMBOL(bioset_free); * dispatch queued requests when the mempool runs out of space. * */ -struct bio_set *bioset_create(unsigned int pool_size, - unsigned int front_pad, - int flags) +int bioset_init(struct bio_set *bs, + unsigned int pool_size, + unsigned int front_pad, + int flags) { unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); - struct bio_set *bs; - - bs = kzalloc(sizeof(*bs), GFP_KERNEL); - if (!bs) - return NULL; bs->front_pad = front_pad; @@ -1908,34 +1943,29 @@ struct bio_set *bioset_create(unsigned int pool_size, INIT_WORK(&bs->rescue_work, bio_alloc_rescue); bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); - if (!bs->bio_slab) { - kfree(bs); - return NULL; - } + if (!bs->bio_slab) + return -ENOMEM; - bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab); - if (!bs->bio_pool) + if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab)) goto bad; - if (flags & BIOSET_NEED_BVECS) { - bs->bvec_pool = biovec_create_pool(pool_size); - if (!bs->bvec_pool) - goto bad; - } + if ((flags & BIOSET_NEED_BVECS) && + biovec_init_pool(&bs->bvec_pool, pool_size)) + goto bad; if (!(flags & BIOSET_NEED_RESCUER)) - return bs; + return 0; bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); if (!bs->rescue_workqueue) goto bad; - return bs; + return 0; bad: - bioset_free(bs); - return NULL; + bioset_exit(bs); + return -ENOMEM; } -EXPORT_SYMBOL(bioset_create); +EXPORT_SYMBOL(bioset_init); #ifdef CONFIG_BLK_CGROUP @@ -2020,11 +2050,10 @@ static int __init init_bio(void) bio_integrity_init(); biovec_init_slabs(); - fs_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (!fs_bio_set) + if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) panic("bio: can't allocate bios\n"); - if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) + if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) panic("bio: can't create integrity pool\n"); return 0; diff --git a/block/blk-core.c b/block/blk-core.c index 85909b4..3f56be1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -196,15 +196,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) RB_CLEAR_NODE(&rq->rb_node); rq->tag = -1; rq->internal_tag = -1; - rq->start_time = jiffies; - set_start_time_ns(rq); + rq->start_time_ns = ktime_get_ns(); rq->part = NULL; - seqcount_init(&rq->gstate_seq); - u64_stats_init(&rq->aborted_gstate_sync); - /* - * See comment of blk_mq_init_request - */ - WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC); } EXPORT_SYMBOL(blk_rq_init); @@ -280,6 +273,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio, bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ + /* + * XXX this code looks suspicious - it's not consistent with advancing + * req->bio in caller + */ if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) bio_endio(bio); } @@ -360,7 +357,6 @@ EXPORT_SYMBOL(blk_start_queue_async); void blk_start_queue(struct request_queue *q) { lockdep_assert_held(q->queue_lock); - WARN_ON(!in_interrupt() && !irqs_disabled()); WARN_ON_ONCE(q->mq_ops); queue_flag_clear(QUEUE_FLAG_STOPPED, q); @@ -996,18 +992,24 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, spinlock_t *lock) { struct request_queue *q; + int ret; q = kmem_cache_alloc_node(blk_requestq_cachep, gfp_mask | __GFP_ZERO, node_id); if (!q) return NULL; + INIT_LIST_HEAD(&q->queue_head); + q->last_merge = NULL; + q->end_sector = 0; + q->boundary_rq = NULL; + q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) goto fail_q; - q->bio_split = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (!q->bio_split) + ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (ret) goto fail_id; q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id); @@ -1079,7 +1081,7 @@ fail_bdi: fail_stats: bdi_put(q->backing_dev_info); fail_split: - bioset_free(q->bio_split); + bioset_exit(&q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); fail_q: @@ -1173,16 +1175,8 @@ int blk_init_allocated_queue(struct request_queue *q) q->sg_reserved_size = INT_MAX; - /* Protect q->elevator from elevator_change */ - mutex_lock(&q->sysfs_lock); - - /* init elevator */ - if (elevator_init(q, NULL)) { - mutex_unlock(&q->sysfs_lock); + if (elevator_init(q)) goto out_exit_flush_rq; - } - - mutex_unlock(&q->sysfs_lock); return 0; out_exit_flush_rq: @@ -1334,6 +1328,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) * @flags: BLQ_MQ_REQ_* flags + * @gfp_mask: allocator flags * * Get a free request from @q. This function may fail under memory * pressure or if @q is dead. @@ -1343,7 +1338,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *__get_request(struct request_list *rl, unsigned int op, - struct bio *bio, blk_mq_req_flags_t flags) + struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask) { struct request_queue *q = rl->q; struct request *rq; @@ -1352,8 +1347,6 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, struct io_cq *icq = NULL; const bool is_sync = op_is_sync(op); int may_queue; - gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : - __GFP_DIRECT_RECLAIM; req_flags_t rq_flags = RQF_ALLOCED; lockdep_assert_held(q->queue_lock); @@ -1517,8 +1510,9 @@ rq_starved: * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) * @flags: BLK_MQ_REQ_* flags. + * @gfp: allocator flags * - * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, + * Get a free request from @q. If %BLK_MQ_REQ_NOWAIT is set in @flags, * this function keeps retrying under memory pressure and fails iff @q is dead. * * Must be called with @q->queue_lock held and, @@ -1526,7 +1520,7 @@ rq_starved: * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, unsigned int op, - struct bio *bio, blk_mq_req_flags_t flags) + struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp) { const bool is_sync = op_is_sync(op); DEFINE_WAIT(wait); @@ -1538,7 +1532,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op, rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: - rq = __get_request(rl, op, bio, flags); + rq = __get_request(rl, op, bio, flags, gfp); if (!IS_ERR(rq)) return rq; @@ -1579,8 +1573,7 @@ static struct request *blk_old_get_request(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags) { struct request *rq; - gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : - __GFP_DIRECT_RECLAIM; + gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO; int ret = 0; WARN_ON_ONCE(q->mq_ops); @@ -1592,7 +1585,7 @@ static struct request *blk_old_get_request(struct request_queue *q, if (ret) return ERR_PTR(ret); spin_lock_irq(q->queue_lock); - rq = get_request(q, op, NULL, flags); + rq = get_request(q, op, NULL, flags, gfp_mask); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); blk_queue_exit(q); @@ -1607,13 +1600,13 @@ static struct request *blk_old_get_request(struct request_queue *q, } /** - * blk_get_request_flags - allocate a request + * blk_get_request - allocate a request * @q: request queue to allocate a request for * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC. * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. */ -struct request *blk_get_request_flags(struct request_queue *q, unsigned int op, - blk_mq_req_flags_t flags) +struct request *blk_get_request(struct request_queue *q, unsigned int op, + blk_mq_req_flags_t flags) { struct request *req; @@ -1632,14 +1625,6 @@ struct request *blk_get_request_flags(struct request_queue *q, unsigned int op, return req; } -EXPORT_SYMBOL(blk_get_request_flags); - -struct request *blk_get_request(struct request_queue *q, unsigned int op, - gfp_t gfp_mask) -{ - return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ? - 0 : BLK_MQ_REQ_NOWAIT); -} EXPORT_SYMBOL(blk_get_request); /** @@ -1660,7 +1645,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); - wbt_requeue(q->rq_wb, &rq->issue_stat); + wbt_requeue(q->rq_wb, rq); if (rq->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, rq); @@ -1767,7 +1752,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) /* this is a bio leak */ WARN_ON(req->bio != NULL); - wbt_done(q->rq_wb, &req->issue_stat); + wbt_done(q->rq_wb, req); /* * Request may not have originated from ll_rw_blk. if not, @@ -2066,7 +2051,7 @@ get_rq: * Returns with the queue unlocked. */ blk_queue_enter_live(q); - req = get_request(q, bio->bi_opf, bio, 0); + req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); if (IS_ERR(req)) { blk_queue_exit(q); __wbt_done(q->rq_wb, wb_acct); @@ -2078,7 +2063,7 @@ get_rq: goto out_unlock; } - wbt_track(&req->issue_stat, wb_acct); + wbt_track(req, wb_acct); /* * After dropping the lock and possibly sleeping here, our request @@ -2392,7 +2377,9 @@ blk_qc_t generic_make_request(struct bio *bio) if (bio->bi_opf & REQ_NOWAIT) flags = BLK_MQ_REQ_NOWAIT; - if (blk_queue_enter(q, flags) < 0) { + if (bio_flagged(bio, BIO_QUEUE_ENTERED)) + blk_queue_enter_live(q); + else if (blk_queue_enter(q, flags) < 0) { if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT)) bio_wouldblock_error(bio); else @@ -2727,7 +2714,7 @@ void blk_account_io_completion(struct request *req, unsigned int bytes) } } -void blk_account_io_done(struct request *req) +void blk_account_io_done(struct request *req, u64 now) { /* * Account IO completion. flush_rq isn't accounted as a @@ -2735,11 +2722,12 @@ void blk_account_io_done(struct request *req) * containing request is enough. */ if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { - unsigned long duration = jiffies - req->start_time; + unsigned long duration; const int rw = rq_data_dir(req); struct hd_struct *part; int cpu; + duration = nsecs_to_jiffies(now - req->start_time_ns); cpu = part_stat_lock(); part = req->part; @@ -2970,10 +2958,8 @@ static void blk_dequeue_request(struct request *rq) * and to it is freed is accounted as io that is in progress at * the driver side. */ - if (blk_account_rq(rq)) { + if (blk_account_rq(rq)) q->in_flight[rq_is_sync(rq)]++; - set_io_start_time_ns(rq); - } } /** @@ -2992,9 +2978,12 @@ void blk_start_request(struct request *req) blk_dequeue_request(req); if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { - blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req)); + req->io_start_time_ns = ktime_get_ns(); +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + req->throtl_size = blk_rq_sectors(req); +#endif req->rq_flags |= RQF_STATS; - wbt_issue(req->q->rq_wb, &req->issue_stat); + wbt_issue(req->q->rq_wb, req); } BUG_ON(blk_rq_is_complete(req)); @@ -3092,8 +3081,10 @@ bool blk_update_request(struct request *req, blk_status_t error, struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); - if (bio_bytes == bio->bi_iter.bi_size) + if (bio_bytes == bio->bi_iter.bi_size) { req->bio = bio->bi_next; + bio->bi_next = NULL; + } /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); @@ -3190,12 +3181,13 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); void blk_finish_request(struct request *req, blk_status_t error) { struct request_queue *q = req->q; + u64 now = ktime_get_ns(); lockdep_assert_held(req->q->queue_lock); WARN_ON_ONCE(q->mq_ops); if (req->rq_flags & RQF_STATS) - blk_stat_add(req); + blk_stat_add(req, now); if (req->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, req); @@ -3210,10 +3202,10 @@ void blk_finish_request(struct request *req, blk_status_t error) if (req->rq_flags & RQF_DONTPREP) blk_unprep_request(req); - blk_account_io_done(req); + blk_account_io_done(req, now); if (req->end_io) { - wbt_done(req->q->rq_wb, &req->issue_stat); + wbt_done(req->q->rq_wb, req); req->end_io(req, error); } else { if (blk_bidi_rq(req)) @@ -3519,7 +3511,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio *bio, *bio_src; if (!bs) - bs = fs_bio_set; + bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { bio = bio_clone_fast(bio_src, gfp_mask, bs); @@ -3630,7 +3622,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth, blk_run_queue_async(q); else __blk_run_queue(q); - spin_unlock(q->queue_lock); + spin_unlock_irq(q->queue_lock); } static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) @@ -3678,7 +3670,6 @@ EXPORT_SYMBOL(blk_check_plugged); void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request_queue *q; - unsigned long flags; struct request *rq; LIST_HEAD(list); unsigned int depth; @@ -3698,11 +3689,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) q = NULL; depth = 0; - /* - * Save and disable interrupts here, to avoid doing it for every - * queue lock we have to take. - */ - local_irq_save(flags); while (!list_empty(&list)) { rq = list_entry_rq(list.next); list_del_init(&rq->queuelist); @@ -3715,7 +3701,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) queue_unplugged(q, depth, from_schedule); q = rq->q; depth = 0; - spin_lock(q->queue_lock); + spin_lock_irq(q->queue_lock); } /* @@ -3742,8 +3728,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (q) queue_unplugged(q, depth, from_schedule); - - local_irq_restore(flags); } void blk_finish_plug(struct blk_plug *plug) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index feb3057..6121611 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -333,34 +333,34 @@ static ssize_t integrity_device_show(struct blk_integrity *bi, char *page) } static struct integrity_sysfs_entry integrity_format_entry = { - .attr = { .name = "format", .mode = S_IRUGO }, + .attr = { .name = "format", .mode = 0444 }, .show = integrity_format_show, }; static struct integrity_sysfs_entry integrity_tag_size_entry = { - .attr = { .name = "tag_size", .mode = S_IRUGO }, + .attr = { .name = "tag_size", .mode = 0444 }, .show = integrity_tag_size_show, }; static struct integrity_sysfs_entry integrity_interval_entry = { - .attr = { .name = "protection_interval_bytes", .mode = S_IRUGO }, + .attr = { .name = "protection_interval_bytes", .mode = 0444 }, .show = integrity_interval_show, }; static struct integrity_sysfs_entry integrity_verify_entry = { - .attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR }, + .attr = { .name = "read_verify", .mode = 0644 }, .show = integrity_verify_show, .store = integrity_verify_store, }; static struct integrity_sysfs_entry integrity_generate_entry = { - .attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR }, + .attr = { .name = "write_generate", .mode = 0644 }, .show = integrity_generate_show, .store = integrity_generate_store, }; static struct integrity_sysfs_entry integrity_device_entry = { - .attr = { .name = "device_is_integrity_capable", .mode = S_IRUGO }, + .attr = { .name = "device_is_integrity_capable", .mode = 0444 }, .show = integrity_device_show, }; diff --git a/block/blk-lib.c b/block/blk-lib.c index a676084..8faa70f 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -62,10 +62,16 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, unsigned int req_sects; sector_t end_sect, tmp; - /* Make sure bi_size doesn't overflow */ - req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9); + /* + * Issue in chunks of the user defined max discard setting, + * ensuring that bi_size doesn't overflow + */ + req_sects = min_t(sector_t, nr_sects, + q->limits.max_discard_sectors); + if (req_sects > UINT_MAX >> 9) + req_sects = UINT_MAX >> 9; - /** + /* * If splitting a request, and the next starting sector would be * misaligned, stop the discard at the previous aligned sector. */ diff --git a/block/blk-merge.c b/block/blk-merge.c index 782940c..aaec38c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -188,16 +188,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio) switch (bio_op(*bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: - split = blk_bio_discard_split(q, *bio, q->bio_split, &nsegs); + split = blk_bio_discard_split(q, *bio, &q->bio_split, &nsegs); break; case REQ_OP_WRITE_ZEROES: - split = blk_bio_write_zeroes_split(q, *bio, q->bio_split, &nsegs); + split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, &nsegs); break; case REQ_OP_WRITE_SAME: - split = blk_bio_write_same_split(q, *bio, q->bio_split, &nsegs); + split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs); break; default: - split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs); + split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs); break; } @@ -210,6 +210,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio) /* there isn't chance to merge the splitted bio */ split->bi_opf |= REQ_NOMERGE; + /* + * Since we're recursing into make_request here, ensure + * that we mark this bio as already having entered the queue. + * If not, and the queue is going away, we can get stuck + * forever on waiting for the queue reference to drop. But + * that will never happen, as we're already holding a + * reference to it. + */ + bio_set_flag(*bio, BIO_QUEUE_ENTERED); + bio_chain(split, *bio); trace_block_split(q, split, (*bio)->bi_iter.bi_sector); generic_make_request(*bio); @@ -724,13 +734,12 @@ static struct request *attempt_merge(struct request_queue *q, } /* - * At this point we have either done a back merge - * or front merge. We need the smaller start_time of - * the merged requests to be the current request - * for accounting purposes. + * At this point we have either done a back merge or front merge. We + * need the smaller start_time_ns of the merged requests to be the + * current request for accounting purposes. */ - if (time_after(req->start_time, next->start_time)) - req->start_time = next->start_time; + if (next->start_time_ns < req->start_time_ns) + req->start_time_ns = next->start_time_ns; req->biotail->bi_next = next->bio; req->biotail = next->biotail; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 3080e18..ffa6223 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -344,7 +344,6 @@ static const char *const rqf_name[] = { RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_LOCKED), - RQF_NAME(MQ_TIMEOUT_EXPIRED), RQF_NAME(MQ_POLL_SLEPT), }; #undef RQF_NAME diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 25c14c5..56c493c 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -268,19 +268,16 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); /* - * Reverse check our software queue for entries that we could potentially - * merge with. Currently includes a hand-wavy stop count of 8, to not spend - * too much time checking for merges. + * Iterate list of requests and see if we can merge this bio with any + * of them. */ -static bool blk_mq_attempt_merge(struct request_queue *q, - struct blk_mq_ctx *ctx, struct bio *bio) +bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, + struct bio *bio) { struct request *rq; int checked = 8; - lockdep_assert_held(&ctx->lock); - - list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { + list_for_each_entry_reverse(rq, list, queuelist) { bool merged = false; if (!checked--) @@ -305,13 +302,30 @@ static bool blk_mq_attempt_merge(struct request_queue *q, continue; } - if (merged) - ctx->rq_merged++; return merged; } return false; } +EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); + +/* + * Reverse check our software queue for entries that we could potentially + * merge with. Currently includes a hand-wavy stop count of 8, to not spend + * too much time checking for merges. + */ +static bool blk_mq_attempt_merge(struct request_queue *q, + struct blk_mq_ctx *ctx, struct bio *bio) +{ + lockdep_assert_held(&ctx->lock); + + if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) { + ctx->rq_merged++; + return true; + } + + return false; +} bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { @@ -571,6 +585,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) if (!e) { q->elevator = NULL; + q->nr_requests = q->tag_set->queue_depth; return 0; } @@ -633,14 +648,3 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) blk_mq_sched_tags_teardown(q); q->elevator = NULL; } - -int blk_mq_sched_init(struct request_queue *q) -{ - int ret; - - mutex_lock(&q->sysfs_lock); - ret = elevator_init(q, NULL); - mutex_unlock(&q->sysfs_lock); - - return ret; -} diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 1e9c901..0cb8f93 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -33,8 +33,6 @@ int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx); -int blk_mq_sched_init(struct request_queue *q); - static inline bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index a54b4b0..aafb442 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -166,15 +166,15 @@ static struct attribute *default_ctx_attrs[] = { }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { - .attr = {.name = "nr_tags", .mode = S_IRUGO }, + .attr = {.name = "nr_tags", .mode = 0444 }, .show = blk_mq_hw_sysfs_nr_tags_show, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = { - .attr = {.name = "nr_reserved_tags", .mode = S_IRUGO }, + .attr = {.name = "nr_reserved_tags", .mode = 0444 }, .show = blk_mq_hw_sysfs_nr_reserved_tags_show, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { - .attr = {.name = "cpu_list", .mode = S_IRUGO }, + .attr = {.name = "cpu_list", .mode = 0444 }, .show = blk_mq_hw_sysfs_cpus_show, }; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 336dde0..70356a2a 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -134,6 +134,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ws = bt_wait_ptr(bt, data->hctx); drop_ctx = data->ctx == NULL; do { + struct sbitmap_queue *bt_prev; + /* * We're out of tags on this hardware queue, kick any * pending IO submits before going to sleep waiting for @@ -159,6 +161,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (data->ctx) blk_mq_put_ctx(data->ctx); + bt_prev = bt; io_schedule(); data->ctx = blk_mq_get_ctx(data->q); @@ -170,6 +173,15 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) bt = &tags->bitmap_tags; finish_wait(&ws->wait, &wait); + + /* + * If destination hw queue is changed, fake wake up on + * previous queue for compensating the wake up miss, so + * other allocations on previous queue won't be starved. + */ + if (bt != bt_prev) + sbitmap_queue_wake_up(bt_prev); + ws = bt_wait_ptr(bt, data->hctx); } while (1); @@ -259,7 +271,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * test and set the bit before assining ->rqs[]. */ rq = tags->rqs[bitnr]; - if (rq) + if (rq && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) iter_data->fn(rq, iter_data->data, reserved); return true; diff --git a/block/blk-mq.c b/block/blk-mq.c index 9ce9cac..6332940 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -309,7 +309,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; - rq->start_time = jiffies; + rq->start_time_ns = ktime_get_ns(); + rq->io_start_time_ns = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -328,11 +329,10 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, #ifdef CONFIG_BLK_CGROUP rq->rl = NULL; - set_start_time_ns(rq); - rq->io_start_time_ns = 0; #endif data->ctx->rq_dispatched[op_is_sync(op)]++; + refcount_set(&rq->ref, 1); return rq; } @@ -361,9 +361,11 @@ static struct request *blk_mq_get_request(struct request_queue *q, /* * Flush requests are special and go directly to the - * dispatch list. + * dispatch list. Don't include reserved tags in the + * limiting, as it isn't useful. */ - if (!op_is_flush(op) && e->type->ops.mq.limit_depth) + if (!op_is_flush(op) && e->type->ops.mq.limit_depth && + !(data->flags & BLK_MQ_REQ_RESERVED)) e->type->ops.mq.limit_depth(op, data); } @@ -464,13 +466,27 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); +static void __blk_mq_free_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + const int sched_tag = rq->internal_tag; + + if (rq->tag != -1) + blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); + if (sched_tag != -1) + blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag); + blk_mq_sched_restart(hctx); + blk_queue_exit(q); +} + void blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - const int sched_tag = rq->internal_tag; if (rq->rq_flags & RQF_ELVPRIV) { if (e && e->type->ops.mq.finish_request) @@ -488,27 +504,30 @@ void blk_mq_free_request(struct request *rq) if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->backing_dev_info); - wbt_done(q->rq_wb, &rq->issue_stat); + wbt_done(q->rq_wb, rq); if (blk_rq_rl(rq)) blk_put_rl(blk_rq_rl(rq)); - blk_mq_rq_update_state(rq, MQ_RQ_IDLE); - if (rq->tag != -1) - blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); - if (sched_tag != -1) - blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag); - blk_mq_sched_restart(hctx); - blk_queue_exit(q); + WRITE_ONCE(rq->state, MQ_RQ_IDLE); + if (refcount_dec_and_test(&rq->ref)) + __blk_mq_free_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { - blk_account_io_done(rq); + u64 now = ktime_get_ns(); + + if (rq->rq_flags & RQF_STATS) { + blk_mq_poll_stats_start(rq->q); + blk_stat_add(rq, now); + } + + blk_account_io_done(rq, now); if (rq->end_io) { - wbt_done(rq->q->rq_wb, &rq->issue_stat); + wbt_done(rq->q->rq_wb, rq); rq->end_io(rq, error); } else { if (unlikely(blk_bidi_rq(rq))) @@ -539,15 +558,12 @@ static void __blk_mq_complete_request(struct request *rq) bool shared = false; int cpu; - WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT); - blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE); + if (cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) != + MQ_RQ_IN_FLIGHT) + return; if (rq->internal_tag != -1) blk_mq_sched_completed_request(rq); - if (rq->rq_flags & RQF_STATS) { - blk_mq_poll_stats_start(rq->q); - blk_stat_add(rq); - } if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { rq->q->softirq_done_fn(rq); @@ -589,36 +605,6 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) *srcu_idx = srcu_read_lock(hctx->srcu); } -static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate) -{ - unsigned long flags; - - /* - * blk_mq_rq_aborted_gstate() is used from the completion path and - * can thus be called from irq context. u64_stats_fetch in the - * middle of update on the same CPU leads to lockup. Disable irq - * while updating. - */ - local_irq_save(flags); - u64_stats_update_begin(&rq->aborted_gstate_sync); - rq->aborted_gstate = gstate; - u64_stats_update_end(&rq->aborted_gstate_sync); - local_irq_restore(flags); -} - -static u64 blk_mq_rq_aborted_gstate(struct request *rq) -{ - unsigned int start; - u64 aborted_gstate; - - do { - start = u64_stats_fetch_begin(&rq->aborted_gstate_sync); - aborted_gstate = rq->aborted_gstate; - } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start)); - - return aborted_gstate; -} - /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed @@ -629,28 +615,9 @@ static u64 blk_mq_rq_aborted_gstate(struct request *rq) **/ void blk_mq_complete_request(struct request *rq) { - struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); - int srcu_idx; - - if (unlikely(blk_should_fake_timeout(q))) + if (unlikely(blk_should_fake_timeout(rq->q))) return; - - /* - * If @rq->aborted_gstate equals the current instance, timeout is - * claiming @rq and we lost. This is synchronized through - * hctx_lock(). See blk_mq_timeout_work() for details. - * - * Completion path never blocks and we can directly use RCU here - * instead of hctx_lock() which can be either RCU or SRCU. - * However, that would complicate paths which want to synchronize - * against us. Let stay in sync with the issue path so that - * hctx_lock() covers both issue and completion paths. - */ - hctx_lock(hctx, &srcu_idx); - if (blk_mq_rq_aborted_gstate(rq) != rq->gstate) - __blk_mq_complete_request(rq); - hctx_unlock(hctx, srcu_idx); + __blk_mq_complete_request(rq); } EXPORT_SYMBOL(blk_mq_complete_request); @@ -669,32 +636,18 @@ void blk_mq_start_request(struct request *rq) trace_block_rq_issue(q, rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { - blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq)); + rq->io_start_time_ns = ktime_get_ns(); +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + rq->throtl_size = blk_rq_sectors(rq); +#endif rq->rq_flags |= RQF_STATS; - wbt_issue(q->rq_wb, &rq->issue_stat); + wbt_issue(q->rq_wb, rq); } WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); - /* - * Mark @rq in-flight which also advances the generation number, - * and register for timeout. Protect with a seqcount to allow the - * timeout path to read both @rq->gstate and @rq->deadline - * coherently. - * - * This is the only place where a request is marked in-flight. If - * the timeout path reads an in-flight @rq->gstate, the - * @rq->deadline it reads together under @rq->gstate_seq is - * guaranteed to be the matching one. - */ - preempt_disable(); - write_seqcount_begin(&rq->gstate_seq); - - blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT); blk_add_timer(rq); - - write_seqcount_end(&rq->gstate_seq); - preempt_enable(); + WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -707,11 +660,6 @@ void blk_mq_start_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_start_request); -/* - * When we reach here because queue is busy, it's safe to change the state - * to IDLE without checking @rq->aborted_gstate because we should still be - * holding the RCU read lock and thus protected against timeout. - */ static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; @@ -719,10 +667,10 @@ static void __blk_mq_requeue_request(struct request *rq) blk_mq_put_driver_tag(rq); trace_block_rq_requeue(q, rq); - wbt_requeue(q->rq_wb, &rq->issue_stat); + wbt_requeue(q->rq_wb, rq); - if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) { - blk_mq_rq_update_state(rq, MQ_RQ_IDLE); + if (blk_mq_request_started(rq)) { + WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; } @@ -820,101 +768,79 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) } EXPORT_SYMBOL(blk_mq_tag_to_rq); -struct blk_mq_timeout_data { - unsigned long next; - unsigned int next_set; - unsigned int nr_expired; -}; - static void blk_mq_rq_timed_out(struct request *req, bool reserved) { - const struct blk_mq_ops *ops = req->q->mq_ops; - enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; - - req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED; - - if (ops->timeout) - ret = ops->timeout(req, reserved); + if (req->q->mq_ops->timeout) { + enum blk_eh_timer_return ret; - switch (ret) { - case BLK_EH_HANDLED: - __blk_mq_complete_request(req); - break; - case BLK_EH_RESET_TIMER: - /* - * As nothing prevents from completion happening while - * ->aborted_gstate is set, this may lead to ignored - * completions and further spurious timeouts. - */ - blk_mq_rq_update_aborted_gstate(req, 0); - blk_add_timer(req); - break; - case BLK_EH_NOT_HANDLED: - break; - default: - printk(KERN_ERR "block: bad eh return: %d\n", ret); - break; + ret = req->q->mq_ops->timeout(req, reserved); + if (ret == BLK_EH_DONE) + return; + WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); } + + blk_add_timer(req); } -static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, bool reserved) +static bool blk_mq_req_expired(struct request *rq, unsigned long *next) { - struct blk_mq_timeout_data *data = priv; - unsigned long gstate, deadline; - int start; + unsigned long deadline; - might_sleep(); - - if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) - return; + if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) + return false; - /* read coherent snapshots of @rq->state_gen and @rq->deadline */ - while (true) { - start = read_seqcount_begin(&rq->gstate_seq); - gstate = READ_ONCE(rq->gstate); - deadline = blk_rq_deadline(rq); - if (!read_seqcount_retry(&rq->gstate_seq, start)) - break; - cond_resched(); - } + deadline = blk_rq_deadline(rq); + if (time_after_eq(jiffies, deadline)) + return true; - /* if in-flight && overdue, mark for abortion */ - if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT && - time_after_eq(jiffies, deadline)) { - blk_mq_rq_update_aborted_gstate(rq, gstate); - data->nr_expired++; - hctx->nr_expired++; - } else if (!data->next_set || time_after(data->next, deadline)) { - data->next = deadline; - data->next_set = 1; - } + if (*next == 0) + *next = deadline; + else if (time_after(*next, deadline)) + *next = deadline; + return false; } -static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx, +static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { + unsigned long *next = priv; + /* - * We marked @rq->aborted_gstate and waited for RCU. If there were - * completions that we lost to, they would have finished and - * updated @rq->gstate by now; otherwise, the completion path is - * now guaranteed to see @rq->aborted_gstate and yield. If - * @rq->aborted_gstate still matches @rq->gstate, @rq is ours. + * Just do a quick check if it is expired before locking the request in + * so we're not unnecessarilly synchronizing across CPUs. + */ + if (!blk_mq_req_expired(rq, next)) + return; + + /* + * We have reason to believe the request may be expired. Take a + * reference on the request to lock this request lifetime into its + * currently allocated context to prevent it from being reallocated in + * the event the completion by-passes this timeout handler. + * + * If the reference was already released, then the driver beat the + * timeout handler to posting a natural completion. */ - if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) && - READ_ONCE(rq->gstate) == rq->aborted_gstate) + if (!refcount_inc_not_zero(&rq->ref)) + return; + + /* + * The request is now locked and cannot be reallocated underneath the + * timeout handler's processing. Re-verify this exact request is truly + * expired; if it is not expired, then the request was completed and + * reallocated as a new request. + */ + if (blk_mq_req_expired(rq, next)) blk_mq_rq_timed_out(rq, reserved); + if (refcount_dec_and_test(&rq->ref)) + __blk_mq_free_request(rq); } static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, timeout_work); - struct blk_mq_timeout_data data = { - .next = 0, - .next_set = 0, - .nr_expired = 0, - }; + unsigned long next = 0; struct blk_mq_hw_ctx *hctx; int i; @@ -934,39 +860,10 @@ static void blk_mq_timeout_work(struct work_struct *work) if (!percpu_ref_tryget(&q->q_usage_counter)) return; - /* scan for the expired ones and set their ->aborted_gstate */ - blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); - - if (data.nr_expired) { - bool has_rcu = false; - - /* - * Wait till everyone sees ->aborted_gstate. The - * sequential waits for SRCUs aren't ideal. If this ever - * becomes a problem, we can add per-hw_ctx rcu_head and - * wait in parallel. - */ - queue_for_each_hw_ctx(q, hctx, i) { - if (!hctx->nr_expired) - continue; - - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) - has_rcu = true; - else - synchronize_srcu(hctx->srcu); - - hctx->nr_expired = 0; - } - if (has_rcu) - synchronize_rcu(); - - /* terminate the ones we won */ - blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL); - } + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); - if (data.next_set) { - data.next = blk_rq_timeout(round_jiffies_up(data.next)); - mod_timer(&q->timeout, data.next); + if (next != 0) { + mod_timer(&q->timeout, next); } else { /* * Request timeouts are handled as a forward rolling timer. If @@ -1029,7 +926,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; spin_lock(&ctx->lock); - if (unlikely(!list_empty(&ctx->rq_list))) { + if (!list_empty(&ctx->rq_list)) { dispatch_data->rq = list_entry_rq(ctx->rq_list.next); list_del_init(&dispatch_data->rq->queuelist); if (list_empty(&ctx->rq_list)) @@ -1716,15 +1613,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) blk_account_io_start(rq, true); } -static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, - struct request *rq) -{ - spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq, false); - spin_unlock(&ctx->lock); -} - static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) { if (rq->tag != -1) @@ -1882,7 +1770,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } - wbt_track(&rq->issue_stat, wb_acct); + wbt_track(rq, wb_acct); cookie = request_to_qc_t(data.hctx, rq); @@ -1949,15 +1837,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_try_issue_directly(data.hctx, rq, &cookie); - } else if (q->elevator) { - blk_mq_put_ctx(data.ctx); - blk_mq_bio_to_request(rq, bio); - blk_mq_sched_insert_request(rq, false, true, true); } else { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_queue_io(data.hctx, data.ctx, rq); - blk_mq_run_hw_queue(data.hctx, true); + blk_mq_sched_insert_request(rq, false, true, true); } return cookie; @@ -2056,15 +1939,7 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, return ret; } - seqcount_init(&rq->gstate_seq); - u64_stats_init(&rq->aborted_gstate_sync); - /* - * start gstate with gen 1 instead of 0, otherwise it will be equal - * to aborted_gstate, and be identified timed out by - * blk_mq_terminate_expired. - */ - WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC); - + WRITE_ONCE(rq->state, MQ_RQ_IDLE); return 0; } @@ -2365,6 +2240,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; + hctx->dispatch_from = NULL; } /* @@ -2697,7 +2573,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, if (!(set->flags & BLK_MQ_F_NO_SCHED)) { int ret; - ret = blk_mq_sched_init(q); + ret = elevator_init_mq(q); if (ret) return ERR_PTR(ret); } diff --git a/block/blk-mq.h b/block/blk-mq.h index e1bb420..89231e4 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -30,20 +30,6 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; -/* - * Bits for request->gstate. The lower two bits carry MQ_RQ_* state value - * and the upper bits the generation number. - */ -enum mq_rq_state { - MQ_RQ_IDLE = 0, - MQ_RQ_IN_FLIGHT = 1, - MQ_RQ_COMPLETE = 2, - - MQ_RQ_STATE_BITS = 2, - MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1, - MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS, -}; - void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); @@ -107,33 +93,9 @@ void blk_mq_release(struct request_queue *q); * blk_mq_rq_state() - read the current MQ_RQ_* state of a request * @rq: target request. */ -static inline int blk_mq_rq_state(struct request *rq) +static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) { - return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK; -} - -/** - * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request - * @rq: target request. - * @state: new state to set. - * - * Set @rq's state to @state. The caller is responsible for ensuring that - * there are no other updaters. A request can transition into IN_FLIGHT - * only from IDLE and doing so increments the generation number. - */ -static inline void blk_mq_rq_update_state(struct request *rq, - enum mq_rq_state state) -{ - u64 old_val = READ_ONCE(rq->gstate); - u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state; - - if (state == MQ_RQ_IN_FLIGHT) { - WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE); - new_val += MQ_RQ_GEN_INC; - } - - /* avoid exposing interim values */ - WRITE_ONCE(rq->gstate, new_val); + return READ_ONCE(rq->state); } static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, diff --git a/block/blk-stat.c b/block/blk-stat.c index bd365a9..175c143 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -47,19 +47,15 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value) stat->nr_samples++; } -void blk_stat_add(struct request *rq) +void blk_stat_add(struct request *rq, u64 now) { struct request_queue *q = rq->q; struct blk_stat_callback *cb; struct blk_rq_stat *stat; int bucket; - u64 now, value; + u64 value; - now = __blk_stat_time(ktime_to_ns(ktime_get())); - if (now < blk_stat_time(&rq->issue_stat)) - return; - - value = now - blk_stat_time(&rq->issue_stat); + value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; blk_throtl_stat_add(rq, value); diff --git a/block/blk-stat.h b/block/blk-stat.h index 2dd3634..78399cd 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -8,21 +8,6 @@ #include <linux/rcupdate.h> #include <linux/timer.h> -/* - * from upper: - * 3 bits: reserved for other usage - * 12 bits: size - * 49 bits: time - */ -#define BLK_STAT_RES_BITS 3 -#define BLK_STAT_SIZE_BITS 12 -#define BLK_STAT_RES_SHIFT (64 - BLK_STAT_RES_BITS) -#define BLK_STAT_SIZE_SHIFT (BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS) -#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SIZE_SHIFT) - 1) -#define BLK_STAT_SIZE_MASK \ - (((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT) -#define BLK_STAT_RES_MASK (~((1ULL << BLK_STAT_RES_SHIFT) - 1)) - /** * struct blk_stat_callback - Block statistics callback. * @@ -80,35 +65,7 @@ struct blk_stat_callback { struct blk_queue_stats *blk_alloc_queue_stats(void); void blk_free_queue_stats(struct blk_queue_stats *); -void blk_stat_add(struct request *); - -static inline u64 __blk_stat_time(u64 time) -{ - return time & BLK_STAT_TIME_MASK; -} - -static inline u64 blk_stat_time(struct blk_issue_stat *stat) -{ - return __blk_stat_time(stat->stat); -} - -static inline sector_t blk_capped_size(sector_t size) -{ - return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1); -} - -static inline sector_t blk_stat_size(struct blk_issue_stat *stat) -{ - return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT; -} - -static inline void blk_stat_set_issue(struct blk_issue_stat *stat, - sector_t size) -{ - stat->stat = (stat->stat & BLK_STAT_RES_MASK) | - (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) | - (((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT); -} +void blk_stat_add(struct request *rq, u64 now); /* record time/size info in request but not add a callback */ void blk_stat_enable_accounting(struct request_queue *q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index d00d1b0..94987b1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -491,188 +491,198 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page, return count; } +static ssize_t queue_fua_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags)); +} + static ssize_t queue_dax_show(struct request_queue *q, char *page) { return queue_var_show(blk_queue_dax(q), page); } static struct queue_sysfs_entry queue_requests_entry = { - .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "nr_requests", .mode = 0644 }, .show = queue_requests_show, .store = queue_requests_store, }; static struct queue_sysfs_entry queue_ra_entry = { - .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "read_ahead_kb", .mode = 0644 }, .show = queue_ra_show, .store = queue_ra_store, }; static struct queue_sysfs_entry queue_max_sectors_entry = { - .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "max_sectors_kb", .mode = 0644 }, .show = queue_max_sectors_show, .store = queue_max_sectors_store, }; static struct queue_sysfs_entry queue_max_hw_sectors_entry = { - .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO }, + .attr = {.name = "max_hw_sectors_kb", .mode = 0444 }, .show = queue_max_hw_sectors_show, }; static struct queue_sysfs_entry queue_max_segments_entry = { - .attr = {.name = "max_segments", .mode = S_IRUGO }, + .attr = {.name = "max_segments", .mode = 0444 }, .show = queue_max_segments_show, }; static struct queue_sysfs_entry queue_max_discard_segments_entry = { - .attr = {.name = "max_discard_segments", .mode = S_IRUGO }, + .attr = {.name = "max_discard_segments", .mode = 0444 }, .show = queue_max_discard_segments_show, }; static struct queue_sysfs_entry queue_max_integrity_segments_entry = { - .attr = {.name = "max_integrity_segments", .mode = S_IRUGO }, + .attr = {.name = "max_integrity_segments", .mode = 0444 }, .show = queue_max_integrity_segments_show, }; static struct queue_sysfs_entry queue_max_segment_size_entry = { - .attr = {.name = "max_segment_size", .mode = S_IRUGO }, + .attr = {.name = "max_segment_size", .mode = 0444 }, .show = queue_max_segment_size_show, }; static struct queue_sysfs_entry queue_iosched_entry = { - .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "scheduler", .mode = 0644 }, .show = elv_iosched_show, .store = elv_iosched_store, }; static struct queue_sysfs_entry queue_hw_sector_size_entry = { - .attr = {.name = "hw_sector_size", .mode = S_IRUGO }, + .attr = {.name = "hw_sector_size", .mode = 0444 }, .show = queue_logical_block_size_show, }; static struct queue_sysfs_entry queue_logical_block_size_entry = { - .attr = {.name = "logical_block_size", .mode = S_IRUGO }, + .attr = {.name = "logical_block_size", .mode = 0444 }, .show = queue_logical_block_size_show, }; static struct queue_sysfs_entry queue_physical_block_size_entry = { - .attr = {.name = "physical_block_size", .mode = S_IRUGO }, + .attr = {.name = "physical_block_size", .mode = 0444 }, .show = queue_physical_block_size_show, }; static struct queue_sysfs_entry queue_chunk_sectors_entry = { - .attr = {.name = "chunk_sectors", .mode = S_IRUGO }, + .attr = {.name = "chunk_sectors", .mode = 0444 }, .show = queue_chunk_sectors_show, }; static struct queue_sysfs_entry queue_io_min_entry = { - .attr = {.name = "minimum_io_size", .mode = S_IRUGO }, + .attr = {.name = "minimum_io_size", .mode = 0444 }, .show = queue_io_min_show, }; static struct queue_sysfs_entry queue_io_opt_entry = { - .attr = {.name = "optimal_io_size", .mode = S_IRUGO }, + .attr = {.name = "optimal_io_size", .mode = 0444 }, .show = queue_io_opt_show, }; static struct queue_sysfs_entry queue_discard_granularity_entry = { - .attr = {.name = "discard_granularity", .mode = S_IRUGO }, + .attr = {.name = "discard_granularity", .mode = 0444 }, .show = queue_discard_granularity_show, }; static struct queue_sysfs_entry queue_discard_max_hw_entry = { - .attr = {.name = "discard_max_hw_bytes", .mode = S_IRUGO }, + .attr = {.name = "discard_max_hw_bytes", .mode = 0444 }, .show = queue_discard_max_hw_show, }; static struct queue_sysfs_entry queue_discard_max_entry = { - .attr = {.name = "discard_max_bytes", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "discard_max_bytes", .mode = 0644 }, .show = queue_discard_max_show, .store = queue_discard_max_store, }; static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { - .attr = {.name = "discard_zeroes_data", .mode = S_IRUGO }, + .attr = {.name = "discard_zeroes_data", .mode = 0444 }, .show = queue_discard_zeroes_data_show, }; static struct queue_sysfs_entry queue_write_same_max_entry = { - .attr = {.name = "write_same_max_bytes", .mode = S_IRUGO }, + .attr = {.name = "write_same_max_bytes", .mode = 0444 }, .show = queue_write_same_max_show, }; static struct queue_sysfs_entry queue_write_zeroes_max_entry = { - .attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO }, + .attr = {.name = "write_zeroes_max_bytes", .mode = 0444 }, .show = queue_write_zeroes_max_show, }; static struct queue_sysfs_entry queue_nonrot_entry = { - .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "rotational", .mode = 0644 }, .show = queue_show_nonrot, .store = queue_store_nonrot, }; static struct queue_sysfs_entry queue_zoned_entry = { - .attr = {.name = "zoned", .mode = S_IRUGO }, + .attr = {.name = "zoned", .mode = 0444 }, .show = queue_zoned_show, }; static struct queue_sysfs_entry queue_nomerges_entry = { - .attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "nomerges", .mode = 0644 }, .show = queue_nomerges_show, .store = queue_nomerges_store, }; static struct queue_sysfs_entry queue_rq_affinity_entry = { - .attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "rq_affinity", .mode = 0644 }, .show = queue_rq_affinity_show, .store = queue_rq_affinity_store, }; static struct queue_sysfs_entry queue_iostats_entry = { - .attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "iostats", .mode = 0644 }, .show = queue_show_iostats, .store = queue_store_iostats, }; static struct queue_sysfs_entry queue_random_entry = { - .attr = {.name = "add_random", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "add_random", .mode = 0644 }, .show = queue_show_random, .store = queue_store_random, }; static struct queue_sysfs_entry queue_poll_entry = { - .attr = {.name = "io_poll", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "io_poll", .mode = 0644 }, .show = queue_poll_show, .store = queue_poll_store, }; static struct queue_sysfs_entry queue_poll_delay_entry = { - .attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "io_poll_delay", .mode = 0644 }, .show = queue_poll_delay_show, .store = queue_poll_delay_store, }; static struct queue_sysfs_entry queue_wc_entry = { - .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "write_cache", .mode = 0644 }, .show = queue_wc_show, .store = queue_wc_store, }; +static struct queue_sysfs_entry queue_fua_entry = { + .attr = {.name = "fua", .mode = 0444 }, + .show = queue_fua_show, +}; + static struct queue_sysfs_entry queue_dax_entry = { - .attr = {.name = "dax", .mode = S_IRUGO }, + .attr = {.name = "dax", .mode = 0444 }, .show = queue_dax_show, }; static struct queue_sysfs_entry queue_wb_lat_entry = { - .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "wbt_lat_usec", .mode = 0644 }, .show = queue_wb_lat_show, .store = queue_wb_lat_store, }; #ifdef CONFIG_BLK_DEV_THROTTLING_LOW static struct queue_sysfs_entry throtl_sample_time_entry = { - .attr = {.name = "throttle_sample_time", .mode = S_IRUGO | S_IWUSR }, + .attr = {.name = "throttle_sample_time", .mode = 0644 }, .show = blk_throtl_sample_time_show, .store = blk_throtl_sample_time_store, }; @@ -708,6 +718,7 @@ static struct attribute *default_attrs[] = { &queue_random_entry.attr, &queue_poll_entry.attr, &queue_wc_entry.attr, + &queue_fua_entry.attr, &queue_dax_entry.attr, &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr, @@ -813,8 +824,7 @@ static void __blk_release_queue(struct work_struct *work) if (q->mq_ops) blk_mq_debugfs_unregister(q); - if (q->bio_split) - bioset_free(q->bio_split); + bioset_exit(&q->bio_split); ida_simple_remove(&blk_queue_ida, q->id); call_rcu(&q->rcu_head, blk_free_queue_rcu); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c5a1316..82282e6 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -36,8 +36,6 @@ static int throtl_quantum = 32; */ #define LATENCY_FILTERED_HD (1000L) /* 1ms */ -#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT) - static struct blkcg_policy blkcg_policy_throtl; /* A workqueue to queue throttle related work */ @@ -821,7 +819,7 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw) if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) return false; - return 1; + return true; } /* Trim the used slices and adjust slice start accordingly */ @@ -931,7 +929,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, if (wait) *wait = jiffy_wait; - return 0; + return false; } static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, @@ -974,7 +972,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); if (wait) *wait = jiffy_wait; - return 0; + return false; } /* @@ -1024,7 +1022,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, tg_with_in_iops_limit(tg, bio, &iops_wait)) { if (wait) *wait = 0; - return 1; + return true; } max_wait = max(bps_wait, iops_wait); @@ -1035,7 +1033,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, if (time_before(tg->slice_end[rw], jiffies + max_wait)) throtl_extend_slice(tg, rw, jiffies + max_wait); - return 0; + return false; } static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) @@ -1209,7 +1207,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) while (1) { struct throtl_grp *tg = throtl_rb_first(parent_sq); - struct throtl_service_queue *sq = &tg->service_queue; + struct throtl_service_queue *sq; if (!tg) break; @@ -1221,6 +1219,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) nr_disp += throtl_dispatch_tg(tg); + sq = &tg->service_queue; if (sq->nr_queued[0] || sq->nr_queued[1]) tg_update_disptime(tg); @@ -2139,7 +2138,7 @@ static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) bio->bi_cg_private = tg; blkg_get(tg_to_blkg(tg)); } - blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); #endif } @@ -2251,7 +2250,7 @@ out: #ifdef CONFIG_BLK_DEV_THROTTLING_LOW if (throttled || !td->track_bio_latency) - bio->bi_issue_stat.stat |= SKIP_LATENCY; + bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; #endif return throttled; } @@ -2281,8 +2280,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns) struct request_queue *q = rq->q; struct throtl_data *td = q->td; - throtl_track_latency(td, blk_stat_size(&rq->issue_stat), - req_op(rq), time_ns >> 10); + throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10); } void blk_throtl_bio_endio(struct bio *bio) @@ -2302,8 +2300,8 @@ void blk_throtl_bio_endio(struct bio *bio) finish_time_ns = ktime_get_ns(); tg->last_finish_time = finish_time_ns >> 10; - start_time = blk_stat_time(&bio->bi_issue_stat) >> 10; - finish_time = __blk_stat_time(finish_time_ns) >> 10; + start_time = bio_issue_time(&bio->bi_issue) >> 10; + finish_time = __bio_issue_time(finish_time_ns) >> 10; if (!start_time || finish_time <= start_time) { blkg_put(tg_to_blkg(tg)); return; @@ -2311,16 +2309,15 @@ void blk_throtl_bio_endio(struct bio *bio) lat = finish_time - start_time; /* this is only for bio based driver */ - if (!(bio->bi_issue_stat.stat & SKIP_LATENCY)) - throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat), - bio_op(bio), lat); + if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY)) + throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue), + bio_op(bio), lat); if (tg->latency_target && lat >= tg->td->filtered_latency) { int bucket; unsigned int threshold; - bucket = request_bucket_index( - blk_stat_size(&bio->bi_issue_stat)); + bucket = request_bucket_index(bio_issue_size(&bio->bi_issue)); threshold = tg->td->avg_buckets[rw][bucket].latency + tg->latency_target; if (lat > threshold) diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 652d4d4..4b8a48d 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -86,14 +86,11 @@ static void blk_rq_timed_out(struct request *req) if (q->rq_timed_out_fn) ret = q->rq_timed_out_fn(req); switch (ret) { - case BLK_EH_HANDLED: - __blk_complete_request(req); - break; case BLK_EH_RESET_TIMER: blk_add_timer(req); blk_clear_rq_complete(req); break; - case BLK_EH_NOT_HANDLED: + case BLK_EH_DONE: /* * LLD handles this for now but in the future * we can send a request msg to abort the command @@ -214,7 +211,6 @@ void blk_add_timer(struct request *req) req->timeout = q->rq_timeout; blk_rq_set_deadline(req, jiffies + req->timeout); - req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED; /* * Only the non-mq case needs to add the request to a protected list. diff --git a/block/blk-wbt.c b/block/blk-wbt.c index f92fc84..4f89b28 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -29,6 +29,26 @@ #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> +static inline void wbt_clear_state(struct request *rq) +{ + rq->wbt_flags = 0; +} + +static inline enum wbt_flags wbt_flags(struct request *rq) +{ + return rq->wbt_flags; +} + +static inline bool wbt_is_tracked(struct request *rq) +{ + return rq->wbt_flags & WBT_TRACKED; +} + +static inline bool wbt_is_read(struct request *rq) +{ + return rq->wbt_flags & WBT_READ; +} + enum { /* * Default setting, we'll scale up (to 75% of QD max) or down (min 1) @@ -101,9 +121,15 @@ static bool wb_recent_wait(struct rq_wb *rwb) return time_before(jiffies, wb->dirty_sleep + HZ); } -static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd) +static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, + enum wbt_flags wb_acct) { - return &rwb->rq_wait[is_kswapd]; + if (wb_acct & WBT_KSWAPD) + return &rwb->rq_wait[WBT_RWQ_KSWAPD]; + else if (wb_acct & WBT_DISCARD) + return &rwb->rq_wait[WBT_RWQ_DISCARD]; + + return &rwb->rq_wait[WBT_RWQ_BG]; } static void rwb_wake_all(struct rq_wb *rwb) @@ -126,7 +152,7 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) if (!(wb_acct & WBT_TRACKED)) return; - rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD); + rqw = get_rq_wait(rwb, wb_acct); inflight = atomic_dec_return(&rqw->inflight); /* @@ -139,10 +165,13 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) } /* - * If the device does write back caching, drop further down - * before we wake people up. + * For discards, our limit is always the background. For writes, if + * the device does write back caching, drop further down before we + * wake people up. */ - if (rwb->wc && !wb_recent_wait(rwb)) + if (wb_acct & WBT_DISCARD) + limit = rwb->wb_background; + else if (rwb->wc && !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; @@ -165,24 +194,24 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) * Called on completion of a request. Note that it's also called when * a request is merged, when the request gets freed. */ -void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat) +void wbt_done(struct rq_wb *rwb, struct request *rq) { if (!rwb) return; - if (!wbt_is_tracked(stat)) { - if (rwb->sync_cookie == stat) { + if (!wbt_is_tracked(rq)) { + if (rwb->sync_cookie == rq) { rwb->sync_issue = 0; rwb->sync_cookie = NULL; } - if (wbt_is_read(stat)) + if (wbt_is_read(rq)) wb_timestamp(rwb, &rwb->last_comp); } else { - WARN_ON_ONCE(stat == rwb->sync_cookie); - __wbt_done(rwb, wbt_stat_to_mask(stat)); + WARN_ON_ONCE(rq == rwb->sync_cookie); + __wbt_done(rwb, wbt_flags(rq)); } - wbt_clear_state(stat); + wbt_clear_state(rq); } /* @@ -479,6 +508,9 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) { unsigned int limit; + if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD) + return rwb->wb_background; + /* * At this point we know it's a buffered write. If this is * kswapd trying to free memory, or REQ_SYNC is set, then @@ -529,11 +561,12 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, * Block if we will exceed our limit, or if we are currently waiting for * the timer to kick off queuing again. */ -static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) +static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, + unsigned long rw, spinlock_t *lock) __releases(lock) __acquires(lock) { - struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd()); + struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); DEFINE_WAIT(wait); if (may_queue(rwb, rqw, &wait, rw)) @@ -559,21 +592,20 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) { - const int op = bio_op(bio); - - /* - * If not a WRITE, do nothing - */ - if (op != REQ_OP_WRITE) - return false; - - /* - * Don't throttle WRITE_ODIRECT - */ - if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE)) + switch (bio_op(bio)) { + case REQ_OP_WRITE: + /* + * Don't throttle WRITE_ODIRECT + */ + if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == + (REQ_SYNC | REQ_IDLE)) + return false; + /* fallthrough */ + case REQ_OP_DISCARD: + return true; + default: return false; - - return true; + } } /* @@ -584,7 +616,7 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) */ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) { - unsigned int ret = 0; + enum wbt_flags ret = 0; if (!rwb_enabled(rwb)) return 0; @@ -598,41 +630,42 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) return ret; } - __wbt_wait(rwb, bio->bi_opf, lock); + if (current_is_kswapd()) + ret |= WBT_KSWAPD; + if (bio_op(bio) == REQ_OP_DISCARD) + ret |= WBT_DISCARD; + + __wbt_wait(rwb, ret, bio->bi_opf, lock); if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); - if (current_is_kswapd()) - ret |= WBT_KSWAPD; - return ret | WBT_TRACKED; } -void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) +void wbt_issue(struct rq_wb *rwb, struct request *rq) { if (!rwb_enabled(rwb)) return; /* - * Track sync issue, in case it takes a long time to complete. Allows - * us to react quicker, if a sync IO takes a long time to complete. - * Note that this is just a hint. 'stat' can go away when the - * request completes, so it's important we never dereference it. We - * only use the address to compare with, which is why we store the - * sync_issue time locally. + * Track sync issue, in case it takes a long time to complete. Allows us + * to react quicker, if a sync IO takes a long time to complete. Note + * that this is just a hint. The request can go away when it completes, + * so it's important we never dereference it. We only use the address to + * compare with, which is why we store the sync_issue time locally. */ - if (wbt_is_read(stat) && !rwb->sync_issue) { - rwb->sync_cookie = stat; - rwb->sync_issue = blk_stat_time(stat); + if (wbt_is_read(rq) && !rwb->sync_issue) { + rwb->sync_cookie = rq; + rwb->sync_issue = rq->io_start_time_ns; } } -void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat) +void wbt_requeue(struct rq_wb *rwb, struct request *rq) { if (!rwb_enabled(rwb)) return; - if (stat == rwb->sync_cookie) { + if (rq == rwb->sync_cookie) { rwb->sync_issue = 0; rwb->sync_cookie = NULL; } @@ -701,7 +734,7 @@ static int wbt_data_dir(const struct request *rq) if (op == REQ_OP_READ) return READ; - else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH) + else if (op_is_write(op)) return WRITE; /* don't account */ @@ -713,8 +746,6 @@ int wbt_init(struct request_queue *q) struct rq_wb *rwb; int i; - BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); - rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); if (!rwb) return -ENOMEM; diff --git a/block/blk-wbt.h b/block/blk-wbt.h index a232c98..300df53 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -14,12 +14,16 @@ enum wbt_flags { WBT_TRACKED = 1, /* write, tracked for throttling */ WBT_READ = 2, /* read */ WBT_KSWAPD = 4, /* write, from kswapd */ + WBT_DISCARD = 8, /* discard */ - WBT_NR_BITS = 3, /* number of bits */ + WBT_NR_BITS = 4, /* number of bits */ }; enum { - WBT_NUM_RWQ = 2, + WBT_RWQ_BG = 0, + WBT_RWQ_KSWAPD, + WBT_RWQ_DISCARD, + WBT_NUM_RWQ, }; /* @@ -31,31 +35,6 @@ enum { WBT_STATE_ON_MANUAL = 2, }; -static inline void wbt_clear_state(struct blk_issue_stat *stat) -{ - stat->stat &= ~BLK_STAT_RES_MASK; -} - -static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat) -{ - return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT; -} - -static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct) -{ - stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT; -} - -static inline bool wbt_is_tracked(struct blk_issue_stat *stat) -{ - return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED; -} - -static inline bool wbt_is_read(struct blk_issue_stat *stat) -{ - return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ; -} - struct rq_wait { wait_queue_head_t wait; atomic_t inflight; @@ -84,7 +63,7 @@ struct rq_wb { struct blk_stat_callback *cb; - s64 sync_issue; + u64 sync_issue; void *sync_cookie; unsigned int wc; @@ -109,14 +88,19 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb) #ifdef CONFIG_BLK_WBT +static inline void wbt_track(struct request *rq, enum wbt_flags flags) +{ + rq->wbt_flags |= flags; +} + void __wbt_done(struct rq_wb *, enum wbt_flags); -void wbt_done(struct rq_wb *, struct blk_issue_stat *); +void wbt_done(struct rq_wb *, struct request *); enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *); int wbt_init(struct request_queue *); void wbt_exit(struct request_queue *); void wbt_update_limits(struct rq_wb *); -void wbt_requeue(struct rq_wb *, struct blk_issue_stat *); -void wbt_issue(struct rq_wb *, struct blk_issue_stat *); +void wbt_requeue(struct rq_wb *, struct request *); +void wbt_issue(struct rq_wb *, struct request *); void wbt_disable_default(struct request_queue *); void wbt_enable_default(struct request_queue *); @@ -127,10 +111,13 @@ u64 wbt_default_latency_nsec(struct request_queue *); #else +static inline void wbt_track(struct request *rq, enum wbt_flags flags) +{ +} static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags) { } -static inline void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat) +static inline void wbt_done(struct rq_wb *rwb, struct request *rq) { } static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, @@ -148,10 +135,10 @@ static inline void wbt_exit(struct request_queue *q) static inline void wbt_update_limits(struct rq_wb *rwb) { } -static inline void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat) +static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq) { } -static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) +static inline void wbt_issue(struct rq_wb *rwb, struct request *rq) { } static inline void wbt_disable_default(struct request_queue *q) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 08e84ef..3d08dc84 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -328,7 +328,11 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!rep.nr_zones) return -EINVAL; - zones = kcalloc(rep.nr_zones, sizeof(struct blk_zone), GFP_KERNEL); + if (rep.nr_zones > INT_MAX / sizeof(struct blk_zone)) + return -ERANGE; + + zones = kvmalloc(rep.nr_zones * sizeof(struct blk_zone), + GFP_KERNEL | __GFP_ZERO); if (!zones) return -ENOMEM; @@ -350,7 +354,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, } out: - kfree(zones); + kvfree(zones); return ret; } diff --git a/block/blk.h b/block/blk.h index b034fd2..8d23aea9 100644 --- a/block/blk.h +++ b/block/blk.h @@ -186,7 +186,7 @@ unsigned int blk_plug_queued_count(struct request_queue *q); void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); -void blk_account_io_done(struct request *req); +void blk_account_io_done(struct request *req, u64 now); /* * EH timer and IO completion will both attempt to 'grab' the request, make @@ -231,6 +231,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq e->type->ops.sq.elevator_deactivate_req_fn(q, rq); } +int elevator_init(struct request_queue *); +int elevator_init_mq(struct request_queue *q); +void elevator_exit(struct request_queue *, struct elevator_queue *); int elv_register_queue(struct request_queue *q); void elv_unregister_queue(struct request_queue *q); diff --git a/block/bounce.c b/block/bounce.c index dd0b93f..fd31347 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -28,28 +28,29 @@ #define POOL_SIZE 64 #define ISA_POOL_SIZE 16 -static struct bio_set *bounce_bio_set, *bounce_bio_split; -static mempool_t *page_pool, *isa_page_pool; +static struct bio_set bounce_bio_set, bounce_bio_split; +static mempool_t page_pool, isa_page_pool; #if defined(CONFIG_HIGHMEM) static __init int init_emergency_pool(void) { + int ret; #if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) if (max_pfn <= max_low_pfn) return 0; #endif - page_pool = mempool_create_page_pool(POOL_SIZE, 0); - BUG_ON(!page_pool); + ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0); + BUG_ON(ret); pr_info("pool size: %d pages\n", POOL_SIZE); - bounce_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - BUG_ON(!bounce_bio_set); - if (bioset_integrity_create(bounce_bio_set, BIO_POOL_SIZE)) + ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + BUG_ON(ret); + if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE)) BUG_ON(1); - bounce_bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); - BUG_ON(!bounce_bio_split); + ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0); + BUG_ON(ret); return 0; } @@ -63,14 +64,11 @@ __initcall(init_emergency_pool); */ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) { - unsigned long flags; unsigned char *vto; - local_irq_save(flags); vto = kmap_atomic(to->bv_page); memcpy(vto + to->bv_offset, vfrom, to->bv_len); kunmap_atomic(vto); - local_irq_restore(flags); } #else /* CONFIG_HIGHMEM */ @@ -94,12 +92,14 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) */ int init_emergency_isa_pool(void) { - if (isa_page_pool) + int ret; + + if (mempool_initialized(&isa_page_pool)) return 0; - isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, - mempool_free_pages, (void *) 0); - BUG_ON(!isa_page_pool); + ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa, + mempool_free_pages, (void *) 0); + BUG_ON(ret); pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE); return 0; @@ -166,13 +166,13 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) static void bounce_end_io_write(struct bio *bio) { - bounce_end_io(bio, page_pool); + bounce_end_io(bio, &page_pool); } static void bounce_end_io_write_isa(struct bio *bio) { - bounce_end_io(bio, isa_page_pool); + bounce_end_io(bio, &isa_page_pool); } static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) @@ -187,12 +187,12 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) static void bounce_end_io_read(struct bio *bio) { - __bounce_end_io_read(bio, page_pool); + __bounce_end_io_read(bio, &page_pool); } static void bounce_end_io_read_isa(struct bio *bio) { - __bounce_end_io_read(bio, isa_page_pool); + __bounce_end_io_read(bio, &isa_page_pool); } static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, @@ -217,13 +217,13 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, return; if (!passthrough && sectors < bio_sectors(*bio_orig)) { - bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split); + bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); bio_chain(bio, *bio_orig); generic_make_request(*bio_orig); *bio_orig = bio; } bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL : - bounce_bio_set); + &bounce_bio_set); bio_for_each_segment_all(to, bio, i) { struct page *page = to->bv_page; @@ -250,7 +250,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, bio->bi_flags |= (1 << BIO_BOUNCED); - if (pool == page_pool) { + if (pool == &page_pool) { bio->bi_end_io = bounce_end_io_write; if (rw == READ) bio->bi_end_io = bounce_end_io_read; @@ -282,10 +282,10 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) if (!(q->bounce_gfp & GFP_DMA)) { if (q->limits.bounce_pfn >= blk_max_pfn) return; - pool = page_pool; + pool = &page_pool; } else { - BUG_ON(!isa_page_pool); - pool = isa_page_pool; + BUG_ON(!mempool_initialized(&isa_page_pool)); + pool = &isa_page_pool; } /* diff --git a/block/bsg-lib.c b/block/bsg-lib.c index fc2e5ff..9419def 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -303,11 +303,9 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req) * @name: device to give bsg device * @job_fn: bsg job handler * @dd_job_size: size of LLD data needed for each job - * @release: @dev release function */ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, - bsg_job_fn *job_fn, int dd_job_size, - void (*release)(struct device *)) + bsg_job_fn *job_fn, int dd_job_size) { struct request_queue *q; int ret; @@ -331,7 +329,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, blk_queue_softirq_done(q, bsg_softirq_done); blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); - ret = bsg_register_queue(q, dev, name, &bsg_transport_ops, release); + ret = bsg_register_queue(q, dev, name, &bsg_transport_ops); if (ret) { printk(KERN_ERR "%s: bsg interface failed to " "initialize - register queue\n", dev->kobj.name); diff --git a/block/bsg.c b/block/bsg.c index defa06c..132e657 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -226,8 +226,7 @@ bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode) return ERR_PTR(ret); rq = blk_get_request(q, hdr->dout_xfer_len ? - REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, - GFP_KERNEL); + REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); if (IS_ERR(rq)) return rq; @@ -249,7 +248,7 @@ bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode) goto out; } - next_rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL); + next_rq = blk_get_request(q, REQ_OP_SCSI_IN, 0); if (IS_ERR(next_rq)) { ret = PTR_ERR(next_rq); goto out; @@ -650,18 +649,6 @@ static struct bsg_device *bsg_alloc_device(void) return bd; } -static void bsg_kref_release_function(struct kref *kref) -{ - struct bsg_class_device *bcd = - container_of(kref, struct bsg_class_device, ref); - struct device *parent = bcd->parent; - - if (bcd->release) - bcd->release(bcd->parent); - - put_device(parent); -} - static int bsg_put_device(struct bsg_device *bd) { int ret = 0, do_free; @@ -694,7 +681,6 @@ static int bsg_put_device(struct bsg_device *bd) kfree(bd); out: - kref_put(&q->bsg_dev.ref, bsg_kref_release_function); if (do_free) blk_put_queue(q); return ret; @@ -760,8 +746,6 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file) */ mutex_lock(&bsg_mutex); bcd = idr_find(&bsg_minor_idr, iminor(inode)); - if (bcd) - kref_get(&bcd->ref); mutex_unlock(&bsg_mutex); if (!bcd) @@ -772,8 +756,6 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file) return bd; bd = bsg_add_device(inode, bcd->queue, file); - if (IS_ERR(bd)) - kref_put(&bcd->ref, bsg_kref_release_function); return bd; } @@ -913,25 +895,17 @@ void bsg_unregister_queue(struct request_queue *q) sysfs_remove_link(&q->kobj, "bsg"); device_unregister(bcd->class_dev); bcd->class_dev = NULL; - kref_put(&bcd->ref, bsg_kref_release_function); mutex_unlock(&bsg_mutex); } EXPORT_SYMBOL_GPL(bsg_unregister_queue); int bsg_register_queue(struct request_queue *q, struct device *parent, - const char *name, const struct bsg_ops *ops, - void (*release)(struct device *)) + const char *name, const struct bsg_ops *ops) { struct bsg_class_device *bcd; dev_t dev; int ret; struct device *class_dev = NULL; - const char *devname; - - if (name) - devname = name; - else - devname = dev_name(parent); /* * we need a proper transport to send commands, not a stacked device @@ -955,15 +929,12 @@ int bsg_register_queue(struct request_queue *q, struct device *parent, bcd->minor = ret; bcd->queue = q; - bcd->parent = get_device(parent); - bcd->release = release; bcd->ops = ops; - kref_init(&bcd->ref); dev = MKDEV(bsg_major, bcd->minor); - class_dev = device_create(bsg_class, parent, dev, NULL, "%s", devname); + class_dev = device_create(bsg_class, parent, dev, NULL, "%s", name); if (IS_ERR(class_dev)) { ret = PTR_ERR(class_dev); - goto put_dev; + goto idr_remove; } bcd->class_dev = class_dev; @@ -978,8 +949,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent, unregister_class_dev: device_unregister(class_dev); -put_dev: - put_device(parent); +idr_remove: idr_remove(&bsg_minor_idr, bcd->minor); unlock: mutex_unlock(&bsg_mutex); @@ -993,7 +963,7 @@ int bsg_scsi_register_queue(struct request_queue *q, struct device *parent) return -EINVAL; } - return bsg_register_queue(q, parent, NULL, &bsg_scsi_ops, NULL); + return bsg_register_queue(q, parent, dev_name(parent), &bsg_scsi_ops); } EXPORT_SYMBOL_GPL(bsg_scsi_register_queue); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 9f342ef..82b6c27 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -210,9 +210,9 @@ struct cfqg_stats { /* total time with empty current active q with other requests queued */ struct blkg_stat empty_time; /* fields after this shouldn't be cleared on stat reset */ - uint64_t start_group_wait_time; - uint64_t start_idle_time; - uint64_t start_empty_time; + u64 start_group_wait_time; + u64 start_idle_time; + u64 start_empty_time; uint16_t flags; #endif /* CONFIG_DEBUG_BLK_CGROUP */ #endif /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -491,13 +491,13 @@ CFQG_FLAG_FNS(empty) /* This should be called with the queue_lock held. */ static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats) { - unsigned long long now; + u64 now; if (!cfqg_stats_waiting(stats)) return; - now = sched_clock(); - if (time_after64(now, stats->start_group_wait_time)) + now = ktime_get_ns(); + if (now > stats->start_group_wait_time) blkg_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); cfqg_stats_clear_waiting(stats); @@ -513,20 +513,20 @@ static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, return; if (cfqg == curr_cfqg) return; - stats->start_group_wait_time = sched_clock(); + stats->start_group_wait_time = ktime_get_ns(); cfqg_stats_mark_waiting(stats); } /* This should be called with the queue_lock held. */ static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { - unsigned long long now; + u64 now; if (!cfqg_stats_empty(stats)) return; - now = sched_clock(); - if (time_after64(now, stats->start_empty_time)) + now = ktime_get_ns(); + if (now > stats->start_empty_time) blkg_stat_add(&stats->empty_time, now - stats->start_empty_time); cfqg_stats_clear_empty(stats); @@ -552,7 +552,7 @@ static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) if (cfqg_stats_empty(stats)) return; - stats->start_empty_time = sched_clock(); + stats->start_empty_time = ktime_get_ns(); cfqg_stats_mark_empty(stats); } @@ -561,9 +561,9 @@ static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) struct cfqg_stats *stats = &cfqg->stats; if (cfqg_stats_idling(stats)) { - unsigned long long now = sched_clock(); + u64 now = ktime_get_ns(); - if (time_after64(now, stats->start_idle_time)) + if (now > stats->start_idle_time) blkg_stat_add(&stats->idle_time, now - stats->start_idle_time); cfqg_stats_clear_idling(stats); @@ -576,7 +576,7 @@ static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) BUG_ON(cfqg_stats_idling(stats)); - stats->start_idle_time = sched_clock(); + stats->start_idle_time = ktime_get_ns(); cfqg_stats_mark_idling(stats); } @@ -701,17 +701,19 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - uint64_t start_time, uint64_t io_start_time, - unsigned int op) + u64 start_time_ns, + u64 io_start_time_ns, + unsigned int op) { struct cfqg_stats *stats = &cfqg->stats; - unsigned long long now = sched_clock(); + u64 now = ktime_get_ns(); - if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, op, now - io_start_time); - if (time_after64(io_start_time, start_time)) + if (now > io_start_time_ns) + blkg_rwstat_add(&stats->service_time, op, + now - io_start_time_ns); + if (io_start_time_ns > start_time_ns) blkg_rwstat_add(&stats->wait_time, op, - io_start_time - start_time); + io_start_time_ns - start_time_ns); } /* @stats = 0 */ @@ -797,8 +799,9 @@ static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, unsigned int op) { } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - uint64_t start_time, uint64_t io_start_time, - unsigned int op) { } + u64 start_time_ns, + u64 io_start_time_ns, + unsigned int op) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -4225,8 +4228,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqd->rq_in_driver--; cfqq->dispatched--; (RQ_CFQG(rq))->dispatched--; - cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq), - rq_io_start_time_ns(rq), rq->cmd_flags); + cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns, + rq->io_start_time_ns, rq->cmd_flags); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; @@ -4242,16 +4245,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqq_type(cfqq)); st->ttime.last_end_request = now; - /* - * We have to do this check in jiffies since start_time is in - * jiffies and it is not trivial to convert to ns. If - * cfq_fifo_expire[1] ever comes close to 1 jiffie, this test - * will become problematic but so far we are fine (the default - * is 128 ms). - */ - if (!time_after(rq->start_time + - nsecs_to_jiffies(cfqd->cfq_fifo_expire[1]), - jiffies)) + if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now) cfqd->last_delayed_sync = now; } @@ -4792,7 +4786,7 @@ USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, U #undef USEC_STORE_FUNCTION #define CFQ_ATTR(name) \ - __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) + __ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store) static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(quantum), diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 9de9f15..ef2f1f0 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -512,8 +512,7 @@ STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); #undef STORE_FUNCTION #define DD_ATTR(name) \ - __ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \ - deadline_##name##_store) + __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(read_expire), diff --git a/block/elevator.c b/block/elevator.c index e87e9b4..fa828b5 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -199,76 +199,46 @@ static void elevator_release(struct kobject *kobj) kfree(e); } -int elevator_init(struct request_queue *q, char *name) +/* + * Use the default elevator specified by config boot param for non-mq devices, + * or by config option. Don't try to load modules as we could be running off + * async and request_module() isn't allowed from async. + */ +int elevator_init(struct request_queue *q) { struct elevator_type *e = NULL; - int err; + int err = 0; /* * q->sysfs_lock must be held to provide mutual exclusion between * elevator_switch() and here. */ - lockdep_assert_held(&q->sysfs_lock); - + mutex_lock(&q->sysfs_lock); if (unlikely(q->elevator)) - return 0; - - INIT_LIST_HEAD(&q->queue_head); - q->last_merge = NULL; - q->end_sector = 0; - q->boundary_rq = NULL; - - if (name) { - e = elevator_get(q, name, true); - if (!e) - return -EINVAL; - } + goto out_unlock; - /* - * Use the default elevator specified by config boot param for - * non-mq devices, or by config option. Don't try to load modules - * as we could be running off async and request_module() isn't - * allowed from async. - */ - if (!e && !q->mq_ops && *chosen_elevator) { + if (*chosen_elevator) { e = elevator_get(q, chosen_elevator, false); if (!e) printk(KERN_ERR "I/O scheduler %s not found\n", chosen_elevator); } + if (!e) + e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false); if (!e) { - /* - * For blk-mq devices, we default to using mq-deadline, - * if available, for single queue devices. If deadline - * isn't available OR we have multiple queues, default - * to "none". - */ - if (q->mq_ops) { - if (q->nr_hw_queues == 1) - e = elevator_get(q, "mq-deadline", false); - if (!e) - return 0; - } else - e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false); - - if (!e) { - printk(KERN_ERR - "Default I/O scheduler not found. " \ - "Using noop.\n"); - e = elevator_get(q, "noop", false); - } + printk(KERN_ERR + "Default I/O scheduler not found. Using noop.\n"); + e = elevator_get(q, "noop", false); } - if (e->uses_mq) - err = blk_mq_init_sched(q, e); - else - err = e->ops.sq.elevator_init_fn(q, e); + err = e->ops.sq.elevator_init_fn(q, e); if (err) elevator_put(e); +out_unlock: + mutex_unlock(&q->sysfs_lock); return err; } -EXPORT_SYMBOL(elevator_init); void elevator_exit(struct request_queue *q, struct elevator_queue *e) { @@ -281,7 +251,6 @@ void elevator_exit(struct request_queue *q, struct elevator_queue *e) kobject_put(&e->kobj); } -EXPORT_SYMBOL(elevator_exit); static inline void __elv_rqhash_del(struct request *rq) { @@ -1005,6 +974,40 @@ out: } /* + * For blk-mq devices, we default to using mq-deadline, if available, for single + * queue devices. If deadline isn't available OR we have multiple queues, + * default to "none". + */ +int elevator_init_mq(struct request_queue *q) +{ + struct elevator_type *e; + int err = 0; + + if (q->nr_hw_queues != 1) + return 0; + + /* + * q->sysfs_lock must be held to provide mutual exclusion between + * elevator_switch() and here. + */ + mutex_lock(&q->sysfs_lock); + if (unlikely(q->elevator)) + goto out_unlock; + + e = elevator_get(q, "mq-deadline", false); + if (!e) + goto out_unlock; + + err = blk_mq_init_sched(q, e); + if (err) + elevator_put(e); +out_unlock: + mutex_unlock(&q->sysfs_lock); + return err; +} + + +/* * switch to new_e io scheduler. be careful not to introduce deadlocks - * we don't free the old io scheduler, before we have allocated what we * need for the new one. this way we have a chance of going back to the old diff --git a/block/genhd.c b/block/genhd.c index c4513fe..4d69403 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1139,28 +1139,25 @@ static ssize_t disk_discard_alignment_show(struct device *dev, return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); } -static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); -static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); -static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); -static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL); -static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); -static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); -static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); -static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, - NULL); -static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); -static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); -static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); -static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show, - disk_badblocks_store); +static DEVICE_ATTR(range, 0444, disk_range_show, NULL); +static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); +static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); +static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL); +static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL); +static DEVICE_ATTR(size, 0444, part_size_show, NULL); +static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL); +static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL); +static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); +static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); +static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); +static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = - __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); + __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); #endif #ifdef CONFIG_FAIL_IO_TIMEOUT static struct device_attribute dev_attr_fail_timeout = - __ATTR(io-timeout-fail, S_IRUGO|S_IWUSR, part_timeout_show, - part_timeout_store); + __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store); #endif static struct attribute *disk_attrs[] = { @@ -1924,9 +1921,9 @@ static ssize_t disk_events_poll_msecs_store(struct device *dev, return count; } -static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL); -static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL); -static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR, +static const DEVICE_ATTR(events, 0444, disk_events_show, NULL); +static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); +static const DEVICE_ATTR(events_poll_msecs, 0644, disk_events_poll_msecs_show, disk_events_poll_msecs_store); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 0d6d25e3..a1660ba 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -72,6 +72,19 @@ static const unsigned int kyber_batch_size[] = { [KYBER_OTHER] = 8, }; +/* + * There is a same mapping between ctx & hctx and kcq & khd, + * we use request->mq_ctx->index_hw to index the kcq in khd. + */ +struct kyber_ctx_queue { + /* + * Used to ensure operations on rq_list and kcq_map to be an atmoic one. + * Also protect the rqs on rq_list when merge. + */ + spinlock_t lock; + struct list_head rq_list[KYBER_NUM_DOMAINS]; +} ____cacheline_aligned_in_smp; + struct kyber_queue_data { struct request_queue *q; @@ -99,6 +112,8 @@ struct kyber_hctx_data { struct list_head rqs[KYBER_NUM_DOMAINS]; unsigned int cur_domain; unsigned int batching; + struct kyber_ctx_queue *kcqs; + struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS]; struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; atomic_t wait_index[KYBER_NUM_DOMAINS]; @@ -107,10 +122,8 @@ struct kyber_hctx_data { static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key); -static int rq_sched_domain(const struct request *rq) +static unsigned int kyber_sched_domain(unsigned int op) { - unsigned int op = rq->cmd_flags; - if ((op & REQ_OP_MASK) == REQ_OP_READ) return KYBER_READ; else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op)) @@ -284,6 +297,11 @@ static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd) return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; } +static int kyber_bucket_fn(const struct request *rq) +{ + return kyber_sched_domain(rq->cmd_flags); +} + static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) { struct kyber_queue_data *kqd; @@ -297,7 +315,7 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) goto err; kqd->q = q; - kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain, + kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn, KYBER_NUM_DOMAINS, kqd); if (!kqd->cb) goto err_kqd; @@ -376,8 +394,18 @@ static void kyber_exit_sched(struct elevator_queue *e) kfree(kqd); } +static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq) +{ + unsigned int i; + + spin_lock_init(&kcq->lock); + for (i = 0; i < KYBER_NUM_DOMAINS; i++) + INIT_LIST_HEAD(&kcq->rq_list[i]); +} + static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { + struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; struct kyber_hctx_data *khd; int i; @@ -385,6 +413,24 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) if (!khd) return -ENOMEM; + khd->kcqs = kmalloc_array_node(hctx->nr_ctx, + sizeof(struct kyber_ctx_queue), + GFP_KERNEL, hctx->numa_node); + if (!khd->kcqs) + goto err_khd; + + for (i = 0; i < hctx->nr_ctx; i++) + kyber_ctx_queue_init(&khd->kcqs[i]); + + for (i = 0; i < KYBER_NUM_DOMAINS; i++) { + if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx, + ilog2(8), GFP_KERNEL, hctx->numa_node)) { + while (--i >= 0) + sbitmap_free(&khd->kcq_map[i]); + goto err_kcqs; + } + } + spin_lock_init(&khd->lock); for (i = 0; i < KYBER_NUM_DOMAINS; i++) { @@ -400,12 +446,26 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) khd->batching = 0; hctx->sched_data = khd; + sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, + kqd->async_depth); return 0; + +err_kcqs: + kfree(khd->kcqs); +err_khd: + kfree(khd); + return -ENOMEM; } static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { + struct kyber_hctx_data *khd = hctx->sched_data; + int i; + + for (i = 0; i < KYBER_NUM_DOMAINS; i++) + sbitmap_free(&khd->kcq_map[i]); + kfree(khd->kcqs); kfree(hctx->sched_data); } @@ -427,7 +487,7 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd, nr = rq_get_domain_token(rq); if (nr != -1) { - sched_domain = rq_sched_domain(rq); + sched_domain = kyber_sched_domain(rq->cmd_flags); sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr, rq->mq_ctx->cpu); } @@ -446,11 +506,51 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) } } +static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) +{ + struct kyber_hctx_data *khd = hctx->sched_data; + struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); + struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw]; + unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); + struct list_head *rq_list = &kcq->rq_list[sched_domain]; + bool merged; + + spin_lock(&kcq->lock); + merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio); + spin_unlock(&kcq->lock); + blk_mq_put_ctx(ctx); + + return merged; +} + static void kyber_prepare_request(struct request *rq, struct bio *bio) { rq_set_domain_token(rq, -1); } +static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, + struct list_head *rq_list, bool at_head) +{ + struct kyber_hctx_data *khd = hctx->sched_data; + struct request *rq, *next; + + list_for_each_entry_safe(rq, next, rq_list, queuelist) { + unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); + struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; + struct list_head *head = &kcq->rq_list[sched_domain]; + + spin_lock(&kcq->lock); + if (at_head) + list_move(&rq->queuelist, head); + else + list_move_tail(&rq->queuelist, head); + sbitmap_set_bit(&khd->kcq_map[sched_domain], + rq->mq_ctx->index_hw); + blk_mq_sched_request_inserted(rq); + spin_unlock(&kcq->lock); + } +} + static void kyber_finish_request(struct request *rq) { struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; @@ -469,7 +569,7 @@ static void kyber_completed_request(struct request *rq) * Check if this request met our latency goal. If not, quickly gather * some statistics and start throttling. */ - sched_domain = rq_sched_domain(rq); + sched_domain = kyber_sched_domain(rq->cmd_flags); switch (sched_domain) { case KYBER_READ: target = kqd->read_lat_nsec; @@ -485,29 +585,48 @@ static void kyber_completed_request(struct request *rq) if (blk_stat_is_active(kqd->cb)) return; - now = __blk_stat_time(ktime_to_ns(ktime_get())); - if (now < blk_stat_time(&rq->issue_stat)) + now = ktime_get_ns(); + if (now < rq->io_start_time_ns) return; - latency = now - blk_stat_time(&rq->issue_stat); + latency = now - rq->io_start_time_ns; if (latency > target) blk_stat_activate_msecs(kqd->cb, 10); } -static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd, - struct blk_mq_hw_ctx *hctx) +struct flush_kcq_data { + struct kyber_hctx_data *khd; + unsigned int sched_domain; + struct list_head *list; +}; + +static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data) { - LIST_HEAD(rq_list); - struct request *rq, *next; + struct flush_kcq_data *flush_data = data; + struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr]; - blk_mq_flush_busy_ctxs(hctx, &rq_list); - list_for_each_entry_safe(rq, next, &rq_list, queuelist) { - unsigned int sched_domain; + spin_lock(&kcq->lock); + list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain], + flush_data->list); + sbitmap_clear_bit(sb, bitnr); + spin_unlock(&kcq->lock); - sched_domain = rq_sched_domain(rq); - list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]); - } + return true; +} + +static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd, + unsigned int sched_domain, + struct list_head *list) +{ + struct flush_kcq_data data = { + .khd = khd, + .sched_domain = sched_domain, + .list = list, + }; + + sbitmap_for_each_set(&khd->kcq_map[sched_domain], + flush_busy_kcq, &data); } static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, @@ -570,26 +689,23 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, static struct request * kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, struct kyber_hctx_data *khd, - struct blk_mq_hw_ctx *hctx, - bool *flushed) + struct blk_mq_hw_ctx *hctx) { struct list_head *rqs; struct request *rq; int nr; rqs = &khd->rqs[khd->cur_domain]; - rq = list_first_entry_or_null(rqs, struct request, queuelist); /* - * If there wasn't already a pending request and we haven't flushed the - * software queues yet, flush the software queues and check again. + * If we already have a flushed request, then we just need to get a + * token for it. Otherwise, if there are pending requests in the kcqs, + * flush the kcqs, but only if we can get a token. If not, we should + * leave the requests in the kcqs so that they can be merged. Note that + * khd->lock serializes the flushes, so if we observed any bit set in + * the kcq_map, we will always get a request. */ - if (!rq && !*flushed) { - kyber_flush_busy_ctxs(khd, hctx); - *flushed = true; - rq = list_first_entry_or_null(rqs, struct request, queuelist); - } - + rq = list_first_entry_or_null(rqs, struct request, queuelist); if (rq) { nr = kyber_get_domain_token(kqd, khd, hctx); if (nr >= 0) { @@ -598,6 +714,16 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, list_del_init(&rq->queuelist); return rq; } + } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) { + nr = kyber_get_domain_token(kqd, khd, hctx); + if (nr >= 0) { + kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs); + rq = list_first_entry(rqs, struct request, queuelist); + khd->batching++; + rq_set_domain_token(rq, nr); + list_del_init(&rq->queuelist); + return rq; + } } /* There were either no pending requests or no tokens. */ @@ -608,7 +734,6 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; struct kyber_hctx_data *khd = hctx->sched_data; - bool flushed = false; struct request *rq; int i; @@ -619,7 +744,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) * from the batch. */ if (khd->batching < kyber_batch_size[khd->cur_domain]) { - rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed); + rq = kyber_dispatch_cur_domain(kqd, khd, hctx); if (rq) goto out; } @@ -640,7 +765,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) else khd->cur_domain++; - rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed); + rq = kyber_dispatch_cur_domain(kqd, khd, hctx); if (rq) goto out; } @@ -657,10 +782,12 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx) int i; for (i = 0; i < KYBER_NUM_DOMAINS; i++) { - if (!list_empty_careful(&khd->rqs[i])) + if (!list_empty_careful(&khd->rqs[i]) || + sbitmap_any_bit_set(&khd->kcq_map[i])) return true; } - return sbitmap_any_bit_set(&hctx->ctx_map); + + return false; } #define KYBER_LAT_SHOW_STORE(op) \ @@ -831,7 +958,9 @@ static struct elevator_type kyber_sched = { .init_hctx = kyber_init_hctx, .exit_hctx = kyber_exit_hctx, .limit_depth = kyber_limit_depth, + .bio_merge = kyber_bio_merge, .prepare_request = kyber_prepare_request, + .insert_requests = kyber_insert_requests, .finish_request = kyber_finish_request, .requeue_request = kyber_finish_request, .completed_request = kyber_completed_request, diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 8ec0ba9..099a9e05 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -630,8 +630,7 @@ STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); #undef STORE_FUNCTION #define DD_ATTR(name) \ - __ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \ - deadline_##name##_store) + __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(read_expire), diff --git a/block/partition-generic.c b/block/partition-generic.c index db57cce..3dcfd4e 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -179,18 +179,17 @@ ssize_t part_fail_store(struct device *dev, } #endif -static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); -static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); -static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); -static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL); -static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); -static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, - NULL); -static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); -static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); +static DEVICE_ATTR(start, 0444, part_start_show, NULL); +static DEVICE_ATTR(size, 0444, part_size_show, NULL); +static DEVICE_ATTR(ro, 0444, part_ro_show, NULL); +static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL); +static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL); +static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); +static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = - __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); + __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); #endif static struct attribute *part_attrs[] = { @@ -291,8 +290,7 @@ static ssize_t whole_disk_show(struct device *dev, { return 0; } -static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, - whole_disk_show, NULL); +static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); /* * Must be called either with bd_mutex held, before a disk can be opened or @@ -518,7 +516,7 @@ rescan: if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); - check_disk_size_change(disk, bdev); + check_disk_size_change(disk, bdev, true); bdev->bd_invalidated = 0; if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) return 0; @@ -643,7 +641,7 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) return res; set_capacity(disk, 0); - check_disk_size_change(disk, bdev); + check_disk_size_change(disk, bdev, false); bdev->bd_invalidated = 0; /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 60b471f..533f4ae 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -321,8 +321,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, at_head = 1; ret = -ENOMEM; - rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, - GFP_KERNEL); + rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); req = scsi_req(rq); @@ -449,8 +448,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } - rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, - __GFP_RECLAIM); + rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto error_free_buffer; @@ -501,7 +499,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, break; } - if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) { + if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO)) { err = DRIVER_ERROR << 24; goto error; } @@ -538,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, struct request *rq; int err; - rq = blk_get_request(q, REQ_OP_SCSI_OUT, __GFP_RECLAIM); + rq = blk_get_request(q, REQ_OP_SCSI_OUT, 0); if (IS_ERR(rq)) return PTR_ERR(rq); rq->timeout = BLK_DEFAULT_SG_TIMEOUT; diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 513b260b..a2398e2 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -500,57 +500,6 @@ void ata_eh_release(struct ata_port *ap) mutex_unlock(&ap->host->eh_mutex); } -/** - * ata_scsi_timed_out - SCSI layer time out callback - * @cmd: timed out SCSI command - * - * Handles SCSI layer timeout. We race with normal completion of - * the qc for @cmd. If the qc is already gone, we lose and let - * the scsi command finish (EH_HANDLED). Otherwise, the qc has - * timed out and EH should be invoked. Prevent ata_qc_complete() - * from finishing it by setting EH_SCHEDULED and return - * EH_NOT_HANDLED. - * - * TODO: kill this function once old EH is gone. - * - * LOCKING: - * Called from timer context - * - * RETURNS: - * EH_HANDLED or EH_NOT_HANDLED - */ -enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd) -{ - struct Scsi_Host *host = cmd->device->host; - struct ata_port *ap = ata_shost_to_port(host); - unsigned long flags; - struct ata_queued_cmd *qc; - enum blk_eh_timer_return ret; - - DPRINTK("ENTER\n"); - - if (ap->ops->error_handler) { - ret = BLK_EH_NOT_HANDLED; - goto out; - } - - ret = BLK_EH_HANDLED; - spin_lock_irqsave(ap->lock, flags); - qc = ata_qc_from_tag(ap, ap->link.active_tag); - if (qc) { - WARN_ON(qc->scsicmd != cmd); - qc->flags |= ATA_QCFLAG_EH_SCHEDULED; - qc->err_mask |= AC_ERR_TIMEOUT; - ret = BLK_EH_NOT_HANDLED; - } - spin_unlock_irqrestore(ap->lock, flags); - - out: - DPRINTK("EXIT, ret=%d\n", ret); - return ret; -} -EXPORT_SYMBOL(ata_scsi_timed_out); - static void ata_eh_unload(struct ata_port *ap) { struct ata_link *link; diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index f781eff..7c3887a 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -1179,7 +1179,6 @@ static bool DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32))) return DAC960_Failure(Controller, "DMA mask out of range"); - Controller->BounceBufferLimit = DMA_BIT_MASK(32); if ((hw_type == DAC960_PD_Controller) || (hw_type == DAC960_P_Controller)) { CommandMailboxesSize = 0; @@ -1380,11 +1379,8 @@ static bool DAC960_V2_EnableMemoryMailboxInterface(DAC960_Controller_T dma_addr_t CommandMailboxDMA; DAC960_V2_CommandStatus_T CommandStatus; - if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(64))) - Controller->BounceBufferLimit = DMA_BIT_MASK(64); - else if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32))) - Controller->BounceBufferLimit = DMA_BIT_MASK(32); - else + if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(64)) && + pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32))) return DAC960_Failure(Controller, "DMA mask out of range"); /* This is a temporary dma mapping, used only in the scope of this function */ @@ -2540,7 +2536,6 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller) continue; } Controller->RequestQueue[n] = RequestQueue; - blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit); RequestQueue->queuedata = Controller; blk_queue_max_segments(RequestQueue, Controller->DriverScatterGatherLimit); blk_queue_max_hw_sectors(RequestQueue, Controller->MaxBlocksPerCommand); @@ -6594,7 +6589,7 @@ static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller) DAC960_ProcDirectoryEntry); proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller); proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller); - proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller); + proc_create_data("user_command", 0600, ControllerProcEntry, &dac960_user_command_proc_fops, Controller); Controller->ControllerProcEntry = ControllerProcEntry; } diff --git a/drivers/block/DAC960.h b/drivers/block/DAC960.h index 21aff47..1439e65 100644 --- a/drivers/block/DAC960.h +++ b/drivers/block/DAC960.h @@ -2295,7 +2295,6 @@ typedef struct DAC960_Controller unsigned short MaxBlocksPerCommand; unsigned short ControllerScatterGatherLimit; unsigned short DriverScatterGatherLimit; - u64 BounceBufferLimit; unsigned int CombinedStatusBufferLength; unsigned int InitialStatusLength; unsigned int CurrentStatusLength; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 6797e6c..429ebb8 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -159,14 +159,14 @@ static int aoe_debugfs_open(struct inode *inode, struct file *file) return single_open(file, aoedisk_debugfs_show, inode->i_private); } -static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL); -static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL); -static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL); +static DEVICE_ATTR(state, 0444, aoedisk_show_state, NULL); +static DEVICE_ATTR(mac, 0444, aoedisk_show_mac, NULL); +static DEVICE_ATTR(netif, 0444, aoedisk_show_netif, NULL); static struct device_attribute dev_attr_firmware_version = { - .attr = { .name = "firmware-version", .mode = S_IRUGO }, + .attr = { .name = "firmware-version", .mode = 0444 }, .show = aoedisk_show_fwver, }; -static DEVICE_ATTR(payload, S_IRUGO, aoedisk_show_payload, NULL); +static DEVICE_ATTR(payload, 0444, aoedisk_show_payload, NULL); static struct attribute *aoe_attrs[] = { &dev_attr_state.attr, @@ -388,7 +388,6 @@ aoeblk_gdalloc(void *vp) d->aoemajor, d->aoeminor); goto err_mempool; } - blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); spin_lock_irqsave(&d->lock, flags); WARN_ON(!(d->flags & DEVFL_GD_NOW)); diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 540bb60..096882e 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1032,8 +1032,9 @@ bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt) iter.bi_size = cnt; __bio_for_each_segment(bv, bio, iter, iter) { - char *p = page_address(bv.bv_page) + bv.bv_offset; + char *p = kmap_atomic(bv.bv_page) + bv.bv_offset; skb_copy_bits(skb, soff, p, bv.bv_len); + kunmap_atomic(p); soff += bv.bv_len; } } diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 66cb0f8..bb97659 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -331,15 +331,15 @@ static const struct block_device_operations brd_fops = { * And now the modules code and kernel interface. */ static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT; -module_param(rd_nr, int, S_IRUGO); +module_param(rd_nr, int, 0444); MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE; -module_param(rd_size, ulong, S_IRUGO); +module_param(rd_size, ulong, 0444); MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); static int max_part = 1; -module_param(max_part, int, S_IRUGO); +module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); MODULE_LICENSE("GPL"); @@ -402,6 +402,10 @@ static struct brd_device *brd_alloc(int i) set_capacity(disk, rd_size * 2); disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; + /* Tell the block layer that this is not a rotational device */ + blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); + return brd; out_free_queue: diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 9f4e6f5..11a85b7 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -977,7 +977,7 @@ static void drbd_bm_endio(struct bio *bio) bm_page_unlock_io(device, idx); if (ctx->flags & BM_AIO_COPY_PAGES) - mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool); + mempool_free(bio->bi_io_vec[0].bv_page, &drbd_md_io_page_pool); bio_put(bio); @@ -1014,7 +1014,8 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho bm_set_page_unchanged(b->bm_pages[page_nr]); if (ctx->flags & BM_AIO_COPY_PAGES) { - page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_RECLAIM); + page = mempool_alloc(&drbd_md_io_page_pool, + GFP_NOIO | __GFP_HIGHMEM); copy_highpage(page, b->bm_pages[page_nr]); bm_store_page_idx(page, page_nr); } else diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index ab21976..5d5e8d6 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -481,9 +481,9 @@ void drbd_debugfs_resource_add(struct drbd_resource *resource) goto fail; resource->debugfs_res_connections = dentry; - dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP, - resource->debugfs_res, resource, - &in_flight_summary_fops); + dentry = debugfs_create_file("in_flight_summary", 0440, + resource->debugfs_res, resource, + &in_flight_summary_fops); if (IS_ERR_OR_NULL(dentry)) goto fail; resource->debugfs_res_in_flight_summary = dentry; @@ -645,16 +645,16 @@ void drbd_debugfs_connection_add(struct drbd_connection *connection) goto fail; connection->debugfs_conn = dentry; - dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP, - connection->debugfs_conn, connection, - &connection_callback_history_fops); + dentry = debugfs_create_file("callback_history", 0440, + connection->debugfs_conn, connection, + &connection_callback_history_fops); if (IS_ERR_OR_NULL(dentry)) goto fail; connection->debugfs_conn_callback_history = dentry; - dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP, - connection->debugfs_conn, connection, - &connection_oldest_requests_fops); + dentry = debugfs_create_file("oldest_requests", 0440, + connection->debugfs_conn, connection, + &connection_oldest_requests_fops); if (IS_ERR_OR_NULL(dentry)) goto fail; connection->debugfs_conn_oldest_requests = dentry; @@ -824,7 +824,7 @@ void drbd_debugfs_device_add(struct drbd_device *device) device->debugfs_minor = dentry; #define DCF(name) do { \ - dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP, \ + dentry = debugfs_create_file(#name, 0440, \ device->debugfs_vol, device, \ &device_ ## name ## _fops); \ if (IS_ERR_OR_NULL(dentry)) \ diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 06ecee1..21b4186 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1405,8 +1405,8 @@ extern struct kmem_cache *drbd_request_cache; extern struct kmem_cache *drbd_ee_cache; /* peer requests */ extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ -extern mempool_t *drbd_request_mempool; -extern mempool_t *drbd_ee_mempool; +extern mempool_t drbd_request_mempool; +extern mempool_t drbd_ee_mempool; /* drbd's page pool, used to buffer data received from the peer, * or data requested by the peer. @@ -1432,16 +1432,16 @@ extern wait_queue_head_t drbd_pp_wait; * 128 should be plenty, currently we probably can get away with as few as 1. */ #define DRBD_MIN_POOL_PAGES 128 -extern mempool_t *drbd_md_io_page_pool; +extern mempool_t drbd_md_io_page_pool; /* We also need to make sure we get a bio * when we need it for housekeeping purposes */ -extern struct bio_set *drbd_md_io_bio_set; +extern struct bio_set drbd_md_io_bio_set; /* to allocate from that set */ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); /* And a bio_set for cloning */ -extern struct bio_set *drbd_io_bio_set; +extern struct bio_set drbd_io_bio_set; extern struct mutex resources_mutex; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 185f1ef..a233e71 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -124,11 +124,11 @@ struct kmem_cache *drbd_request_cache; struct kmem_cache *drbd_ee_cache; /* peer requests */ struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ -mempool_t *drbd_request_mempool; -mempool_t *drbd_ee_mempool; -mempool_t *drbd_md_io_page_pool; -struct bio_set *drbd_md_io_bio_set; -struct bio_set *drbd_io_bio_set; +mempool_t drbd_request_mempool; +mempool_t drbd_ee_mempool; +mempool_t drbd_md_io_page_pool; +struct bio_set drbd_md_io_bio_set; +struct bio_set drbd_io_bio_set; /* I do not use a standard mempool, because: 1) I want to hand out the pre-allocated objects first. @@ -153,10 +153,10 @@ struct bio *bio_alloc_drbd(gfp_t gfp_mask) { struct bio *bio; - if (!drbd_md_io_bio_set) + if (!bioset_initialized(&drbd_md_io_bio_set)) return bio_alloc(gfp_mask, 1); - bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); + bio = bio_alloc_bioset(gfp_mask, 1, &drbd_md_io_bio_set); if (!bio) return NULL; return bio; @@ -2097,16 +2097,11 @@ static void drbd_destroy_mempools(void) /* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */ - if (drbd_io_bio_set) - bioset_free(drbd_io_bio_set); - if (drbd_md_io_bio_set) - bioset_free(drbd_md_io_bio_set); - if (drbd_md_io_page_pool) - mempool_destroy(drbd_md_io_page_pool); - if (drbd_ee_mempool) - mempool_destroy(drbd_ee_mempool); - if (drbd_request_mempool) - mempool_destroy(drbd_request_mempool); + bioset_exit(&drbd_io_bio_set); + bioset_exit(&drbd_md_io_bio_set); + mempool_exit(&drbd_md_io_page_pool); + mempool_exit(&drbd_ee_mempool); + mempool_exit(&drbd_request_mempool); if (drbd_ee_cache) kmem_cache_destroy(drbd_ee_cache); if (drbd_request_cache) @@ -2116,11 +2111,6 @@ static void drbd_destroy_mempools(void) if (drbd_al_ext_cache) kmem_cache_destroy(drbd_al_ext_cache); - drbd_io_bio_set = NULL; - drbd_md_io_bio_set = NULL; - drbd_md_io_page_pool = NULL; - drbd_ee_mempool = NULL; - drbd_request_mempool = NULL; drbd_ee_cache = NULL; drbd_request_cache = NULL; drbd_bm_ext_cache = NULL; @@ -2133,18 +2123,7 @@ static int drbd_create_mempools(void) { struct page *page; const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count; - int i; - - /* prepare our caches and mempools */ - drbd_request_mempool = NULL; - drbd_ee_cache = NULL; - drbd_request_cache = NULL; - drbd_bm_ext_cache = NULL; - drbd_al_ext_cache = NULL; - drbd_pp_pool = NULL; - drbd_md_io_page_pool = NULL; - drbd_md_io_bio_set = NULL; - drbd_io_bio_set = NULL; + int i, ret; /* caches */ drbd_request_cache = kmem_cache_create( @@ -2168,26 +2147,26 @@ static int drbd_create_mempools(void) goto Enomem; /* mempools */ - drbd_io_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0); - if (drbd_io_bio_set == NULL) + ret = bioset_init(&drbd_io_bio_set, BIO_POOL_SIZE, 0, 0); + if (ret) goto Enomem; - drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0, - BIOSET_NEED_BVECS); - if (drbd_md_io_bio_set == NULL) + ret = bioset_init(&drbd_md_io_bio_set, DRBD_MIN_POOL_PAGES, 0, + BIOSET_NEED_BVECS); + if (ret) goto Enomem; - drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); - if (drbd_md_io_page_pool == NULL) + ret = mempool_init_page_pool(&drbd_md_io_page_pool, DRBD_MIN_POOL_PAGES, 0); + if (ret) goto Enomem; - drbd_request_mempool = mempool_create_slab_pool(number, - drbd_request_cache); - if (drbd_request_mempool == NULL) + ret = mempool_init_slab_pool(&drbd_request_mempool, number, + drbd_request_cache); + if (ret) goto Enomem; - drbd_ee_mempool = mempool_create_slab_pool(number, drbd_ee_cache); - if (drbd_ee_mempool == NULL) + ret = mempool_init_slab_pool(&drbd_ee_mempool, number, drbd_ee_cache); + if (ret) goto Enomem; /* drbd's page pool */ @@ -3010,7 +2989,7 @@ static int __init drbd_init(void) goto fail; err = -ENOMEM; - drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); + drbd_proc = proc_create_data("drbd", S_IFREG | 0444 , NULL, &drbd_proc_fops, NULL); if (!drbd_proc) { pr_err("unable to register proc file\n"); goto fail; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c72dee0..be9450f 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -378,7 +378,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) return NULL; - peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); + peer_req = mempool_alloc(&drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); if (!peer_req) { if (!(gfp_mask & __GFP_NOWARN)) drbd_err(device, "%s: allocation failed\n", __func__); @@ -409,7 +409,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto return peer_req; fail: - mempool_free(peer_req, drbd_ee_mempool); + mempool_free(peer_req, &drbd_ee_mempool); return NULL; } @@ -426,7 +426,7 @@ void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request * peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; drbd_al_complete_io(device, &peer_req->i); } - mempool_free(peer_req, drbd_ee_mempool); + mempool_free(peer_req, &drbd_ee_mempool); } int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index a500e73..a47e498 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -55,7 +55,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio { struct drbd_request *req; - req = mempool_alloc(drbd_request_mempool, GFP_NOIO); + req = mempool_alloc(&drbd_request_mempool, GFP_NOIO); if (!req) return NULL; memset(req, 0, sizeof(*req)); @@ -184,7 +184,7 @@ void drbd_req_destroy(struct kref *kref) } } - mempool_free(req, drbd_request_mempool); + mempool_free(req, &drbd_request_mempool); } static void wake_all_senders(struct drbd_connection *connection) diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index cb97b3b..94c6540 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -269,7 +269,7 @@ enum drbd_req_state_bits { static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) { struct bio *bio; - bio = bio_clone_fast(bio_src, GFP_NOIO, drbd_io_bio_set); + bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set); req->private_bio = bio; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 8ec7235..8871b50 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4450,7 +4450,7 @@ static ssize_t floppy_cmos_show(struct device *dev, return sprintf(buf, "%X\n", UDP->cmos); } -static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL); +static DEVICE_ATTR(cmos, 0444, floppy_cmos_show, NULL); static struct attribute *floppy_dev_attrs[] = { &dev_attr_cmos.attr, diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 55cf554..4838b0d 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -732,7 +732,7 @@ static ssize_t loop_attr_do_show_##_name(struct device *d, \ return loop_attr_show(d, b, loop_attr_##_name##_show); \ } \ static struct device_attribute loop_attr_##_name = \ - __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL); + __ATTR(_name, 0444, loop_attr_do_show_##_name, NULL); static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) { @@ -809,16 +809,17 @@ static struct attribute_group loop_attribute_group = { .attrs= loop_attrs, }; -static int loop_sysfs_init(struct loop_device *lo) +static void loop_sysfs_init(struct loop_device *lo) { - return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj, - &loop_attribute_group); + lo->sysfs_inited = !sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj, + &loop_attribute_group); } static void loop_sysfs_exit(struct loop_device *lo) { - sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj, - &loop_attribute_group); + if (lo->sysfs_inited) + sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj, + &loop_attribute_group); } static void loop_config_discard(struct loop_device *lo) @@ -1677,9 +1678,9 @@ static const struct block_device_operations lo_fops = { * And now the modules code and kernel interface. */ static int max_loop; -module_param(max_loop, int, S_IRUGO); +module_param(max_loop, int, 0444); MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); -module_param(max_part, int, S_IRUGO); +module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); diff --git a/drivers/block/loop.h b/drivers/block/loop.h index b78de98..4d42c7a 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -58,6 +58,7 @@ struct loop_device { struct kthread_worker worker; struct task_struct *worker_task; bool use_dio; + bool sysfs_inited; struct request_queue *lo_queue; struct blk_mq_tag_set tag_set; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 769c551..c73626d 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2285,7 +2285,7 @@ static ssize_t mtip_hw_show_status(struct device *dev, return size; } -static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL); +static DEVICE_ATTR(status, 0444, mtip_hw_show_status, NULL); /* debugsfs entries */ @@ -2566,10 +2566,9 @@ static int mtip_hw_debugfs_init(struct driver_data *dd) return -1; } - debugfs_create_file("flags", S_IRUGO, dd->dfs_node, dd, - &mtip_flags_fops); - debugfs_create_file("registers", S_IRUGO, dd->dfs_node, dd, - &mtip_regs_fops); + debugfs_create_file("flags", 0444, dd->dfs_node, dd, &mtip_flags_fops); + debugfs_create_file("registers", 0444, dd->dfs_node, dd, + &mtip_regs_fops); return 0; } @@ -2726,15 +2725,11 @@ static void mtip_softirq_done_fn(struct request *rq) blk_mq_end_request(rq, cmd->status); } -static void mtip_abort_cmd(struct request *req, void *data, - bool reserved) +static void mtip_abort_cmd(struct request *req, void *data, bool reserved) { struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req); struct driver_data *dd = data; - if (!blk_mq_request_started(req)) - return; - dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag); clear_bit(req->tag, dd->port->cmds_to_issue); @@ -2742,14 +2737,10 @@ static void mtip_abort_cmd(struct request *req, void *data, mtip_softirq_done_fn(req); } -static void mtip_queue_cmd(struct request *req, void *data, - bool reserved) +static void mtip_queue_cmd(struct request *req, void *data, bool reserved) { struct driver_data *dd = data; - if (!blk_mq_request_started(req)) - return; - set_bit(req->tag, dd->port->cmds_to_issue); blk_abort_request(req); } @@ -3720,7 +3711,8 @@ static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req, struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req); cmd->status = BLK_STS_TIMEOUT; - return BLK_EH_HANDLED; + blk_mq_complete_request(req); + return BLK_EH_DONE; } if (test_bit(req->tag, dd->port->cmds_to_issue)) @@ -3862,7 +3854,6 @@ skip_create_disk: blk_queue_max_hw_sectors(dd->queue, 0xffff); blk_queue_max_segment_size(dd->queue, 0x400000); blk_queue_io_min(dd->queue, 4096); - blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask); /* Signal trim support */ if (dd->trim_supp == true) { @@ -4273,7 +4264,7 @@ static int mtip_pci_probe(struct pci_dev *pdev, if (!dd->isr_workq) { dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance); rv = -ENOMEM; - goto block_initialize_err; + goto setmask_err; } memset(cpu_list, 0, sizeof(cpu_list)); @@ -4614,7 +4605,7 @@ static int __init mtip_init(void) } if (dfs_parent) { dfs_device_status = debugfs_create_file("device_status", - S_IRUGO, dfs_parent, NULL, + 0444, dfs_parent, NULL, &mtip_device_status_fops); if (IS_ERR_OR_NULL(dfs_device_status)) { pr_err("Error creating device_status node\n"); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index afbc202..3ed1ef8 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -166,16 +166,19 @@ static ssize_t pid_show(struct device *dev, } static const struct device_attribute pid_attr = { - .attr = { .name = "pid", .mode = S_IRUGO}, + .attr = { .name = "pid", .mode = 0444}, .show = pid_show, }; static void nbd_dev_remove(struct nbd_device *nbd) { struct gendisk *disk = nbd->disk; + struct request_queue *q; + if (disk) { + q = disk->queue; del_gendisk(disk); - blk_cleanup_queue(disk->queue); + blk_cleanup_queue(q); blk_mq_free_tag_set(&nbd->tag_set); disk->private_data = NULL; put_disk(disk); @@ -213,7 +216,15 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, } if (!nsock->dead) { kernel_sock_shutdown(nsock->sock, SHUT_RDWR); - atomic_dec(&nbd->config->live_connections); + if (atomic_dec_return(&nbd->config->live_connections) == 0) { + if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED, + &nbd->config->runtime_flags)) { + set_bit(NBD_DISCONNECTED, + &nbd->config->runtime_flags); + dev_info(nbd_to_dev(nbd), + "Disconnected due to user request.\n"); + } + } } nsock->dead = true; nsock->pending = NULL; @@ -231,9 +242,22 @@ static void nbd_size_clear(struct nbd_device *nbd) static void nbd_size_update(struct nbd_device *nbd) { struct nbd_config *config = nbd->config; + struct block_device *bdev = bdget_disk(nbd->disk, 0); + + if (config->flags & NBD_FLAG_SEND_TRIM) { + nbd->disk->queue->limits.discard_granularity = config->blksize; + blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); + } blk_queue_logical_block_size(nbd->disk->queue, config->blksize); blk_queue_physical_block_size(nbd->disk->queue, config->blksize); set_capacity(nbd->disk, config->bytesize >> 9); + if (bdev) { + if (bdev->bd_disk) + bd_set_size(bdev, config->bytesize); + else + bdev->bd_invalidated = 1; + bdput(bdev); + } kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); } @@ -243,6 +267,8 @@ static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize, struct nbd_config *config = nbd->config; config->blksize = blocksize; config->bytesize = blocksize * nr_blocks; + if (nbd->task_recv != NULL) + nbd_size_update(nbd); } static void nbd_complete_rq(struct request *req) @@ -286,13 +312,15 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, if (!refcount_inc_not_zero(&nbd->config_refs)) { cmd->status = BLK_STS_TIMEOUT; - return BLK_EH_HANDLED; + goto done; } config = nbd->config; if (config->num_connections > 1) { dev_err_ratelimited(nbd_to_dev(nbd), - "Connection timed out, retrying\n"); + "Connection timed out, retrying (%d/%d alive)\n", + atomic_read(&config->live_connections), + config->num_connections); /* * Hooray we have more connections, requeue this IO, the submit * path will put it on a real connection. @@ -314,7 +342,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, } blk_mq_requeue_request(req, true); nbd_config_put(nbd); - return BLK_EH_NOT_HANDLED; + return BLK_EH_DONE; } } else { dev_err_ratelimited(nbd_to_dev(nbd), @@ -324,8 +352,9 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, cmd->status = BLK_STS_IOERR; sock_shutdown(nbd); nbd_config_put(nbd); - - return BLK_EH_HANDLED; +done: + blk_mq_complete_request(req); + return BLK_EH_DONE; } /* @@ -647,11 +676,8 @@ static void recv_work(struct work_struct *work) static void nbd_clear_req(struct request *req, void *data, bool reserved) { - struct nbd_cmd *cmd; + struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); - if (!blk_mq_request_started(req)) - return; - cmd = blk_mq_rq_to_pdu(req); cmd->status = BLK_STS_IOERR; blk_mq_complete_request(req); } @@ -714,10 +740,9 @@ static int wait_for_reconnect(struct nbd_device *nbd) return 0; if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) return 0; - wait_event_timeout(config->conn_wait, - atomic_read(&config->live_connections), - config->dead_conn_timeout); - return atomic_read(&config->live_connections); + return wait_event_timeout(config->conn_wait, + atomic_read(&config->live_connections) > 0, + config->dead_conn_timeout) > 0; } static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) @@ -950,10 +975,6 @@ static void nbd_bdev_reset(struct block_device *bdev) if (bdev->bd_openers > 1) return; bd_set_size(bdev, 0); - if (max_part > 0) { - blkdev_reread_part(bdev); - bdev->bd_invalidated = 1; - } } static void nbd_parse_flags(struct nbd_device *nbd) @@ -1040,6 +1061,8 @@ static void nbd_config_put(struct nbd_device *nbd) nbd->config = NULL; nbd->tag_set.timeout = 0; + nbd->disk->queue->limits.discard_granularity = 0; + blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue); mutex_unlock(&nbd->config_lock); @@ -1109,7 +1132,6 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b if (ret) return ret; - bd_set_size(bdev, config->bytesize); if (max_part) bdev->bd_invalidated = 1; mutex_unlock(&nbd->config_lock); @@ -1118,7 +1140,7 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b if (ret) sock_shutdown(nbd); mutex_lock(&nbd->config_lock); - bd_set_size(bdev, 0); + nbd_bdev_reset(bdev); /* user requested, ignore socket errors */ if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags)) ret = 0; @@ -1269,6 +1291,9 @@ static int nbd_open(struct block_device *bdev, fmode_t mode) refcount_set(&nbd->config_refs, 1); refcount_inc(&nbd->refs); mutex_unlock(&nbd->config_lock); + bdev->bd_invalidated = 1; + } else if (nbd_disconnected(nbd->config)) { + bdev->bd_invalidated = 1; } out: mutex_unlock(&nbd_index_mutex); @@ -1490,8 +1515,8 @@ static int nbd_dev_add(int index) */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); - disk->queue->limits.discard_granularity = 512; - blk_queue_max_discard_sectors(disk->queue, UINT_MAX); + disk->queue->limits.discard_granularity = 0; + blk_queue_max_discard_sectors(disk->queue, 0); blk_queue_max_segment_size(disk->queue, UINT_MAX); blk_queue_max_segments(disk->queue, USHRT_MAX); blk_queue_max_hw_sectors(disk->queue, 65536); @@ -1755,6 +1780,7 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) } mutex_lock(&nbd->config_lock); nbd_disconnect(nbd); + nbd_clear_sock(nbd); mutex_unlock(&nbd->config_lock); if (test_and_clear_bit(NBD_HAS_CONFIG_REF, &nbd->config->runtime_flags)) @@ -2093,7 +2119,8 @@ static int __init nbd_init(void) if (nbds_max > 1UL << (MINORBITS - part_shift)) return -EINVAL; recv_workqueue = alloc_workqueue("knbd-recv", - WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + WQ_MEM_RECLAIM | WQ_HIGHPRI | + WQ_UNBOUND, 0); if (!recv_workqueue) return -ENOMEM; diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index a765532..2bdadd7 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -157,23 +157,23 @@ enum { }; static int g_no_sched; -module_param_named(no_sched, g_no_sched, int, S_IRUGO); +module_param_named(no_sched, g_no_sched, int, 0444); MODULE_PARM_DESC(no_sched, "No io scheduler"); static int g_submit_queues = 1; -module_param_named(submit_queues, g_submit_queues, int, S_IRUGO); +module_param_named(submit_queues, g_submit_queues, int, 0444); MODULE_PARM_DESC(submit_queues, "Number of submission queues"); static int g_home_node = NUMA_NO_NODE; -module_param_named(home_node, g_home_node, int, S_IRUGO); +module_param_named(home_node, g_home_node, int, 0444); MODULE_PARM_DESC(home_node, "Home node for the device"); #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION static char g_timeout_str[80]; -module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO); +module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); static char g_requeue_str[80]; -module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), S_IRUGO); +module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); #endif static int g_queue_mode = NULL_Q_MQ; @@ -203,27 +203,27 @@ static const struct kernel_param_ops null_queue_mode_param_ops = { .get = param_get_int, }; -device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, S_IRUGO); +device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); static int g_gb = 250; -module_param_named(gb, g_gb, int, S_IRUGO); +module_param_named(gb, g_gb, int, 0444); MODULE_PARM_DESC(gb, "Size in GB"); static int g_bs = 512; -module_param_named(bs, g_bs, int, S_IRUGO); +module_param_named(bs, g_bs, int, 0444); MODULE_PARM_DESC(bs, "Block size (in bytes)"); static int nr_devices = 1; -module_param(nr_devices, int, S_IRUGO); +module_param(nr_devices, int, 0444); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); static bool g_blocking; -module_param_named(blocking, g_blocking, bool, S_IRUGO); +module_param_named(blocking, g_blocking, bool, 0444); MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); static bool shared_tags; -module_param(shared_tags, bool, S_IRUGO); +module_param(shared_tags, bool, 0444); MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); static int g_irqmode = NULL_IRQ_SOFTIRQ; @@ -239,19 +239,19 @@ static const struct kernel_param_ops null_irqmode_param_ops = { .get = param_get_int, }; -device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, S_IRUGO); +device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); static unsigned long g_completion_nsec = 10000; -module_param_named(completion_nsec, g_completion_nsec, ulong, S_IRUGO); +module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); static int g_hw_queue_depth = 64; -module_param_named(hw_queue_depth, g_hw_queue_depth, int, S_IRUGO); +module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); static bool g_use_per_node_hctx; -module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, S_IRUGO); +module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); static struct nullb_device *null_alloc_dev(void); @@ -1365,7 +1365,8 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio) static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq) { pr_info("null: rq %p timed out\n", rq); - return BLK_EH_HANDLED; + blk_mq_complete_request(rq); + return BLK_EH_DONE; } static int null_rq_prep_fn(struct request_queue *q, struct request *req) @@ -1427,7 +1428,8 @@ static void null_request_fn(struct request_queue *q) static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) { pr_info("null: rq %p timed out\n", rq); - return BLK_EH_HANDLED; + blk_mq_complete_request(rq); + return BLK_EH_DONE; } static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 27a44b9..8961b190 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -740,7 +740,7 @@ static int pd_special_command(struct pd_unit *disk, { struct request *rq; - rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); + rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index c61d20c..1a2c010 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -97,8 +97,8 @@ static int pktdev_major; static int write_congestion_on = PKT_WRITE_CONGESTION_ON; static int write_congestion_off = PKT_WRITE_CONGESTION_OFF; static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */ -static mempool_t *psd_pool; -static struct bio_set *pkt_bio_set; +static mempool_t psd_pool; +static struct bio_set pkt_bio_set; static struct class *class_pktcdvd = NULL; /* /sys/class/pktcdvd */ static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */ @@ -478,8 +478,8 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) if (!pd->dfs_d_root) return; - pd->dfs_f_info = debugfs_create_file("info", S_IRUGO, - pd->dfs_d_root, pd, &debug_fops); + pd->dfs_f_info = debugfs_create_file("info", 0444, + pd->dfs_d_root, pd, &debug_fops); } static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd) @@ -631,7 +631,7 @@ static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node) static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node) { rb_erase(&node->rb_node, &pd->bio_queue); - mempool_free(node, pd->rb_pool); + mempool_free(node, &pd->rb_pool); pd->bio_queue_size--; BUG_ON(pd->bio_queue_size < 0); } @@ -704,13 +704,13 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * int ret = 0; rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? - REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM); + REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); if (cgc->buflen) { ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, - __GFP_RECLAIM); + GFP_NOIO); if (ret) goto out; } @@ -1285,7 +1285,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) * Fill-in bvec with data from orig_bios. */ spin_lock(&pkt->lock); - bio_copy_data(pkt->w_bio, pkt->orig_bios.head); + bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head); pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE); spin_unlock(&pkt->lock); @@ -2303,14 +2303,14 @@ static void pkt_end_io_read_cloned(struct bio *bio) psd->bio->bi_status = bio->bi_status; bio_put(bio); bio_endio(psd->bio); - mempool_free(psd, psd_pool); + mempool_free(psd, &psd_pool); pkt_bio_finished(pd); } static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio) { - struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, pkt_bio_set); - struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO); + struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, &pkt_bio_set); + struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO); psd->pd = pd; psd->bio = bio; @@ -2381,7 +2381,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) /* * No matching packet found. Store the bio in the work queue. */ - node = mempool_alloc(pd->rb_pool, GFP_NOIO); + node = mempool_alloc(&pd->rb_pool, GFP_NOIO); node->bio = bio; spin_lock(&pd->lock); BUG_ON(pd->bio_queue_size < 0); @@ -2451,7 +2451,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio) split = bio_split(bio, last_zone - bio->bi_iter.bi_sector, - GFP_NOIO, pkt_bio_set); + GFP_NOIO, &pkt_bio_set); bio_chain(split, bio); } else { split = bio; @@ -2707,9 +2707,9 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) if (!pd) goto out_mutex; - pd->rb_pool = mempool_create_kmalloc_pool(PKT_RB_POOL_SIZE, - sizeof(struct pkt_rb_node)); - if (!pd->rb_pool) + ret = mempool_init_kmalloc_pool(&pd->rb_pool, PKT_RB_POOL_SIZE, + sizeof(struct pkt_rb_node)); + if (ret) goto out_mem; INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); @@ -2766,7 +2766,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) out_mem2: put_disk(disk); out_mem: - mempool_destroy(pd->rb_pool); + mempool_exit(&pd->rb_pool); kfree(pd); out_mutex: mutex_unlock(&ctl_mutex); @@ -2817,7 +2817,7 @@ static int pkt_remove_dev(dev_t pkt_dev) blk_cleanup_queue(pd->disk->queue); put_disk(pd->disk); - mempool_destroy(pd->rb_pool); + mempool_exit(&pd->rb_pool); kfree(pd); /* This is safe: open() is still holding a reference. */ @@ -2914,14 +2914,14 @@ static int __init pkt_init(void) mutex_init(&ctl_mutex); - psd_pool = mempool_create_kmalloc_pool(PSD_POOL_SIZE, - sizeof(struct packet_stacked_data)); - if (!psd_pool) - return -ENOMEM; - pkt_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0); - if (!pkt_bio_set) { - mempool_destroy(psd_pool); - return -ENOMEM; + ret = mempool_init_kmalloc_pool(&psd_pool, PSD_POOL_SIZE, + sizeof(struct packet_stacked_data)); + if (ret) + return ret; + ret = bioset_init(&pkt_bio_set, BIO_POOL_SIZE, 0, 0); + if (ret) { + mempool_exit(&psd_pool); + return ret; } ret = register_blkdev(pktdev_major, DRIVER_NAME); @@ -2954,8 +2954,8 @@ out_misc: out: unregister_blkdev(pktdev_major, DRIVER_NAME); out2: - mempool_destroy(psd_pool); - bioset_free(pkt_bio_set); + mempool_exit(&psd_pool); + bioset_exit(&pkt_bio_set); return ret; } @@ -2968,8 +2968,8 @@ static void __exit pkt_exit(void) pkt_sysfs_cleanup(); unregister_blkdev(pktdev_major, DRIVER_NAME); - mempool_destroy(psd_pool); - bioset_free(pkt_bio_set); + mempool_exit(&psd_pool); + bioset_exit(&pkt_bio_set); } MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives"); diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 075662f..afe1508 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -465,8 +465,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) priv->queue = queue; queue->queuedata = dev; - blk_queue_bounce_limit(queue, BLK_BOUNCE_HIGH); - blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9); blk_queue_segment_boundary(queue, -1UL); blk_queue_dma_alignment(queue, dev->blk_size-1); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 33b36fe..af35404 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -424,7 +424,7 @@ static struct workqueue_struct *rbd_wq; * single-major requires >= 0.75 version of userspace rbd utility. */ static bool single_major = true; -module_param(single_major, bool, S_IRUGO); +module_param(single_major, bool, 0444); MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); static ssize_t rbd_add(struct bus_type *bus, const char *buf, @@ -468,11 +468,11 @@ static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf) return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED); } -static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); -static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); -static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); -static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); -static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL); +static BUS_ATTR(add, 0200, NULL, rbd_add); +static BUS_ATTR(remove, 0200, NULL, rbd_remove); +static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major); +static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major); +static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL); static struct attribute *rbd_bus_attrs[] = { &bus_attr_add.attr, @@ -4204,22 +4204,22 @@ static ssize_t rbd_image_refresh(struct device *dev, return size; } -static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); -static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); -static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); -static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); -static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL); -static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); -static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL); -static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL); -static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); -static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); -static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); -static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); -static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); -static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); -static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); -static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); +static DEVICE_ATTR(size, 0444, rbd_size_show, NULL); +static DEVICE_ATTR(features, 0444, rbd_features_show, NULL); +static DEVICE_ATTR(major, 0444, rbd_major_show, NULL); +static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL); +static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL); +static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL); +static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL); +static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL); +static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL); +static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL); +static DEVICE_ATTR(name, 0444, rbd_name_show, NULL); +static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL); +static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh); +static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL); +static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL); +static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL); static struct attribute *rbd_attrs[] = { &dev_attr_size.attr, diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 34997df..09537be 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -247,19 +247,19 @@ static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card) if (IS_ERR_OR_NULL(card->debugfs_dir)) goto failed_debugfs_dir; - debugfs_stats = debugfs_create_file("stats", S_IRUGO, + debugfs_stats = debugfs_create_file("stats", 0444, card->debugfs_dir, card, &debugfs_stats_fops); if (IS_ERR_OR_NULL(debugfs_stats)) goto failed_debugfs_stats; - debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO, + debugfs_pci_regs = debugfs_create_file("pci_regs", 0444, card->debugfs_dir, card, &debugfs_pci_regs_fops); if (IS_ERR_OR_NULL(debugfs_pci_regs)) goto failed_debugfs_pci_regs; - debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR, + debugfs_cram = debugfs_create_file("cram", 0644, card->debugfs_dir, card, &debugfs_cram_fops); if (IS_ERR_OR_NULL(debugfs_cram)) diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 08586dc..4d90e5e 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -567,7 +567,7 @@ static struct carm_request *carm_get_special(struct carm_host *host) if (!crq) return NULL; - rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, GFP_KERNEL); + rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, 0); if (IS_ERR(rq)) { spin_lock_irqsave(&host->lock, flags); carm_put_request(host, crq); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 4a07593c..23752dc 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -298,7 +298,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) struct request *req; int err; - req = blk_get_request(q, REQ_OP_DRV_IN, GFP_KERNEL); + req = blk_get_request(q, REQ_OP_DRV_IN, 0); if (IS_ERR(req)) return PTR_ERR(req); @@ -371,7 +371,7 @@ static ssize_t virtblk_serial_show(struct device *dev, return err; } -static DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL); +static DEVICE_ATTR(serial, 0444, virtblk_serial_show, NULL); /* The queue's logical block size must be set before calling this */ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize) @@ -576,10 +576,10 @@ virtblk_cache_type_show(struct device *dev, struct device_attribute *attr, } static const struct device_attribute dev_attr_cache_type_ro = - __ATTR(cache_type, S_IRUGO, + __ATTR(cache_type, 0444, virtblk_cache_type_show, NULL); static const struct device_attribute dev_attr_cache_type_rw = - __ATTR(cache_type, S_IRUGO|S_IWUSR, + __ATTR(cache_type, 0644, virtblk_cache_type_show, virtblk_cache_type_store); static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq, diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 987d665..b55b245 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -98,7 +98,7 @@ MODULE_PARM_DESC(max_queues, * backend, 4KB page granularity is used. */ unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; -module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO); +module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444); MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); /* * The LRU mechanism to clean the lists of persistent grants needs to diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 21c1be1..66412eed 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -367,7 +367,7 @@ int __init xen_blkif_interface_init(void) out: \ return sprintf(buf, format, result); \ } \ - static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + static DEVICE_ATTR(name, 0444, show_##name, NULL) VBD_SHOW_ALLRING(oo_req, "%llu\n"); VBD_SHOW_ALLRING(rd_req, "%llu\n"); @@ -403,7 +403,7 @@ static const struct attribute_group xen_vbdstat_group = { \ return sprintf(buf, format, ##args); \ } \ - static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + static DEVICE_ATTR(name, 0444, show_##name, NULL) VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); VBD_SHOW(mode, "%s\n", be->mode); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2a8e781..ae00a82 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -129,13 +129,12 @@ static const struct block_device_operations xlvbd_block_fops; */ static unsigned int xen_blkif_max_segments = 32; -module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, - S_IRUGO); +module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, 0444); MODULE_PARM_DESC(max_indirect_segments, "Maximum amount of segments in indirect requests (default is 32)"); static unsigned int xen_blkif_max_queues = 4; -module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO); +module_param_named(max_queues, xen_blkif_max_queues, uint, 0444); MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk"); /* @@ -143,7 +142,7 @@ MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per v * backend, 4KB page granularity is used. */ static unsigned int xen_blkif_max_ring_order; -module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO); +module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444); MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); #define BLK_RING_SIZE(info) \ diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index bfc566d..9adc8c3 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2192,7 +2192,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, len = nr * CD_FRAMESIZE_RAW; - rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL); + rq = blk_get_request(q, REQ_OP_SCSI_IN, 0); if (IS_ERR(rq)) { ret = PTR_ERR(rq); break; diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 0e6bc63..8b2b72b 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -92,7 +92,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, struct request *rq; int error; - rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); + rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_MISC; rq->special = (char *)pc; diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 5a8e8e3..32af6f0 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -437,7 +437,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, bool delay = false; rq = blk_get_request(drive->queue, - write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM); + write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); memcpy(scsi_req(rq)->cmd, cmd, BLK_MAX_CDB); ide_req(rq)->type = ATA_PRIV_PC; rq->rq_flags |= rq_flags; diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 2acca12..b132240 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -304,7 +304,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) struct request *rq; int ret; - rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); + rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_MISC; rq->rq_flags = RQF_QUIET; blk_execute_rq(drive->queue, cd->disk, rq, 0); diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index 4e20747..f4f8afd 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -166,7 +166,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, if (!(setting->flags & DS_SYNC)) return setting->set(drive, arg); - rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM); + rq = blk_get_request(q, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_MISC; scsi_req(rq)->cmd_len = 5; scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC; diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index f1a7c58..e3b4e65 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -478,7 +478,7 @@ static int set_multcount(ide_drive_t *drive, int arg) if (drive->special_flags & IDE_SFLAG_SET_MULTMODE) return -EBUSY; - rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); + rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_TASKFILE; drive->mult_req = arg; diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index 3661abb..af5119a 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -125,7 +125,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg) if (NULL == (void *) arg) { struct request *rq; - rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); + rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_TASKFILE; blk_execute_rq(drive->queue, NULL, rq, 0); err = scsi_req(rq)->result ? -EIO : 0; @@ -222,7 +222,7 @@ static int generic_drive_reset(ide_drive_t *drive) struct request *rq; int ret = 0; - rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); + rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_MISC; scsi_req(rq)->cmd_len = 1; scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET; diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index 6465bcc..622f0ed 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -32,7 +32,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) } spin_unlock_irq(&hwif->lock); - rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM); + rq = blk_get_request(q, REQ_OP_DRV_IN, 0); scsi_req(rq)->cmd[0] = REQ_PARK_HEADS; scsi_req(rq)->cmd_len = 1; ide_req(rq)->type = ATA_PRIV_MISC; @@ -47,7 +47,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) * Make sure that *some* command is sent to the drive after the * timeout has expired, so power management will be reenabled. */ - rq = blk_get_request(q, REQ_OP_DRV_IN, GFP_NOWAIT); + rq = blk_get_request(q, REQ_OP_DRV_IN, BLK_MQ_REQ_NOWAIT); if (IS_ERR(rq)) goto out; diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index ad8a125..59217aa 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -19,7 +19,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) } memset(&rqpm, 0, sizeof(rqpm)); - rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); + rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_PM_SUSPEND; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_SUSPEND; @@ -90,8 +90,7 @@ int generic_ide_resume(struct device *dev) } memset(&rqpm, 0, sizeof(rqpm)); - rq = blk_get_request_flags(drive->queue, REQ_OP_DRV_IN, - BLK_MQ_REQ_PREEMPT); + rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PREEMPT); ide_req(rq)->type = ATA_PRIV_PM_RESUME; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_RESUME; diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index fd57e8c..62c1a19 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -854,7 +854,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE); BUG_ON(size < 0 || size % tape->blk_size); - rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); + rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_MISC; scsi_req(rq)->cmd[13] = cmd; rq->rq_disk = tape->disk; @@ -862,7 +862,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) if (size) { ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size, - __GFP_RECLAIM); + GFP_NOIO); if (ret) goto out_put; } diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index abe0822..c034cd9 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -431,7 +431,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, rq = blk_get_request(drive->queue, (cmd->tf_flags & IDE_TFLAG_WRITE) ? - REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM); + REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_TASKFILE; /* @@ -442,7 +442,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, */ if (nsect) { error = blk_rq_map_kern(drive->queue, rq, buf, - nsect * SECTOR_SIZE, __GFP_RECLAIM); + nsect * SECTOR_SIZE, GFP_NOIO); if (error) goto put_req; } diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 63171cd..60aa7bc 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -431,7 +431,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) return 0; err_sysfs: if (tt->exit) - tt->exit(targetdata); + tt->exit(targetdata, true); err_init: blk_cleanup_queue(tqueue); tdisk->queue = NULL; @@ -446,7 +446,7 @@ err_reserve: return ret; } -static void __nvm_remove_target(struct nvm_target *t) +static void __nvm_remove_target(struct nvm_target *t, bool graceful) { struct nvm_tgt_type *tt = t->type; struct gendisk *tdisk = t->disk; @@ -459,7 +459,7 @@ static void __nvm_remove_target(struct nvm_target *t) tt->sysfs_exit(tdisk); if (tt->exit) - tt->exit(tdisk->private_data); + tt->exit(tdisk->private_data, graceful); nvm_remove_tgt_dev(t->dev, 1); put_disk(tdisk); @@ -489,7 +489,7 @@ static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) mutex_unlock(&dev->mlock); return 1; } - __nvm_remove_target(t); + __nvm_remove_target(t, true); mutex_unlock(&dev->mlock); return 0; @@ -963,7 +963,7 @@ void nvm_unregister(struct nvm_dev *dev) list_for_each_entry_safe(t, tmp, &dev->targets, list) { if (t->dev->parent != dev) continue; - __nvm_remove_target(t); + __nvm_remove_target(t, false); } mutex_unlock(&dev->mlock); diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index 29a2311..b1c6d7e 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c @@ -44,13 +44,15 @@ retry: goto out; } - if (unlikely(!bio_has_data(bio))) - goto out; - pblk_ppa_set_empty(&w_ctx.ppa); w_ctx.flags = flags; - if (bio->bi_opf & REQ_PREFLUSH) + if (bio->bi_opf & REQ_PREFLUSH) { w_ctx.flags |= PBLK_FLUSH_ENTRY; + pblk_write_kick(pblk); + } + + if (unlikely(!bio_has_data(bio))) + goto out; for (i = 0; i < nr_entries; i++) { void *data = bio_data(bio); diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 94d5d97..ed9cc97 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -40,7 +40,7 @@ static void pblk_line_mark_bb(struct work_struct *work) } kfree(ppa); - mempool_free(line_ws, pblk->gen_ws_pool); + mempool_free(line_ws, &pblk->gen_ws_pool); } static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, @@ -102,7 +102,7 @@ static void pblk_end_io_erase(struct nvm_rq *rqd) struct pblk *pblk = rqd->private; __pblk_end_io_erase(pblk, rqd); - mempool_free(rqd, pblk->e_rq_pool); + mempool_free(rqd, &pblk->e_rq_pool); } /* @@ -237,15 +237,15 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type) switch (type) { case PBLK_WRITE: case PBLK_WRITE_INT: - pool = pblk->w_rq_pool; + pool = &pblk->w_rq_pool; rq_size = pblk_w_rq_size; break; case PBLK_READ: - pool = pblk->r_rq_pool; + pool = &pblk->r_rq_pool; rq_size = pblk_g_rq_size; break; default: - pool = pblk->e_rq_pool; + pool = &pblk->e_rq_pool; rq_size = pblk_g_rq_size; } @@ -265,20 +265,22 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type) case PBLK_WRITE: kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap); case PBLK_WRITE_INT: - pool = pblk->w_rq_pool; + pool = &pblk->w_rq_pool; break; case PBLK_READ: - pool = pblk->r_rq_pool; + pool = &pblk->r_rq_pool; break; case PBLK_ERASE: - pool = pblk->e_rq_pool; + pool = &pblk->e_rq_pool; break; default: pr_err("pblk: trying to free unknown rqd type\n"); return; } - nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); + if (rqd->meta_list) + nvm_dev_dma_free(dev->parent, rqd->meta_list, + rqd->dma_meta_list); mempool_free(rqd, pool); } @@ -292,7 +294,7 @@ void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, for (i = off; i < nr_pages + off; i++) { bv = bio->bi_io_vec[i]; - mempool_free(bv.bv_page, pblk->page_bio_pool); + mempool_free(bv.bv_page, &pblk->page_bio_pool); } } @@ -304,23 +306,23 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, int i, ret; for (i = 0; i < nr_pages; i++) { - page = mempool_alloc(pblk->page_bio_pool, flags); + page = mempool_alloc(&pblk->page_bio_pool, flags); ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); if (ret != PBLK_EXPOSED_PAGE_SIZE) { pr_err("pblk: could not add page to bio\n"); - mempool_free(page, pblk->page_bio_pool); + mempool_free(page, &pblk->page_bio_pool); goto err; } } return 0; err: - pblk_bio_free_pages(pblk, bio, 0, i - 1); + pblk_bio_free_pages(pblk, bio, (bio->bi_vcnt - i), i); return -1; } -static void pblk_write_kick(struct pblk *pblk) +void pblk_write_kick(struct pblk *pblk) { wake_up_process(pblk->writer_ts); mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000)); @@ -342,13 +344,6 @@ void pblk_write_should_kick(struct pblk *pblk) pblk_write_kick(pblk); } -void pblk_end_io_sync(struct nvm_rq *rqd) -{ - struct completion *waiting = rqd->private; - - complete(waiting); -} - static void pblk_wait_for_meta(struct pblk *pblk) { do { @@ -380,7 +375,13 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) lockdep_assert_held(&line->lock); - if (!vsc) { + if (line->w_err_gc->has_write_err) { + if (line->gc_group != PBLK_LINEGC_WERR) { + line->gc_group = PBLK_LINEGC_WERR; + move_list = &l_mg->gc_werr_list; + pblk_rl_werr_line_in(&pblk->rl); + } + } else if (!vsc) { if (line->gc_group != PBLK_LINEGC_FULL) { line->gc_group = PBLK_LINEGC_FULL; move_list = &l_mg->gc_full_list; @@ -467,16 +468,13 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) { struct nvm_tgt_dev *dev = pblk->dev; -#ifdef CONFIG_NVM_DEBUG - int ret; + atomic_inc(&pblk->inflight_io); - ret = pblk_check_io(pblk, rqd); - if (ret) - return ret; +#ifdef CONFIG_NVM_DEBUG + if (pblk_check_io(pblk, rqd)) + return NVM_IO_ERR; #endif - atomic_inc(&pblk->inflight_io); - return nvm_submit_io(dev, rqd); } @@ -484,16 +482,13 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd) { struct nvm_tgt_dev *dev = pblk->dev; -#ifdef CONFIG_NVM_DEBUG - int ret; + atomic_inc(&pblk->inflight_io); - ret = pblk_check_io(pblk, rqd); - if (ret) - return ret; +#ifdef CONFIG_NVM_DEBUG + if (pblk_check_io(pblk, rqd)) + return NVM_IO_ERR; #endif - atomic_inc(&pblk->inflight_io); - return nvm_submit_io_sync(dev, rqd); } @@ -856,9 +851,10 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, atomic_dec(&pblk->inflight_io); if (rqd.error) { - if (dir == PBLK_WRITE) + if (dir == PBLK_WRITE) { pblk_log_write_err(pblk, &rqd); - else if (dir == PBLK_READ) + ret = 1; + } else if (dir == PBLK_READ) pblk_log_read_err(pblk, &rqd); } @@ -1071,6 +1067,25 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, return 1; } +static int pblk_line_alloc_bitmaps(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_meta *lm = &pblk->lm; + + line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_KERNEL); + if (!line->map_bitmap) + return -ENOMEM; + + /* will be initialized using bb info from map_bitmap */ + line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL); + if (!line->invalid_bitmap) { + kfree(line->map_bitmap); + line->map_bitmap = NULL; + return -ENOMEM; + } + + return 0; +} + /* For now lines are always assumed full lines. Thus, smeta former and current * lun bitmaps are omitted. */ @@ -1108,7 +1123,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) { pr_debug("pblk: line smeta I/O failed. Retry\n"); - return 1; + return 0; } bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line); @@ -1174,19 +1189,9 @@ static int pblk_prepare_new_line(struct pblk *pblk, struct pblk_line *line) static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line) { struct pblk_line_meta *lm = &pblk->lm; + int blk_in_line = atomic_read(&line->blk_in_line); int blk_to_erase; - line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_ATOMIC); - if (!line->map_bitmap) - return -ENOMEM; - - /* will be initialized using bb info from map_bitmap */ - line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_ATOMIC); - if (!line->invalid_bitmap) { - kfree(line->map_bitmap); - return -ENOMEM; - } - /* Bad blocks do not need to be erased */ bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line); @@ -1199,16 +1204,19 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line) blk_to_erase = pblk_prepare_new_line(pblk, line); line->state = PBLK_LINESTATE_FREE; } else { - blk_to_erase = atomic_read(&line->blk_in_line); + blk_to_erase = blk_in_line; } - if (line->state != PBLK_LINESTATE_FREE) { - kfree(line->map_bitmap); - kfree(line->invalid_bitmap); + if (blk_in_line < lm->min_blk_line) { spin_unlock(&line->lock); + return -EAGAIN; + } + + if (line->state != PBLK_LINESTATE_FREE) { WARN(1, "pblk: corrupted line %d, state %d\n", line->id, line->state); - return -EAGAIN; + spin_unlock(&line->lock); + return -EINTR; } line->state = PBLK_LINESTATE_OPEN; @@ -1241,13 +1249,16 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line) } spin_unlock(&l_mg->free_lock); - pblk_rl_free_lines_dec(&pblk->rl, line, true); + ret = pblk_line_alloc_bitmaps(pblk, line); + if (ret) + return ret; if (!pblk_line_init_bb(pblk, line, 0)) { list_add(&line->list, &l_mg->free_list); return -EINTR; } + pblk_rl_free_lines_dec(&pblk->rl, line, true); return 0; } @@ -1259,6 +1270,24 @@ void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line) line->emeta = NULL; } +static void pblk_line_reinit(struct pblk_line *line) +{ + *line->vsc = cpu_to_le32(EMPTY_ENTRY); + + line->map_bitmap = NULL; + line->invalid_bitmap = NULL; + line->smeta = NULL; + line->emeta = NULL; +} + +void pblk_line_free(struct pblk_line *line) +{ + kfree(line->map_bitmap); + kfree(line->invalid_bitmap); + + pblk_line_reinit(line); +} + struct pblk_line *pblk_line_get(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; @@ -1292,10 +1321,14 @@ retry: ret = pblk_line_prepare(pblk, line); if (ret) { - if (ret == -EAGAIN) { + switch (ret) { + case -EAGAIN: + list_add(&line->list, &l_mg->bad_list); + goto retry; + case -EINTR: list_add(&line->list, &l_mg->corrupt_list); goto retry; - } else { + default: pr_err("pblk: failed to prepare line %d\n", line->id); list_add(&line->list, &l_mg->free_list); l_mg->nr_free_lines++; @@ -1321,11 +1354,14 @@ retry: return NULL; } + retry_line->map_bitmap = line->map_bitmap; + retry_line->invalid_bitmap = line->invalid_bitmap; retry_line->smeta = line->smeta; retry_line->emeta = line->emeta; retry_line->meta_line = line->meta_line; - pblk_line_free(pblk, line); + pblk_line_reinit(line); + l_mg->data_line = retry_line; spin_unlock(&l_mg->free_lock); @@ -1378,6 +1414,9 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) } spin_unlock(&l_mg->free_lock); + if (pblk_line_alloc_bitmaps(pblk, line)) + return NULL; + if (pblk_line_erase(pblk, line)) { line = pblk_line_retry(pblk, line); if (!line) @@ -1449,7 +1488,7 @@ static void pblk_line_close_meta_sync(struct pblk *pblk) flush_workqueue(pblk->close_wq); } -void pblk_pipeline_stop(struct pblk *pblk) +void __pblk_pipeline_flush(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; int ret; @@ -1474,6 +1513,11 @@ void pblk_pipeline_stop(struct pblk *pblk) flush_workqueue(pblk->bb_wq); pblk_line_close_meta_sync(pblk); +} + +void __pblk_pipeline_stop(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; spin_lock(&l_mg->free_lock); pblk->state = PBLK_STATE_STOPPED; @@ -1482,6 +1526,12 @@ void pblk_pipeline_stop(struct pblk *pblk) spin_unlock(&l_mg->free_lock); } +void pblk_pipeline_stop(struct pblk *pblk) +{ + __pblk_pipeline_flush(pblk); + __pblk_pipeline_stop(pblk); +} + struct pblk_line *pblk_line_replace_data(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; @@ -1511,6 +1561,9 @@ retry_erase: goto retry_erase; } + if (pblk_line_alloc_bitmaps(pblk, new)) + return NULL; + retry_setup: if (!pblk_line_init_metadata(pblk, new, cur)) { new = pblk_line_retry(pblk, new); @@ -1550,19 +1603,6 @@ out: return new; } -void pblk_line_free(struct pblk *pblk, struct pblk_line *line) -{ - kfree(line->map_bitmap); - kfree(line->invalid_bitmap); - - *line->vsc = cpu_to_le32(EMPTY_ENTRY); - - line->map_bitmap = NULL; - line->invalid_bitmap = NULL; - line->smeta = NULL; - line->emeta = NULL; -} - static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; @@ -1572,9 +1612,14 @@ static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line) WARN_ON(line->state != PBLK_LINESTATE_GC); line->state = PBLK_LINESTATE_FREE; line->gc_group = PBLK_LINEGC_NONE; - pblk_line_free(pblk, line); - spin_unlock(&line->lock); + pblk_line_free(line); + + if (line->w_err_gc->has_write_err) { + pblk_rl_werr_line_out(&pblk->rl); + line->w_err_gc->has_write_err = 0; + } + spin_unlock(&line->lock); atomic_dec(&gc->pipeline_gc); spin_lock(&l_mg->free_lock); @@ -1593,7 +1638,7 @@ static void pblk_line_put_ws(struct work_struct *work) struct pblk_line *line = line_put_ws->line; __pblk_line_put(pblk, line); - mempool_free(line_put_ws, pblk->gen_ws_pool); + mempool_free(line_put_ws, &pblk->gen_ws_pool); } void pblk_line_put(struct kref *ref) @@ -1610,7 +1655,7 @@ void pblk_line_put_wq(struct kref *ref) struct pblk *pblk = line->pblk; struct pblk_line_ws *line_put_ws; - line_put_ws = mempool_alloc(pblk->gen_ws_pool, GFP_ATOMIC); + line_put_ws = mempool_alloc(&pblk->gen_ws_pool, GFP_ATOMIC); if (!line_put_ws) return; @@ -1737,11 +1782,34 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) spin_lock(&l_mg->close_lock); spin_lock(&line->lock); + + /* Update the in-memory start address for emeta, in case it has + * shifted due to write errors + */ + if (line->emeta_ssec != line->cur_sec) + line->emeta_ssec = line->cur_sec; + list_add_tail(&line->list, &l_mg->emeta_list); spin_unlock(&line->lock); spin_unlock(&l_mg->close_lock); pblk_line_should_sync_meta(pblk); + + +} + +static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + unsigned int lba_list_size = lm->emeta_len[2]; + struct pblk_w_err_gc *w_err_gc = line->w_err_gc; + struct pblk_emeta *emeta = line->emeta; + + w_err_gc->lba_list = pblk_malloc(lba_list_size, + l_mg->emeta_alloc_type, GFP_KERNEL); + memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf), + lba_list_size); } void pblk_line_close_ws(struct work_struct *work) @@ -1750,9 +1818,16 @@ void pblk_line_close_ws(struct work_struct *work) ws); struct pblk *pblk = line_ws->pblk; struct pblk_line *line = line_ws->line; + struct pblk_w_err_gc *w_err_gc = line->w_err_gc; + + /* Write errors makes the emeta start address stored in smeta invalid, + * so keep a copy of the lba list until we've gc'd the line + */ + if (w_err_gc->has_write_err) + pblk_save_lba_list(pblk, line); pblk_line_close(pblk, line); - mempool_free(line_ws, pblk->gen_ws_pool); + mempool_free(line_ws, &pblk->gen_ws_pool); } void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, @@ -1761,7 +1836,7 @@ void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, { struct pblk_line_ws *line_ws; - line_ws = mempool_alloc(pblk->gen_ws_pool, gfp_mask); + line_ws = mempool_alloc(&pblk->gen_ws_pool, gfp_mask); line_ws->pblk = pblk; line_ws->line = line; diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 6851a5c..df88f1b 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -129,6 +129,53 @@ out: kfree(gc_rq_ws); } +static __le64 *get_lba_list_from_emeta(struct pblk *pblk, + struct pblk_line *line) +{ + struct line_emeta *emeta_buf; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + unsigned int lba_list_size = lm->emeta_len[2]; + __le64 *lba_list; + int ret; + + emeta_buf = pblk_malloc(lm->emeta_len[0], + l_mg->emeta_alloc_type, GFP_KERNEL); + if (!emeta_buf) + return NULL; + + ret = pblk_line_read_emeta(pblk, line, emeta_buf); + if (ret) { + pr_err("pblk: line %d read emeta failed (%d)\n", + line->id, ret); + pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); + return NULL; + } + + /* If this read fails, it means that emeta is corrupted. + * For now, leave the line untouched. + * TODO: Implement a recovery routine that scans and moves + * all sectors on the line. + */ + + ret = pblk_recov_check_emeta(pblk, emeta_buf); + if (ret) { + pr_err("pblk: inconsistent emeta (line %d)\n", + line->id); + pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); + return NULL; + } + + lba_list = pblk_malloc(lba_list_size, + l_mg->emeta_alloc_type, GFP_KERNEL); + if (lba_list) + memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size); + + pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); + + return lba_list; +} + static void pblk_gc_line_prepare_ws(struct work_struct *work) { struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, @@ -138,46 +185,26 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; struct pblk_gc *gc = &pblk->gc; - struct line_emeta *emeta_buf; struct pblk_line_ws *gc_rq_ws; struct pblk_gc_rq *gc_rq; __le64 *lba_list; unsigned long *invalid_bitmap; int sec_left, nr_secs, bit; - int ret; invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL); if (!invalid_bitmap) goto fail_free_ws; - emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type, - GFP_KERNEL); - if (!emeta_buf) { - pr_err("pblk: cannot use GC emeta\n"); - goto fail_free_bitmap; - } - - ret = pblk_line_read_emeta(pblk, line, emeta_buf); - if (ret) { - pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret); - goto fail_free_emeta; - } - - /* If this read fails, it means that emeta is corrupted. For now, leave - * the line untouched. TODO: Implement a recovery routine that scans and - * moves all sectors on the line. - */ - - ret = pblk_recov_check_emeta(pblk, emeta_buf); - if (ret) { - pr_err("pblk: inconsistent emeta (line %d)\n", line->id); - goto fail_free_emeta; - } - - lba_list = emeta_to_lbas(pblk, emeta_buf); - if (!lba_list) { - pr_err("pblk: could not interpret emeta (line %d)\n", line->id); - goto fail_free_emeta; + if (line->w_err_gc->has_write_err) { + lba_list = line->w_err_gc->lba_list; + line->w_err_gc->lba_list = NULL; + } else { + lba_list = get_lba_list_from_emeta(pblk, line); + if (!lba_list) { + pr_err("pblk: could not interpret emeta (line %d)\n", + line->id); + goto fail_free_ws; + } } spin_lock(&line->lock); @@ -187,14 +214,14 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) if (sec_left < 0) { pr_err("pblk: corrupted GC line (%d)\n", line->id); - goto fail_free_emeta; + goto fail_free_lba_list; } bit = -1; next_rq: gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL); if (!gc_rq) - goto fail_free_emeta; + goto fail_free_lba_list; nr_secs = 0; do { @@ -240,7 +267,7 @@ next_rq: goto next_rq; out: - pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); + pblk_mfree(lba_list, l_mg->emeta_alloc_type); kfree(line_ws); kfree(invalid_bitmap); @@ -251,9 +278,8 @@ out: fail_free_gc_rq: kfree(gc_rq); -fail_free_emeta: - pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); -fail_free_bitmap: +fail_free_lba_list: + pblk_mfree(lba_list, l_mg->emeta_alloc_type); kfree(invalid_bitmap); fail_free_ws: kfree(line_ws); @@ -349,12 +375,14 @@ static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk, static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl) { unsigned int nr_blocks_free, nr_blocks_need; + unsigned int werr_lines = atomic_read(&rl->werr_lines); nr_blocks_need = pblk_rl_high_thrs(rl); nr_blocks_free = pblk_rl_nr_free_blks(rl); /* This is not critical, no need to take lock here */ - return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free)); + return ((werr_lines > 0) || + ((gc->gc_active) && (nr_blocks_need > nr_blocks_free))); } void pblk_gc_free_full_lines(struct pblk *pblk) @@ -649,7 +677,7 @@ fail_free_main_kthread: return ret; } -void pblk_gc_exit(struct pblk *pblk) +void pblk_gc_exit(struct pblk *pblk, bool graceful) { struct pblk_gc *gc = &pblk->gc; @@ -663,10 +691,12 @@ void pblk_gc_exit(struct pblk *pblk) if (gc->gc_reader_ts) kthread_stop(gc->gc_reader_ts); - flush_workqueue(gc->gc_reader_wq); - destroy_workqueue(gc->gc_reader_wq); + if (graceful) { + flush_workqueue(gc->gc_reader_wq); + flush_workqueue(gc->gc_line_reader_wq); + } - flush_workqueue(gc->gc_line_reader_wq); + destroy_workqueue(gc->gc_reader_wq); destroy_workqueue(gc->gc_line_reader_wq); if (gc->gc_writer_ts) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 91a5bc2..ce561f5 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -20,10 +20,15 @@ #include "pblk.h" +unsigned int write_buffer_size; + +module_param(write_buffer_size, uint, 0644); +MODULE_PARM_DESC(write_buffer_size, "number of entries in a write buffer"); + static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache, *pblk_w_rq_cache; static DECLARE_RWSEM(pblk_lock); -struct bio_set *pblk_bio_set; +struct bio_set pblk_bio_set; static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, struct bio *bio) @@ -127,10 +132,8 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) if (!line) { /* Configure next line for user data */ line = pblk_line_get_first_data(pblk); - if (!line) { - pr_err("pblk: line list corrupted\n"); + if (!line) return -EFAULT; - } } return 0; @@ -141,6 +144,7 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init) sector_t i; struct ppa_addr ppa; size_t map_size; + int ret = 0; map_size = pblk_trans_map_size(pblk); pblk->trans_map = vmalloc(map_size); @@ -152,7 +156,11 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init) for (i = 0; i < pblk->rl.nr_secs; i++) pblk_trans_map_set(pblk, i, ppa); - return pblk_l2p_recover(pblk, factory_init); + ret = pblk_l2p_recover(pblk, factory_init); + if (ret) + vfree(pblk->trans_map); + + return ret; } static void pblk_rwb_free(struct pblk *pblk) @@ -169,10 +177,15 @@ static int pblk_rwb_init(struct pblk *pblk) struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct pblk_rb_entry *entries; - unsigned long nr_entries; + unsigned long nr_entries, buffer_size; unsigned int power_size, power_seg_sz; - nr_entries = pblk_rb_calculate_size(pblk->pgs_in_buffer); + if (write_buffer_size && (write_buffer_size > pblk->pgs_in_buffer)) + buffer_size = write_buffer_size; + else + buffer_size = pblk->pgs_in_buffer; + + nr_entries = pblk_rb_calculate_size(buffer_size); entries = vzalloc(nr_entries * sizeof(struct pblk_rb_entry)); if (!entries) @@ -341,7 +354,7 @@ static int pblk_core_init(struct pblk *pblk) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - int max_write_ppas; + int ret, max_write_ppas; atomic64_set(&pblk->user_wa, 0); atomic64_set(&pblk->pad_wa, 0); @@ -375,33 +388,33 @@ static int pblk_core_init(struct pblk *pblk) goto fail_free_pad_dist; /* Internal bios can be at most the sectors signaled by the device. */ - pblk->page_bio_pool = mempool_create_page_pool(NVM_MAX_VLBA, 0); - if (!pblk->page_bio_pool) + ret = mempool_init_page_pool(&pblk->page_bio_pool, NVM_MAX_VLBA, 0); + if (ret) goto free_global_caches; - pblk->gen_ws_pool = mempool_create_slab_pool(PBLK_GEN_WS_POOL_SIZE, - pblk_ws_cache); - if (!pblk->gen_ws_pool) + ret = mempool_init_slab_pool(&pblk->gen_ws_pool, PBLK_GEN_WS_POOL_SIZE, + pblk_ws_cache); + if (ret) goto free_page_bio_pool; - pblk->rec_pool = mempool_create_slab_pool(geo->all_luns, - pblk_rec_cache); - if (!pblk->rec_pool) + ret = mempool_init_slab_pool(&pblk->rec_pool, geo->all_luns, + pblk_rec_cache); + if (ret) goto free_gen_ws_pool; - pblk->r_rq_pool = mempool_create_slab_pool(geo->all_luns, - pblk_g_rq_cache); - if (!pblk->r_rq_pool) + ret = mempool_init_slab_pool(&pblk->r_rq_pool, geo->all_luns, + pblk_g_rq_cache); + if (ret) goto free_rec_pool; - pblk->e_rq_pool = mempool_create_slab_pool(geo->all_luns, - pblk_g_rq_cache); - if (!pblk->e_rq_pool) + ret = mempool_init_slab_pool(&pblk->e_rq_pool, geo->all_luns, + pblk_g_rq_cache); + if (ret) goto free_r_rq_pool; - pblk->w_rq_pool = mempool_create_slab_pool(geo->all_luns, - pblk_w_rq_cache); - if (!pblk->w_rq_pool) + ret = mempool_init_slab_pool(&pblk->w_rq_pool, geo->all_luns, + pblk_w_rq_cache); + if (ret) goto free_e_rq_pool; pblk->close_wq = alloc_workqueue("pblk-close-wq", @@ -423,6 +436,7 @@ static int pblk_core_init(struct pblk *pblk) goto free_r_end_wq; INIT_LIST_HEAD(&pblk->compl_list); + INIT_LIST_HEAD(&pblk->resubmit_list); return 0; @@ -433,17 +447,17 @@ free_bb_wq: free_close_wq: destroy_workqueue(pblk->close_wq); free_w_rq_pool: - mempool_destroy(pblk->w_rq_pool); + mempool_exit(&pblk->w_rq_pool); free_e_rq_pool: - mempool_destroy(pblk->e_rq_pool); + mempool_exit(&pblk->e_rq_pool); free_r_rq_pool: - mempool_destroy(pblk->r_rq_pool); + mempool_exit(&pblk->r_rq_pool); free_rec_pool: - mempool_destroy(pblk->rec_pool); + mempool_exit(&pblk->rec_pool); free_gen_ws_pool: - mempool_destroy(pblk->gen_ws_pool); + mempool_exit(&pblk->gen_ws_pool); free_page_bio_pool: - mempool_destroy(pblk->page_bio_pool); + mempool_exit(&pblk->page_bio_pool); free_global_caches: pblk_free_global_caches(pblk); fail_free_pad_dist: @@ -462,12 +476,12 @@ static void pblk_core_free(struct pblk *pblk) if (pblk->bb_wq) destroy_workqueue(pblk->bb_wq); - mempool_destroy(pblk->page_bio_pool); - mempool_destroy(pblk->gen_ws_pool); - mempool_destroy(pblk->rec_pool); - mempool_destroy(pblk->r_rq_pool); - mempool_destroy(pblk->e_rq_pool); - mempool_destroy(pblk->w_rq_pool); + mempool_exit(&pblk->page_bio_pool); + mempool_exit(&pblk->gen_ws_pool); + mempool_exit(&pblk->rec_pool); + mempool_exit(&pblk->r_rq_pool); + mempool_exit(&pblk->e_rq_pool); + mempool_exit(&pblk->w_rq_pool); pblk_free_global_caches(pblk); kfree(pblk->pad_dist); @@ -489,11 +503,17 @@ static void pblk_line_mg_free(struct pblk *pblk) } } -static void pblk_line_meta_free(struct pblk_line *line) +static void pblk_line_meta_free(struct pblk_line_mgmt *l_mg, + struct pblk_line *line) { + struct pblk_w_err_gc *w_err_gc = line->w_err_gc; + kfree(line->blk_bitmap); kfree(line->erase_bitmap); kfree(line->chks); + + pblk_mfree(w_err_gc->lba_list, l_mg->emeta_alloc_type); + kfree(w_err_gc); } static void pblk_lines_free(struct pblk *pblk) @@ -506,8 +526,8 @@ static void pblk_lines_free(struct pblk *pblk) for (i = 0; i < l_mg->nr_lines; i++) { line = &pblk->lines[i]; - pblk_line_free(pblk, line); - pblk_line_meta_free(line); + pblk_line_free(line); + pblk_line_meta_free(l_mg, line); } spin_unlock(&l_mg->free_lock); @@ -748,14 +768,14 @@ static int pblk_setup_line_meta_20(struct pblk *pblk, struct pblk_line *line, chunk->cnlb = chunk_meta->cnlb; chunk->wp = chunk_meta->wp; - if (!(chunk->state & NVM_CHK_ST_OFFLINE)) - continue; - if (chunk->type & NVM_CHK_TP_SZ_SPEC) { WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n"); continue; } + if (!(chunk->state & NVM_CHK_ST_OFFLINE)) + continue; + set_bit(pos, line->blk_bitmap); nr_bad_chks++; } @@ -809,20 +829,28 @@ static int pblk_alloc_line_meta(struct pblk *pblk, struct pblk_line *line) return -ENOMEM; line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL); - if (!line->erase_bitmap) { - kfree(line->blk_bitmap); - return -ENOMEM; - } + if (!line->erase_bitmap) + goto free_blk_bitmap; + line->chks = kmalloc(lm->blk_per_line * sizeof(struct nvm_chk_meta), GFP_KERNEL); - if (!line->chks) { - kfree(line->erase_bitmap); - kfree(line->blk_bitmap); - return -ENOMEM; - } + if (!line->chks) + goto free_erase_bitmap; + + line->w_err_gc = kzalloc(sizeof(struct pblk_w_err_gc), GFP_KERNEL); + if (!line->w_err_gc) + goto free_chks; return 0; + +free_chks: + kfree(line->chks); +free_erase_bitmap: + kfree(line->erase_bitmap); +free_blk_bitmap: + kfree(line->blk_bitmap); + return -ENOMEM; } static int pblk_line_mg_init(struct pblk *pblk) @@ -847,12 +875,14 @@ static int pblk_line_mg_init(struct pblk *pblk) INIT_LIST_HEAD(&l_mg->gc_mid_list); INIT_LIST_HEAD(&l_mg->gc_low_list); INIT_LIST_HEAD(&l_mg->gc_empty_list); + INIT_LIST_HEAD(&l_mg->gc_werr_list); INIT_LIST_HEAD(&l_mg->emeta_list); - l_mg->gc_lists[0] = &l_mg->gc_high_list; - l_mg->gc_lists[1] = &l_mg->gc_mid_list; - l_mg->gc_lists[2] = &l_mg->gc_low_list; + l_mg->gc_lists[0] = &l_mg->gc_werr_list; + l_mg->gc_lists[1] = &l_mg->gc_high_list; + l_mg->gc_lists[2] = &l_mg->gc_mid_list; + l_mg->gc_lists[3] = &l_mg->gc_low_list; spin_lock_init(&l_mg->free_lock); spin_lock_init(&l_mg->close_lock); @@ -1047,6 +1077,11 @@ static int pblk_lines_init(struct pblk *pblk) nr_free_chks += pblk_setup_line_meta(pblk, line, chunk_meta, i); } + if (!nr_free_chks) { + pr_err("pblk: too many bad blocks prevent for sane instance\n"); + return -EINTR; + } + pblk_set_provision(pblk, nr_free_chks); kfree(chunk_meta); @@ -1054,7 +1089,7 @@ static int pblk_lines_init(struct pblk *pblk) fail_free_lines: while (--i >= 0) - pblk_line_meta_free(&pblk->lines[i]); + pblk_line_meta_free(l_mg, &pblk->lines[i]); kfree(pblk->lines); fail_free_chunk_meta: kfree(chunk_meta); @@ -1110,23 +1145,25 @@ static void pblk_free(struct pblk *pblk) kfree(pblk); } -static void pblk_tear_down(struct pblk *pblk) +static void pblk_tear_down(struct pblk *pblk, bool graceful) { - pblk_pipeline_stop(pblk); + if (graceful) + __pblk_pipeline_flush(pblk); + __pblk_pipeline_stop(pblk); pblk_writer_stop(pblk); pblk_rb_sync_l2p(&pblk->rwb); pblk_rl_free(&pblk->rl); - pr_debug("pblk: consistent tear down\n"); + pr_debug("pblk: consistent tear down (graceful:%d)\n", graceful); } -static void pblk_exit(void *private) +static void pblk_exit(void *private, bool graceful) { struct pblk *pblk = private; down_write(&pblk_lock); - pblk_gc_exit(pblk); - pblk_tear_down(pblk); + pblk_gc_exit(pblk, graceful); + pblk_tear_down(pblk, graceful); #ifdef CONFIG_NVM_DEBUG pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); @@ -1175,6 +1212,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, pblk->state = PBLK_STATE_RUNNING; pblk->gc.gc_enabled = 0; + spin_lock_init(&pblk->resubmit_lock); spin_lock_init(&pblk->trans_lock); spin_lock_init(&pblk->lock); @@ -1297,18 +1335,18 @@ static int __init pblk_module_init(void) { int ret; - pblk_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0); - if (!pblk_bio_set) - return -ENOMEM; + ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); + if (ret) + return ret; ret = nvm_register_tgt_type(&tt_pblk); if (ret) - bioset_free(pblk_bio_set); + bioset_exit(&pblk_bio_set); return ret; } static void pblk_module_exit(void) { - bioset_free(pblk_bio_set); + bioset_exit(&pblk_bio_set); nvm_unregister_tgt_type(&tt_pblk); } diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c index 20dbaa8..953ca31 100644 --- a/drivers/lightnvm/pblk-map.c +++ b/drivers/lightnvm/pblk-map.c @@ -18,11 +18,11 @@ #include "pblk.h" -static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, - struct ppa_addr *ppa_list, - unsigned long *lun_bitmap, - struct pblk_sec_meta *meta_list, - unsigned int valid_secs) +static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry, + struct ppa_addr *ppa_list, + unsigned long *lun_bitmap, + struct pblk_sec_meta *meta_list, + unsigned int valid_secs) { struct pblk_line *line = pblk_line_get_data(pblk); struct pblk_emeta *emeta; @@ -35,8 +35,14 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, if (pblk_line_is_full(line)) { struct pblk_line *prev_line = line; + /* If we cannot allocate a new line, make sure to store metadata + * on current line and then fail + */ line = pblk_line_replace_data(pblk); pblk_line_close_meta(pblk, prev_line); + + if (!line) + return -EINTR; } emeta = line->emeta; @@ -74,6 +80,7 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, } pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap); + return 0; } void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, @@ -87,8 +94,12 @@ void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, for (i = off; i < rqd->nr_ppas; i += min) { map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; - pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i], - lun_bitmap, &meta_list[i], map_secs); + if (pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i], + lun_bitmap, &meta_list[i], map_secs)) { + bio_put(rqd->bio); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); + pblk_pipeline_stop(pblk); + } } } @@ -108,8 +119,12 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, for (i = 0; i < rqd->nr_ppas; i += min) { map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; - pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i], - lun_bitmap, &meta_list[i], map_secs); + if (pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i], + lun_bitmap, &meta_list[i], map_secs)) { + bio_put(rqd->bio); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); + pblk_pipeline_stop(pblk); + } erase_lun = pblk_ppa_to_pos(geo, rqd->ppa_list[i]); diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 52fdd85..00cd1f2 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -142,10 +142,9 @@ static void clean_wctx(struct pblk_w_ctx *w_ctx) { int flags; -try: flags = READ_ONCE(w_ctx->flags); - if (!(flags & PBLK_SUBMITTED_ENTRY)) - goto try; + WARN_ONCE(!(flags & PBLK_SUBMITTED_ENTRY), + "pblk: overwriting unsubmitted data\n"); /* Release flags on context. Protect from writes and reads */ smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY); @@ -350,7 +349,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, } static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio, - unsigned int pos) + unsigned int pos) { struct pblk_rb_entry *entry; unsigned int sync, flush_point; @@ -420,7 +419,7 @@ void pblk_rb_flush(struct pblk_rb *rb) if (pblk_rb_flush_point_set(rb, NULL, mem)) return; - pblk_write_should_kick(pblk); + pblk_write_kick(pblk); } static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries, @@ -504,45 +503,6 @@ int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries, } /* - * The caller of this function must ensure that the backpointer will not - * overwrite the entries passed on the list. - */ -unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio, - struct list_head *list, - unsigned int max) -{ - struct pblk_rb_entry *entry, *tentry; - struct page *page; - unsigned int read = 0; - int ret; - - list_for_each_entry_safe(entry, tentry, list, index) { - if (read > max) { - pr_err("pblk: too many entries on list\n"); - goto out; - } - - page = virt_to_page(entry->data); - if (!page) { - pr_err("pblk: could not allocate write bio page\n"); - goto out; - } - - ret = bio_add_page(bio, page, rb->seg_size, 0); - if (ret != rb->seg_size) { - pr_err("pblk: could not add page to write bio\n"); - goto out; - } - - list_del(&entry->index); - read++; - } - -out: - return read; -} - -/* * Read available entries on rb and add them to the given bio. To avoid a memory * copy, a page reference to the write buffer is used to be added to the bio. * diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 9eee10f..1869469 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -39,10 +39,10 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, } static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, - sector_t blba, unsigned long *read_bitmap) + struct bio *bio, sector_t blba, + unsigned long *read_bitmap) { struct pblk_sec_meta *meta_list = rqd->meta_list; - struct bio *bio = rqd->bio; struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; int nr_secs = rqd->nr_ppas; bool advanced_bio = false; @@ -102,32 +102,69 @@ next: #endif } -static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd) + +static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd, + sector_t blba) { - int err; + struct pblk_sec_meta *meta_lba_list = rqd->meta_list; + int nr_lbas = rqd->nr_ppas; + int i; - err = pblk_submit_io(pblk, rqd); - if (err) - return NVM_IO_ERR; + for (i = 0; i < nr_lbas; i++) { + u64 lba = le64_to_cpu(meta_lba_list[i].lba); + + if (lba == ADDR_EMPTY) + continue; + + if (lba != blba + i) { +#ifdef CONFIG_NVM_DEBUG + struct ppa_addr *p; - return NVM_IO_OK; + p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr; + print_ppa(&pblk->dev->geo, p, "seq", i); +#endif + pr_err("pblk: corrupted read LBA (%llu/%llu)\n", + lba, (u64)blba + i); + WARN_ON(1); + } + } } -static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd, - sector_t blba) +/* + * There can be holes in the lba list. + */ +static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd, + u64 *lba_list, int nr_lbas) { - struct pblk_sec_meta *meta_list = rqd->meta_list; - int nr_lbas = rqd->nr_ppas; - int i; + struct pblk_sec_meta *meta_lba_list = rqd->meta_list; + int i, j; - for (i = 0; i < nr_lbas; i++) { - u64 lba = le64_to_cpu(meta_list[i].lba); + for (i = 0, j = 0; i < nr_lbas; i++) { + u64 lba = lba_list[i]; + u64 meta_lba; if (lba == ADDR_EMPTY) continue; - WARN(lba != blba + i, "pblk: corrupted read LBA\n"); + meta_lba = le64_to_cpu(meta_lba_list[j].lba); + + if (lba != meta_lba) { +#ifdef CONFIG_NVM_DEBUG + struct ppa_addr *p; + int nr_ppas = rqd->nr_ppas; + + p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr; + print_ppa(&pblk->dev->geo, p, "seq", j); +#endif + pr_err("pblk: corrupted read LBA (%llu/%llu)\n", + lba, meta_lba); + WARN_ON(1); + } + + j++; } + + WARN_ONCE(j != rqd->nr_ppas, "pblk: corrupted random request\n"); } static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd) @@ -152,7 +189,6 @@ static void pblk_end_user_read(struct bio *bio) WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n"); #endif bio_endio(bio); - bio_put(bio); } static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, @@ -160,23 +196,18 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, { struct nvm_tgt_dev *dev = pblk->dev; struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); - struct bio *bio = rqd->bio; + struct bio *int_bio = rqd->bio; unsigned long start_time = r_ctx->start_time; generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time); if (rqd->error) pblk_log_read_err(pblk, rqd); -#ifdef CONFIG_NVM_DEBUG - else - WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n"); -#endif - pblk_read_check(pblk, rqd, r_ctx->lba); + pblk_read_check_seq(pblk, rqd, r_ctx->lba); - bio_put(bio); - if (r_ctx->private) - pblk_end_user_read((struct bio *)r_ctx->private); + if (int_bio) + bio_put(int_bio); if (put_line) pblk_read_put_rqd_kref(pblk, rqd); @@ -193,16 +224,19 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, static void pblk_end_io_read(struct nvm_rq *rqd) { struct pblk *pblk = rqd->private; + struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); + struct bio *bio = (struct bio *)r_ctx->private; + pblk_end_user_read(bio); __pblk_end_io_read(pblk, rqd, true); } -static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, - unsigned int bio_init_idx, - unsigned long *read_bitmap) +static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd, + struct bio *orig_bio, unsigned int bio_init_idx, + unsigned long *read_bitmap) { - struct bio *new_bio, *bio = rqd->bio; struct pblk_sec_meta *meta_list = rqd->meta_list; + struct bio *new_bio; struct bio_vec src_bv, dst_bv; void *ppa_ptr = NULL; void *src_p, *dst_p; @@ -219,11 +253,11 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, new_bio = bio_alloc(GFP_KERNEL, nr_holes); if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) - goto err; + goto fail_add_pages; if (nr_holes != new_bio->bi_vcnt) { pr_err("pblk: malformed bio\n"); - goto err; + goto fail; } for (i = 0; i < nr_secs; i++) @@ -246,7 +280,7 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, if (ret) { bio_put(rqd->bio); pr_err("pblk: sync read IO submission failed\n"); - goto err; + goto fail; } if (rqd->error) { @@ -282,7 +316,7 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, meta_list[hole].lba = lba_list_media[i]; src_bv = new_bio->bi_io_vec[i++]; - dst_bv = bio->bi_io_vec[bio_init_idx + hole]; + dst_bv = orig_bio->bi_io_vec[bio_init_idx + hole]; src_p = kmap_atomic(src_bv.bv_page); dst_p = kmap_atomic(dst_bv.bv_page); @@ -294,35 +328,33 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, kunmap_atomic(src_p); kunmap_atomic(dst_p); - mempool_free(src_bv.bv_page, pblk->page_bio_pool); + mempool_free(src_bv.bv_page, &pblk->page_bio_pool); hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1); } while (hole < nr_secs); bio_put(new_bio); - /* Complete the original bio and associated request */ - bio_endio(bio); - rqd->bio = bio; + /* restore original request */ + rqd->bio = NULL; rqd->nr_ppas = nr_secs; __pblk_end_io_read(pblk, rqd, false); - return NVM_IO_OK; - -err: - pr_err("pblk: failed to perform partial read\n"); + return NVM_IO_DONE; +fail: /* Free allocated pages in new bio */ - pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt); + pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt); +fail_add_pages: + pr_err("pblk: failed to perform partial read\n"); __pblk_end_io_read(pblk, rqd, false); return NVM_IO_ERR; } -static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, +static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, sector_t lba, unsigned long *read_bitmap) { struct pblk_sec_meta *meta_list = rqd->meta_list; - struct bio *bio = rqd->bio; struct ppa_addr ppa; pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); @@ -386,14 +418,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) rqd = pblk_alloc_rqd(pblk, PBLK_READ); rqd->opcode = NVM_OP_PREAD; - rqd->bio = bio; rqd->nr_ppas = nr_secs; + rqd->bio = NULL; /* cloned bio if needed */ rqd->private = pblk; rqd->end_io = pblk_end_io_read; r_ctx = nvm_rq_to_pdu(rqd); r_ctx->start_time = jiffies; r_ctx->lba = blba; + r_ctx->private = bio; /* original bio */ /* Save the index for this bio's start. This is needed in case * we need to fill a partial read. @@ -411,17 +444,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; - pblk_read_ppalist_rq(pblk, rqd, blba, &read_bitmap); + pblk_read_ppalist_rq(pblk, rqd, bio, blba, &read_bitmap); } else { - pblk_read_rq(pblk, rqd, blba, &read_bitmap); + pblk_read_rq(pblk, rqd, bio, blba, &read_bitmap); } - bio_get(bio); if (bitmap_full(&read_bitmap, nr_secs)) { - bio_endio(bio); atomic_inc(&pblk->inflight_io); __pblk_end_io_read(pblk, rqd, false); - return NVM_IO_OK; + return NVM_IO_DONE; } /* All sectors are to be read from the device */ @@ -429,20 +460,17 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) struct bio *int_bio = NULL; /* Clone read bio to deal with read errors internally */ - int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set); + int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); if (!int_bio) { pr_err("pblk: could not clone read bio\n"); goto fail_end_io; } rqd->bio = int_bio; - r_ctx->private = bio; - ret = pblk_submit_read_io(pblk, rqd); - if (ret) { + if (pblk_submit_io(pblk, rqd)) { pr_err("pblk: read IO submission failed\n"); - if (int_bio) - bio_put(int_bio); + ret = NVM_IO_ERR; goto fail_end_io; } @@ -452,7 +480,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) /* The read bio request could be partially filled by the write buffer, * but there are some holes that need to be read from the drive. */ - return pblk_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap); + return pblk_partial_read(pblk, rqd, bio, bio_init_idx, &read_bitmap); fail_rqd_free: pblk_free_rqd(pblk, rqd, PBLK_READ); @@ -585,6 +613,8 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) goto err_free_bio; } + pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs); + atomic_dec(&pblk->inflight_io); if (rqd.error) { diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 3e079c2..5983428 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -16,97 +16,6 @@ #include "pblk.h" -void pblk_submit_rec(struct work_struct *work) -{ - struct pblk_rec_ctx *recovery = - container_of(work, struct pblk_rec_ctx, ws_rec); - struct pblk *pblk = recovery->pblk; - struct nvm_rq *rqd = recovery->rqd; - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - struct bio *bio; - unsigned int nr_rec_secs; - unsigned int pgs_read; - int ret; - - nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status, - NVM_MAX_VLBA); - - bio = bio_alloc(GFP_KERNEL, nr_rec_secs); - - bio->bi_iter.bi_sector = 0; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - rqd->bio = bio; - rqd->nr_ppas = nr_rec_secs; - - pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed, - nr_rec_secs); - if (pgs_read != nr_rec_secs) { - pr_err("pblk: could not read recovery entries\n"); - goto err; - } - - if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) { - pr_err("pblk: could not setup recovery request\n"); - goto err; - } - -#ifdef CONFIG_NVM_DEBUG - atomic_long_add(nr_rec_secs, &pblk->recov_writes); -#endif - - ret = pblk_submit_io(pblk, rqd); - if (ret) { - pr_err("pblk: I/O submission failed: %d\n", ret); - goto err; - } - - mempool_free(recovery, pblk->rec_pool); - return; - -err: - bio_put(bio); - pblk_free_rqd(pblk, rqd, PBLK_WRITE); -} - -int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, - struct pblk_rec_ctx *recovery, u64 *comp_bits, - unsigned int comp) -{ - struct nvm_rq *rec_rqd; - struct pblk_c_ctx *rec_ctx; - int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded; - - rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE); - rec_ctx = nvm_rq_to_pdu(rec_rqd); - - /* Copy completion bitmap, but exclude the first X completed entries */ - bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status, - (unsigned long int *)comp_bits, - comp, NVM_MAX_VLBA); - - /* Save the context for the entries that need to be re-written and - * update current context with the completed entries. - */ - rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp); - if (comp >= c_ctx->nr_valid) { - rec_ctx->nr_valid = 0; - rec_ctx->nr_padded = nr_entries - comp; - - c_ctx->nr_padded = comp - c_ctx->nr_valid; - } else { - rec_ctx->nr_valid = c_ctx->nr_valid - comp; - rec_ctx->nr_padded = c_ctx->nr_padded; - - c_ctx->nr_valid = comp; - c_ctx->nr_padded = 0; - } - - recovery->rqd = rec_rqd; - recovery->pblk = pblk; - - return 0; -} - int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf) { u32 crc; @@ -865,18 +774,30 @@ static void pblk_recov_wa_counters(struct pblk *pblk, } static int pblk_line_was_written(struct pblk_line *line, - struct pblk_line_meta *lm) + struct pblk *pblk) { - int i; - int state_mask = NVM_CHK_ST_OFFLINE | NVM_CHK_ST_FREE; + struct pblk_line_meta *lm = &pblk->lm; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct nvm_chk_meta *chunk; + struct ppa_addr bppa; + int smeta_blk; - for (i = 0; i < lm->blk_per_line; i++) { - if (!(line->chks[i].state & state_mask)) - return 1; - } + if (line->state == PBLK_LINESTATE_BAD) + return 0; - return 0; + smeta_blk = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); + if (smeta_blk >= lm->blk_per_line) + return 0; + + bppa = pblk->luns[smeta_blk].bppa; + chunk = &line->chks[pblk_ppa_to_pos(geo, bppa)]; + + if (chunk->state & NVM_CHK_ST_FREE) + return 0; + + return 1; } struct pblk_line *pblk_recov_l2p(struct pblk *pblk) @@ -915,7 +836,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) line->lun_bitmap = ((void *)(smeta_buf)) + sizeof(struct line_smeta); - if (!pblk_line_was_written(line, lm)) + if (!pblk_line_was_written(line, pblk)) continue; /* Lines that cannot be read are assumed as not written here */ diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index 883a711..6a0616a 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -73,6 +73,16 @@ void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries) pblk_rl_kick_u_timer(rl); } +void pblk_rl_werr_line_in(struct pblk_rl *rl) +{ + atomic_inc(&rl->werr_lines); +} + +void pblk_rl_werr_line_out(struct pblk_rl *rl) +{ + atomic_dec(&rl->werr_lines); +} + void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries) { atomic_add(nr_entries, &rl->rb_gc_cnt); @@ -99,11 +109,21 @@ static void __pblk_rl_update_rates(struct pblk_rl *rl, { struct pblk *pblk = container_of(rl, struct pblk, rl); int max = rl->rb_budget; + int werr_gc_needed = atomic_read(&rl->werr_lines); if (free_blocks >= rl->high) { - rl->rb_user_max = max; - rl->rb_gc_max = 0; - rl->rb_state = PBLK_RL_HIGH; + if (werr_gc_needed) { + /* Allocate a small budget for recovering + * lines with write errors + */ + rl->rb_gc_max = 1 << rl->rb_windows_pw; + rl->rb_user_max = max - rl->rb_gc_max; + rl->rb_state = PBLK_RL_WERR; + } else { + rl->rb_user_max = max; + rl->rb_gc_max = 0; + rl->rb_state = PBLK_RL_OFF; + } } else if (free_blocks < rl->high) { int shift = rl->high_pw - rl->rb_windows_pw; int user_windows = free_blocks >> shift; @@ -124,7 +144,7 @@ static void __pblk_rl_update_rates(struct pblk_rl *rl, rl->rb_state = PBLK_RL_LOW; } - if (rl->rb_state == (PBLK_RL_MID | PBLK_RL_LOW)) + if (rl->rb_state != PBLK_RL_OFF) pblk_gc_should_start(pblk); else pblk_gc_should_stop(pblk); @@ -221,6 +241,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget) atomic_set(&rl->rb_user_cnt, 0); atomic_set(&rl->rb_gc_cnt, 0); atomic_set(&rl->rb_space, -1); + atomic_set(&rl->werr_lines, 0); timer_setup(&rl->u_timer, pblk_rl_u_timer, 0); diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c index e61909a..88a0a7c 100644 --- a/drivers/lightnvm/pblk-sysfs.c +++ b/drivers/lightnvm/pblk-sysfs.c @@ -173,6 +173,8 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) int free_line_cnt = 0, closed_line_cnt = 0, emeta_line_cnt = 0; int d_line_cnt = 0, l_line_cnt = 0; int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0; + int gc_werr = 0; + int bad = 0, cor = 0; int msecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0; int map_weight = 0, meta_weight = 0; @@ -237,6 +239,15 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) gc_empty++; } + list_for_each_entry(line, &l_mg->gc_werr_list, list) { + if (line->type == PBLK_LINETYPE_DATA) + d_line_cnt++; + else if (line->type == PBLK_LINETYPE_LOG) + l_line_cnt++; + closed_line_cnt++; + gc_werr++; + } + list_for_each_entry(line, &l_mg->bad_list, list) bad++; list_for_each_entry(line, &l_mg->corrupt_list, list) @@ -275,8 +286,8 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) l_mg->nr_lines); sz += snprintf(page + sz, PAGE_SIZE - sz, - "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n", - gc_full, gc_high, gc_mid, gc_low, gc_empty, + "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, werr: %d, queue:%d\n", + gc_full, gc_high, gc_mid, gc_low, gc_empty, gc_werr, atomic_read(&pblk->gc.read_inflight_gc)); sz += snprintf(page + sz, PAGE_SIZE - sz, diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 3e6f1eb..f353e52 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -103,68 +103,150 @@ retry: pblk_rb_sync_end(&pblk->rwb, &flags); } -/* When a write fails, we are not sure whether the block has grown bad or a page - * range is more susceptible to write errors. If a high number of pages fail, we - * assume that the block is bad and we mark it accordingly. In all cases, we - * remap and resubmit the failed entries as fast as possible; if a flush is - * waiting on a completion, the whole stack would stall otherwise. - */ -static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) +/* Map remaining sectors in chunk, starting from ppa */ +static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa) { - void *comp_bits = &rqd->ppa_status; - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - struct pblk_rec_ctx *recovery; - struct ppa_addr *ppa_list = rqd->ppa_list; - int nr_ppas = rqd->nr_ppas; - unsigned int c_entries; - int bit, ret; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line *line; + struct ppa_addr map_ppa = *ppa; + u64 paddr; + int done = 0; - if (unlikely(nr_ppas == 1)) - ppa_list = &rqd->ppa_addr; + line = &pblk->lines[pblk_ppa_to_line(*ppa)]; + spin_lock(&line->lock); - recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC); + while (!done) { + paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa); - INIT_LIST_HEAD(&recovery->failed); + if (!test_and_set_bit(paddr, line->map_bitmap)) + line->left_msecs--; - bit = -1; - while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) { - struct pblk_rb_entry *entry; - struct ppa_addr ppa; + if (!test_and_set_bit(paddr, line->invalid_bitmap)) + le32_add_cpu(line->vsc, -1); - /* Logic error */ - if (bit > c_ctx->nr_valid) { - WARN_ONCE(1, "pblk: corrupted write request\n"); - mempool_free(recovery, pblk->rec_pool); - goto out; + if (geo->version == NVM_OCSSD_SPEC_12) { + map_ppa.ppa++; + if (map_ppa.g.pg == geo->num_pg) + done = 1; + } else { + map_ppa.m.sec++; + if (map_ppa.m.sec == geo->clba) + done = 1; } + } - ppa = ppa_list[bit]; - entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa); - if (!entry) { - pr_err("pblk: could not scan entry on write failure\n"); - mempool_free(recovery, pblk->rec_pool); - goto out; - } + line->w_err_gc->has_write_err = 1; + spin_unlock(&line->lock); +} - /* The list is filled first and emptied afterwards. No need for - * protecting it with a lock +static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry, + unsigned int nr_entries) +{ + struct pblk_rb *rb = &pblk->rwb; + struct pblk_rb_entry *entry; + struct pblk_line *line; + struct pblk_w_ctx *w_ctx; + struct ppa_addr ppa_l2p; + int flags; + unsigned int pos, i; + + spin_lock(&pblk->trans_lock); + pos = sentry; + for (i = 0; i < nr_entries; i++) { + entry = &rb->entries[pos]; + w_ctx = &entry->w_ctx; + + /* Check if the lba has been overwritten */ + ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba); + if (!pblk_ppa_comp(ppa_l2p, entry->cacheline)) + w_ctx->lba = ADDR_EMPTY; + + /* Mark up the entry as submittable again */ + flags = READ_ONCE(w_ctx->flags); + flags |= PBLK_WRITTEN_DATA; + /* Release flags on write context. Protect from writes */ + smp_store_release(&w_ctx->flags, flags); + + /* Decrese the reference count to the line as we will + * re-map these entries */ - list_add_tail(&entry->index, &recovery->failed); + line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)]; + kref_put(&line->ref, pblk_line_put); + + pos = (pos + 1) & (rb->nr_entries - 1); } + spin_unlock(&pblk->trans_lock); +} - c_entries = find_first_bit(comp_bits, nr_ppas); - ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries); - if (ret) { - pr_err("pblk: could not recover from write failure\n"); - mempool_free(recovery, pblk->rec_pool); - goto out; +static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx) +{ + struct pblk_c_ctx *r_ctx; + + r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL); + if (!r_ctx) + return; + + r_ctx->lun_bitmap = NULL; + r_ctx->sentry = c_ctx->sentry; + r_ctx->nr_valid = c_ctx->nr_valid; + r_ctx->nr_padded = c_ctx->nr_padded; + + spin_lock(&pblk->resubmit_lock); + list_add_tail(&r_ctx->list, &pblk->resubmit_list); + spin_unlock(&pblk->resubmit_lock); + +#ifdef CONFIG_NVM_DEBUG + atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes); +#endif +} + +static void pblk_submit_rec(struct work_struct *work) +{ + struct pblk_rec_ctx *recovery = + container_of(work, struct pblk_rec_ctx, ws_rec); + struct pblk *pblk = recovery->pblk; + struct nvm_rq *rqd = recovery->rqd; + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); + struct ppa_addr *ppa_list; + + pblk_log_write_err(pblk, rqd); + + if (rqd->nr_ppas == 1) + ppa_list = &rqd->ppa_addr; + else + ppa_list = rqd->ppa_list; + + pblk_map_remaining(pblk, ppa_list); + pblk_queue_resubmit(pblk, c_ctx); + + pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap); + if (c_ctx->nr_padded) + pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, + c_ctx->nr_padded); + bio_put(rqd->bio); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); + mempool_free(recovery, &pblk->rec_pool); + + atomic_dec(&pblk->inflight_io); +} + + +static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct pblk_rec_ctx *recovery; + + recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC); + if (!recovery) { + pr_err("pblk: could not allocate recovery work\n"); + return; } + recovery->pblk = pblk; + recovery->rqd = rqd; + INIT_WORK(&recovery->ws_rec, pblk_submit_rec); queue_work(pblk->close_wq, &recovery->ws_rec); - -out: - pblk_complete_write(pblk, rqd, c_ctx); } static void pblk_end_io_write(struct nvm_rq *rqd) @@ -173,8 +255,8 @@ static void pblk_end_io_write(struct nvm_rq *rqd) struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); if (rqd->error) { - pblk_log_write_err(pblk, rqd); - return pblk_end_w_fail(pblk, rqd); + pblk_end_w_fail(pblk, rqd); + return; } #ifdef CONFIG_NVM_DEBUG else @@ -198,6 +280,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd) if (rqd->error) { pblk_log_write_err(pblk, rqd); pr_err("pblk: metadata I/O failed. Line %d\n", line->id); + line->w_err_gc->has_write_err = 1; } sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); @@ -266,31 +349,6 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, return 0; } -int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx) -{ - struct pblk_line_meta *lm = &pblk->lm; - unsigned long *lun_bitmap; - int ret; - - lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); - if (!lun_bitmap) - return -ENOMEM; - - c_ctx->lun_bitmap = lun_bitmap; - - ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas, pblk_end_io_write); - if (ret) - return ret; - - pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0); - - rqd->ppa_status = (u64)0; - rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); - - return ret; -} - static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, unsigned int secs_to_flush) { @@ -339,6 +397,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, l_mg->emeta_alloc_type, GFP_KERNEL); if (IS_ERR(bio)) { + pr_err("pblk: failed to map emeta io"); ret = PTR_ERR(bio); goto fail_free_rqd; } @@ -515,26 +574,54 @@ static int pblk_submit_write(struct pblk *pblk) unsigned int secs_avail, secs_to_sync, secs_to_com; unsigned int secs_to_flush; unsigned long pos; + unsigned int resubmit; - /* If there are no sectors in the cache, flushes (bios without data) - * will be cleared on the cache threads - */ - secs_avail = pblk_rb_read_count(&pblk->rwb); - if (!secs_avail) - return 1; - - secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb); - if (!secs_to_flush && secs_avail < pblk->min_write_pgs) - return 1; - - secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush); - if (secs_to_sync > pblk->max_write_pgs) { - pr_err("pblk: bad buffer sync calculation\n"); - return 1; - } + spin_lock(&pblk->resubmit_lock); + resubmit = !list_empty(&pblk->resubmit_list); + spin_unlock(&pblk->resubmit_lock); + + /* Resubmit failed writes first */ + if (resubmit) { + struct pblk_c_ctx *r_ctx; + + spin_lock(&pblk->resubmit_lock); + r_ctx = list_first_entry(&pblk->resubmit_list, + struct pblk_c_ctx, list); + list_del(&r_ctx->list); + spin_unlock(&pblk->resubmit_lock); + + secs_avail = r_ctx->nr_valid; + pos = r_ctx->sentry; + + pblk_prepare_resubmit(pblk, pos, secs_avail); + secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, + secs_avail); - secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync; - pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); + kfree(r_ctx); + } else { + /* If there are no sectors in the cache, + * flushes (bios without data) will be cleared on + * the cache threads + */ + secs_avail = pblk_rb_read_count(&pblk->rwb); + if (!secs_avail) + return 1; + + secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb); + if (!secs_to_flush && secs_avail < pblk->min_write_pgs) + return 1; + + secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, + secs_to_flush); + if (secs_to_sync > pblk->max_write_pgs) { + pr_err("pblk: bad buffer sync calculation\n"); + return 1; + } + + secs_to_com = (secs_to_sync > secs_avail) ? + secs_avail : secs_to_sync; + pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); + } bio = bio_alloc(GFP_KERNEL, secs_to_sync); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 9c682ac..34cc1d6 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -89,12 +89,14 @@ struct pblk_sec_meta { /* The number of GC lists and the rate-limiter states go together. This way the * rate-limiter can dictate how much GC is needed based on resource utilization. */ -#define PBLK_GC_NR_LISTS 3 +#define PBLK_GC_NR_LISTS 4 enum { - PBLK_RL_HIGH = 1, - PBLK_RL_MID = 2, - PBLK_RL_LOW = 3, + PBLK_RL_OFF = 0, + PBLK_RL_WERR = 1, + PBLK_RL_HIGH = 2, + PBLK_RL_MID = 3, + PBLK_RL_LOW = 4 }; #define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS) @@ -128,7 +130,6 @@ struct pblk_pad_rq { struct pblk_rec_ctx { struct pblk *pblk; struct nvm_rq *rqd; - struct list_head failed; struct work_struct ws_rec; }; @@ -279,6 +280,8 @@ struct pblk_rl { int rb_user_active; int rb_gc_active; + atomic_t werr_lines; /* Number of write error lines that needs gc */ + struct timer_list u_timer; unsigned long long nr_secs; @@ -312,6 +315,7 @@ enum { PBLK_LINEGC_MID = 23, PBLK_LINEGC_HIGH = 24, PBLK_LINEGC_FULL = 25, + PBLK_LINEGC_WERR = 26 }; #define PBLK_MAGIC 0x70626c6b /*pblk*/ @@ -413,6 +417,11 @@ struct pblk_smeta { struct line_smeta *buf; /* smeta buffer in persistent format */ }; +struct pblk_w_err_gc { + int has_write_err; + __le64 *lba_list; +}; + struct pblk_line { struct pblk *pblk; unsigned int id; /* Line number corresponds to the @@ -458,6 +467,8 @@ struct pblk_line { struct kref ref; /* Write buffer L2P references */ + struct pblk_w_err_gc *w_err_gc; /* Write error gc recovery metadata */ + spinlock_t lock; /* Necessary for invalid_bitmap only */ }; @@ -489,6 +500,8 @@ struct pblk_line_mgmt { struct list_head gc_mid_list; /* Full lines ready to GC, mid isc */ struct list_head gc_low_list; /* Full lines ready to GC, low isc */ + struct list_head gc_werr_list; /* Write err recovery list */ + struct list_head gc_full_list; /* Full lines ready to GC, no valid */ struct list_head gc_empty_list; /* Full lines close, all valid */ @@ -664,12 +677,15 @@ struct pblk { struct list_head compl_list; - mempool_t *page_bio_pool; - mempool_t *gen_ws_pool; - mempool_t *rec_pool; - mempool_t *r_rq_pool; - mempool_t *w_rq_pool; - mempool_t *e_rq_pool; + spinlock_t resubmit_lock; /* Resubmit list lock */ + struct list_head resubmit_list; /* Resubmit list for failed writes*/ + + mempool_t page_bio_pool; + mempool_t gen_ws_pool; + mempool_t rec_pool; + mempool_t r_rq_pool; + mempool_t w_rq_pool; + mempool_t e_rq_pool; struct workqueue_struct *close_wq; struct workqueue_struct *bb_wq; @@ -713,9 +729,6 @@ void pblk_rb_sync_l2p(struct pblk_rb *rb); unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, unsigned int pos, unsigned int nr_entries, unsigned int count); -unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio, - struct list_head *list, - unsigned int max); int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, struct ppa_addr ppa, int bio_iter, bool advanced_bio); unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries); @@ -766,11 +779,13 @@ struct pblk_line *pblk_line_get_data(struct pblk *pblk); struct pblk_line *pblk_line_get_erase(struct pblk *pblk); int pblk_line_erase(struct pblk *pblk, struct pblk_line *line); int pblk_line_is_full(struct pblk_line *line); -void pblk_line_free(struct pblk *pblk, struct pblk_line *line); +void pblk_line_free(struct pblk_line *line); void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line); void pblk_line_close(struct pblk *pblk, struct pblk_line *line); void pblk_line_close_ws(struct work_struct *work); void pblk_pipeline_stop(struct pblk *pblk); +void __pblk_pipeline_stop(struct pblk *pblk); +void __pblk_pipeline_flush(struct pblk *pblk); void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, void (*work)(struct work_struct *), gfp_t gfp_mask, struct workqueue_struct *wq); @@ -794,7 +809,6 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas); void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, unsigned long *lun_bitmap); -void pblk_end_io_sync(struct nvm_rq *rqd); int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, int nr_pages); void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, @@ -837,23 +851,20 @@ void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, int pblk_write_ts(void *data); void pblk_write_timer_fn(struct timer_list *t); void pblk_write_should_kick(struct pblk *pblk); +void pblk_write_kick(struct pblk *pblk); /* * pblk read path */ -extern struct bio_set *pblk_bio_set; +extern struct bio_set pblk_bio_set; int pblk_submit_read(struct pblk *pblk, struct bio *bio); int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq); /* * pblk recovery */ -void pblk_submit_rec(struct work_struct *work); struct pblk_line *pblk_recov_l2p(struct pblk *pblk); int pblk_recov_pad(struct pblk *pblk); int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta); -int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, - struct pblk_rec_ctx *recovery, u64 *comp_bits, - unsigned int comp); /* * pblk gc @@ -864,7 +875,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, #define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */ int pblk_gc_init(struct pblk *pblk); -void pblk_gc_exit(struct pblk *pblk); +void pblk_gc_exit(struct pblk *pblk, bool graceful); void pblk_gc_should_start(struct pblk *pblk); void pblk_gc_should_stop(struct pblk *pblk); void pblk_gc_should_kick(struct pblk *pblk); @@ -894,6 +905,9 @@ void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line, bool used); int pblk_rl_is_limit(struct pblk_rl *rl); +void pblk_rl_werr_line_in(struct pblk_rl *rl); +void pblk_rl_werr_line_out(struct pblk_rl *rl); + /* * pblk sysfs */ diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 3a0cfb2..d6bf294f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -269,7 +269,7 @@ struct bcache_device { atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; - struct bio_set *bio_split; + struct bio_set bio_split; unsigned data_csum:1; @@ -345,6 +345,7 @@ struct cached_dev { struct keybuf writeback_keys; + struct task_struct *status_update_thread; /* * Order the write-half of writeback operations strongly in dispatch * order. (Maintain LBA order; don't allow reads completing out of @@ -392,6 +393,7 @@ struct cached_dev { #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 atomic_t io_errors; unsigned error_limit; + unsigned offline_seconds; char backing_dev_name[BDEVNAME_SIZE]; }; @@ -528,9 +530,9 @@ struct cache_set { struct closure sb_write; struct semaphore sb_write_mutex; - mempool_t *search; - mempool_t *bio_meta; - struct bio_set *bio_split; + mempool_t search; + mempool_t bio_meta; + struct bio_set bio_split; /* For the btree cache */ struct shrinker shrink; @@ -655,7 +657,7 @@ struct cache_set { * A btree node on disk could have too many bsets for an iterator to fit * on the stack - have to dynamically allocate them */ - mempool_t *fill_iter; + mempool_t fill_iter; struct bset_sort_state sort; @@ -956,8 +958,6 @@ void bch_prio_write(struct cache *); void bch_write_bdev_super(struct cached_dev *, struct closure *); extern struct workqueue_struct *bcache_wq; -extern const char * const bch_cache_modes[]; -extern const char * const bch_stop_on_failure_modes[]; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 579c696..f3403b4 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1118,8 +1118,7 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, void bch_bset_sort_state_free(struct bset_sort_state *state) { - if (state->pool) - mempool_destroy(state->pool); + mempool_exit(&state->pool); } int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) @@ -1129,11 +1128,7 @@ int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) state->page_order = page_order; state->crit_factor = int_sqrt(1 << page_order); - state->pool = mempool_create_page_pool(1, page_order); - if (!state->pool) - return -ENOMEM; - - return 0; + return mempool_init_page_pool(&state->pool, 1, page_order); } EXPORT_SYMBOL(bch_bset_sort_state_init); @@ -1191,7 +1186,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, BUG_ON(order > state->page_order); - outp = mempool_alloc(state->pool, GFP_NOIO); + outp = mempool_alloc(&state->pool, GFP_NOIO); out = page_address(outp); used_mempool = true; order = state->page_order; @@ -1220,7 +1215,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, } if (used_mempool) - mempool_free(virt_to_page(out), state->pool); + mempool_free(virt_to_page(out), &state->pool); else free_pages((unsigned long) out, order); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 0c24280..b867f22 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -347,7 +347,7 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b, /* Sorting */ struct bset_sort_state { - mempool_t *pool; + mempool_t pool; unsigned page_order; unsigned crit_factor; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 17936b2..2a0968c0 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -204,7 +204,7 @@ void bch_btree_node_read_done(struct btree *b) struct bset *i = btree_bset_first(b); struct btree_iter *iter; - iter = mempool_alloc(b->c->fill_iter, GFP_NOIO); + iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; @@ -271,7 +271,7 @@ void bch_btree_node_read_done(struct btree *b) bch_bset_init_next(&b->keys, write_block(b), bset_magic(&b->c->sb)); out: - mempool_free(iter, b->c->fill_iter); + mempool_free(iter, &b->c->fill_iter); return; err: set_btree_node_io_error(b); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 2ddf851..9612873 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -17,12 +17,12 @@ void bch_bbio_free(struct bio *bio, struct cache_set *c) { struct bbio *b = container_of(bio, struct bbio, bio); - mempool_free(b, c->bio_meta); + mempool_free(b, &c->bio_meta); } struct bio *bch_bbio_alloc(struct cache_set *c) { - struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO); + struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; bio_init(bio, bio->bi_inline_vecs, bucket_pages(c)); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 8e3e865..ae67f5f 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -213,7 +213,7 @@ static void bch_data_insert_start(struct closure *cl) do { unsigned i; struct bkey *k; - struct bio_set *split = op->c->bio_split; + struct bio_set *split = &op->c->bio_split; /* 1 for the device pointer and 1 for the chksum */ if (bch_keylist_realloc(&op->insert_keys, @@ -548,7 +548,7 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) n = bio_next_split(bio, min_t(uint64_t, INT_MAX, KEY_OFFSET(k) - bio->bi_iter.bi_sector), - GFP_NOIO, s->d->bio_split); + GFP_NOIO, &s->d->bio_split); bio_key = &container_of(n, struct bbio, bio)->key; bch_bkey_copy_single_ptr(bio_key, k, ptr); @@ -707,7 +707,7 @@ static void search_free(struct closure *cl) bio_complete(s); closure_debug_destroy(cl); - mempool_free(s, s->d->c->search); + mempool_free(s, &s->d->c->search); } static inline struct search *search_alloc(struct bio *bio, @@ -715,7 +715,7 @@ static inline struct search *search_alloc(struct bio *bio, { struct search *s; - s = mempool_alloc(d->c->search, GFP_NOIO); + s = mempool_alloc(&d->c->search, GFP_NOIO); closure_init(&s->cl, NULL); do_bio_hook(s, bio, request_endio); @@ -864,7 +864,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->cache_missed = 1; if (s->cache_miss || s->iop.bypass) { - miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); + miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split); ret = miss == bio ? MAP_DONE : MAP_CONTINUE; goto out_submit; } @@ -887,14 +887,14 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->iop.replace = true; - miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); + miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split); /* btree_search_recurse()'s btree iterator is no good anymore */ ret = miss == bio ? MAP_DONE : -EINTR; cache_bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), - dc->disk.bio_split); + &dc->disk.bio_split); if (!cache_bio) goto out_submit; @@ -1008,7 +1008,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) struct bio *flush; flush = bio_alloc_bioset(GFP_NOIO, 0, - dc->disk.bio_split); + &dc->disk.bio_split); if (!flush) { s->iop.status = BLK_STS_RESOURCE; goto insert_data; @@ -1021,7 +1021,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) closure_bio_submit(s->iop.c, flush, cl); } } else { - s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); + s->iop.bio = bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split); /* I/O request sent to backing device */ bio->bi_end_io = backing_request_endio; closure_bio_submit(s->iop.c, bio, cl); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 3dea06b..a31e55b 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -37,24 +37,6 @@ static const char invalid_uuid[] = { 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99 }; -/* Default is -1; we skip past it for struct cached_dev's cache mode */ -const char * const bch_cache_modes[] = { - "default", - "writethrough", - "writeback", - "writearound", - "none", - NULL -}; - -/* Default is -1; we skip past it for stop_when_cache_set_failed */ -const char * const bch_stop_on_failure_modes[] = { - "default", - "auto", - "always", - NULL -}; - static struct kobject *bcache_kobj; struct mutex bch_register_lock; LIST_HEAD(bch_cache_sets); @@ -654,6 +636,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode, unsigned int cmd, unsigned long arg) { struct bcache_device *d = b->bd_disk->private_data; + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + + if (dc->io_disable) + return -EIO; + return d->ioctl(d, mode, cmd, arg); } @@ -766,8 +753,7 @@ static void bcache_device_free(struct bcache_device *d) put_disk(d->disk); } - if (d->bio_split) - bioset_free(d->bio_split); + bioset_exit(&d->bio_split); kvfree(d->full_dirty_stripes); kvfree(d->stripe_sectors_dirty); @@ -809,9 +795,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (idx < 0) return idx; - if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio), - BIOSET_NEED_BVECS | - BIOSET_NEED_RESCUER)) || + if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), + BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) || !(d->disk = alloc_disk(BCACHE_MINORS))) { ida_simple_remove(&bcache_device_idx, idx); return -ENOMEM; @@ -864,6 +849,44 @@ static void calc_cached_dev_sectors(struct cache_set *c) c->cached_dev_sectors = sectors; } +#define BACKING_DEV_OFFLINE_TIMEOUT 5 +static int cached_dev_status_update(void *arg) +{ + struct cached_dev *dc = arg; + struct request_queue *q; + + /* + * If this delayed worker is stopping outside, directly quit here. + * dc->io_disable might be set via sysfs interface, so check it + * here too. + */ + while (!kthread_should_stop() && !dc->io_disable) { + q = bdev_get_queue(dc->bdev); + if (blk_queue_dying(q)) + dc->offline_seconds++; + else + dc->offline_seconds = 0; + + if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) { + pr_err("%s: device offline for %d seconds", + dc->backing_dev_name, + BACKING_DEV_OFFLINE_TIMEOUT); + pr_err("%s: disable I/O request due to backing " + "device offline", dc->disk.name); + dc->io_disable = true; + /* let others know earlier that io_disable is true */ + smp_mb(); + bcache_device_stop(&dc->disk); + break; + } + schedule_timeout_interruptible(HZ); + } + + wait_for_kthread_stop(); + return 0; +} + + void bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; @@ -906,6 +929,14 @@ void bch_cached_dev_run(struct cached_dev *dc) if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) pr_debug("error creating sysfs link"); + + dc->status_update_thread = kthread_run(cached_dev_status_update, + dc, "bcache_status_update"); + if (IS_ERR(dc->status_update_thread)) { + pr_warn("failed to create bcache_status_update kthread, " + "continue to run without monitoring backing " + "device status"); + } } /* @@ -1139,6 +1170,8 @@ static void cached_dev_free(struct closure *cl) kthread_stop(dc->writeback_thread); if (dc->writeback_write_wq) destroy_workqueue(dc->writeback_write_wq); + if (!IS_ERR_OR_NULL(dc->status_update_thread)) + kthread_stop(dc->status_update_thread); if (atomic_read(&dc->running)) bd_unlink_disk_holder(dc->bdev, dc->disk.disk); @@ -1465,14 +1498,10 @@ static void cache_set_free(struct closure *cl) if (c->moving_gc_wq) destroy_workqueue(c->moving_gc_wq); - if (c->bio_split) - bioset_free(c->bio_split); - if (c->fill_iter) - mempool_destroy(c->fill_iter); - if (c->bio_meta) - mempool_destroy(c->bio_meta); - if (c->search) - mempool_destroy(c->search); + bioset_exit(&c->bio_split); + mempool_exit(&c->fill_iter); + mempool_exit(&c->bio_meta); + mempool_exit(&c->search); kfree(c->devices); mutex_lock(&bch_register_lock); @@ -1683,21 +1712,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) INIT_LIST_HEAD(&c->btree_cache_freed); INIT_LIST_HEAD(&c->data_buckets); - c->search = mempool_create_slab_pool(32, bch_search_cache); - if (!c->search) - goto err; - iter_size = (sb->bucket_size / sb->block_size + 1) * sizeof(struct btree_iter_set); if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) || - !(c->bio_meta = mempool_create_kmalloc_pool(2, - sizeof(struct bbio) + sizeof(struct bio_vec) * - bucket_pages(c))) || - !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || - !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio), - BIOSET_NEED_BVECS | - BIOSET_NEED_RESCUER)) || + mempool_init_slab_pool(&c->search, 32, bch_search_cache) || + mempool_init_kmalloc_pool(&c->bio_meta, 2, + sizeof(struct bbio) + sizeof(struct bio_vec) * + bucket_pages(c)) || + mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || + bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio), + BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || !(c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0)) || diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index dfeef58..8ccbc8f3 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -16,6 +16,22 @@ #include <linux/sort.h> #include <linux/sched/clock.h> +/* Default is -1; we skip past it for struct cached_dev's cache mode */ +static const char * const bch_cache_modes[] = { + "writethrough", + "writeback", + "writearound", + "none", + NULL +}; + +/* Default is -1; we skip past it for stop_when_cache_set_failed */ +static const char * const bch_stop_on_failure_modes[] = { + "auto", + "always", + NULL +}; + static const char * const cache_replacement_policies[] = { "lru", "fifo", @@ -114,6 +130,20 @@ rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); rw_attribute(size); +static ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], + size_t selected) +{ + char *out = buf; + size_t i; + + for (i = 0; list[i]; i++) + out += snprintf(out, buf + size - out, + i == selected ? "[%s] " : "%s ", list[i]); + + out[-1] = '\n'; + return out - buf; +} + SHOW(__bch_cached_dev) { struct cached_dev *dc = container_of(kobj, struct cached_dev, @@ -124,12 +154,12 @@ SHOW(__bch_cached_dev) if (attr == &sysfs_cache_mode) return bch_snprint_string_list(buf, PAGE_SIZE, - bch_cache_modes + 1, + bch_cache_modes, BDEV_CACHE_MODE(&dc->sb)); if (attr == &sysfs_stop_when_cache_set_failed) return bch_snprint_string_list(buf, PAGE_SIZE, - bch_stop_on_failure_modes + 1, + bch_stop_on_failure_modes, dc->stop_when_cache_set_failed); @@ -253,8 +283,7 @@ STORE(__cached_dev) bch_cached_dev_run(dc); if (attr == &sysfs_cache_mode) { - v = bch_read_string_list(buf, bch_cache_modes + 1); - + v = __sysfs_match_string(bch_cache_modes, -1, buf); if (v < 0) return v; @@ -265,8 +294,7 @@ STORE(__cached_dev) } if (attr == &sysfs_stop_when_cache_set_failed) { - v = bch_read_string_list(buf, bch_stop_on_failure_modes + 1); - + v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf); if (v < 0) return v; @@ -635,6 +663,7 @@ SHOW_LOCKED(bch_cache_set) STORE(__bch_cache_set) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); + ssize_t v; if (attr == &sysfs_unregister) bch_cache_set_unregister(c); @@ -698,8 +727,7 @@ STORE(__bch_cache_set) c->congested_write_threshold_us); if (attr == &sysfs_errors) { - ssize_t v = bch_read_string_list(buf, error_actions); - + v = __sysfs_match_string(error_actions, -1, buf); if (v < 0) return v; @@ -714,8 +742,7 @@ STORE(__bch_cache_set) c->error_decay = strtoul_or_return(buf) / 88; if (attr == &sysfs_io_disable) { - int v = strtoul_or_return(buf); - + v = strtoul_or_return(buf); if (v) { if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) @@ -929,6 +956,7 @@ SHOW_LOCKED(bch_cache) STORE(__bch_cache) { struct cache *ca = container_of(kobj, struct cache, kobj); + ssize_t v; if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); @@ -943,8 +971,7 @@ STORE(__bch_cache) } if (attr == &sysfs_cache_replacement_policy) { - ssize_t v = bch_read_string_list(buf, cache_replacement_policies); - + v = __sysfs_match_string(cache_replacement_policies, -1, buf); if (v < 0) return v; diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 74febd5..fc479b0 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -120,41 +120,6 @@ ssize_t bch_hprint(char *buf, int64_t v) return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]); } -ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], - size_t selected) -{ - char *out = buf; - size_t i; - - for (i = 0; list[i]; i++) - out += snprintf(out, buf + size - out, - i == selected ? "[%s] " : "%s ", list[i]); - - out[-1] = '\n'; - return out - buf; -} - -ssize_t bch_read_string_list(const char *buf, const char * const list[]) -{ - size_t i; - char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL); - if (!d) - return -ENOMEM; - - s = strim(d); - - for (i = 0; list[i]; i++) - if (!strcmp(list[i], s)) - break; - - kfree(d); - - if (!list[i]) - return -EINVAL; - - return i; -} - bool bch_is_zero(const char *p, size_t n) { size_t i; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 2680245..cced87f 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -365,11 +365,6 @@ ssize_t bch_hprint(char *buf, int64_t v); bool bch_is_zero(const char *p, size_t n); int bch_parse_uuid(const char *s, char *uuid); -ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], - size_t selected); - -ssize_t bch_read_string_list(const char *buf, const char * const list[]); - struct time_stats { spinlock_t lock; /* diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c index 874841f..8e33a38 100644 --- a/drivers/md/dm-bio-prison-v1.c +++ b/drivers/md/dm-bio-prison-v1.c @@ -19,7 +19,7 @@ struct dm_bio_prison { spinlock_t lock; - mempool_t *cell_pool; + mempool_t cell_pool; struct rb_root cells; }; @@ -34,14 +34,15 @@ static struct kmem_cache *_cell_cache; struct dm_bio_prison *dm_bio_prison_create(void) { struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL); + int ret; if (!prison) return NULL; spin_lock_init(&prison->lock); - prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache); - if (!prison->cell_pool) { + ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache); + if (ret) { kfree(prison); return NULL; } @@ -54,21 +55,21 @@ EXPORT_SYMBOL_GPL(dm_bio_prison_create); void dm_bio_prison_destroy(struct dm_bio_prison *prison) { - mempool_destroy(prison->cell_pool); + mempool_exit(&prison->cell_pool); kfree(prison); } EXPORT_SYMBOL_GPL(dm_bio_prison_destroy); struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, gfp_t gfp) { - return mempool_alloc(prison->cell_pool, gfp); + return mempool_alloc(&prison->cell_pool, gfp); } EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell); void dm_bio_prison_free_cell(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell) { - mempool_free(cell, prison->cell_pool); + mempool_free(cell, &prison->cell_pool); } EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell); diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c index 8ce3a1a..601b156 100644 --- a/drivers/md/dm-bio-prison-v2.c +++ b/drivers/md/dm-bio-prison-v2.c @@ -21,7 +21,7 @@ struct dm_bio_prison_v2 { struct workqueue_struct *wq; spinlock_t lock; - mempool_t *cell_pool; + mempool_t cell_pool; struct rb_root cells; }; @@ -36,6 +36,7 @@ static struct kmem_cache *_cell_cache; struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq) { struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL); + int ret; if (!prison) return NULL; @@ -43,8 +44,8 @@ struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq) prison->wq = wq; spin_lock_init(&prison->lock); - prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache); - if (!prison->cell_pool) { + ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache); + if (ret) { kfree(prison); return NULL; } @@ -57,21 +58,21 @@ EXPORT_SYMBOL_GPL(dm_bio_prison_create_v2); void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison) { - mempool_destroy(prison->cell_pool); + mempool_exit(&prison->cell_pool); kfree(prison); } EXPORT_SYMBOL_GPL(dm_bio_prison_destroy_v2); struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, gfp_t gfp) { - return mempool_alloc(prison->cell_pool, gfp); + return mempool_alloc(&prison->cell_pool, gfp); } EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell_v2); void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison, struct dm_bio_prison_cell_v2 *cell) { - mempool_free(cell, prison->cell_pool); + mempool_free(cell, &prison->cell_pool); } EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell_v2); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index da20863..001c712 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -447,9 +447,9 @@ struct cache { struct work_struct migration_worker; struct delayed_work waker; struct dm_bio_prison_v2 *prison; - struct bio_set *bs; + struct bio_set bs; - mempool_t *migration_pool; + mempool_t migration_pool; struct dm_cache_policy *policy; unsigned policy_nr_args; @@ -550,7 +550,7 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache) { struct dm_cache_migration *mg; - mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); + mg = mempool_alloc(&cache->migration_pool, GFP_NOWAIT); if (!mg) return NULL; @@ -569,7 +569,7 @@ static void free_migration(struct dm_cache_migration *mg) if (atomic_dec_and_test(&cache->nr_allocated_migrations)) wake_up(&cache->migration_wait); - mempool_free(mg, cache->migration_pool); + mempool_free(mg, &cache->migration_pool); } /*----------------------------------------------------------------*/ @@ -924,7 +924,7 @@ static void issue_op(struct bio *bio, void *context) static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, dm_oblock_t oblock, dm_cblock_t cblock) { - struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, cache->bs); + struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs); BUG_ON(!origin_bio); @@ -2011,7 +2011,7 @@ static void destroy(struct cache *cache) { unsigned i; - mempool_destroy(cache->migration_pool); + mempool_exit(&cache->migration_pool); if (cache->prison) dm_bio_prison_destroy_v2(cache->prison); @@ -2047,8 +2047,7 @@ static void destroy(struct cache *cache) kfree(cache->ctr_args[i]); kfree(cache->ctr_args); - if (cache->bs) - bioset_free(cache->bs); + bioset_exit(&cache->bs); kfree(cache); } @@ -2498,8 +2497,8 @@ static int cache_create(struct cache_args *ca, struct cache **result) cache->features = ca->features; if (writethrough_mode(cache)) { /* Create bioset for writethrough bios issued to origin */ - cache->bs = bioset_create(BIO_POOL_SIZE, 0, 0); - if (!cache->bs) + r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0); + if (r) goto bad; } @@ -2630,9 +2629,9 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, - migration_cache); - if (!cache->migration_pool) { + r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE, + migration_cache); + if (r) { *error = "Error creating cache's migration mempool"; goto bad; } diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 3222e21..f21c5d2 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -91,8 +91,8 @@ struct mapped_device { /* * io objects are allocated from here. */ - struct bio_set *io_bs; - struct bio_set *bs; + struct bio_set io_bs; + struct bio_set bs; /* * freeze/thaw support require holding onto a super block diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 44ff473..da02f4d 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -143,14 +143,14 @@ struct crypt_config { * pool for per bio private data, crypto requests, * encryption requeusts/buffer pages and integrity tags */ - mempool_t *req_pool; - mempool_t *page_pool; - mempool_t *tag_pool; + mempool_t req_pool; + mempool_t page_pool; + mempool_t tag_pool; unsigned tag_pool_max_sectors; struct percpu_counter n_allocated_pages; - struct bio_set *bs; + struct bio_set bs; struct mutex bio_alloc_lock; struct workqueue_struct *io_queue; @@ -1245,7 +1245,7 @@ static void crypt_alloc_req_skcipher(struct crypt_config *cc, unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1); if (!ctx->r.req) - ctx->r.req = mempool_alloc(cc->req_pool, GFP_NOIO); + ctx->r.req = mempool_alloc(&cc->req_pool, GFP_NOIO); skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]); @@ -1262,7 +1262,7 @@ static void crypt_alloc_req_aead(struct crypt_config *cc, struct convert_context *ctx) { if (!ctx->r.req_aead) - ctx->r.req_aead = mempool_alloc(cc->req_pool, GFP_NOIO); + ctx->r.req_aead = mempool_alloc(&cc->req_pool, GFP_NOIO); aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]); @@ -1290,7 +1290,7 @@ static void crypt_free_req_skcipher(struct crypt_config *cc, struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); if ((struct skcipher_request *)(io + 1) != req) - mempool_free(req, cc->req_pool); + mempool_free(req, &cc->req_pool); } static void crypt_free_req_aead(struct crypt_config *cc, @@ -1299,7 +1299,7 @@ static void crypt_free_req_aead(struct crypt_config *cc, struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); if ((struct aead_request *)(io + 1) != req) - mempool_free(req, cc->req_pool); + mempool_free(req, &cc->req_pool); } static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio) @@ -1409,7 +1409,7 @@ retry: if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) mutex_lock(&cc->bio_alloc_lock); - clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); + clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, &cc->bs); if (!clone) goto out; @@ -1418,7 +1418,7 @@ retry: remaining_size = size; for (i = 0; i < nr_iovecs; i++) { - page = mempool_alloc(cc->page_pool, gfp_mask); + page = mempool_alloc(&cc->page_pool, gfp_mask); if (!page) { crypt_free_buffer_pages(cc, clone); bio_put(clone); @@ -1453,7 +1453,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) bio_for_each_segment_all(bv, clone, i) { BUG_ON(!bv->bv_page); - mempool_free(bv->bv_page, cc->page_pool); + mempool_free(bv->bv_page, &cc->page_pool); } } @@ -1492,7 +1492,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io) crypt_free_req(cc, io->ctx.r.req, base_bio); if (unlikely(io->integrity_metadata_from_pool)) - mempool_free(io->integrity_metadata, io->cc->tag_pool); + mempool_free(io->integrity_metadata, &io->cc->tag_pool); else kfree(io->integrity_metadata); @@ -1565,7 +1565,7 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) * biovecs we don't need to worry about the block layer * modifying the biovec array; so leverage bio_clone_fast(). */ - clone = bio_clone_fast(io->base_bio, gfp, cc->bs); + clone = bio_clone_fast(io->base_bio, gfp, &cc->bs); if (!clone) return 1; @@ -2219,15 +2219,13 @@ static void crypt_dtr(struct dm_target *ti) crypt_free_tfms(cc); - if (cc->bs) - bioset_free(cc->bs); + bioset_exit(&cc->bs); - mempool_destroy(cc->page_pool); - mempool_destroy(cc->req_pool); - mempool_destroy(cc->tag_pool); + mempool_exit(&cc->page_pool); + mempool_exit(&cc->req_pool); + mempool_exit(&cc->tag_pool); - if (cc->page_pool) - WARN_ON(percpu_counter_sum(&cc->n_allocated_pages) != 0); + WARN_ON(percpu_counter_sum(&cc->n_allocated_pages) != 0); percpu_counter_destroy(&cc->n_allocated_pages); if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) @@ -2743,8 +2741,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) iv_size_padding = align_mask; } - ret = -ENOMEM; - /* ...| IV + padding | original IV | original sec. number | bio tag offset | */ additional_req_size = sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size + @@ -2752,8 +2748,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) sizeof(uint64_t) + sizeof(unsigned int); - cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + additional_req_size); - if (!cc->req_pool) { + ret = mempool_init_kmalloc_pool(&cc->req_pool, MIN_IOS, cc->dmreq_start + additional_req_size); + if (ret) { ti->error = "Cannot allocate crypt request mempool"; goto bad; } @@ -2762,14 +2758,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size, ARCH_KMALLOC_MINALIGN); - cc->page_pool = mempool_create(BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc); - if (!cc->page_pool) { + ret = mempool_init(&cc->page_pool, BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc); + if (ret) { ti->error = "Cannot allocate page mempool"; goto bad; } - cc->bs = bioset_create(MIN_IOS, 0, BIOSET_NEED_BVECS); - if (!cc->bs) { + ret = bioset_init(&cc->bs, MIN_IOS, 0, BIOSET_NEED_BVECS); + if (ret) { ti->error = "Cannot allocate crypt bioset"; goto bad; } @@ -2806,11 +2802,10 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (!cc->tag_pool_max_sectors) cc->tag_pool_max_sectors = 1; - cc->tag_pool = mempool_create_kmalloc_pool(MIN_IOS, + ret = mempool_init_kmalloc_pool(&cc->tag_pool, MIN_IOS, cc->tag_pool_max_sectors * cc->on_disk_tag_size); - if (!cc->tag_pool) { + if (ret) { ti->error = "Cannot allocate integrity tags mempool"; - ret = -ENOMEM; goto bad; } @@ -2903,7 +2898,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) { if (bio_sectors(bio) > cc->tag_pool_max_sectors) dm_accept_partial_bio(bio, cc->tag_pool_max_sectors); - io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO); + io->integrity_metadata = mempool_alloc(&cc->tag_pool, GFP_NOIO); io->integrity_metadata_from_pool = true; } } diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 514fb4a..fc68c7a 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -142,7 +142,7 @@ struct dm_integrity_c { unsigned tag_size; __s8 log2_tag_size; sector_t start; - mempool_t *journal_io_mempool; + mempool_t journal_io_mempool; struct dm_io_client *io; struct dm_bufio_client *bufio; struct workqueue_struct *metadata_wq; @@ -1817,7 +1817,7 @@ static void complete_copy_from_journal(unsigned long error, void *context) struct journal_completion *comp = io->comp; struct dm_integrity_c *ic = comp->ic; remove_range(ic, &io->range); - mempool_free(io, ic->journal_io_mempool); + mempool_free(io, &ic->journal_io_mempool); if (unlikely(error != 0)) dm_integrity_io_error(ic, "copying from journal", -EIO); complete_journal_op(comp); @@ -1886,7 +1886,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, } next_loop = k - 1; - io = mempool_alloc(ic->journal_io_mempool, GFP_NOIO); + io = mempool_alloc(&ic->journal_io_mempool, GFP_NOIO); io->comp = ∁ io->range.logical_sector = sec; io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block; @@ -1918,7 +1918,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, if (j == k) { remove_range_unlocked(ic, &io->range); spin_unlock_irq(&ic->endio_wait.lock); - mempool_free(io, ic->journal_io_mempool); + mempool_free(io, &ic->journal_io_mempool); goto skip_io; } for (l = j; l < k; l++) { @@ -2980,9 +2980,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - ic->journal_io_mempool = mempool_create_slab_pool(JOURNAL_IO_MEMPOOL, journal_io_cache); - if (!ic->journal_io_mempool) { - r = -ENOMEM; + r = mempool_init_slab_pool(&ic->journal_io_mempool, JOURNAL_IO_MEMPOOL, journal_io_cache); + if (r) { ti->error = "Cannot allocate mempool"; goto bad; } @@ -3196,7 +3195,7 @@ static void dm_integrity_dtr(struct dm_target *ti) destroy_workqueue(ic->writer_wq); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); - mempool_destroy(ic->journal_io_mempool); + mempool_exit(&ic->journal_io_mempool); if (ic->io) dm_io_client_destroy(ic->io); if (ic->dev) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index a8d914d..53c6ed0 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -22,8 +22,8 @@ #define DM_IO_MAX_REGIONS BITS_PER_LONG struct dm_io_client { - mempool_t *pool; - struct bio_set *bios; + mempool_t pool; + struct bio_set bios; }; /* @@ -49,32 +49,33 @@ struct dm_io_client *dm_io_client_create(void) { struct dm_io_client *client; unsigned min_ios = dm_get_reserved_bio_based_ios(); + int ret; client = kmalloc(sizeof(*client), GFP_KERNEL); if (!client) return ERR_PTR(-ENOMEM); - client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache); - if (!client->pool) + ret = mempool_init_slab_pool(&client->pool, min_ios, _dm_io_cache); + if (ret) goto bad; - client->bios = bioset_create(min_ios, 0, BIOSET_NEED_BVECS); - if (!client->bios) + ret = bioset_init(&client->bios, min_ios, 0, BIOSET_NEED_BVECS); + if (ret) goto bad; return client; bad: - mempool_destroy(client->pool); + mempool_exit(&client->pool); kfree(client); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } EXPORT_SYMBOL(dm_io_client_create); void dm_io_client_destroy(struct dm_io_client *client) { - mempool_destroy(client->pool); - bioset_free(client->bios); + mempool_exit(&client->pool); + bioset_exit(&client->bios); kfree(client); } EXPORT_SYMBOL(dm_io_client_destroy); @@ -120,7 +121,7 @@ static void complete_io(struct io *io) invalidate_kernel_vmap_range(io->vma_invalidate_address, io->vma_invalidate_size); - mempool_free(io, io->client->pool); + mempool_free(io, &io->client->pool); fn(error_bits, context); } @@ -344,7 +345,7 @@ static void do_region(int op, int op_flags, unsigned region, dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); } - bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); + bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, &io->client->bios); bio->bi_iter.bi_sector = where->sector + (where->count - remaining); bio_set_dev(bio, where->bdev); bio->bi_end_io = endio; @@ -442,7 +443,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, init_completion(&sio.wait); - io = mempool_alloc(client->pool, GFP_NOIO); + io = mempool_alloc(&client->pool, GFP_NOIO); io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ io->client = client; @@ -474,7 +475,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, return -EIO; } - io = mempool_alloc(client->pool, GFP_NOIO); + io = mempool_alloc(&client->pool, GFP_NOIO); io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ io->client = client; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index e6e7c68..c89a675 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -47,7 +47,7 @@ struct dm_kcopyd_client { wait_queue_head_t destroyq; atomic_t nr_jobs; - mempool_t *job_pool; + mempool_t job_pool; struct workqueue_struct *kcopyd_wq; struct work_struct kcopyd_work; @@ -479,7 +479,7 @@ static int run_complete_job(struct kcopyd_job *job) */ if (job->master_job == job) { mutex_destroy(&job->lock); - mempool_free(job, kc->job_pool); + mempool_free(job, &kc->job_pool); } fn(read_err, write_err, context); @@ -751,7 +751,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, * Allocate an array of jobs consisting of one master job * followed by SPLIT_COUNT sub jobs. */ - job = mempool_alloc(kc->job_pool, GFP_NOIO); + job = mempool_alloc(&kc->job_pool, GFP_NOIO); mutex_init(&job->lock); /* @@ -835,7 +835,7 @@ void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, { struct kcopyd_job *job; - job = mempool_alloc(kc->job_pool, GFP_NOIO); + job = mempool_alloc(&kc->job_pool, GFP_NOIO); memset(job, 0, sizeof(struct kcopyd_job)); job->kc = kc; @@ -879,7 +879,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block) *---------------------------------------------------------------*/ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle) { - int r = -ENOMEM; + int r; struct dm_kcopyd_client *kc; kc = kmalloc(sizeof(*kc), GFP_KERNEL); @@ -892,14 +892,16 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro INIT_LIST_HEAD(&kc->pages_jobs); kc->throttle = throttle; - kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); - if (!kc->job_pool) + r = mempool_init_slab_pool(&kc->job_pool, MIN_JOBS, _job_cache); + if (r) goto bad_slab; INIT_WORK(&kc->kcopyd_work, do_work); kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0); - if (!kc->kcopyd_wq) + if (!kc->kcopyd_wq) { + r = -ENOMEM; goto bad_workqueue; + } kc->pages = NULL; kc->nr_reserved_pages = kc->nr_free_pages = 0; @@ -923,7 +925,7 @@ bad_io_client: bad_client_pages: destroy_workqueue(kc->kcopyd_wq); bad_workqueue: - mempool_destroy(kc->job_pool); + mempool_exit(&kc->job_pool); bad_slab: kfree(kc); @@ -942,7 +944,7 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc) destroy_workqueue(kc->kcopyd_wq); dm_io_client_destroy(kc->io_client); client_free_pages(kc); - mempool_destroy(kc->job_pool); + mempool_exit(&kc->job_pool); kfree(kc); } EXPORT_SYMBOL(dm_kcopyd_client_destroy); diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 53b7b06d..52090be 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -76,7 +76,7 @@ struct log_c { */ uint32_t integrated_flush; - mempool_t *flush_entry_pool; + mempool_t flush_entry_pool; }; static struct kmem_cache *_flush_entry_cache; @@ -249,11 +249,10 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, goto out; } - lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE, - _flush_entry_cache); - if (!lc->flush_entry_pool) { + r = mempool_init_slab_pool(&lc->flush_entry_pool, FLUSH_ENTRY_POOL_SIZE, + _flush_entry_cache); + if (r) { DMERR("Failed to create flush_entry_pool"); - r = -ENOMEM; goto out; } @@ -313,7 +312,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, out: kfree(devices_rdata); if (r) { - mempool_destroy(lc->flush_entry_pool); + mempool_exit(&lc->flush_entry_pool); kfree(lc); kfree(ctr_str); } else { @@ -342,7 +341,7 @@ static void userspace_dtr(struct dm_dirty_log *log) if (lc->log_dev) dm_put_device(lc->ti, lc->log_dev); - mempool_destroy(lc->flush_entry_pool); + mempool_exit(&lc->flush_entry_pool); kfree(lc->usr_argv_str); kfree(lc); @@ -570,7 +569,7 @@ static int userspace_flush(struct dm_dirty_log *log) int mark_list_is_empty; int clear_list_is_empty; struct dm_dirty_log_flush_entry *fe, *tmp_fe; - mempool_t *flush_entry_pool = lc->flush_entry_pool; + mempool_t *flush_entry_pool = &lc->flush_entry_pool; spin_lock_irqsave(&lc->flush_lock, flags); list_splice_init(&lc->mark_list, &mark_list); @@ -653,7 +652,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region) struct dm_dirty_log_flush_entry *fe; /* Wait for an allocation, but _never_ fail */ - fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO); + fe = mempool_alloc(&lc->flush_entry_pool, GFP_NOIO); BUG_ON(!fe); spin_lock_irqsave(&lc->flush_lock, flags); @@ -687,7 +686,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region) * to cause the region to be resync'ed when the * device is activated next time. */ - fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC); + fe = mempool_alloc(&lc->flush_entry_pool, GFP_ATOMIC); if (!fe) { DMERR("Failed to allocate memory to clear region."); return; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 203a041..d94ba6f 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -520,7 +520,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, bdev = pgpath->path.dev->bdev; q = bdev_get_queue(bdev); - clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC); + clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, + BLK_MQ_REQ_NOWAIT); if (IS_ERR(clone)) { /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ if (blk_queue_dying(q)) { diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 85c32b2..43149eb 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -63,7 +63,7 @@ struct dm_region_hash { /* hash table */ rwlock_t hash_lock; - mempool_t *region_pool; + mempool_t region_pool; unsigned mask; unsigned nr_buckets; unsigned prime; @@ -169,6 +169,7 @@ struct dm_region_hash *dm_region_hash_create( struct dm_region_hash *rh; unsigned nr_buckets, max_buckets; size_t i; + int ret; /* * Calculate a suitable number of buckets for our hash @@ -220,9 +221,9 @@ struct dm_region_hash *dm_region_hash_create( INIT_LIST_HEAD(&rh->failed_recovered_regions); rh->flush_failure = 0; - rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, - sizeof(struct dm_region)); - if (!rh->region_pool) { + ret = mempool_init_kmalloc_pool(&rh->region_pool, MIN_REGIONS, + sizeof(struct dm_region)); + if (ret) { vfree(rh->buckets); kfree(rh); rh = ERR_PTR(-ENOMEM); @@ -242,14 +243,14 @@ void dm_region_hash_destroy(struct dm_region_hash *rh) list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) { BUG_ON(atomic_read(®->pending)); - mempool_free(reg, rh->region_pool); + mempool_free(reg, &rh->region_pool); } } if (rh->log) dm_dirty_log_destroy(rh->log); - mempool_destroy(rh->region_pool); + mempool_exit(&rh->region_pool); vfree(rh->buckets); kfree(rh); } @@ -287,7 +288,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) { struct dm_region *reg, *nreg; - nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); + nreg = mempool_alloc(&rh->region_pool, GFP_ATOMIC); if (unlikely(!nreg)) nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); @@ -303,7 +304,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) reg = __rh_lookup(rh, region); if (reg) /* We lost the race. */ - mempool_free(nreg, rh->region_pool); + mempool_free(nreg, &rh->region_pool); else { __rh_insert(rh, nreg); if (nreg->state == DM_RH_CLEAN) { @@ -481,17 +482,17 @@ void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) list_for_each_entry_safe(reg, next, &recovered, list) { rh->log->type->clear_region(rh->log, reg->key); complete_resync_work(reg, 1); - mempool_free(reg, rh->region_pool); + mempool_free(reg, &rh->region_pool); } list_for_each_entry_safe(reg, next, &failed_recovered, list) { complete_resync_work(reg, errors_handled ? 0 : 1); - mempool_free(reg, rh->region_pool); + mempool_free(reg, &rh->region_pool); } list_for_each_entry_safe(reg, next, &clean, list) { rh->log->type->clear_region(rh->log, reg->key); - mempool_free(reg, rh->region_pool); + mempool_free(reg, &rh->region_pool); } rh->log->type->flush(rh->log); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index bf0b840..6e547b8 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -406,7 +406,7 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ if (blk_queue_io_stat(clone->q)) clone->rq_flags |= RQF_IO_STAT; - clone->start_time = jiffies; + clone->start_time_ns = ktime_get_ns(); r = blk_insert_cloned_request(clone->q, clone); if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE) /* must complete clone in terms of original request */ @@ -433,7 +433,7 @@ static int setup_clone(struct request *clone, struct request *rq, { int r; - r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, + r = blk_rq_prep_clone(clone, rq, &tio->md->bs, gfp_mask, dm_rq_bio_constructor, tio); if (r) return r; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 216035b..b11ddc5 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -87,7 +87,7 @@ struct dm_snapshot { */ struct list_head out_of_order_list; - mempool_t *pending_pool; + mempool_t pending_pool; struct dm_exception_table pending; struct dm_exception_table complete; @@ -682,7 +682,7 @@ static void free_completed_exception(struct dm_exception *e) static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s) { - struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool, + struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool, GFP_NOIO); atomic_inc(&s->pending_exceptions_count); @@ -695,7 +695,7 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) { struct dm_snapshot *s = pe->snap; - mempool_free(pe, s->pending_pool); + mempool_free(pe, &s->pending_pool); smp_mb__before_atomic(); atomic_dec(&s->pending_exceptions_count); } @@ -1196,10 +1196,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_kcopyd; } - s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache); - if (!s->pending_pool) { + r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache); + if (r) { ti->error = "Could not allocate mempool for pending exceptions"; - r = -ENOMEM; goto bad_pending_pool; } @@ -1259,7 +1258,7 @@ bad_read_metadata: unregister_snapshot(s); bad_load_and_register: - mempool_destroy(s->pending_pool); + mempool_exit(&s->pending_pool); bad_pending_pool: dm_kcopyd_client_destroy(s->kcopyd_client); @@ -1355,7 +1354,7 @@ static void snapshot_dtr(struct dm_target *ti) while (atomic_read(&s->pending_exceptions_count)) msleep(1); /* - * Ensure instructions in mempool_destroy aren't reordered + * Ensure instructions in mempool_exit aren't reordered * before atomic_read. */ smp_mb(); @@ -1367,7 +1366,7 @@ static void snapshot_dtr(struct dm_target *ti) __free_exceptions(s); - mempool_destroy(s->pending_pool); + mempool_exit(&s->pending_pool); dm_exception_store_destroy(s->store); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index b111074..6c92382 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -260,7 +260,7 @@ struct pool { struct dm_deferred_set *all_io_ds; struct dm_thin_new_mapping *next_mapping; - mempool_t *mapping_pool; + mempool_t mapping_pool; process_bio_fn process_bio; process_bio_fn process_discard; @@ -917,7 +917,7 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) { cell_error(m->tc->pool, m->cell); list_del(&m->list); - mempool_free(m, m->tc->pool->mapping_pool); + mempool_free(m, &m->tc->pool->mapping_pool); } static void process_prepared_mapping(struct dm_thin_new_mapping *m) @@ -961,7 +961,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) out: list_del(&m->list); - mempool_free(m, pool->mapping_pool); + mempool_free(m, &pool->mapping_pool); } /*----------------------------------------------------------------*/ @@ -971,7 +971,7 @@ static void free_discard_mapping(struct dm_thin_new_mapping *m) struct thin_c *tc = m->tc; if (m->cell) cell_defer_no_holder(tc, m->cell); - mempool_free(m, tc->pool->mapping_pool); + mempool_free(m, &tc->pool->mapping_pool); } static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) @@ -999,7 +999,7 @@ static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m) bio_endio(m->bio); cell_defer_no_holder(tc, m->cell); - mempool_free(m, tc->pool->mapping_pool); + mempool_free(m, &tc->pool->mapping_pool); } /*----------------------------------------------------------------*/ @@ -1092,7 +1092,7 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) metadata_operation_failed(pool, "dm_thin_remove_range", r); bio_io_error(m->bio); cell_defer_no_holder(tc, m->cell); - mempool_free(m, pool->mapping_pool); + mempool_free(m, &pool->mapping_pool); return; } @@ -1105,7 +1105,7 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) metadata_operation_failed(pool, "dm_pool_inc_data_range", r); bio_io_error(m->bio); cell_defer_no_holder(tc, m->cell); - mempool_free(m, pool->mapping_pool); + mempool_free(m, &pool->mapping_pool); return; } @@ -1150,7 +1150,7 @@ static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m) bio_endio(m->bio); cell_defer_no_holder(tc, m->cell); - mempool_free(m, pool->mapping_pool); + mempool_free(m, &pool->mapping_pool); } static void process_prepared(struct pool *pool, struct list_head *head, @@ -1196,7 +1196,7 @@ static int ensure_next_mapping(struct pool *pool) if (pool->next_mapping) return 0; - pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); + pool->next_mapping = mempool_alloc(&pool->mapping_pool, GFP_ATOMIC); return pool->next_mapping ? 0 : -ENOMEM; } @@ -2835,8 +2835,8 @@ static void __pool_destroy(struct pool *pool) destroy_workqueue(pool->wq); if (pool->next_mapping) - mempool_free(pool->next_mapping, pool->mapping_pool); - mempool_destroy(pool->mapping_pool); + mempool_free(pool->next_mapping, &pool->mapping_pool); + mempool_exit(&pool->mapping_pool); dm_deferred_set_destroy(pool->shared_read_ds); dm_deferred_set_destroy(pool->all_io_ds); kfree(pool); @@ -2931,11 +2931,11 @@ static struct pool *pool_create(struct mapped_device *pool_md, } pool->next_mapping = NULL; - pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE, - _new_mapping_cache); - if (!pool->mapping_pool) { + r = mempool_init_slab_pool(&pool->mapping_pool, MAPPING_POOL_SIZE, + _new_mapping_cache); + if (r) { *error = "Error creating pool's mapping mempool"; - err_p = ERR_PTR(-ENOMEM); + err_p = ERR_PTR(r); goto bad_mapping_pool; } @@ -2955,7 +2955,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, return pool; bad_sort_array: - mempool_destroy(pool->mapping_pool); + mempool_exit(&pool->mapping_pool); bad_mapping_pool: dm_deferred_set_destroy(pool->all_io_ds); bad_all_io_ds: diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index e13f908..8640586 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -309,13 +309,13 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) unsigned n; if (!fio->rs) - fio->rs = mempool_alloc(v->fec->rs_pool, GFP_NOIO); + fio->rs = mempool_alloc(&v->fec->rs_pool, GFP_NOIO); fec_for_each_prealloc_buffer(n) { if (fio->bufs[n]) continue; - fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOWAIT); + fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOWAIT); if (unlikely(!fio->bufs[n])) { DMERR("failed to allocate FEC buffer"); return -ENOMEM; @@ -327,7 +327,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) if (fio->bufs[n]) continue; - fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOWAIT); + fio->bufs[n] = mempool_alloc(&v->fec->extra_pool, GFP_NOWAIT); /* we can manage with even one buffer if necessary */ if (unlikely(!fio->bufs[n])) break; @@ -335,7 +335,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) fio->nbufs = n; if (!fio->output) - fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO); + fio->output = mempool_alloc(&v->fec->output_pool, GFP_NOIO); return 0; } @@ -493,15 +493,15 @@ void verity_fec_finish_io(struct dm_verity_io *io) if (!verity_fec_is_enabled(io->v)) return; - mempool_free(fio->rs, f->rs_pool); + mempool_free(fio->rs, &f->rs_pool); fec_for_each_prealloc_buffer(n) - mempool_free(fio->bufs[n], f->prealloc_pool); + mempool_free(fio->bufs[n], &f->prealloc_pool); fec_for_each_extra_buffer(fio, n) - mempool_free(fio->bufs[n], f->extra_pool); + mempool_free(fio->bufs[n], &f->extra_pool); - mempool_free(fio->output, f->output_pool); + mempool_free(fio->output, &f->output_pool); } /* @@ -549,9 +549,9 @@ void verity_fec_dtr(struct dm_verity *v) if (!verity_fec_is_enabled(v)) goto out; - mempool_destroy(f->rs_pool); - mempool_destroy(f->prealloc_pool); - mempool_destroy(f->extra_pool); + mempool_exit(&f->rs_pool); + mempool_exit(&f->prealloc_pool); + mempool_exit(&f->extra_pool); kmem_cache_destroy(f->cache); if (f->data_bufio) @@ -675,6 +675,7 @@ int verity_fec_ctr(struct dm_verity *v) struct dm_verity_fec *f = v->fec; struct dm_target *ti = v->ti; u64 hash_blocks; + int ret; if (!verity_fec_is_enabled(v)) { verity_fec_dtr(v); @@ -770,11 +771,11 @@ int verity_fec_ctr(struct dm_verity *v) } /* Preallocate an rs_control structure for each worker thread */ - f->rs_pool = mempool_create(num_online_cpus(), fec_rs_alloc, - fec_rs_free, (void *) v); - if (!f->rs_pool) { + ret = mempool_init(&f->rs_pool, num_online_cpus(), fec_rs_alloc, + fec_rs_free, (void *) v); + if (ret) { ti->error = "Cannot allocate RS pool"; - return -ENOMEM; + return ret; } f->cache = kmem_cache_create("dm_verity_fec_buffers", @@ -786,26 +787,26 @@ int verity_fec_ctr(struct dm_verity *v) } /* Preallocate DM_VERITY_FEC_BUF_PREALLOC buffers for each thread */ - f->prealloc_pool = mempool_create_slab_pool(num_online_cpus() * - DM_VERITY_FEC_BUF_PREALLOC, - f->cache); - if (!f->prealloc_pool) { + ret = mempool_init_slab_pool(&f->prealloc_pool, num_online_cpus() * + DM_VERITY_FEC_BUF_PREALLOC, + f->cache); + if (ret) { ti->error = "Cannot allocate FEC buffer prealloc pool"; - return -ENOMEM; + return ret; } - f->extra_pool = mempool_create_slab_pool(0, f->cache); - if (!f->extra_pool) { + ret = mempool_init_slab_pool(&f->extra_pool, 0, f->cache); + if (ret) { ti->error = "Cannot allocate FEC buffer extra pool"; - return -ENOMEM; + return ret; } /* Preallocate an output buffer for each thread */ - f->output_pool = mempool_create_kmalloc_pool(num_online_cpus(), - 1 << v->data_dev_block_bits); - if (!f->output_pool) { + ret = mempool_init_kmalloc_pool(&f->output_pool, num_online_cpus(), + 1 << v->data_dev_block_bits); + if (ret) { ti->error = "Cannot allocate FEC output pool"; - return -ENOMEM; + return ret; } /* Reserve space for our per-bio data */ diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index bb31ce8..6ad803b 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -46,10 +46,10 @@ struct dm_verity_fec { sector_t hash_blocks; /* blocks covered after v->hash_start */ unsigned char roots; /* number of parity bytes, M-N of RS(M, N) */ unsigned char rsn; /* N of RS(M, N) */ - mempool_t *rs_pool; /* mempool for fio->rs */ - mempool_t *prealloc_pool; /* mempool for preallocated buffers */ - mempool_t *extra_pool; /* mempool for extra buffers */ - mempool_t *output_pool; /* mempool for output */ + mempool_t rs_pool; /* mempool for fio->rs */ + mempool_t prealloc_pool; /* mempool for preallocated buffers */ + mempool_t extra_pool; /* mempool for extra buffers */ + mempool_t output_pool; /* mempool for output */ struct kmem_cache *cache; /* cache for buffers */ }; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index e73b077..30602d1 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -57,7 +57,7 @@ struct dmz_target { struct workqueue_struct *chunk_wq; /* For cloned BIOs to zones */ - struct bio_set *bio_set; + struct bio_set bio_set; /* For flush */ spinlock_t flush_lock; @@ -121,7 +121,7 @@ static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone, } /* Partial BIO: we need to clone the BIO */ - clone = bio_clone_fast(bio, GFP_NOIO, dmz->bio_set); + clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set); if (!clone) return -ENOMEM; @@ -779,10 +779,9 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift; /* Zone BIO */ - dmz->bio_set = bioset_create(DMZ_MIN_BIOS, 0, 0); - if (!dmz->bio_set) { + ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0); + if (ret) { ti->error = "Create BIO set failed"; - ret = -ENOMEM; goto err_meta; } @@ -828,7 +827,7 @@ err_cwq: destroy_workqueue(dmz->chunk_wq); err_bio: mutex_destroy(&dmz->chunk_lock); - bioset_free(dmz->bio_set); + bioset_exit(&dmz->bio_set); err_meta: dmz_dtr_metadata(dmz->metadata); err_dev: @@ -858,7 +857,7 @@ static void dmz_dtr(struct dm_target *ti) dmz_dtr_metadata(dmz->metadata); - bioset_free(dmz->bio_set); + bioset_exit(&dmz->bio_set); dmz_put_zoned_device(ti); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0a7b010..98dff36 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -148,8 +148,8 @@ static int dm_numa_node = DM_NUMA_NODE; * For mempools pre-allocation at the table loading time. */ struct dm_md_mempools { - struct bio_set *bs; - struct bio_set *io_bs; + struct bio_set bs; + struct bio_set io_bs; }; struct table_device { @@ -537,7 +537,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) struct dm_target_io *tio; struct bio *clone; - clone = bio_alloc_bioset(GFP_NOIO, 0, md->io_bs); + clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs); if (!clone) return NULL; @@ -572,7 +572,7 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t /* the dm_target_io embedded in ci->io is available */ tio = &ci->io->tio; } else { - struct bio *clone = bio_alloc_bioset(gfp_mask, 0, ci->io->md->bs); + struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs); if (!clone) return NULL; @@ -1583,7 +1583,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, * won't be affected by this reassignment. */ struct bio *b = bio_clone_bioset(bio, GFP_NOIO, - md->queue->bio_split); + &md->queue->bio_split); ci.io->orig_bio = b; bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9); bio_chain(b, bio); @@ -1785,10 +1785,8 @@ static void cleanup_mapped_device(struct mapped_device *md) destroy_workqueue(md->wq); if (md->kworker_task) kthread_stop(md->kworker_task); - if (md->bs) - bioset_free(md->bs); - if (md->io_bs) - bioset_free(md->io_bs); + bioset_exit(&md->bs); + bioset_exit(&md->io_bs); if (md->dax_dev) { kill_dax(md->dax_dev); @@ -1965,16 +1963,10 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) * If so, reload bioset because front_pad may have changed * because a different table was loaded. */ - if (md->bs) { - bioset_free(md->bs); - md->bs = NULL; - } - if (md->io_bs) { - bioset_free(md->io_bs); - md->io_bs = NULL; - } + bioset_exit(&md->bs); + bioset_exit(&md->io_bs); - } else if (md->bs) { + } else if (bioset_initialized(&md->bs)) { /* * There's no need to reload with request-based dm * because the size of front_pad doesn't change. @@ -1986,12 +1978,14 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) goto out; } - BUG_ON(!p || md->bs || md->io_bs); + BUG_ON(!p || + bioset_initialized(&md->bs) || + bioset_initialized(&md->io_bs)); md->bs = p->bs; - p->bs = NULL; + memset(&p->bs, 0, sizeof(p->bs)); md->io_bs = p->io_bs; - p->io_bs = NULL; + memset(&p->io_bs, 0, sizeof(p->io_bs)); out: /* mempool bind completed, no longer need any mempools in the table */ dm_table_free_md_mempools(t); @@ -2905,6 +2899,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); unsigned int pool_size = 0; unsigned int front_pad, io_front_pad; + int ret; if (!pools) return NULL; @@ -2916,10 +2911,10 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio); - pools->io_bs = bioset_create(pool_size, io_front_pad, 0); - if (!pools->io_bs) + ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); + if (ret) goto out; - if (integrity && bioset_integrity_create(pools->io_bs, pool_size)) + if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) goto out; break; case DM_TYPE_REQUEST_BASED: @@ -2932,11 +2927,11 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu BUG(); } - pools->bs = bioset_create(pool_size, front_pad, 0); - if (!pools->bs) + ret = bioset_init(&pools->bs, pool_size, front_pad, 0); + if (ret) goto out; - if (integrity && bioset_integrity_create(pools->bs, pool_size)) + if (integrity && bioset_integrity_create(&pools->bs, pool_size)) goto out; return pools; @@ -2952,10 +2947,8 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) if (!pools) return; - if (pools->bs) - bioset_free(pools->bs); - if (pools->io_bs) - bioset_free(pools->io_bs); + bioset_exit(&pools->bs); + bioset_exit(&pools->io_bs); kfree(pools); } diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c index 38264b3..c2fdf89 100644 --- a/drivers/md/md-faulty.c +++ b/drivers/md/md-faulty.c @@ -214,7 +214,7 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio) } } if (failit) { - struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + struct bio *b = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); bio_set_dev(b, conf->rdev->bdev); b->bi_private = bio; diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 4964323..d45c697 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -269,7 +269,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) if (unlikely(bio_end_sector(bio) > end_sector)) { /* This bio crosses a device boundary, so we have to split it */ struct bio *split = bio_split(bio, end_sector - bio_sector, - GFP_NOIO, mddev->bio_set); + GFP_NOIO, &mddev->bio_set); bio_chain(split, bio); generic_make_request(bio); bio = split; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 0a7e99d..f71fcdb 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -80,7 +80,7 @@ static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status) bio->bi_status = status; bio_endio(bio); - mempool_free(mp_bh, conf->pool); + mempool_free(mp_bh, &conf->pool); } static void multipath_end_request(struct bio *bio) @@ -117,7 +117,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) return true; } - mp_bh = mempool_alloc(conf->pool, GFP_NOIO); + mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); mp_bh->master_bio = bio; mp_bh->mddev = mddev; @@ -125,7 +125,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) mp_bh->path = multipath_map(conf); if (mp_bh->path < 0) { bio_io_error(bio); - mempool_free(mp_bh, conf->pool); + mempool_free(mp_bh, &conf->pool); return true; } multipath = conf->multipaths + mp_bh->path; @@ -378,6 +378,7 @@ static int multipath_run (struct mddev *mddev) struct multipath_info *disk; struct md_rdev *rdev; int working_disks; + int ret; if (md_check_no_bitmap(mddev)) return -EINVAL; @@ -431,9 +432,9 @@ static int multipath_run (struct mddev *mddev) } mddev->degraded = conf->raid_disks - working_disks; - conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, - sizeof(struct multipath_bh)); - if (conf->pool == NULL) + ret = mempool_init_kmalloc_pool(&conf->pool, NR_RESERVED_BUFS, + sizeof(struct multipath_bh)); + if (ret) goto out_free_conf; mddev->thread = md_register_thread(multipathd, mddev, @@ -455,7 +456,7 @@ static int multipath_run (struct mddev *mddev) return 0; out_free_conf: - mempool_destroy(conf->pool); + mempool_exit(&conf->pool); kfree(conf->multipaths); kfree(conf); mddev->private = NULL; @@ -467,7 +468,7 @@ static void multipath_free(struct mddev *mddev, void *priv) { struct mpconf *conf = priv; - mempool_destroy(conf->pool); + mempool_exit(&conf->pool); kfree(conf->multipaths); kfree(conf); } diff --git a/drivers/md/md-multipath.h b/drivers/md/md-multipath.h index 0adb941..b3099e5 100644 --- a/drivers/md/md-multipath.h +++ b/drivers/md/md-multipath.h @@ -13,7 +13,7 @@ struct mpconf { spinlock_t device_lock; struct list_head retry_list; - mempool_t *pool; + mempool_t pool; }; /* diff --git a/drivers/md/md.c b/drivers/md/md.c index c208c01..fc692b7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -193,10 +193,10 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, { struct bio *b; - if (!mddev || !mddev->bio_set) + if (!mddev || !bioset_initialized(&mddev->bio_set)) return bio_alloc(gfp_mask, nr_iovecs); - b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); + b = bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set); if (!b) return NULL; return b; @@ -205,10 +205,10 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev); static struct bio *md_bio_alloc_sync(struct mddev *mddev) { - if (!mddev || !mddev->sync_set) + if (!mddev || !bioset_initialized(&mddev->sync_set)) return bio_alloc(GFP_NOIO, 1); - return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set); + return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set); } /* @@ -510,7 +510,10 @@ static void mddev_delayed_delete(struct work_struct *ws); static void mddev_put(struct mddev *mddev) { - struct bio_set *bs = NULL, *sync_bs = NULL; + struct bio_set bs, sync_bs; + + memset(&bs, 0, sizeof(bs)); + memset(&sync_bs, 0, sizeof(sync_bs)); if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; @@ -521,8 +524,8 @@ static void mddev_put(struct mddev *mddev) list_del_init(&mddev->all_mddevs); bs = mddev->bio_set; sync_bs = mddev->sync_set; - mddev->bio_set = NULL; - mddev->sync_set = NULL; + memset(&mddev->bio_set, 0, sizeof(mddev->bio_set)); + memset(&mddev->sync_set, 0, sizeof(mddev->sync_set)); if (mddev->gendisk) { /* We did a probe so need to clean up. Call * queue_work inside the spinlock so that @@ -535,10 +538,8 @@ static void mddev_put(struct mddev *mddev) kfree(mddev); } spin_unlock(&all_mddevs_lock); - if (bs) - bioset_free(bs); - if (sync_bs) - bioset_free(sync_bs); + bioset_exit(&bs); + bioset_exit(&sync_bs); } static void md_safemode_timeout(struct timer_list *t); @@ -2123,7 +2124,7 @@ int md_integrity_register(struct mddev *mddev) bdev_get_integrity(reference->bdev)); pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { + if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) { pr_err("md: failed to create integrity pool for %s\n", mdname(mddev)); return -EINVAL; @@ -5497,17 +5498,15 @@ int md_run(struct mddev *mddev) sysfs_notify_dirent_safe(rdev->sysfs_state); } - if (mddev->bio_set == NULL) { - mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (!mddev->bio_set) - return -ENOMEM; + if (!bioset_initialized(&mddev->bio_set)) { + err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (err) + return err; } - if (mddev->sync_set == NULL) { - mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (!mddev->sync_set) { - err = -ENOMEM; + if (!bioset_initialized(&mddev->sync_set)) { + err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (err) goto abort; - } } spin_lock(&pers_lock); @@ -5668,14 +5667,8 @@ int md_run(struct mddev *mddev) return 0; abort: - if (mddev->bio_set) { - bioset_free(mddev->bio_set); - mddev->bio_set = NULL; - } - if (mddev->sync_set) { - bioset_free(mddev->sync_set); - mddev->sync_set = NULL; - } + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); return err; } @@ -5888,14 +5881,8 @@ void md_stop(struct mddev *mddev) * This is called from dm-raid */ __md_stop(mddev); - if (mddev->bio_set) { - bioset_free(mddev->bio_set); - mddev->bio_set = NULL; - } - if (mddev->sync_set) { - bioset_free(mddev->sync_set); - mddev->sync_set = NULL; - } + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); } EXPORT_SYMBOL_GPL(md_stop); diff --git a/drivers/md/md.h b/drivers/md/md.h index fbc925c..3507cab2 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -452,8 +452,8 @@ struct mddev { struct attribute_group *to_remove; - struct bio_set *bio_set; - struct bio_set *sync_set; /* for sync operations like + struct bio_set bio_set; + struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 584c103..65ae47a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -479,7 +479,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) if (bio_end_sector(bio) > zone->zone_end) { struct bio *split = bio_split(bio, zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, - mddev->bio_set); + &mddev->bio_set); bio_chain(split, bio); generic_make_request(bio); bio = split; @@ -582,7 +582,8 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) sector = bio_sector; if (sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, sectors, GFP_NOIO, mddev->bio_set); + struct bio *split = bio_split(bio, sectors, GFP_NOIO, + &mddev->bio_set); bio_chain(split, bio); generic_make_request(bio); bio = split; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e9e3308..bad2852 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -221,7 +221,7 @@ static void free_r1bio(struct r1bio *r1_bio) struct r1conf *conf = r1_bio->mddev->private; put_all_bios(conf, r1_bio); - mempool_free(r1_bio, conf->r1bio_pool); + mempool_free(r1_bio, &conf->r1bio_pool); } static void put_buf(struct r1bio *r1_bio) @@ -236,7 +236,7 @@ static void put_buf(struct r1bio *r1_bio) rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); } - mempool_free(r1_bio, conf->r1buf_pool); + mempool_free(r1_bio, &conf->r1buf_pool); lower_barrier(conf, sect); } @@ -1178,7 +1178,7 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio) struct r1conf *conf = mddev->private; struct r1bio *r1_bio; - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO); /* Ensure no bio records IO_BLOCKED */ memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0])); init_r1bio(r1_bio, mddev, bio); @@ -1268,7 +1268,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, - gfp, conf->bio_split); + gfp, &conf->bio_split); bio_chain(split, bio); generic_make_request(bio); bio = split; @@ -1278,7 +1278,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, r1_bio->read_disk = rdisk; - read_bio = bio_clone_fast(bio, gfp, mddev->bio_set); + read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set); r1_bio->bios[rdisk] = read_bio; @@ -1439,7 +1439,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, - GFP_NOIO, conf->bio_split); + GFP_NOIO, &conf->bio_split); bio_chain(split, bio); generic_make_request(bio); bio = split; @@ -1479,9 +1479,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (r1_bio->behind_master_bio) mbio = bio_clone_fast(r1_bio->behind_master_bio, - GFP_NOIO, mddev->bio_set); + GFP_NOIO, &mddev->bio_set); else - mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); if (r1_bio->behind_master_bio) { if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) @@ -1657,8 +1657,7 @@ static void close_sync(struct r1conf *conf) _allow_barrier(conf, idx); } - mempool_destroy(conf->r1buf_pool); - conf->r1buf_pool = NULL; + mempool_exit(&conf->r1buf_pool); } static int raid1_spare_active(struct mddev *mddev) @@ -2348,10 +2347,10 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { wbio = bio_clone_fast(r1_bio->behind_master_bio, GFP_NOIO, - mddev->bio_set); + &mddev->bio_set); } else { wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, - mddev->bio_set); + &mddev->bio_set); } bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); @@ -2564,17 +2563,15 @@ static int init_resync(struct r1conf *conf) int buffs; buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; - BUG_ON(conf->r1buf_pool); - conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free, - conf->poolinfo); - if (!conf->r1buf_pool) - return -ENOMEM; - return 0; + BUG_ON(mempool_initialized(&conf->r1buf_pool)); + + return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc, + r1buf_pool_free, conf->poolinfo); } static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) { - struct r1bio *r1bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); + struct r1bio *r1bio = mempool_alloc(&conf->r1buf_pool, GFP_NOIO); struct resync_pages *rps; struct bio *bio; int i; @@ -2617,7 +2614,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, int idx = sector_to_idx(sector_nr); int page_idx = 0; - if (!conf->r1buf_pool) + if (!mempool_initialized(&conf->r1buf_pool)) if (init_resync(conf)) return 0; @@ -2953,14 +2950,13 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf->poolinfo) goto abort; conf->poolinfo->raid_disks = mddev->raid_disks * 2; - conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, - r1bio_pool_free, - conf->poolinfo); - if (!conf->r1bio_pool) + err = mempool_init(&conf->r1bio_pool, NR_RAID1_BIOS, r1bio_pool_alloc, + r1bio_pool_free, conf->poolinfo); + if (err) goto abort; - conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); - if (!conf->bio_split) + err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); + if (err) goto abort; conf->poolinfo->mddev = mddev; @@ -3033,7 +3029,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) abort: if (conf) { - mempool_destroy(conf->r1bio_pool); + mempool_exit(&conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); @@ -3041,8 +3037,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) kfree(conf->nr_waiting); kfree(conf->nr_queued); kfree(conf->barrier); - if (conf->bio_split) - bioset_free(conf->bio_split); + bioset_exit(&conf->bio_split); kfree(conf); } return ERR_PTR(err); @@ -3144,7 +3139,7 @@ static void raid1_free(struct mddev *mddev, void *priv) { struct r1conf *conf = priv; - mempool_destroy(conf->r1bio_pool); + mempool_exit(&conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); @@ -3152,8 +3147,7 @@ static void raid1_free(struct mddev *mddev, void *priv) kfree(conf->nr_waiting); kfree(conf->nr_queued); kfree(conf->barrier); - if (conf->bio_split) - bioset_free(conf->bio_split); + bioset_exit(&conf->bio_split); kfree(conf); } @@ -3199,13 +3193,17 @@ static int raid1_reshape(struct mddev *mddev) * At the same time, we "pack" the devices so that all the missing * devices have the higher raid_disk numbers. */ - mempool_t *newpool, *oldpool; + mempool_t newpool, oldpool; struct pool_info *newpoolinfo; struct raid1_info *newmirrors; struct r1conf *conf = mddev->private; int cnt, raid_disks; unsigned long flags; int d, d2; + int ret; + + memset(&newpool, 0, sizeof(newpool)); + memset(&oldpool, 0, sizeof(oldpool)); /* Cannot change chunk_size, layout, or level */ if (mddev->chunk_sectors != mddev->new_chunk_sectors || @@ -3237,17 +3235,17 @@ static int raid1_reshape(struct mddev *mddev) newpoolinfo->mddev = mddev; newpoolinfo->raid_disks = raid_disks * 2; - newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, - r1bio_pool_free, newpoolinfo); - if (!newpool) { + ret = mempool_init(&newpool, NR_RAID1_BIOS, r1bio_pool_alloc, + r1bio_pool_free, newpoolinfo); + if (ret) { kfree(newpoolinfo); - return -ENOMEM; + return ret; } newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2, GFP_KERNEL); if (!newmirrors) { kfree(newpoolinfo); - mempool_destroy(newpool); + mempool_exit(&newpool); return -ENOMEM; } @@ -3287,7 +3285,7 @@ static int raid1_reshape(struct mddev *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - mempool_destroy(oldpool); + mempool_exit(&oldpool); return 0; } diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index eb84bc6..e7ccad89 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -118,10 +118,10 @@ struct r1conf { * mempools - it changes when the array grows or shrinks */ struct pool_info *poolinfo; - mempool_t *r1bio_pool; - mempool_t *r1buf_pool; + mempool_t r1bio_pool; + mempool_t r1buf_pool; - struct bio_set *bio_split; + struct bio_set bio_split; /* temporary buffer to synchronous IO when attempting to repair * a read error. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3c60774..37d4b23 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -291,14 +291,14 @@ static void free_r10bio(struct r10bio *r10_bio) struct r10conf *conf = r10_bio->mddev->private; put_all_bios(conf, r10_bio); - mempool_free(r10_bio, conf->r10bio_pool); + mempool_free(r10_bio, &conf->r10bio_pool); } static void put_buf(struct r10bio *r10_bio) { struct r10conf *conf = r10_bio->mddev->private; - mempool_free(r10_bio, conf->r10buf_pool); + mempool_free(r10_bio, &conf->r10buf_pool); lower_barrier(conf); } @@ -1204,7 +1204,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, (unsigned long long)r10_bio->sector); if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, - gfp, conf->bio_split); + gfp, &conf->bio_split); bio_chain(split, bio); generic_make_request(bio); bio = split; @@ -1213,7 +1213,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } slot = r10_bio->read_slot; - read_bio = bio_clone_fast(bio, gfp, mddev->bio_set); + read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set); r10_bio->devs[slot].bio = read_bio; r10_bio->devs[slot].rdev = rdev; @@ -1261,7 +1261,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, } else rdev = conf->mirrors[devnum].rdev; - mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); if (replacement) r10_bio->devs[n_copy].repl_bio = mbio; else @@ -1509,7 +1509,7 @@ retry_write: if (r10_bio->sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, r10_bio->sectors, - GFP_NOIO, conf->bio_split); + GFP_NOIO, &conf->bio_split); bio_chain(split, bio); generic_make_request(bio); bio = split; @@ -1533,7 +1533,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) struct r10conf *conf = mddev->private; struct r10bio *r10_bio; - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; r10_bio->sectors = sectors; @@ -1732,8 +1732,7 @@ static void close_sync(struct r10conf *conf) wait_barrier(conf); allow_barrier(conf); - mempool_destroy(conf->r10buf_pool); - conf->r10buf_pool = NULL; + mempool_exit(&conf->r10buf_pool); } static int raid10_spare_active(struct mddev *mddev) @@ -2583,7 +2582,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) if (sectors > sect_to_write) sectors = sect_to_write; /* Write at 'sector' for 'sectors' */ - wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); wbio->bi_iter.bi_sector = wsector + @@ -2816,25 +2815,25 @@ static void raid10d(struct md_thread *thread) static int init_resync(struct r10conf *conf) { - int buffs; - int i; + int ret, buffs, i; buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; - BUG_ON(conf->r10buf_pool); + BUG_ON(mempool_initialized(&conf->r10buf_pool)); conf->have_replacement = 0; for (i = 0; i < conf->geo.raid_disks; i++) if (conf->mirrors[i].replacement) conf->have_replacement = 1; - conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); - if (!conf->r10buf_pool) - return -ENOMEM; + ret = mempool_init(&conf->r10buf_pool, buffs, + r10buf_pool_alloc, r10buf_pool_free, conf); + if (ret) + return ret; conf->next_resync = 0; return 0; } static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) { - struct r10bio *r10bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO); struct rsync_pages *rp; struct bio *bio; int nalloc; @@ -2945,7 +2944,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t chunk_mask = conf->geo.chunk_mask; int page_idx = 0; - if (!conf->r10buf_pool) + if (!mempool_initialized(&conf->r10buf_pool)) if (init_resync(conf)) return 0; @@ -3699,13 +3698,13 @@ static struct r10conf *setup_conf(struct mddev *mddev) conf->geo = geo; conf->copies = copies; - conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, - r10bio_pool_free, conf); - if (!conf->r10bio_pool) + err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc, + r10bio_pool_free, conf); + if (err) goto out; - conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); - if (!conf->bio_split) + err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); + if (err) goto out; calc_sectors(conf, mddev->dev_sectors); @@ -3733,6 +3732,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) init_waitqueue_head(&conf->wait_barrier); atomic_set(&conf->nr_pending, 0); + err = -ENOMEM; conf->thread = md_register_thread(raid10d, mddev, "raid10"); if (!conf->thread) goto out; @@ -3742,11 +3742,10 @@ static struct r10conf *setup_conf(struct mddev *mddev) out: if (conf) { - mempool_destroy(conf->r10bio_pool); + mempool_exit(&conf->r10bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); - if (conf->bio_split) - bioset_free(conf->bio_split); + bioset_exit(&conf->bio_split); kfree(conf); } return ERR_PTR(err); @@ -3953,7 +3952,7 @@ static int raid10_run(struct mddev *mddev) out_free_conf: md_unregister_thread(&mddev->thread); - mempool_destroy(conf->r10bio_pool); + mempool_exit(&conf->r10bio_pool); safe_put_page(conf->tmppage); kfree(conf->mirrors); kfree(conf); @@ -3966,13 +3965,12 @@ static void raid10_free(struct mddev *mddev, void *priv) { struct r10conf *conf = priv; - mempool_destroy(conf->r10bio_pool); + mempool_exit(&conf->r10bio_pool); safe_put_page(conf->tmppage); kfree(conf->mirrors); kfree(conf->mirrors_old); kfree(conf->mirrors_new); - if (conf->bio_split) - bioset_free(conf->bio_split); + bioset_exit(&conf->bio_split); kfree(conf); } @@ -4543,7 +4541,7 @@ read_more: * on all the target devices. */ // FIXME - mempool_free(r10_bio, conf->r10buf_pool); + mempool_free(r10_bio, &conf->r10buf_pool); set_bit(MD_RECOVERY_INTR, &mddev->recovery); return sectors_done; } diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index e2e8840..d3eaaf3 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -93,10 +93,10 @@ struct r10conf { */ wait_queue_head_t wait_barrier; - mempool_t *r10bio_pool; - mempool_t *r10buf_pool; + mempool_t r10bio_pool; + mempool_t r10buf_pool; struct page *tmppage; - struct bio_set *bio_split; + struct bio_set bio_split; /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 3c65f52..2b775ab 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -125,9 +125,9 @@ struct r5l_log { struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ struct kmem_cache *io_kc; - mempool_t *io_pool; - struct bio_set *bs; - mempool_t *meta_pool; + mempool_t io_pool; + struct bio_set bs; + mempool_t meta_pool; struct md_thread *reclaim_thread; unsigned long reclaim_target; /* number of space that need to be @@ -579,7 +579,7 @@ static void r5l_log_endio(struct bio *bio) md_error(log->rdev->mddev, log->rdev); bio_put(bio); - mempool_free(io->meta_page, log->meta_pool); + mempool_free(io->meta_page, &log->meta_pool); spin_lock_irqsave(&log->io_list_lock, flags); __r5l_set_io_unit_state(io, IO_UNIT_IO_END); @@ -748,7 +748,7 @@ static void r5l_submit_current_io(struct r5l_log *log) static struct bio *r5l_bio_alloc(struct r5l_log *log) { - struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); + struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &log->bs); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio_set_dev(bio, log->rdev->bdev); @@ -780,7 +780,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) struct r5l_io_unit *io; struct r5l_meta_block *block; - io = mempool_alloc(log->io_pool, GFP_ATOMIC); + io = mempool_alloc(&log->io_pool, GFP_ATOMIC); if (!io) return NULL; memset(io, 0, sizeof(*io)); @@ -791,7 +791,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) bio_list_init(&io->flush_barriers); io->state = IO_UNIT_RUNNING; - io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); + io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO); block = page_address(io->meta_page); clear_page(block); block->magic = cpu_to_le32(R5LOG_MAGIC); @@ -1223,7 +1223,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) log->next_checkpoint = io->log_start; list_del(&io->log_sibling); - mempool_free(io, log->io_pool); + mempool_free(io, &log->io_pool); r5l_run_no_mem_stripe(log); found = true; @@ -1647,7 +1647,7 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log, { struct page *page; - ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, log->bs); + ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, &log->bs); if (!ctx->ra_bio) return -ENOMEM; @@ -3066,6 +3066,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) struct request_queue *q = bdev_get_queue(rdev->bdev); struct r5l_log *log; char b[BDEVNAME_SIZE]; + int ret; pr_debug("md/raid:%s: using device %s as journal\n", mdname(conf->mddev), bdevname(rdev->bdev, b)); @@ -3111,16 +3112,16 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->io_kc) goto io_kc; - log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); - if (!log->io_pool) + ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc); + if (ret) goto io_pool; - log->bs = bioset_create(R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (!log->bs) + ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (ret) goto io_bs; - log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); - if (!log->meta_pool) + ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0); + if (ret) goto out_mempool; spin_lock_init(&log->tree_lock); @@ -3155,11 +3156,11 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) rcu_assign_pointer(conf->log, NULL); md_unregister_thread(&log->reclaim_thread); reclaim_thread: - mempool_destroy(log->meta_pool); + mempool_exit(&log->meta_pool); out_mempool: - bioset_free(log->bs); + bioset_exit(&log->bs); io_bs: - mempool_destroy(log->io_pool); + mempool_exit(&log->io_pool); io_pool: kmem_cache_destroy(log->io_kc); io_kc: @@ -3178,9 +3179,9 @@ void r5l_exit_log(struct r5conf *conf) wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); - mempool_destroy(log->meta_pool); - bioset_free(log->bs); - mempool_destroy(log->io_pool); + mempool_exit(&log->meta_pool); + bioset_exit(&log->bs); + mempool_exit(&log->io_pool); kmem_cache_destroy(log->io_kc); kfree(log); } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 42890a08..3a7c363 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -105,9 +105,9 @@ struct ppl_conf { atomic64_t seq; /* current log write sequence number */ struct kmem_cache *io_kc; - mempool_t *io_pool; - struct bio_set *bs; - struct bio_set *flush_bs; + mempool_t io_pool; + struct bio_set bs; + struct bio_set flush_bs; /* used only for recovery */ int recovered_entries; @@ -244,7 +244,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log, struct ppl_header *pplhdr; struct page *header_page; - io = mempool_alloc(ppl_conf->io_pool, GFP_NOWAIT); + io = mempool_alloc(&ppl_conf->io_pool, GFP_NOWAIT); if (!io) return NULL; @@ -503,7 +503,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) struct bio *prev = bio; bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, - ppl_conf->bs); + &ppl_conf->bs); bio->bi_opf = prev->bi_opf; bio_copy_dev(bio, prev); bio->bi_iter.bi_sector = bio_end_sector(prev); @@ -570,7 +570,7 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io) list_del(&io->log_sibling); spin_unlock(&log->io_list_lock); - mempool_free(io, ppl_conf->io_pool); + mempool_free(io, &ppl_conf->io_pool); spin_lock(&ppl_conf->no_mem_stripes_lock); if (!list_empty(&ppl_conf->no_mem_stripes)) { @@ -642,7 +642,7 @@ static void ppl_do_flush(struct ppl_io_unit *io) struct bio *bio; char b[BDEVNAME_SIZE]; - bio = bio_alloc_bioset(GFP_NOIO, 0, ppl_conf->flush_bs); + bio = bio_alloc_bioset(GFP_NOIO, 0, &ppl_conf->flush_bs); bio_set_dev(bio, bdev); bio->bi_private = io; bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; @@ -1246,11 +1246,9 @@ static void __ppl_exit_log(struct ppl_conf *ppl_conf) kfree(ppl_conf->child_logs); - if (ppl_conf->bs) - bioset_free(ppl_conf->bs); - if (ppl_conf->flush_bs) - bioset_free(ppl_conf->flush_bs); - mempool_destroy(ppl_conf->io_pool); + bioset_exit(&ppl_conf->bs); + bioset_exit(&ppl_conf->flush_bs); + mempool_exit(&ppl_conf->io_pool); kmem_cache_destroy(ppl_conf->io_kc); kfree(ppl_conf); @@ -1387,24 +1385,18 @@ int ppl_init_log(struct r5conf *conf) goto err; } - ppl_conf->io_pool = mempool_create(conf->raid_disks, ppl_io_pool_alloc, - ppl_io_pool_free, ppl_conf->io_kc); - if (!ppl_conf->io_pool) { - ret = -ENOMEM; + ret = mempool_init(&ppl_conf->io_pool, conf->raid_disks, ppl_io_pool_alloc, + ppl_io_pool_free, ppl_conf->io_kc); + if (ret) goto err; - } - ppl_conf->bs = bioset_create(conf->raid_disks, 0, BIOSET_NEED_BVECS); - if (!ppl_conf->bs) { - ret = -ENOMEM; + ret = bioset_init(&ppl_conf->bs, conf->raid_disks, 0, BIOSET_NEED_BVECS); + if (ret) goto err; - } - ppl_conf->flush_bs = bioset_create(conf->raid_disks, 0, 0); - if (!ppl_conf->flush_bs) { - ret = -ENOMEM; + ret = bioset_init(&ppl_conf->flush_bs, conf->raid_disks, 0, 0); + if (ret) goto err; - } ppl_conf->count = conf->raid_disks; ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log), diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index be117d0..a2e6498 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5192,7 +5192,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) /* * use bio_clone_fast to make a copy of the bio */ - align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set); + align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); if (!align_bi) return 0; /* @@ -5277,7 +5277,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) if (sectors < bio_sectors(raid_bio)) { struct r5conf *conf = mddev->private; - split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split); + split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); bio_chain(split, raid_bio); generic_make_request(raid_bio); raid_bio = split; @@ -6773,8 +6773,7 @@ static void free_conf(struct r5conf *conf) if (conf->disks[i].extra_page) put_page(conf->disks[i].extra_page); kfree(conf->disks); - if (conf->bio_split) - bioset_free(conf->bio_split); + bioset_exit(&conf->bio_split); kfree(conf->stripe_hashtbl); kfree(conf->pending_data); kfree(conf); @@ -6853,6 +6852,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) int i; int group_cnt, worker_cnt_per_group; struct r5worker_group *new_group; + int ret; if (mddev->new_level != 5 && mddev->new_level != 4 @@ -6950,8 +6950,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; } - conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); - if (!conf->bio_split) + ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); + if (ret) goto abort; conf->mddev = mddev; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 3f8da26..72e75ba 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -669,7 +669,7 @@ struct r5conf { int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; struct disk_info *disks; - struct bio_set *bio_split; + struct bio_set bio_split; /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 57b13df..a15181f 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -2094,14 +2094,9 @@ static const struct block_device_operations msb_bdops = { static int msb_init_disk(struct memstick_dev *card) { struct msb_data *msb = memstick_get_drvdata(card); - struct memstick_host *host = card->host; int rc; - u64 limit = BLK_BOUNCE_HIGH; unsigned long capacity; - if (host->dev.dma_mask && *(host->dev.dma_mask)) - limit = *(host->dev.dma_mask); - mutex_lock(&msb_disk_lock); msb->disk_id = idr_alloc(&msb_disk_idr, card, 0, 256, GFP_KERNEL); mutex_unlock(&msb_disk_lock); @@ -2123,7 +2118,6 @@ static int msb_init_disk(struct memstick_dev *card) msb->queue->queuedata = card; - blk_queue_bounce_limit(msb->queue, limit); blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES); blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS); blk_queue_max_segment_size(msb->queue, diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 8897962..5ee9326 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -1170,17 +1170,12 @@ static int mspro_block_init_card(struct memstick_dev *card) static int mspro_block_init_disk(struct memstick_dev *card) { struct mspro_block_data *msb = memstick_get_drvdata(card); - struct memstick_host *host = card->host; struct mspro_devinfo *dev_info = NULL; struct mspro_sys_info *sys_info = NULL; struct mspro_sys_attr *s_attr = NULL; int rc, disk_id; - u64 limit = BLK_BOUNCE_HIGH; unsigned long capacity; - if (host->dev.dma_mask && *(host->dev.dma_mask)) - limit = *(host->dev.dma_mask); - for (rc = 0; msb->attr_group.attrs[rc]; ++rc) { s_attr = mspro_from_sysfs_attr(msb->attr_group.attrs[rc]); @@ -1219,7 +1214,6 @@ static int mspro_block_init_disk(struct memstick_dev *card) msb->queue->queuedata = card; - blk_queue_bounce_limit(msb->queue, limit); blk_queue_max_hw_sectors(msb->queue, MSPRO_BLOCK_MAX_PAGES); blk_queue_max_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS); blk_queue_max_segment_size(msb->queue, diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index 86503f6..19a5aa7 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -1929,7 +1929,7 @@ static enum blk_eh_timer_return mptsas_eh_timed_out(struct scsi_cmnd *sc) MPT_SCSI_HOST *hd; MPT_ADAPTER *ioc; VirtDevice *vdevice; - enum blk_eh_timer_return rc = BLK_EH_NOT_HANDLED; + enum blk_eh_timer_return rc = BLK_EH_DONE; hd = shost_priv(sc->device->host); if (hd == NULL) { diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 38a7586..d89e178 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -244,7 +244,7 @@ static ssize_t power_ro_lock_store(struct device *dev, mq = &md->queue; /* Dispatch locking to the block layer */ - req = blk_get_request(mq->queue, REQ_OP_DRV_OUT, __GFP_RECLAIM); + req = blk_get_request(mq->queue, REQ_OP_DRV_OUT, 0); if (IS_ERR(req)) { count = PTR_ERR(req); goto out_put; @@ -650,8 +650,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md, */ mq = &md->queue; req = blk_get_request(mq->queue, - idata->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, - __GFP_RECLAIM); + idata->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(req)) { err = PTR_ERR(req); goto cmd_done; @@ -721,8 +720,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md, */ mq = &md->queue; req = blk_get_request(mq->queue, - idata[0]->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, - __GFP_RECLAIM); + idata[0]->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(req)) { err = PTR_ERR(req); goto cmd_err; @@ -2750,7 +2748,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val) int ret; /* Ask the block layer about the card status */ - req = blk_get_request(mq->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); + req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0); if (IS_ERR(req)) return PTR_ERR(req); req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS; @@ -2786,7 +2784,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp) return -ENOMEM; /* Ask the block layer for the EXT CSD */ - req = blk_get_request(mq->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); + req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0); if (IS_ERR(req)) { err = PTR_ERR(req); goto out_free; diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 56e9a80..648eb67 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -111,8 +111,9 @@ static enum blk_eh_timer_return mmc_cqe_timed_out(struct request *req) __mmc_cqe_recovery_notifier(mq); return BLK_EH_RESET_TIMER; } - /* No timeout */ - return BLK_EH_HANDLED; + /* No timeout (XXX: huh? comment doesn't make much sense) */ + blk_mq_complete_request(req); + return BLK_EH_DONE; default: /* Timeout is handled by mmc core */ return BLK_EH_RESET_TIMER; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 16ae4ae..29c0bfd 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -82,7 +82,6 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr, block = blk_rq_pos(req) << 9 >> tr->blkshift; nsect = blk_rq_cur_bytes(req) >> tr->blkshift; - buf = bio_data(req->bio); if (req_op(req) == REQ_OP_FLUSH) { if (tr->flush(dev)) @@ -100,9 +99,14 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr, return BLK_STS_IOERR; return BLK_STS_OK; case REQ_OP_READ: - for (; nsect > 0; nsect--, block++, buf += tr->blksize) - if (tr->readsect(dev, block, buf)) + buf = kmap(bio_page(req->bio)) + bio_offset(req->bio); + for (; nsect > 0; nsect--, block++, buf += tr->blksize) { + if (tr->readsect(dev, block, buf)) { + kunmap(bio_page(req->bio)); return BLK_STS_IOERR; + } + } + kunmap(bio_page(req->bio)); rq_flush_dcache_pages(req); return BLK_STS_OK; case REQ_OP_WRITE: @@ -110,9 +114,14 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr, return BLK_STS_IOERR; rq_flush_dcache_pages(req); - for (; nsect > 0; nsect--, block++, buf += tr->blksize) - if (tr->writesect(dev, block, buf)) + buf = kmap(bio_page(req->bio)) + bio_offset(req->bio); + for (; nsect > 0; nsect--, block++, buf += tr->blksize) { + if (tr->writesect(dev, block, buf)) { + kunmap(bio_page(req->bio)); return BLK_STS_IOERR; + } + } + kunmap(bio_page(req->bio)); return BLK_STS_OK; default: return BLK_STS_IOERR; @@ -418,7 +427,6 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) new->rq->queuedata = new; blk_queue_logical_block_size(new->rq, tr->blksize); - blk_queue_bounce_limit(new->rq, BLK_BOUNCE_HIGH); blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b9ca782..04a20da 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -101,6 +101,15 @@ static void nvme_ns_remove(struct nvme_ns *ns); static int nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); +static void nvme_queue_scan(struct nvme_ctrl *ctrl) +{ + /* + * Only new queue scan work when admin and IO queues are both alive + */ + if (ctrl->state == NVME_CTRL_LIVE) + queue_work(nvme_wq, &ctrl->scan_work); +} + int nvme_reset_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) @@ -244,9 +253,6 @@ EXPORT_SYMBOL_GPL(nvme_complete_rq); void nvme_cancel_request(struct request *req, void *data, bool reserved) { - if (!blk_mq_request_started(req)) - return; - dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, "Cancelling I/O %d", req->tag); @@ -1033,6 +1039,21 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) } EXPORT_SYMBOL_GPL(nvme_set_queue_count); +#define NVME_AEN_SUPPORTED \ + (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT) + +static void nvme_enable_aen(struct nvme_ctrl *ctrl) +{ + u32 result; + int status; + + status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, + ctrl->oaes & NVME_AEN_SUPPORTED, NULL, 0, &result); + if (status) + dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n", + ctrl->oaes & NVME_AEN_SUPPORTED); +} + static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { struct nvme_user_io io; @@ -1351,13 +1372,19 @@ static void nvme_set_chunk_size(struct nvme_ns *ns) blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); } -static void nvme_config_discard(struct nvme_ctrl *ctrl, - unsigned stream_alignment, struct request_queue *queue) +static void nvme_config_discard(struct nvme_ns *ns) { + struct nvme_ctrl *ctrl = ns->ctrl; + struct request_queue *queue = ns->queue; u32 size = queue_logical_block_size(queue); - if (stream_alignment) - size *= stream_alignment; + if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) { + blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue); + return; + } + + if (ctrl->nr_streams && ns->sws && ns->sgs) + size *= ns->sws * ns->sgs; BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < NVME_DSM_MAX_RANGES); @@ -1365,9 +1392,12 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, queue->limits.discard_alignment = 0; queue->limits.discard_granularity = size; + /* If discard is already enabled, don't reset queue limits */ + if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue)) + return; + blk_queue_max_discard_sectors(queue, UINT_MAX); blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, queue); if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); @@ -1411,10 +1441,6 @@ static void nvme_update_disk_info(struct gendisk *disk, { sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9); unsigned short bs = 1 << ns->lba_shift; - unsigned stream_alignment = 0; - - if (ns->ctrl->nr_streams && ns->sws && ns->sgs) - stream_alignment = ns->sws * ns->sgs; blk_mq_freeze_queue(disk->queue); blk_integrity_unregister(disk); @@ -1428,10 +1454,9 @@ static void nvme_update_disk_info(struct gendisk *disk, nvme_init_integrity(disk, ns->ms, ns->pi_type); if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) capacity = 0; - set_capacity(disk, capacity); - if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) - nvme_config_discard(ns->ctrl, stream_alignment, disk->queue); + set_capacity(disk, capacity); + nvme_config_discard(ns); blk_mq_unfreeze_queue(disk->queue); } @@ -1577,7 +1602,7 @@ static int nvme_pr_reserve(struct block_device *bdev, u64 key, static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, enum pr_type type, bool abort) { - u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1; + u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1); return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); } @@ -1589,7 +1614,7 @@ static int nvme_pr_clear(struct block_device *bdev, u64 key) static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { - u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0; + u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0); return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); } @@ -2183,7 +2208,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) * Verify that the subsystem actually supports multiple * controllers, else bail out. */ - if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) { + if (!ctrl->opts->discovery_nqn && + nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) { dev_err(ctrl->device, "ignoring ctrl due to duplicate subnqn (%s).\n", found->subnqn); @@ -2314,7 +2340,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { ret = nvme_get_effects_log(ctrl); if (ret < 0) - return ret; + goto out_free; } if (!ctrl->identified) { @@ -2345,6 +2371,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->oacs = le16_to_cpu(id->oacs); ctrl->oncs = le16_to_cpup(&id->oncs); + ctrl->oaes = le32_to_cpu(id->oaes); atomic_set(&ctrl->abort_limit, id->acl + 1); ctrl->vwc = id->vwc; ctrl->cntlid = le16_to_cpup(&id->cntlid); @@ -3170,6 +3197,42 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) nvme_remove_invalid_namespaces(ctrl, nn); } +static bool nvme_scan_changed_ns_log(struct nvme_ctrl *ctrl) +{ + size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32); + __le32 *log; + int error, i; + bool ret = false; + + log = kzalloc(log_size, GFP_KERNEL); + if (!log) + return false; + + error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size); + if (error) { + dev_warn(ctrl->device, + "reading changed ns log failed: %d\n", error); + goto out_free_log; + } + + if (log[0] == cpu_to_le32(0xffffffff)) + goto out_free_log; + + for (i = 0; i < NVME_MAX_CHANGED_NAMESPACES; i++) { + u32 nsid = le32_to_cpu(log[i]); + + if (nsid == 0) + break; + dev_info(ctrl->device, "rescanning namespace %d.\n", nsid); + nvme_validate_ns(ctrl, nsid); + } + ret = true; + +out_free_log: + kfree(log); + return ret; +} + static void nvme_scan_work(struct work_struct *work) { struct nvme_ctrl *ctrl = @@ -3182,6 +3245,12 @@ static void nvme_scan_work(struct work_struct *work) WARN_ON_ONCE(!ctrl->tagset); + if (test_and_clear_bit(EVENT_NS_CHANGED, &ctrl->events)) { + if (nvme_scan_changed_ns_log(ctrl)) + goto out_sort_namespaces; + dev_info(ctrl->device, "rescanning namespaces.\n"); + } + if (nvme_identify_ctrl(ctrl, &id)) return; @@ -3189,25 +3258,16 @@ static void nvme_scan_work(struct work_struct *work) if (ctrl->vs >= NVME_VS(1, 1, 0) && !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { if (!nvme_scan_ns_list(ctrl, nn)) - goto done; + goto out_free_id; } nvme_scan_ns_sequential(ctrl, nn); - done: +out_free_id: + kfree(id); +out_sort_namespaces: down_write(&ctrl->namespaces_rwsem); list_sort(NULL, &ctrl->namespaces, ns_cmp); up_write(&ctrl->namespaces_rwsem); - kfree(id); -} - -void nvme_queue_scan(struct nvme_ctrl *ctrl) -{ - /* - * Only new queue scan work when admin and IO queues are both alive - */ - if (ctrl->state == NVME_CTRL_LIVE) - queue_work(nvme_wq, &ctrl->scan_work); } -EXPORT_SYMBOL_GPL(nvme_queue_scan); /* * This function iterates the namespace list unlocked to allow recovery from @@ -3322,8 +3382,23 @@ static void nvme_fw_act_work(struct work_struct *work) nvme_get_fw_slot_info(ctrl); } +static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) +{ + switch ((result & 0xff00) >> 8) { + case NVME_AER_NOTICE_NS_CHANGED: + set_bit(EVENT_NS_CHANGED, &ctrl->events); + nvme_queue_scan(ctrl); + break; + case NVME_AER_NOTICE_FW_ACT_STARTING: + queue_work(nvme_wq, &ctrl->fw_act_work); + break; + default: + dev_warn(ctrl->device, "async event result %08x\n", result); + } +} + void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, - union nvme_result *res) + volatile union nvme_result *res) { u32 result = le32_to_cpu(res->u32); @@ -3331,6 +3406,9 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, return; switch (result & 0x7) { + case NVME_AER_NOTICE: + nvme_handle_aen_notice(ctrl, result); + break; case NVME_AER_ERROR: case NVME_AER_SMART: case NVME_AER_CSS: @@ -3340,18 +3418,6 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, default: break; } - - switch (result & 0xff07) { - case NVME_AER_NOTICE_NS_CHANGED: - dev_info(ctrl->device, "rescanning\n"); - nvme_queue_scan(ctrl); - break; - case NVME_AER_NOTICE_FW_ACT_STARTING: - queue_work(nvme_wq, &ctrl->fw_act_work); - break; - default: - dev_warn(ctrl->device, "async event result %08x\n", result); - } queue_work(nvme_wq, &ctrl->async_event_work); } EXPORT_SYMBOL_GPL(nvme_complete_async_event); @@ -3374,6 +3440,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); + nvme_enable_aen(ctrl); queue_work(nvme_wq, &ctrl->async_event_work); nvme_start_queues(ctrl); } diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 7ae732a..5f5f706 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -57,7 +57,7 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn) goto out_unlock; kref_init(&host->ref); - memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); + strlcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); list_add_tail(&host->list, &nvmf_hosts); out_unlock: @@ -545,71 +545,54 @@ blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq, return BLK_STS_OK; switch (ctrl->state) { - case NVME_CTRL_DELETING: - goto reject_io; - case NVME_CTRL_NEW: case NVME_CTRL_CONNECTING: + case NVME_CTRL_DELETING: + /* + * This is the case of starting a new or deleting an association + * but connectivity was lost before it was fully created or torn + * down. We need to error the commands used to initialize the + * controller so the reconnect can go into a retry attempt. The + * commands should all be marked REQ_FAILFAST_DRIVER, which will + * hit the reject path below. Anything else will be queued while + * the state settles. + */ if (!is_connected) - /* - * This is the case of starting a new - * association but connectivity was lost - * before it was fully created. We need to - * error the commands used to initialize the - * controller so the reconnect can go into a - * retry attempt. The commands should all be - * marked REQ_FAILFAST_DRIVER, which will hit - * the reject path below. Anything else will - * be queued while the state settles. - */ - goto reject_or_queue_io; - - if ((queue_live && - !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) || - (!queue_live && blk_rq_is_passthrough(rq) && - cmd->common.opcode == nvme_fabrics_command && - cmd->fabrics.fctype == nvme_fabrics_type_connect)) - /* - * If queue is live, allow only commands that - * are internally generated pass through. These - * are commands on the admin queue to initialize - * the controller. This will reject any ioctl - * admin cmds received while initializing. - * - * If the queue is not live, allow only a - * connect command. This will reject any ioctl - * admin cmd as well as initialization commands - * if the controller reverted the queue to non-live. - */ + break; + + /* + * If queue is live, allow only commands that are internally + * generated pass through. These are commands on the admin + * queue to initialize the controller. This will reject any + * ioctl admin cmds received while initializing. + */ + if (queue_live && !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) return BLK_STS_OK; /* - * fall-thru to the reject_or_queue_io clause + * If the queue is not live, allow only a connect command. This + * will reject any ioctl admin cmd as well as initialization + * commands if the controller reverted the queue to non-live. */ + if (!queue_live && blk_rq_is_passthrough(rq) && + cmd->common.opcode == nvme_fabrics_command && + cmd->fabrics.fctype == nvme_fabrics_type_connect) + return BLK_STS_OK; break; - - /* these cases fall-thru - * case NVME_CTRL_LIVE: - * case NVME_CTRL_RESETTING: - */ default: break; } -reject_or_queue_io: /* - * Any other new io is something we're not in a state to send - * to the device. Default action is to busy it and retry it - * after the controller state is recovered. However, anything - * marked for failfast or nvme multipath is immediately failed. - * Note: commands used to initialize the controller will be - * marked for failfast. + * Any other new io is something we're not in a state to send to the + * device. Default action is to busy it and retry it after the + * controller state is recovered. However, anything marked for failfast + * or nvme multipath is immediately failed. Note: commands used to + * initialize the controller will be marked for failfast. * Note: nvme cli/ioctl commands are marked for failfast. */ if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; - -reject_io: nvme_req(rq)->status = NVME_SC_ABORT_REQ; return BLK_STS_IOERR; } @@ -689,10 +672,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->discovery_nqn = !(strcmp(opts->subsysnqn, NVME_DISC_SUBSYS_NAME)); - if (opts->discovery_nqn) { - opts->kato = 0; - opts->nr_io_queues = 0; - } break; case NVMF_OPT_TRADDR: p = match_strdup(args); @@ -851,6 +830,11 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } } + if (opts->discovery_nqn) { + opts->kato = 0; + opts->nr_io_queues = 0; + opts->duplicate_connect = true; + } if (ctrl_loss_tmo < 0) opts->max_reconnects = -1; else @@ -983,16 +967,6 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) goto out_module_put; } - if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) { - dev_warn(ctrl->device, - "controller returned incorrect NQN: \"%s\".\n", - ctrl->subsys->subnqn); - module_put(ops->module); - up_read(&nvmf_transports_rwsem); - nvme_delete_ctrl_sync(ctrl); - return ERR_PTR(-EINVAL); - } - module_put(ops->module); up_read(&nvmf_transports_rwsem); return ctrl; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index ef46c91..0cf0460 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -139,7 +139,9 @@ static inline bool nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl, struct nvmf_ctrl_options *opts) { - if (strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) || + if (ctrl->state == NVME_CTRL_DELETING || + ctrl->state == NVME_CTRL_DEAD || + strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) || strcmp(opts->host->nqn, ctrl->opts->host->nqn) || memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t))) return false; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 6cb26bc..0bad658 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1686,16 +1686,6 @@ done: goto check_error; } - /* - * Force failures of commands if we're killing the controller - * or have an error on a command used to create an new association - */ - if (status && - (blk_queue_dying(rq->q) || - ctrl->ctrl.state == NVME_CTRL_NEW || - ctrl->ctrl.state == NVME_CTRL_CONNECTING)) - status |= cpu_to_le16(NVME_SC_DNR << 1); - __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate); nvme_end_request(rq, status, result); @@ -2403,9 +2393,6 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req); - if (!blk_mq_request_started(req)) - return; - __nvme_fc_abort_op(ctrl, op); } @@ -3284,6 +3271,8 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) } spin_unlock_irqrestore(&nvme_fc_lock, flags); + pr_warn("%s: %s - %s combination not found\n", + __func__, opts->traddr, opts->host_traddr); return ERR_PTR(-ENOENT); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 17d2f7c..de24fe7 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -22,6 +22,7 @@ #include <linux/lightnvm.h> #include <linux/sed-opal.h> #include <linux/fault-inject.h> +#include <linux/rcupdate.h> extern unsigned int nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) @@ -180,6 +181,7 @@ struct nvme_ctrl { u16 kas; u8 npss; u8 apsta; + u32 oaes; u32 aen_result; unsigned int shutdown_timeout; unsigned int kato; @@ -192,6 +194,8 @@ struct nvme_ctrl { struct delayed_work ka_work; struct nvme_command ka_cmd; struct work_struct fw_act_work; +#define EVENT_NS_CHANGED (1 << 0) + unsigned long events; /* Power saving configuration */ u64 ps_max_latency_us; @@ -398,14 +402,13 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl); void nvme_put_ctrl(struct nvme_ctrl *ctrl); int nvme_init_identify(struct nvme_ctrl *ctrl); -void nvme_queue_scan(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send); void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, - union nvme_result *res); + volatile union nvme_result *res); void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); @@ -454,7 +457,7 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) { struct nvme_ns_head *head = ns->head; - if (head && ns == srcu_dereference(head->current_path, &head->srcu)) + if (head && ns == rcu_access_pointer(head->current_path)) rcu_assign_pointer(head->current_path, NULL); } struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 17a0190..e526437 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -13,6 +13,7 @@ */ #include <linux/aer.h> +#include <linux/async.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/blk-mq-pci.h> @@ -68,7 +69,6 @@ MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); struct nvme_dev; struct nvme_queue; -static void nvme_process_cq(struct nvme_queue *nvmeq); static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); /* @@ -147,9 +147,10 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) struct nvme_queue { struct device *q_dmadev; struct nvme_dev *dev; - spinlock_t q_lock; + spinlock_t sq_lock; struct nvme_command *sq_cmds; struct nvme_command __iomem *sq_cmds_io; + spinlock_t cq_lock ____cacheline_aligned_in_smp; volatile struct nvme_completion *cqes; struct blk_mq_tags **tags; dma_addr_t sq_dma_addr; @@ -159,9 +160,9 @@ struct nvme_queue { s16 cq_vector; u16 sq_tail; u16 cq_head; + u16 last_cq_head; u16 qid; u8 cq_phase; - u8 cqe_seen; u32 *dbbuf_sq_db; u32 *dbbuf_cq_db; u32 *dbbuf_sq_ei; @@ -420,28 +421,25 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set) } /** - * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell + * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * @nvmeq: The queue to use * @cmd: The command to send - * - * Safe to use from interrupt context */ -static void __nvme_submit_cmd(struct nvme_queue *nvmeq, - struct nvme_command *cmd) +static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) { - u16 tail = nvmeq->sq_tail; - + spin_lock(&nvmeq->sq_lock); if (nvmeq->sq_cmds_io) - memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd)); + memcpy_toio(&nvmeq->sq_cmds_io[nvmeq->sq_tail], cmd, + sizeof(*cmd)); else - memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); + memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); - if (++tail == nvmeq->q_depth) - tail = 0; - if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db, - nvmeq->dbbuf_sq_ei)) - writel(tail, nvmeq->q_db); - nvmeq->sq_tail = tail; + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, + nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) + writel(nvmeq->sq_tail, nvmeq->q_db); + spin_unlock(&nvmeq->sq_lock); } static void **nvme_pci_iod_list(struct request *req) @@ -872,6 +870,13 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_command cmnd; blk_status_t ret; + /* + * We should not need to do this, but we're still using this to + * ensure we can drain requests on a dying queue. + */ + if (unlikely(nvmeq->cq_vector < 0)) + return BLK_STS_IOERR; + ret = nvme_setup_cmd(ns, req, &cmnd); if (ret) return ret; @@ -887,16 +892,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, } blk_mq_start_request(req); - - spin_lock_irq(&nvmeq->q_lock); - if (unlikely(nvmeq->cq_vector < 0)) { - ret = BLK_STS_IOERR; - spin_unlock_irq(&nvmeq->q_lock); - goto out_cleanup_iod; - } - __nvme_submit_cmd(nvmeq, &cmnd); - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); + nvme_submit_cmd(nvmeq, &cmnd); return BLK_STS_OK; out_cleanup_iod: nvme_free_iod(dev, req); @@ -914,10 +910,10 @@ static void nvme_pci_complete_rq(struct request *req) } /* We read the CQE phase first to check if the rest of the entry is valid */ -static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head, - u16 phase) +static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) { - return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase; + return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) == + nvmeq->cq_phase; } static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) @@ -931,9 +927,9 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) } } -static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, - struct nvme_completion *cqe) +static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) { + volatile struct nvme_completion *cqe = &nvmeq->cqes[idx]; struct request *req; if (unlikely(cqe->command_id >= nvmeq->q_depth)) { @@ -956,83 +952,87 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, return; } - nvmeq->cqe_seen = 1; req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); nvme_end_request(req, cqe->status, cqe->result); } -static inline bool nvme_read_cqe(struct nvme_queue *nvmeq, - struct nvme_completion *cqe) +static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end) { - if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { - *cqe = nvmeq->cqes[nvmeq->cq_head]; + while (start != end) { + nvme_handle_cqe(nvmeq, start); + if (++start == nvmeq->q_depth) + start = 0; + } +} - if (++nvmeq->cq_head == nvmeq->q_depth) { - nvmeq->cq_head = 0; - nvmeq->cq_phase = !nvmeq->cq_phase; - } - return true; +static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) +{ + if (++nvmeq->cq_head == nvmeq->q_depth) { + nvmeq->cq_head = 0; + nvmeq->cq_phase = !nvmeq->cq_phase; } - return false; } -static void nvme_process_cq(struct nvme_queue *nvmeq) +static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, + u16 *end, int tag) { - struct nvme_completion cqe; - int consumed = 0; + bool found = false; - while (nvme_read_cqe(nvmeq, &cqe)) { - nvme_handle_cqe(nvmeq, &cqe); - consumed++; + *start = nvmeq->cq_head; + while (!found && nvme_cqe_pending(nvmeq)) { + if (nvmeq->cqes[nvmeq->cq_head].command_id == tag) + found = true; + nvme_update_cq_head(nvmeq); } + *end = nvmeq->cq_head; - if (consumed) + if (*start != *end) nvme_ring_cq_doorbell(nvmeq); + return found; } static irqreturn_t nvme_irq(int irq, void *data) { - irqreturn_t result; struct nvme_queue *nvmeq = data; - spin_lock(&nvmeq->q_lock); - nvme_process_cq(nvmeq); - result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; - nvmeq->cqe_seen = 0; - spin_unlock(&nvmeq->q_lock); - return result; + irqreturn_t ret = IRQ_NONE; + u16 start, end; + + spin_lock(&nvmeq->cq_lock); + if (nvmeq->cq_head != nvmeq->last_cq_head) + ret = IRQ_HANDLED; + nvme_process_cq(nvmeq, &start, &end, -1); + nvmeq->last_cq_head = nvmeq->cq_head; + spin_unlock(&nvmeq->cq_lock); + + if (start != end) { + nvme_complete_cqes(nvmeq, start, end); + return IRQ_HANDLED; + } + + return ret; } static irqreturn_t nvme_irq_check(int irq, void *data) { struct nvme_queue *nvmeq = data; - if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) + if (nvme_cqe_pending(nvmeq)) return IRQ_WAKE_THREAD; return IRQ_NONE; } static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) { - struct nvme_completion cqe; - int found = 0, consumed = 0; + u16 start, end; + bool found; - if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) + if (!nvme_cqe_pending(nvmeq)) return 0; - spin_lock_irq(&nvmeq->q_lock); - while (nvme_read_cqe(nvmeq, &cqe)) { - nvme_handle_cqe(nvmeq, &cqe); - consumed++; - - if (tag == cqe.command_id) { - found = 1; - break; - } - } - - if (consumed) - nvme_ring_cq_doorbell(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); + spin_lock_irq(&nvmeq->cq_lock); + found = nvme_process_cq(nvmeq, &start, &end, tag); + spin_unlock_irq(&nvmeq->cq_lock); + nvme_complete_cqes(nvmeq, start, end); return found; } @@ -1052,10 +1052,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; - - spin_lock_irq(&nvmeq->q_lock); - __nvme_submit_cmd(nvmeq, &c); - spin_unlock_irq(&nvmeq->q_lock); + nvme_submit_cmd(nvmeq, &c); } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) @@ -1070,7 +1067,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) } static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, - struct nvme_queue *nvmeq) + struct nvme_queue *nvmeq, s16 vector) { struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; @@ -1085,7 +1082,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, c.create_cq.cqid = cpu_to_le16(qid); c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); c.create_cq.cq_flags = cpu_to_le16(flags); - c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); + c.create_cq.irq_vector = cpu_to_le16(vector); return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); } @@ -1208,7 +1205,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) nvme_warn_reset(dev, csts); nvme_dev_disable(dev, false); nvme_reset_ctrl(&dev->ctrl); - return BLK_EH_HANDLED; + return BLK_EH_DONE; } /* @@ -1218,24 +1215,24 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, completion polled\n", req->tag, nvmeq->qid); - return BLK_EH_HANDLED; + return BLK_EH_DONE; } /* * Shutdown immediately if controller times out while starting. The * reset work will see the pci device disabled when it gets the forced * cancellation error. All outstanding requests are completed on - * shutdown, so we return BLK_EH_HANDLED. + * shutdown, so we return BLK_EH_DONE. */ switch (dev->ctrl.state) { case NVME_CTRL_CONNECTING: case NVME_CTRL_RESETTING: - dev_warn(dev->ctrl.device, + dev_warn_ratelimited(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); nvme_dev_disable(dev, false); nvme_req(req)->flags |= NVME_REQ_CANCELLED; - return BLK_EH_HANDLED; + return BLK_EH_DONE; default: break; } @@ -1252,12 +1249,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) nvme_dev_disable(dev, false); nvme_reset_ctrl(&dev->ctrl); - /* - * Mark the request as handled, since the inline shutdown - * forces all outstanding requests to complete. - */ nvme_req(req)->flags |= NVME_REQ_CANCELLED; - return BLK_EH_HANDLED; + return BLK_EH_DONE; } if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { @@ -1321,15 +1314,21 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) { int vector; - spin_lock_irq(&nvmeq->q_lock); + spin_lock_irq(&nvmeq->cq_lock); if (nvmeq->cq_vector == -1) { - spin_unlock_irq(&nvmeq->q_lock); + spin_unlock_irq(&nvmeq->cq_lock); return 1; } vector = nvmeq->cq_vector; nvmeq->dev->online_queues--; nvmeq->cq_vector = -1; - spin_unlock_irq(&nvmeq->q_lock); + spin_unlock_irq(&nvmeq->cq_lock); + + /* + * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without + * having to grab the lock. + */ + mb(); if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); @@ -1342,15 +1341,18 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { struct nvme_queue *nvmeq = &dev->queues[0]; + u16 start, end; if (shutdown) nvme_shutdown_ctrl(&dev->ctrl); else nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); - spin_lock_irq(&nvmeq->q_lock); - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); + spin_lock_irq(&nvmeq->cq_lock); + nvme_process_cq(nvmeq, &start, &end, -1); + spin_unlock_irq(&nvmeq->cq_lock); + + nvme_complete_cqes(nvmeq, start, end); } static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, @@ -1408,7 +1410,8 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) nvmeq->q_dmadev = dev->dev; nvmeq->dev = dev; - spin_lock_init(&nvmeq->q_lock); + spin_lock_init(&nvmeq->sq_lock); + spin_lock_init(&nvmeq->cq_lock); nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; @@ -1444,7 +1447,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) { struct nvme_dev *dev = nvmeq->dev; - spin_lock_irq(&nvmeq->q_lock); + spin_lock_irq(&nvmeq->cq_lock); nvmeq->sq_tail = 0; nvmeq->cq_head = 0; nvmeq->cq_phase = 1; @@ -1452,13 +1455,14 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); nvme_dbbuf_init(dev, nvmeq, qid); dev->online_queues++; - spin_unlock_irq(&nvmeq->q_lock); + spin_unlock_irq(&nvmeq->cq_lock); } static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) { struct nvme_dev *dev = nvmeq->dev; int result; + s16 vector; if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth), @@ -1471,15 +1475,21 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) * A queue's vector matches the queue identifier unless the controller * has only one vector available. */ - nvmeq->cq_vector = dev->num_vecs == 1 ? 0 : qid; - result = adapter_alloc_cq(dev, qid, nvmeq); + vector = dev->num_vecs == 1 ? 0 : qid; + result = adapter_alloc_cq(dev, qid, nvmeq, vector); if (result < 0) - goto release_vector; + goto out; result = adapter_alloc_sq(dev, qid, nvmeq); if (result < 0) goto release_cq; + /* + * Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will + * invoke free_irq for it and cause a 'Trying to free already-free IRQ + * xxx' warning if the create CQ/SQ command times out. + */ + nvmeq->cq_vector = vector; nvme_init_queue(nvmeq, qid); result = queue_request_irq(nvmeq); if (result < 0) @@ -1487,13 +1497,13 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) return result; - release_sq: +release_sq: + nvmeq->cq_vector = -1; dev->online_queues--; adapter_delete_sq(dev, qid); - release_cq: +release_cq: adapter_delete_cq(dev, qid); - release_vector: - nvmeq->cq_vector = -1; +out: return result; } @@ -1997,19 +2007,22 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error) static void nvme_del_cq_end(struct request *req, blk_status_t error) { struct nvme_queue *nvmeq = req->end_io_data; + u16 start, end; if (!error) { unsigned long flags; /* - * We might be called with the AQ q_lock held - * and the I/O queue q_lock should always + * We might be called with the AQ cq_lock held + * and the I/O queue cq_lock should always * nest inside the AQ one. */ - spin_lock_irqsave_nested(&nvmeq->q_lock, flags, + spin_lock_irqsave_nested(&nvmeq->cq_lock, flags, SINGLE_DEPTH_NESTING); - nvme_process_cq(nvmeq); - spin_unlock_irqrestore(&nvmeq->q_lock, flags); + nvme_process_cq(nvmeq, &start, &end, -1); + spin_unlock_irqrestore(&nvmeq->cq_lock, flags); + + nvme_complete_cqes(nvmeq, start, end); } nvme_del_queue_end(req, error); @@ -2497,6 +2510,15 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return 0; } +static void nvme_async_probe(void *data, async_cookie_t cookie) +{ + struct nvme_dev *dev = data; + + nvme_reset_ctrl_sync(&dev->ctrl); + flush_work(&dev->ctrl.scan_work); + nvme_put_ctrl(&dev->ctrl); +} + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int node, result = -ENOMEM; @@ -2541,7 +2563,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); - nvme_reset_ctrl(&dev->ctrl); + nvme_get_ctrl(&dev->ctrl); + async_schedule(nvme_async_probe, dev); return 0; @@ -2685,6 +2708,9 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) static void nvme_error_resume(struct pci_dev *pdev) { + struct nvme_dev *dev = pci_get_drvdata(pdev); + + flush_work(&dev->ctrl.reset_work); pci_cleanup_aer_uncorrect_error_status(pdev); } @@ -2714,6 +2740,8 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_MEDIUM_PRIO_SQ }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, + { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c58, 0x0023), /* WDC SN200 adapter */ @@ -2728,6 +2756,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */ .driver_data = NVME_QUIRK_LIGHTNVM, }, + { PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */ + .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 1eb4438..7b3f084 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -778,7 +778,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (error) { dev_err(ctrl->ctrl.device, "prop_get NVME_REG_CAP failed\n"); - goto out_cleanup_queue; + goto out_stop_queue; } ctrl->ctrl.sqsize = @@ -786,23 +786,25 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); if (error) - goto out_cleanup_queue; + goto out_stop_queue; ctrl->ctrl.max_hw_sectors = (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9); error = nvme_init_identify(&ctrl->ctrl); if (error) - goto out_cleanup_queue; + goto out_stop_queue; error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); if (error) - goto out_cleanup_queue; + goto out_stop_queue; return 0; +out_stop_queue: + nvme_rdma_stop_queue(&ctrl->queues[0]); out_cleanup_queue: if (new) blk_cleanup_queue(ctrl->ctrl.admin_q); @@ -1598,7 +1600,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved) /* fail with DNR on cmd timeout */ nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR; - return BLK_EH_HANDLED; + return BLK_EH_DONE; } static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index ea91fcc..01390f0 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -148,8 +148,8 @@ TRACE_EVENT(nvme_complete_rq, __entry->flags = nvme_req(req)->flags; __entry->status = nvme_req(req)->status; ), - TP_printk("cmdid=%u, qid=%d, res=%llu, retries=%u, flags=0x%x, status=%u", - __entry->cid, __entry->qid, __entry->result, + TP_printk("qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", + __entry->qid, __entry->cid, __entry->result, __entry->retries, __entry->flags, __entry->status) ); diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile index 4882501..8118c93 100644 --- a/drivers/nvme/target/Makefile +++ b/drivers/nvme/target/Makefile @@ -6,8 +6,8 @@ obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o obj-$(CONFIG_NVME_TARGET_FC) += nvmet-fc.o obj-$(CONFIG_NVME_TARGET_FCLOOP) += nvme-fcloop.o -nvmet-y += core.o configfs.o admin-cmd.o io-cmd.o fabrics-cmd.o \ - discovery.o +nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ + discovery.o io-cmd-file.o io-cmd-bdev.o nvme-loop-y += loop.o nvmet-rdma-y += rdma.o nvmet-fc-y += fc.o diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 5e0e9fc..ead8fbe 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -32,6 +32,11 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd) return len; } +static void nvmet_execute_get_log_page_noop(struct nvmet_req *req) +{ + nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->data_len)); +} + static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, struct nvme_smart_log *slog) { @@ -45,6 +50,10 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, return NVME_SC_INVALID_NS; } + /* we don't have the right data for file backed ns */ + if (!ns->bdev) + goto out; + host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]); host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]); @@ -54,6 +63,7 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, put_unaligned_le64(data_units_read, &slog->data_units_read[0]); put_unaligned_le64(host_writes, &slog->host_writes[0]); put_unaligned_le64(data_units_written, &slog->data_units_written[0]); +out: nvmet_put_namespace(ns); return NVME_SC_SUCCESS; @@ -71,6 +81,9 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, rcu_read_lock(); list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + /* we don't have the right data for file backed ns */ + if (!ns->bdev) + continue; host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]); data_units_read += part_stat_read(ns->bdev->bd_part, sectors[READ]); @@ -89,74 +102,50 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, return NVME_SC_SUCCESS; } -static u16 nvmet_get_smart_log(struct nvmet_req *req, - struct nvme_smart_log *slog) +static void nvmet_execute_get_log_page_smart(struct nvmet_req *req) { - u16 status; + struct nvme_smart_log *log; + u16 status = NVME_SC_INTERNAL; + + if (req->data_len != sizeof(*log)) + goto out; + + log = kzalloc(sizeof(*log), GFP_KERNEL); + if (!log) + goto out; - WARN_ON(req == NULL || slog == NULL); if (req->cmd->get_log_page.nsid == cpu_to_le32(NVME_NSID_ALL)) - status = nvmet_get_smart_log_all(req, slog); + status = nvmet_get_smart_log_all(req, log); else - status = nvmet_get_smart_log_nsid(req, slog); - return status; + status = nvmet_get_smart_log_nsid(req, log); + if (status) + goto out; + + status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); +out: + nvmet_req_complete(req, status); } -static void nvmet_execute_get_log_page(struct nvmet_req *req) +static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) { - struct nvme_smart_log *smart_log; - size_t data_len = nvmet_get_log_page_len(req->cmd); - void *buf; - u16 status = 0; + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = NVME_SC_INTERNAL; + size_t len; - buf = kzalloc(data_len, GFP_KERNEL); - if (!buf) { - status = NVME_SC_INTERNAL; + if (req->data_len != NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32)) goto out; - } - switch (req->cmd->get_log_page.lid) { - case NVME_LOG_ERROR: - /* - * We currently never set the More bit in the status field, - * so all error log entries are invalid and can be zeroed out. - * This is called a minum viable implementation (TM) of this - * mandatory log page. - */ - break; - case NVME_LOG_SMART: - /* - * XXX: fill out actual smart log - * - * We might have a hard time coming up with useful values for - * many of the fields, and even when we have useful data - * available (e.g. units or commands read/written) those aren't - * persistent over power loss. - */ - if (data_len != sizeof(*smart_log)) { - status = NVME_SC_INTERNAL; - goto err; - } - smart_log = buf; - status = nvmet_get_smart_log(req, smart_log); - if (status) - goto err; - break; - case NVME_LOG_FW_SLOT: - /* - * We only support a single firmware slot which always is - * active, so we can zero out the whole firmware slot log and - * still claim to fully implement this mandatory log page. - */ - break; - default: - BUG(); - } - - status = nvmet_copy_to_sgl(req, 0, buf, data_len); - -err: - kfree(buf); + mutex_lock(&ctrl->lock); + if (ctrl->nr_changed_ns == U32_MAX) + len = sizeof(__le32); + else + len = ctrl->nr_changed_ns * sizeof(__le32); + status = nvmet_copy_to_sgl(req, 0, ctrl->changed_ns_list, len); + if (!status) + status = nvmet_zero_sgl(req, len, req->data_len - len); + ctrl->nr_changed_ns = 0; + clear_bit(NVME_AEN_CFG_NS_ATTR, &ctrl->aen_masked); + mutex_unlock(&ctrl->lock); out: nvmet_req_complete(req, status); } @@ -201,7 +190,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->ver = cpu_to_le32(ctrl->subsys->ver); /* XXX: figure out what to do about RTD3R/RTD3 */ - id->oaes = cpu_to_le32(1 << 8); + id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL); id->ctratt = cpu_to_le32(1 << 0); id->oacs = 0; @@ -447,6 +436,16 @@ static void nvmet_execute_set_features(struct nvmet_req *req) req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); nvmet_set_result(req, req->sq->ctrl->kato); break; + case NVME_FEAT_ASYNC_EVENT: + val32 = le32_to_cpu(req->cmd->common.cdw10[1]); + if (val32 & ~NVMET_AEN_CFG_ALL) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + WRITE_ONCE(req->sq->ctrl->aen_enabled, val32); + nvmet_set_result(req, val32); + break; case NVME_FEAT_HOST_ID: status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; break; @@ -485,9 +484,10 @@ static void nvmet_execute_get_features(struct nvmet_req *req) break; case NVME_FEAT_WRITE_ATOMIC: break; +#endif case NVME_FEAT_ASYNC_EVENT: + nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled)); break; -#endif case NVME_FEAT_VOLATILE_WC: nvmet_set_result(req, 1); break; @@ -548,8 +548,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) struct nvme_command *cmd = req->cmd; u16 ret; - req->ns = NULL; - ret = nvmet_check_ctrl_status(req, cmd); if (unlikely(ret)) return ret; @@ -560,9 +558,28 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) switch (cmd->get_log_page.lid) { case NVME_LOG_ERROR: + /* + * We currently never set the More bit in the status + * field, so all error log entries are invalid and can + * be zeroed out. This is called a minum viable + * implementation (TM) of this mandatory log page. + */ + req->execute = nvmet_execute_get_log_page_noop; + return 0; case NVME_LOG_SMART: + req->execute = nvmet_execute_get_log_page_smart; + return 0; case NVME_LOG_FW_SLOT: - req->execute = nvmet_execute_get_log_page; + /* + * We only support a single firmware slot which always + * is active, so we can zero out the whole firmware slot + * log and still claim to fully implement this mandatory + * log page. + */ + req->execute = nvmet_execute_get_log_page_noop; + return 0; + case NVME_LOG_CHANGED_NS: + req->execute = nvmet_execute_get_log_changed_ns; return 0; } break; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index e95424f..a03da76 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -57,6 +57,13 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) return 0; } +u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len) +{ + if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) + return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return 0; +} + static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys) { struct nvmet_ns *ns; @@ -137,6 +144,51 @@ static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, schedule_work(&ctrl->async_event_work); } +static bool nvmet_aen_disabled(struct nvmet_ctrl *ctrl, u32 aen) +{ + if (!(READ_ONCE(ctrl->aen_enabled) & aen)) + return true; + return test_and_set_bit(aen, &ctrl->aen_masked); +} + +static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid) +{ + u32 i; + + mutex_lock(&ctrl->lock); + if (ctrl->nr_changed_ns > NVME_MAX_CHANGED_NAMESPACES) + goto out_unlock; + + for (i = 0; i < ctrl->nr_changed_ns; i++) { + if (ctrl->changed_ns_list[i] == nsid) + goto out_unlock; + } + + if (ctrl->nr_changed_ns == NVME_MAX_CHANGED_NAMESPACES) { + ctrl->changed_ns_list[0] = cpu_to_le32(0xffffffff); + ctrl->nr_changed_ns = U32_MAX; + goto out_unlock; + } + + ctrl->changed_ns_list[ctrl->nr_changed_ns++] = nsid; +out_unlock: + mutex_unlock(&ctrl->lock); +} + +static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) +{ + struct nvmet_ctrl *ctrl; + + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid)); + if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_NS_ATTR)) + continue; + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, + NVME_AER_NOTICE_NS_CHANGED, + NVME_LOG_CHANGED_NS); + } +} + int nvmet_register_transport(const struct nvmet_fabrics_ops *ops) { int ret = 0; @@ -271,33 +323,31 @@ void nvmet_put_namespace(struct nvmet_ns *ns) percpu_ref_put(&ns->ref); } +static void nvmet_ns_dev_disable(struct nvmet_ns *ns) +{ + nvmet_bdev_ns_disable(ns); + nvmet_file_ns_disable(ns); +} + int nvmet_ns_enable(struct nvmet_ns *ns) { struct nvmet_subsys *subsys = ns->subsys; - struct nvmet_ctrl *ctrl; int ret = 0; mutex_lock(&subsys->lock); if (ns->enabled) goto out_unlock; - ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE, - NULL); - if (IS_ERR(ns->bdev)) { - pr_err("failed to open block device %s: (%ld)\n", - ns->device_path, PTR_ERR(ns->bdev)); - ret = PTR_ERR(ns->bdev); - ns->bdev = NULL; + ret = nvmet_bdev_ns_enable(ns); + if (ret) + ret = nvmet_file_ns_enable(ns); + if (ret) goto out_unlock; - } - - ns->size = i_size_read(ns->bdev->bd_inode); - ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev)); ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 0, GFP_KERNEL); if (ret) - goto out_blkdev_put; + goto out_dev_put; if (ns->nsid > subsys->max_nsid) subsys->max_nsid = ns->nsid; @@ -320,24 +370,20 @@ int nvmet_ns_enable(struct nvmet_ns *ns) list_add_tail_rcu(&ns->dev_link, &old->dev_link); } - list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) - nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0); - + nvmet_ns_changed(subsys, ns->nsid); ns->enabled = true; ret = 0; out_unlock: mutex_unlock(&subsys->lock); return ret; -out_blkdev_put: - blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ); - ns->bdev = NULL; +out_dev_put: + nvmet_ns_dev_disable(ns); goto out_unlock; } void nvmet_ns_disable(struct nvmet_ns *ns) { struct nvmet_subsys *subsys = ns->subsys; - struct nvmet_ctrl *ctrl; mutex_lock(&subsys->lock); if (!ns->enabled) @@ -363,11 +409,8 @@ void nvmet_ns_disable(struct nvmet_ns *ns) percpu_ref_exit(&ns->ref); mutex_lock(&subsys->lock); - list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) - nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0); - - if (ns->bdev) - blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ); + nvmet_ns_changed(subsys, ns->nsid); + nvmet_ns_dev_disable(ns); out_unlock: mutex_unlock(&subsys->lock); } @@ -499,6 +542,25 @@ int nvmet_sq_init(struct nvmet_sq *sq) } EXPORT_SYMBOL_GPL(nvmet_sq_init); +static u16 nvmet_parse_io_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + u16 ret; + + ret = nvmet_check_ctrl_status(req, cmd); + if (unlikely(ret)) + return ret; + + req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); + if (unlikely(!req->ns)) + return NVME_SC_INVALID_NS | NVME_SC_DNR; + + if (req->ns->file) + return nvmet_file_parse_io_cmd(req); + else + return nvmet_bdev_parse_io_cmd(req); +} + bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops) { @@ -710,15 +772,14 @@ out: u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd) { if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { - pr_err("got io cmd %d while CC.EN == 0 on qid = %d\n", + pr_err("got cmd %d while CC.EN == 0 on qid = %d\n", cmd->common.opcode, req->sq->qid); return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; } if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { - pr_err("got io cmd %d while CSTS.RDY == 0 on qid = %d\n", + pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n", cmd->common.opcode, req->sq->qid); - req->ns = NULL; return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; } return 0; @@ -809,12 +870,18 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, kref_init(&ctrl->ref); ctrl->subsys = subsys; + WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL); + + ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES, + sizeof(__le32), GFP_KERNEL); + if (!ctrl->changed_ns_list) + goto out_free_ctrl; ctrl->cqs = kcalloc(subsys->max_qid + 1, sizeof(struct nvmet_cq *), GFP_KERNEL); if (!ctrl->cqs) - goto out_free_ctrl; + goto out_free_changed_ns_list; ctrl->sqs = kcalloc(subsys->max_qid + 1, sizeof(struct nvmet_sq *), @@ -872,6 +939,8 @@ out_free_sqs: kfree(ctrl->sqs); out_free_cqs: kfree(ctrl->cqs); +out_free_changed_ns_list: + kfree(ctrl->changed_ns_list); out_free_ctrl: kfree(ctrl); out_put_subsystem: @@ -898,6 +967,7 @@ static void nvmet_ctrl_free(struct kref *ref) kfree(ctrl->sqs); kfree(ctrl->cqs); + kfree(ctrl->changed_ns_list); kfree(ctrl); nvmet_subsys_put(subsys); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 231e04e..08656b8 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -187,8 +187,6 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; - req->ns = NULL; - if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { pr_err("got cmd %d while not ready\n", cmd->common.opcode); diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 19e9e42..d84ae004 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -77,8 +77,6 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; - req->ns = NULL; - switch (cmd->fabrics.fctype) { case nvme_fabrics_type_property_set: req->data_len = 0; @@ -242,8 +240,6 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; - req->ns = NULL; - if (cmd->common.opcode != nvme_fabrics_command) { pr_err("invalid command 0x%x on unconnected queue.\n", cmd->fabrics.opcode); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 33ee8d3..408279c 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -31,7 +31,7 @@ /* *************************** Data Structures/Defines ****************** */ -#define NVMET_LS_CTX_COUNT 4 +#define NVMET_LS_CTX_COUNT 256 /* for this implementation, assume small single frame rqst/rsp */ #define NVME_FC_MAX_LS_BUFFER_SIZE 2048 diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd-bdev.c index cd23441..e0b0f7d 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -16,6 +16,34 @@ #include <linux/module.h> #include "nvmet.h" +int nvmet_bdev_ns_enable(struct nvmet_ns *ns) +{ + int ret; + + ns->bdev = blkdev_get_by_path(ns->device_path, + FMODE_READ | FMODE_WRITE, NULL); + if (IS_ERR(ns->bdev)) { + ret = PTR_ERR(ns->bdev); + if (ret != -ENOTBLK) { + pr_err("failed to open block device %s: (%ld)\n", + ns->device_path, PTR_ERR(ns->bdev)); + } + ns->bdev = NULL; + return ret; + } + ns->size = i_size_read(ns->bdev->bd_inode); + ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev)); + return 0; +} + +void nvmet_bdev_ns_disable(struct nvmet_ns *ns) +{ + if (ns->bdev) { + blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ); + ns->bdev = NULL; + } +} + static void nvmet_bio_done(struct bio *bio) { struct nvmet_req *req = bio->bi_private; @@ -23,20 +51,14 @@ static void nvmet_bio_done(struct bio *bio) nvmet_req_complete(req, bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); - if (bio != &req->inline_bio) + if (bio != &req->b.inline_bio) bio_put(bio); } -static inline u32 nvmet_rw_len(struct nvmet_req *req) -{ - return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) << - req->ns->blksize_shift; -} - -static void nvmet_execute_rw(struct nvmet_req *req) +static void nvmet_bdev_execute_rw(struct nvmet_req *req) { int sg_cnt = req->sg_cnt; - struct bio *bio = &req->inline_bio; + struct bio *bio = &req->b.inline_bio; struct scatterlist *sg; sector_t sector; blk_qc_t cookie; @@ -89,9 +111,9 @@ static void nvmet_execute_rw(struct nvmet_req *req) blk_poll(bdev_get_queue(req->ns->bdev), cookie); } -static void nvmet_execute_flush(struct nvmet_req *req) +static void nvmet_bdev_execute_flush(struct nvmet_req *req) { - struct bio *bio = &req->inline_bio; + struct bio *bio = &req->b.inline_bio; bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); bio_set_dev(bio, req->ns->bdev); @@ -102,7 +124,7 @@ static void nvmet_execute_flush(struct nvmet_req *req) submit_bio(bio); } -static u16 nvmet_discard_range(struct nvmet_ns *ns, +static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns, struct nvme_dsm_range *range, struct bio **bio) { int ret; @@ -116,7 +138,7 @@ static u16 nvmet_discard_range(struct nvmet_ns *ns, return 0; } -static void nvmet_execute_discard(struct nvmet_req *req) +static void nvmet_bdev_execute_discard(struct nvmet_req *req) { struct nvme_dsm_range range; struct bio *bio = NULL; @@ -129,7 +151,7 @@ static void nvmet_execute_discard(struct nvmet_req *req) if (status) break; - status = nvmet_discard_range(req->ns, &range, &bio); + status = nvmet_bdev_discard_range(req->ns, &range, &bio); if (status) break; } @@ -148,11 +170,11 @@ static void nvmet_execute_discard(struct nvmet_req *req) } } -static void nvmet_execute_dsm(struct nvmet_req *req) +static void nvmet_bdev_execute_dsm(struct nvmet_req *req) { switch (le32_to_cpu(req->cmd->dsm.attributes)) { case NVME_DSMGMT_AD: - nvmet_execute_discard(req); + nvmet_bdev_execute_discard(req); return; case NVME_DSMGMT_IDR: case NVME_DSMGMT_IDW: @@ -163,7 +185,7 @@ static void nvmet_execute_dsm(struct nvmet_req *req) } } -static void nvmet_execute_write_zeroes(struct nvmet_req *req) +static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req) { struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes; struct bio *bio = NULL; @@ -189,38 +211,27 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req) } } -u16 nvmet_parse_io_cmd(struct nvmet_req *req) +u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; - u16 ret; - - ret = nvmet_check_ctrl_status(req, cmd); - if (unlikely(ret)) { - req->ns = NULL; - return ret; - } - - req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); - if (unlikely(!req->ns)) - return NVME_SC_INVALID_NS | NVME_SC_DNR; switch (cmd->common.opcode) { case nvme_cmd_read: case nvme_cmd_write: - req->execute = nvmet_execute_rw; + req->execute = nvmet_bdev_execute_rw; req->data_len = nvmet_rw_len(req); return 0; case nvme_cmd_flush: - req->execute = nvmet_execute_flush; + req->execute = nvmet_bdev_execute_flush; req->data_len = 0; return 0; case nvme_cmd_dsm: - req->execute = nvmet_execute_dsm; + req->execute = nvmet_bdev_execute_dsm; req->data_len = (le32_to_cpu(cmd->dsm.nr) + 1) * sizeof(struct nvme_dsm_range); return 0; case nvme_cmd_write_zeroes: - req->execute = nvmet_execute_write_zeroes; + req->execute = nvmet_bdev_execute_write_zeroes; return 0; default: pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c new file mode 100644 index 0000000..8c42b3a --- /dev/null +++ b/drivers/nvme/target/io-cmd-file.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe Over Fabrics Target File I/O commands implementation. + * Copyright (c) 2017-2018 Western Digital Corporation or its + * affiliates. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/uio.h> +#include <linux/falloc.h> +#include <linux/file.h> +#include "nvmet.h" + +#define NVMET_MAX_MPOOL_BVEC 16 +#define NVMET_MIN_MPOOL_OBJ 16 + +void nvmet_file_ns_disable(struct nvmet_ns *ns) +{ + if (ns->file) { + mempool_destroy(ns->bvec_pool); + ns->bvec_pool = NULL; + kmem_cache_destroy(ns->bvec_cache); + ns->bvec_cache = NULL; + fput(ns->file); + ns->file = NULL; + } +} + +int nvmet_file_ns_enable(struct nvmet_ns *ns) +{ + int ret; + struct kstat stat; + + ns->file = filp_open(ns->device_path, + O_RDWR | O_LARGEFILE | O_DIRECT, 0); + if (IS_ERR(ns->file)) { + pr_err("failed to open file %s: (%ld)\n", + ns->device_path, PTR_ERR(ns->file)); + return PTR_ERR(ns->file); + } + + ret = vfs_getattr(&ns->file->f_path, + &stat, STATX_SIZE, AT_STATX_FORCE_SYNC); + if (ret) + goto err; + + ns->size = stat.size; + ns->blksize_shift = file_inode(ns->file)->i_blkbits; + + ns->bvec_cache = kmem_cache_create("nvmet-bvec", + NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!ns->bvec_cache) { + ret = -ENOMEM; + goto err; + } + + ns->bvec_pool = mempool_create(NVMET_MIN_MPOOL_OBJ, mempool_alloc_slab, + mempool_free_slab, ns->bvec_cache); + + if (!ns->bvec_pool) { + ret = -ENOMEM; + goto err; + } + + return ret; +err: + ns->size = 0; + ns->blksize_shift = 0; + nvmet_file_ns_disable(ns); + return ret; +} + +static void nvmet_file_init_bvec(struct bio_vec *bv, struct sg_page_iter *iter) +{ + bv->bv_page = sg_page_iter_page(iter); + bv->bv_offset = iter->sg->offset; + bv->bv_len = PAGE_SIZE - iter->sg->offset; +} + +static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, + unsigned long nr_segs, size_t count) +{ + struct kiocb *iocb = &req->f.iocb; + ssize_t (*call_iter)(struct kiocb *iocb, struct iov_iter *iter); + struct iov_iter iter; + int ki_flags = 0, rw; + ssize_t ret; + + if (req->cmd->rw.opcode == nvme_cmd_write) { + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) + ki_flags = IOCB_DSYNC; + call_iter = req->ns->file->f_op->write_iter; + rw = WRITE; + } else { + call_iter = req->ns->file->f_op->read_iter; + rw = READ; + } + + iov_iter_bvec(&iter, ITER_BVEC | rw, req->f.bvec, nr_segs, count); + + iocb->ki_pos = pos; + iocb->ki_filp = req->ns->file; + iocb->ki_flags = IOCB_DIRECT | ki_flags; + + ret = call_iter(iocb, &iter); + + if (ret != -EIOCBQUEUED && iocb->ki_complete) + iocb->ki_complete(iocb, ret, 0); + + return ret; +} + +static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2) +{ + struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb); + + if (req->f.bvec != req->inline_bvec) { + if (likely(req->f.mpool_alloc == false)) + kfree(req->f.bvec); + else + mempool_free(req->f.bvec, req->ns->bvec_pool); + } + + nvmet_req_complete(req, ret != req->data_len ? + NVME_SC_INTERNAL | NVME_SC_DNR : 0); +} + +static void nvmet_file_execute_rw(struct nvmet_req *req) +{ + ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE); + struct sg_page_iter sg_pg_iter; + unsigned long bv_cnt = 0; + bool is_sync = false; + size_t len = 0, total_len = 0; + ssize_t ret = 0; + loff_t pos; + + if (!req->sg_cnt || !nr_bvec) { + nvmet_req_complete(req, 0); + return; + } + + if (nr_bvec > NVMET_MAX_INLINE_BIOVEC) + req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), + GFP_KERNEL); + else + req->f.bvec = req->inline_bvec; + + req->f.mpool_alloc = false; + if (unlikely(!req->f.bvec)) { + /* fallback under memory pressure */ + req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL); + req->f.mpool_alloc = true; + if (nr_bvec > NVMET_MAX_MPOOL_BVEC) + is_sync = true; + } + + pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift; + + memset(&req->f.iocb, 0, sizeof(struct kiocb)); + for_each_sg_page(req->sg, &sg_pg_iter, req->sg_cnt, 0) { + nvmet_file_init_bvec(&req->f.bvec[bv_cnt], &sg_pg_iter); + len += req->f.bvec[bv_cnt].bv_len; + total_len += req->f.bvec[bv_cnt].bv_len; + bv_cnt++; + + WARN_ON_ONCE((nr_bvec - 1) < 0); + + if (unlikely(is_sync) && + (nr_bvec - 1 == 0 || bv_cnt == NVMET_MAX_MPOOL_BVEC)) { + ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len); + if (ret < 0) + goto out; + pos += len; + bv_cnt = 0; + len = 0; + } + nr_bvec--; + } + + if (WARN_ON_ONCE(total_len != req->data_len)) + ret = -EIO; +out: + if (unlikely(is_sync || ret)) { + nvmet_file_io_done(&req->f.iocb, ret < 0 ? ret : total_len, 0); + return; + } + req->f.iocb.ki_complete = nvmet_file_io_done; + nvmet_file_submit_bvec(req, pos, bv_cnt, total_len); +} + +static void nvmet_file_flush_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); + int ret; + + ret = vfs_fsync(req->ns->file, 1); + + nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); +} + +static void nvmet_file_execute_flush(struct nvmet_req *req) +{ + INIT_WORK(&req->f.work, nvmet_file_flush_work); + schedule_work(&req->f.work); +} + +static void nvmet_file_execute_discard(struct nvmet_req *req) +{ + int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + struct nvme_dsm_range range; + loff_t offset; + loff_t len; + int i, ret; + + for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { + if (nvmet_copy_from_sgl(req, i * sizeof(range), &range, + sizeof(range))) + break; + offset = le64_to_cpu(range.slba) << req->ns->blksize_shift; + len = le32_to_cpu(range.nlb) << req->ns->blksize_shift; + ret = vfs_fallocate(req->ns->file, mode, offset, len); + if (ret) + break; + } + + nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); +} + +static void nvmet_file_dsm_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); + + switch (le32_to_cpu(req->cmd->dsm.attributes)) { + case NVME_DSMGMT_AD: + nvmet_file_execute_discard(req); + return; + case NVME_DSMGMT_IDR: + case NVME_DSMGMT_IDW: + default: + /* Not supported yet */ + nvmet_req_complete(req, 0); + return; + } +} + +static void nvmet_file_execute_dsm(struct nvmet_req *req) +{ + INIT_WORK(&req->f.work, nvmet_file_dsm_work); + schedule_work(&req->f.work); +} + +static void nvmet_file_write_zeroes_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); + struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes; + int mode = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE; + loff_t offset; + loff_t len; + int ret; + + offset = le64_to_cpu(write_zeroes->slba) << req->ns->blksize_shift; + len = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << + req->ns->blksize_shift); + + ret = vfs_fallocate(req->ns->file, mode, offset, len); + nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); +} + +static void nvmet_file_execute_write_zeroes(struct nvmet_req *req) +{ + INIT_WORK(&req->f.work, nvmet_file_write_zeroes_work); + schedule_work(&req->f.work); +} + +u16 nvmet_file_parse_io_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + switch (cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + req->execute = nvmet_file_execute_rw; + req->data_len = nvmet_rw_len(req); + return 0; + case nvme_cmd_flush: + req->execute = nvmet_file_execute_flush; + req->data_len = 0; + return 0; + case nvme_cmd_dsm: + req->execute = nvmet_file_execute_dsm; + req->data_len = (le32_to_cpu(cmd->dsm.nr) + 1) * + sizeof(struct nvme_dsm_range); + return 0; + case nvme_cmd_write_zeroes: + req->execute = nvmet_file_execute_write_zeroes; + req->data_len = 0; + return 0; + default: + pr_err("unhandled cmd for file ns %d on qid %d\n", + cmd->common.opcode, req->sq->qid); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } +} diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 27a8561..1304ec3 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -45,6 +45,7 @@ struct nvme_loop_ctrl { struct nvme_ctrl ctrl; struct nvmet_ctrl *target_ctrl; + struct nvmet_port *port; }; static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) @@ -63,7 +64,8 @@ struct nvme_loop_queue { unsigned long flags; }; -static struct nvmet_port *nvmet_loop_port; +static LIST_HEAD(nvme_loop_ports); +static DEFINE_MUTEX(nvme_loop_ports_mutex); static LIST_HEAD(nvme_loop_ctrl_list); static DEFINE_MUTEX(nvme_loop_ctrl_mutex); @@ -146,7 +148,7 @@ nvme_loop_timeout(struct request *rq, bool reserved) /* fail with DNR on admin cmd timeout */ nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR; - return BLK_EH_HANDLED; + return BLK_EH_DONE; } static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, @@ -169,12 +171,12 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(req); iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; - iod->req.port = nvmet_loop_port; + iod->req.port = queue->ctrl->port; if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq, &nvme_loop_ops)) return BLK_STS_OK; - if (blk_rq_payload_bytes(req)) { + if (blk_rq_nr_phys_segments(req)) { iod->sg_table.sgl = iod->first_sgl; if (sg_alloc_table_chained(&iod->sg_table, blk_rq_nr_phys_segments(req), @@ -517,6 +519,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { .free_ctrl = nvme_loop_free_ctrl, .submit_async_event = nvme_loop_submit_async_event, .delete_ctrl = nvme_loop_delete_ctrl_host, + .get_address = nvmf_get_address, }; static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) @@ -565,6 +568,23 @@ out_destroy_queues: return ret; } +static struct nvmet_port *nvme_loop_find_port(struct nvme_ctrl *ctrl) +{ + struct nvmet_port *p, *found = NULL; + + mutex_lock(&nvme_loop_ports_mutex); + list_for_each_entry(p, &nvme_loop_ports, entry) { + /* if no transport address is specified use the first port */ + if ((ctrl->opts->mask & NVMF_OPT_TRADDR) && + strcmp(ctrl->opts->traddr, p->disc_addr.traddr)) + continue; + found = p; + break; + } + mutex_unlock(&nvme_loop_ports_mutex); + return found; +} + static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { @@ -589,6 +609,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; + ctrl->port = nvme_loop_find_port(&ctrl->ctrl); ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues), GFP_KERNEL); @@ -646,27 +667,17 @@ out_put_ctrl: static int nvme_loop_add_port(struct nvmet_port *port) { - /* - * XXX: disalow adding more than one port so - * there is no connection rejections when a - * a subsystem is assigned to a port for which - * loop doesn't have a pointer. - * This scenario would be possible if we allowed - * more than one port to be added and a subsystem - * was assigned to a port other than nvmet_loop_port. - */ - - if (nvmet_loop_port) - return -EPERM; - - nvmet_loop_port = port; + mutex_lock(&nvme_loop_ports_mutex); + list_add_tail(&port->entry, &nvme_loop_ports); + mutex_unlock(&nvme_loop_ports_mutex); return 0; } static void nvme_loop_remove_port(struct nvmet_port *port) { - if (port == nvmet_loop_port) - nvmet_loop_port = NULL; + mutex_lock(&nvme_loop_ports_mutex); + list_del_init(&port->entry); + mutex_unlock(&nvme_loop_ports_mutex); } static const struct nvmet_fabrics_ops nvme_loop_ops = { @@ -682,6 +693,7 @@ static struct nvmf_transport_ops nvme_loop_transport = { .name = "loop", .module = THIS_MODULE, .create_ctrl = nvme_loop_create_ctrl, + .allowed_opts = NVMF_OPT_TRADDR, }; static int __init nvme_loop_init_module(void) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 15fd84a..480dfe1 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -30,6 +30,21 @@ #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 + +/* + * Supported optional AENs: + */ +#define NVMET_AEN_CFG_OPTIONAL \ + NVME_AEN_CFG_NS_ATTR + +/* + * Plus mandatory SMART AENs (we'll never send them, but allow enabling them): + */ +#define NVMET_AEN_CFG_ALL \ + (NVME_SMART_CRIT_SPARE | NVME_SMART_CRIT_TEMPERATURE | \ + NVME_SMART_CRIT_RELIABILITY | NVME_SMART_CRIT_MEDIA | \ + NVME_SMART_CRIT_VOLATILE_MEMORY | NVMET_AEN_CFG_OPTIONAL) + /* Helper Macros when NVMe error is NVME_SC_CONNECT_INVALID_PARAM * The 16 bit shift is to set IATTR bit to 1, which means offending * offset starts in the data section of connect() @@ -43,6 +58,7 @@ struct nvmet_ns { struct list_head dev_link; struct percpu_ref ref; struct block_device *bdev; + struct file *file; u32 nsid; u32 blksize_shift; loff_t size; @@ -57,6 +73,8 @@ struct nvmet_ns { struct config_group group; struct completion disable_done; + mempool_t *bvec_pool; + struct kmem_cache *bvec_cache; }; static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item) @@ -82,7 +100,7 @@ struct nvmet_sq { /** * struct nvmet_port - Common structure to keep port * information for the target. - * @entry: List head for holding a list of these elements. + * @entry: Entry into referrals or transport list. * @disc_addr: Address information is stored in a format defined * for a discovery log page entry. * @group: ConfigFS group for this element's folder. @@ -120,6 +138,8 @@ struct nvmet_ctrl { u16 cntlid; u32 kato; + u32 aen_enabled; + unsigned long aen_masked; struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS]; unsigned int nr_async_event_cmds; struct list_head async_events; @@ -132,6 +152,9 @@ struct nvmet_ctrl { const struct nvmet_fabrics_ops *ops; + __le32 *changed_ns_list; + u32 nr_changed_ns; + char subsysnqn[NVMF_NQN_FIELD_LEN]; char hostnqn[NVMF_NQN_FIELD_LEN]; }; @@ -222,8 +245,18 @@ struct nvmet_req { struct nvmet_cq *cq; struct nvmet_ns *ns; struct scatterlist *sg; - struct bio inline_bio; struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC]; + union { + struct { + struct bio inline_bio; + } b; + struct { + bool mpool_alloc; + struct kiocb iocb; + struct bio_vec *bvec; + struct work_struct work; + } f; + }; int sg_cnt; /* data length as parsed from the command: */ size_t data_len; @@ -263,7 +296,8 @@ struct nvmet_async_event { }; u16 nvmet_parse_connect_cmd(struct nvmet_req *req); -u16 nvmet_parse_io_cmd(struct nvmet_req *req); +u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); +u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); u16 nvmet_parse_admin_cmd(struct nvmet_req *req); u16 nvmet_parse_discovery_cmd(struct nvmet_req *req); u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req); @@ -316,6 +350,7 @@ u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, size_t len); u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len); +u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len); u32 nvmet_get_log_page_len(struct nvme_command *cmd); @@ -338,4 +373,14 @@ extern struct rw_semaphore nvmet_config_sem; bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, const char *hostnqn); +int nvmet_bdev_ns_enable(struct nvmet_ns *ns); +int nvmet_file_ns_enable(struct nvmet_ns *ns); +void nvmet_bdev_ns_disable(struct nvmet_ns *ns); +void nvmet_file_ns_disable(struct nvmet_ns *ns); + +static inline u32 nvmet_rw_len(struct nvmet_req *req) +{ + return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) << + req->ns->blksize_shift; +} #endif /* _NVMET_H */ diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 02c03e4..cb9b685 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3054,7 +3054,7 @@ out: * * Return values: * BLK_EH_RESET_TIMER if the request should be left running - * BLK_EH_NOT_HANDLED if the request is handled or terminated + * BLK_EH_DONE if the request is handled or terminated * by the driver. */ enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved) @@ -3067,7 +3067,7 @@ enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved) cqr = *((struct dasd_ccw_req **) blk_mq_rq_to_pdu(req)); if (!cqr) - return BLK_EH_NOT_HANDLED; + return BLK_EH_DONE; spin_lock_irqsave(&cqr->dq->lock, flags); device = cqr->startdev ? cqr->startdev : block->base; @@ -3126,7 +3126,7 @@ enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved) spin_unlock(&block->queue_lock); spin_unlock_irqrestore(&cqr->dq->lock, flags); - return rc ? BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED; + return rc ? BLK_EH_RESET_TIMER : BLK_EH_DONE; } static int dasd_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, diff --git a/drivers/sbus/char/Kconfig b/drivers/sbus/char/Kconfig index bf3c5f7..89edd13 100644 --- a/drivers/sbus/char/Kconfig +++ b/drivers/sbus/char/Kconfig @@ -28,13 +28,6 @@ config TADPOLE_TS102_UCTRL events, and can also notice the attachment/detachment of external monitors and mice. -config SUN_JSFLASH - tristate "JavaStation OS Flash SIMM" - depends on SPARC32 - help - If you say Y here, you will be able to boot from your JavaStation's - Flash memory. - config BBC_I2C tristate "UltraSPARC-III bootbus i2c controller driver" depends on PCI && SPARC64 diff --git a/drivers/sbus/char/Makefile b/drivers/sbus/char/Makefile index 8c48ed9..44347c9 100644 --- a/drivers/sbus/char/Makefile +++ b/drivers/sbus/char/Makefile @@ -15,6 +15,5 @@ obj-$(CONFIG_DISPLAY7SEG) += display7seg.o obj-$(CONFIG_OBP_FLASH) += flash.o obj-$(CONFIG_SUN_OPENPROMIO) += openprom.o obj-$(CONFIG_TADPOLE_TS102_UCTRL) += uctrl.o -obj-$(CONFIG_SUN_JSFLASH) += jsflash.o obj-$(CONFIG_BBC_I2C) += bbc.o obj-$(CONFIG_ORACLE_DAX) += oradax.o diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c deleted file mode 100644 index 14f377a..0000000 --- a/drivers/sbus/char/jsflash.c +++ /dev/null @@ -1,658 +0,0 @@ -/* - * drivers/sbus/char/jsflash.c - * - * Copyright (C) 1991, 1992 Linus Torvalds (drivers/char/mem.c) - * Copyright (C) 1997 Eddie C. Dost (drivers/sbus/char/flash.c) - * Copyright (C) 1997-2000 Pavel Machek <pavel@ucw.cz> (drivers/block/nbd.c) - * Copyright (C) 1999-2000 Pete Zaitcev - * - * This driver is used to program OS into a Flash SIMM on - * Krups and Espresso platforms. - * - * TODO: do not allow erase/programming if file systems are mounted. - * TODO: Erase/program both banks of a 8MB SIMM. - * - * It is anticipated that programming an OS Flash will be a routine - * procedure. In the same time it is exceedingly dangerous because - * a user can program its OBP flash with OS image and effectively - * kill the machine. - * - * This driver uses an interface different from Eddie's flash.c - * as a silly safeguard. - * - * XXX The flash.c manipulates page caching characteristics in a certain - * dubious way; also it assumes that remap_pfn_range() can remap - * PCI bus locations, which may be false. ioremap() must be used - * instead. We should discuss this. - */ - -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/miscdevice.h> -#include <linux/fcntl.h> -#include <linux/poll.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/genhd.h> -#include <linux/blkdev.h> -#include <linux/uaccess.h> -#include <asm/pgtable.h> -#include <asm/io.h> -#include <asm/pcic.h> -#include <asm/oplib.h> - -#include <asm/jsflash.h> /* ioctl arguments. <linux/> ?? */ -#define JSFIDSZ (sizeof(struct jsflash_ident_arg)) -#define JSFPRGSZ (sizeof(struct jsflash_program_arg)) - -/* - * Our device numbers have no business in system headers. - * The only thing a user knows is the device name /dev/jsflash. - * - * Block devices are laid out like this: - * minor+0 - Bootstrap, for 8MB SIMM 0x20400000[0x800000] - * minor+1 - Filesystem to mount, normally 0x20400400[0x7ffc00] - * minor+2 - Whole flash area for any case... 0x20000000[0x01000000] - * Total 3 minors per flash device. - * - * It is easier to have static size vectors, so we define - * a total minor range JSF_MAX, which must cover all minors. - */ -/* character device */ -#define JSF_MINOR 178 /* 178 is registered with hpa */ -/* block device */ -#define JSF_MAX 3 /* 3 minors wasted total so far. */ -#define JSF_NPART 3 /* 3 minors per flash device */ -#define JSF_PART_BITS 2 /* 2 bits of minors to cover JSF_NPART */ -#define JSF_PART_MASK 0x3 /* 2 bits mask */ - -static DEFINE_MUTEX(jsf_mutex); - -/* - * Access functions. - * We could ioremap(), but it's easier this way. - */ -static unsigned int jsf_inl(unsigned long addr) -{ - unsigned long retval; - - __asm__ __volatile__("lda [%1] %2, %0\n\t" : - "=r" (retval) : - "r" (addr), "i" (ASI_M_BYPASS)); - return retval; -} - -static void jsf_outl(unsigned long addr, __u32 data) -{ - - __asm__ __volatile__("sta %0, [%1] %2\n\t" : : - "r" (data), "r" (addr), "i" (ASI_M_BYPASS) : - "memory"); -} - -/* - * soft carrier - */ - -struct jsfd_part { - unsigned long dbase; - unsigned long dsize; -}; - -struct jsflash { - unsigned long base; - unsigned long size; - unsigned long busy; /* In use? */ - struct jsflash_ident_arg id; - /* int mbase; */ /* Minor base, typically zero */ - struct jsfd_part dv[JSF_NPART]; -}; - -/* - * We do not map normal memory or obio as a safety precaution. - * But offsets are real, for ease of userland programming. - */ -#define JSF_BASE_TOP 0x30000000 -#define JSF_BASE_ALL 0x20000000 - -#define JSF_BASE_JK 0x20400000 - -/* - */ -static struct gendisk *jsfd_disk[JSF_MAX]; - -/* - * Let's pretend we may have several of these... - */ -static struct jsflash jsf0; - -/* - * Wait for AMD to finish its embedded algorithm. - * We use the Toggle bit DQ6 (0x40) because it does not - * depend on the data value as /DATA bit DQ7 does. - * - * XXX Do we need any timeout here? So far it never hanged, beware broken hw. - */ -static void jsf_wait(unsigned long p) { - unsigned int x1, x2; - - for (;;) { - x1 = jsf_inl(p); - x2 = jsf_inl(p); - if ((x1 & 0x40404040) == (x2 & 0x40404040)) return; - } -} - -/* - * Programming will only work if Flash is clean, - * we leave it to the programmer application. - * - * AMD must be programmed one byte at a time; - * thus, Simple Tech SIMM must be written 4 bytes at a time. - * - * Write waits for the chip to become ready after the write - * was finished. This is done so that application would read - * consistent data after the write is done. - */ -static void jsf_write4(unsigned long fa, u32 data) { - - jsf_outl(fa, 0xAAAAAAAA); /* Unlock 1 Write 1 */ - jsf_outl(fa, 0x55555555); /* Unlock 1 Write 2 */ - jsf_outl(fa, 0xA0A0A0A0); /* Byte Program */ - jsf_outl(fa, data); - - jsf_wait(fa); -} - -/* - */ -static void jsfd_read(char *buf, unsigned long p, size_t togo) { - union byte4 { - char s[4]; - unsigned int n; - } b; - - while (togo >= 4) { - togo -= 4; - b.n = jsf_inl(p); - memcpy(buf, b.s, 4); - p += 4; - buf += 4; - } -} - -static int jsfd_queue; - -static struct request *jsfd_next_request(void) -{ - struct request_queue *q; - struct request *rq; - int old_pos = jsfd_queue; - - do { - q = jsfd_disk[jsfd_queue]->queue; - if (++jsfd_queue == JSF_MAX) - jsfd_queue = 0; - if (q) { - rq = blk_fetch_request(q); - if (rq) - return rq; - } - } while (jsfd_queue != old_pos); - - return NULL; -} - -static void jsfd_request(void) -{ - struct request *req; - - req = jsfd_next_request(); - while (req) { - struct jsfd_part *jdp = req->rq_disk->private_data; - unsigned long offset = blk_rq_pos(req) << 9; - size_t len = blk_rq_cur_bytes(req); - blk_status_t err = BLK_STS_IOERR; - - if ((offset + len) > jdp->dsize) - goto end; - - if (rq_data_dir(req) != READ) { - printk(KERN_ERR "jsfd: write\n"); - goto end; - } - - if ((jdp->dbase & 0xff000000) != 0x20000000) { - printk(KERN_ERR "jsfd: bad base %x\n", (int)jdp->dbase); - goto end; - } - - jsfd_read(bio_data(req->bio), jdp->dbase + offset, len); - err = BLK_STS_OK; - end: - if (!__blk_end_request_cur(req, err)) - req = jsfd_next_request(); - } -} - -static void jsfd_do_request(struct request_queue *q) -{ - jsfd_request(); -} - -/* - * The memory devices use the full 32/64 bits of the offset, and so we cannot - * check against negative addresses: they are ok. The return value is weird, - * though, in that case (0). - * - * also note that seeking relative to the "end of file" isn't supported: - * it has no meaning, so it returns -EINVAL. - */ -static loff_t jsf_lseek(struct file * file, loff_t offset, int orig) -{ - loff_t ret; - - mutex_lock(&jsf_mutex); - switch (orig) { - case 0: - file->f_pos = offset; - ret = file->f_pos; - break; - case 1: - file->f_pos += offset; - ret = file->f_pos; - break; - default: - ret = -EINVAL; - } - mutex_unlock(&jsf_mutex); - return ret; -} - -/* - * OS SIMM Cannot be read in other size but a 32bits word. - */ -static ssize_t jsf_read(struct file * file, char __user * buf, - size_t togo, loff_t *ppos) -{ - unsigned long p = *ppos; - char __user *tmp = buf; - - union byte4 { - char s[4]; - unsigned int n; - } b; - - if (p < JSF_BASE_ALL || p >= JSF_BASE_TOP) { - return 0; - } - - if ((p + togo) < p /* wrap */ - || (p + togo) >= JSF_BASE_TOP) { - togo = JSF_BASE_TOP - p; - } - - if (p < JSF_BASE_ALL && togo != 0) { -#if 0 /* __bzero XXX */ - size_t x = JSF_BASE_ALL - p; - if (x > togo) x = togo; - clear_user(tmp, x); - tmp += x; - p += x; - togo -= x; -#else - /* - * Implementation of clear_user() calls __bzero - * without regard to modversions, - * so we cannot build a module. - */ - return 0; -#endif - } - - while (togo >= 4) { - togo -= 4; - b.n = jsf_inl(p); - if (copy_to_user(tmp, b.s, 4)) - return -EFAULT; - tmp += 4; - p += 4; - } - - /* - * XXX Small togo may remain if 1 byte is ordered. - * It would be nice if we did a word size read and unpacked it. - */ - - *ppos = p; - return tmp-buf; -} - -static ssize_t jsf_write(struct file * file, const char __user * buf, - size_t count, loff_t *ppos) -{ - return -ENOSPC; -} - -/* - */ -static int jsf_ioctl_erase(unsigned long arg) -{ - unsigned long p; - - /* p = jsf0.base; hits wrong bank */ - p = 0x20400000; - - jsf_outl(p, 0xAAAAAAAA); /* Unlock 1 Write 1 */ - jsf_outl(p, 0x55555555); /* Unlock 1 Write 2 */ - jsf_outl(p, 0x80808080); /* Erase setup */ - jsf_outl(p, 0xAAAAAAAA); /* Unlock 2 Write 1 */ - jsf_outl(p, 0x55555555); /* Unlock 2 Write 2 */ - jsf_outl(p, 0x10101010); /* Chip erase */ - -#if 0 - /* - * This code is ok, except that counter based timeout - * has no place in this world. Let's just drop timeouts... - */ - { - int i; - __u32 x; - for (i = 0; i < 1000000; i++) { - x = jsf_inl(p); - if ((x & 0x80808080) == 0x80808080) break; - } - if ((x & 0x80808080) != 0x80808080) { - printk("jsf0: erase timeout with 0x%08x\n", x); - } else { - printk("jsf0: erase done with 0x%08x\n", x); - } - } -#else - jsf_wait(p); -#endif - - return 0; -} - -/* - * Program a block of flash. - * Very simple because we can do it byte by byte anyway. - */ -static int jsf_ioctl_program(void __user *arg) -{ - struct jsflash_program_arg abuf; - char __user *uptr; - unsigned long p; - unsigned int togo; - union { - unsigned int n; - char s[4]; - } b; - - if (copy_from_user(&abuf, arg, JSFPRGSZ)) - return -EFAULT; - p = abuf.off; - togo = abuf.size; - if ((togo & 3) || (p & 3)) return -EINVAL; - - uptr = (char __user *) (unsigned long) abuf.data; - while (togo != 0) { - togo -= 4; - if (copy_from_user(&b.s[0], uptr, 4)) - return -EFAULT; - jsf_write4(p, b.n); - p += 4; - uptr += 4; - } - - return 0; -} - -static long jsf_ioctl(struct file *f, unsigned int cmd, unsigned long arg) -{ - mutex_lock(&jsf_mutex); - int error = -ENOTTY; - void __user *argp = (void __user *)arg; - - if (!capable(CAP_SYS_ADMIN)) { - mutex_unlock(&jsf_mutex); - return -EPERM; - } - switch (cmd) { - case JSFLASH_IDENT: - if (copy_to_user(argp, &jsf0.id, JSFIDSZ)) { - mutex_unlock(&jsf_mutex); - return -EFAULT; - } - break; - case JSFLASH_ERASE: - error = jsf_ioctl_erase(arg); - break; - case JSFLASH_PROGRAM: - error = jsf_ioctl_program(argp); - break; - } - - mutex_unlock(&jsf_mutex); - return error; -} - -static int jsf_mmap(struct file * file, struct vm_area_struct * vma) -{ - return -ENXIO; -} - -static int jsf_open(struct inode * inode, struct file * filp) -{ - mutex_lock(&jsf_mutex); - if (jsf0.base == 0) { - mutex_unlock(&jsf_mutex); - return -ENXIO; - } - if (test_and_set_bit(0, (void *)&jsf0.busy) != 0) { - mutex_unlock(&jsf_mutex); - return -EBUSY; - } - - mutex_unlock(&jsf_mutex); - return 0; /* XXX What security? */ -} - -static int jsf_release(struct inode *inode, struct file *file) -{ - jsf0.busy = 0; - return 0; -} - -static const struct file_operations jsf_fops = { - .owner = THIS_MODULE, - .llseek = jsf_lseek, - .read = jsf_read, - .write = jsf_write, - .unlocked_ioctl = jsf_ioctl, - .mmap = jsf_mmap, - .open = jsf_open, - .release = jsf_release, -}; - -static struct miscdevice jsf_dev = { JSF_MINOR, "jsflash", &jsf_fops }; - -static const struct block_device_operations jsfd_fops = { - .owner = THIS_MODULE, -}; - -static int jsflash_init(void) -{ - int rc; - struct jsflash *jsf; - phandle node; - char banner[128]; - struct linux_prom_registers reg0; - - node = prom_getchild(prom_root_node); - node = prom_searchsiblings(node, "flash-memory"); - if (node != 0 && (s32)node != -1) { - if (prom_getproperty(node, "reg", - (char *)®0, sizeof(reg0)) == -1) { - printk("jsflash: no \"reg\" property\n"); - return -ENXIO; - } - if (reg0.which_io != 0) { - printk("jsflash: bus number nonzero: 0x%x:%x\n", - reg0.which_io, reg0.phys_addr); - return -ENXIO; - } - /* - * Flash may be somewhere else, for instance on Ebus. - * So, don't do the following check for IIep flash space. - */ -#if 0 - if ((reg0.phys_addr >> 24) != 0x20) { - printk("jsflash: suspicious address: 0x%x:%x\n", - reg0.which_io, reg0.phys_addr); - return -ENXIO; - } -#endif - if ((int)reg0.reg_size <= 0) { - printk("jsflash: bad size 0x%x\n", (int)reg0.reg_size); - return -ENXIO; - } - } else { - /* XXX Remove this code once PROLL ID12 got widespread */ - printk("jsflash: no /flash-memory node, use PROLL >= 12\n"); - prom_getproperty(prom_root_node, "banner-name", banner, 128); - if (strcmp (banner, "JavaStation-NC") != 0 && - strcmp (banner, "JavaStation-E") != 0) { - return -ENXIO; - } - reg0.which_io = 0; - reg0.phys_addr = 0x20400000; - reg0.reg_size = 0x00800000; - } - - /* Let us be really paranoid for modifications to probing code. */ - if (sparc_cpu_model != sun4m) { - /* We must be on sun4m because we use MMU Bypass ASI. */ - return -ENXIO; - } - - if (jsf0.base == 0) { - jsf = &jsf0; - - jsf->base = reg0.phys_addr; - jsf->size = reg0.reg_size; - - /* XXX Redo the userland interface. */ - jsf->id.off = JSF_BASE_ALL; - jsf->id.size = 0x01000000; /* 16M - all segments */ - strcpy(jsf->id.name, "Krups_all"); - - jsf->dv[0].dbase = jsf->base; - jsf->dv[0].dsize = jsf->size; - jsf->dv[1].dbase = jsf->base + 1024; - jsf->dv[1].dsize = jsf->size - 1024; - jsf->dv[2].dbase = JSF_BASE_ALL; - jsf->dv[2].dsize = 0x01000000; - - printk("Espresso Flash @0x%lx [%d MB]\n", jsf->base, - (int) (jsf->size / (1024*1024))); - } - - if ((rc = misc_register(&jsf_dev)) != 0) { - printk(KERN_ERR "jsf: unable to get misc minor %d\n", - JSF_MINOR); - jsf0.base = 0; - return rc; - } - - return 0; -} - -static int jsfd_init(void) -{ - static DEFINE_SPINLOCK(lock); - struct jsflash *jsf; - struct jsfd_part *jdp; - int err; - int i; - - if (jsf0.base == 0) - return -ENXIO; - - err = -ENOMEM; - for (i = 0; i < JSF_MAX; i++) { - struct gendisk *disk = alloc_disk(1); - if (!disk) - goto out; - disk->queue = blk_init_queue(jsfd_do_request, &lock); - if (!disk->queue) { - put_disk(disk); - goto out; - } - blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); - jsfd_disk[i] = disk; - } - - if (register_blkdev(JSFD_MAJOR, "jsfd")) { - err = -EIO; - goto out; - } - - for (i = 0; i < JSF_MAX; i++) { - struct gendisk *disk = jsfd_disk[i]; - if ((i & JSF_PART_MASK) >= JSF_NPART) continue; - jsf = &jsf0; /* actually, &jsfv[i >> JSF_PART_BITS] */ - jdp = &jsf->dv[i&JSF_PART_MASK]; - - disk->major = JSFD_MAJOR; - disk->first_minor = i; - sprintf(disk->disk_name, "jsfd%d", i); - disk->fops = &jsfd_fops; - set_capacity(disk, jdp->dsize >> 9); - disk->private_data = jdp; - add_disk(disk); - set_disk_ro(disk, 1); - } - return 0; -out: - while (i--) - put_disk(jsfd_disk[i]); - return err; -} - -MODULE_LICENSE("GPL"); - -static int __init jsflash_init_module(void) { - int rc; - - if ((rc = jsflash_init()) == 0) { - jsfd_init(); - return 0; - } - return rc; -} - -static void __exit jsflash_cleanup_module(void) -{ - int i; - - for (i = 0; i < JSF_MAX; i++) { - if ((i & JSF_PART_MASK) >= JSF_NPART) continue; - del_gendisk(jsfd_disk[i]); - blk_cleanup_queue(jsfd_disk[i]->queue); - put_disk(jsfd_disk[i]); - } - if (jsf0.busy) - printk("jsf0: cleaning busy unit\n"); - jsf0.base = 0; - jsf0.busy = 0; - - misc_deregister(&jsf_dev); - unregister_blkdev(JSFD_MAJOR, "jsfd"); -} - -module_init(jsflash_init_module); -module_exit(jsflash_cleanup_module); diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c index c35f05c..8560479 100644 --- a/drivers/scsi/gdth.c +++ b/drivers/scsi/gdth.c @@ -3882,7 +3882,7 @@ static enum blk_eh_timer_return gdth_timed_out(struct scsi_cmnd *scp) struct gdth_cmndinfo *cmndinfo = gdth_cmnd_priv(scp); u8 b, t; unsigned long flags; - enum blk_eh_timer_return retval = BLK_EH_NOT_HANDLED; + enum blk_eh_timer_return retval = BLK_EH_DONE; TRACE(("%s() cmd 0x%x\n", scp->cmnd[0], __func__)); b = scp->device->channel; diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 15a2fef..71bdc0b 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -1963,7 +1963,7 @@ static int iscsi_has_ping_timed_out(struct iscsi_conn *conn) enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc) { - enum blk_eh_timer_return rc = BLK_EH_NOT_HANDLED; + enum blk_eh_timer_return rc = BLK_EH_DONE; struct iscsi_task *task = NULL, *running_task; struct iscsi_cls_session *cls_session; struct iscsi_session *session; @@ -1982,7 +1982,7 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc) * Raced with completion. Blk layer has taken ownership * so let timeout code complete it now. */ - rc = BLK_EH_HANDLED; + rc = BLK_EH_DONE; goto done; } @@ -1997,7 +1997,7 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc) if (unlikely(system_state != SYSTEM_RUNNING)) { sc->result = DID_NO_CONNECT << 16; ISCSI_DBG_EH(session, "sc on shutdown, handled\n"); - rc = BLK_EH_HANDLED; + rc = BLK_EH_DONE; goto done; } /* diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index b89c6e6..ce656c4 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -2772,7 +2772,7 @@ blk_eh_timer_return megasas_reset_timer(struct scsi_cmnd *scmd) if (time_after(jiffies, scmd->jiffies_at_alloc + (scmd_timeout * 2) * HZ)) { - return BLK_EH_NOT_HANDLED; + return BLK_EH_DONE; } instance = (struct megasas_instance *)scmd->device->host->hostdata; diff --git a/drivers/scsi/mvumi.c b/drivers/scsi/mvumi.c index fe97401..afd2716 100644 --- a/drivers/scsi/mvumi.c +++ b/drivers/scsi/mvumi.c @@ -2155,7 +2155,7 @@ static enum blk_eh_timer_return mvumi_timed_out(struct scsi_cmnd *scmd) mvumi_return_cmd(mhba, cmd); spin_unlock_irqrestore(mhba->shost->host_lock, flags); - return BLK_EH_NOT_HANDLED; + return BLK_EH_DONE; } static int diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index e188771..5a33e1a 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -99,7 +99,7 @@ static int _osd_get_print_system_info(struct osd_dev *od, int nelem = ARRAY_SIZE(get_attrs), a = 0; int ret; - or = osd_start_request(od, GFP_KERNEL); + or = osd_start_request(od); if (!or) return -ENOMEM; @@ -409,16 +409,15 @@ static void _osd_request_free(struct osd_request *or) kfree(or); } -struct osd_request *osd_start_request(struct osd_dev *dev, gfp_t gfp) +struct osd_request *osd_start_request(struct osd_dev *dev) { struct osd_request *or; - or = _osd_request_alloc(gfp); + or = _osd_request_alloc(GFP_KERNEL); if (!or) return NULL; or->osd_dev = dev; - or->alloc_flags = gfp; or->timeout = dev->def_timeout; or->retries = OSD_REQ_RETRIES; @@ -546,7 +545,7 @@ static int _osd_realloc_seg(struct osd_request *or, if (seg->alloc_size >= max_bytes) return 0; - buff = krealloc(seg->buff, max_bytes, or->alloc_flags); + buff = krealloc(seg->buff, max_bytes, GFP_KERNEL); if (!buff) { OSD_ERR("Failed to Realloc %d-bytes was-%d\n", max_bytes, seg->alloc_size); @@ -728,7 +727,7 @@ static int _osd_req_list_objects(struct osd_request *or, _osd_req_encode_olist(or, list); WARN_ON(or->in.bio); - bio = bio_map_kern(q, list, len, or->alloc_flags); + bio = bio_map_kern(q, list, len, GFP_KERNEL); if (IS_ERR(bio)) { OSD_ERR("!!! Failed to allocate list_objects BIO\n"); return PTR_ERR(bio); @@ -1190,14 +1189,14 @@ static int _req_append_segment(struct osd_request *or, pad_buff = io->pad_buff; ret = blk_rq_map_kern(io->req->q, io->req, pad_buff, padding, - or->alloc_flags); + GFP_KERNEL); if (ret) return ret; io->total_bytes += padding; } ret = blk_rq_map_kern(io->req->q, io->req, seg->buff, seg->total_bytes, - or->alloc_flags); + GFP_KERNEL); if (ret) return ret; @@ -1564,14 +1563,14 @@ static int _osd_req_finalize_data_integrity(struct osd_request *or, * osd_finalize_request and helpers */ static struct request *_make_request(struct request_queue *q, bool has_write, - struct _osd_io_info *oii, gfp_t flags) + struct _osd_io_info *oii) { struct request *req; struct bio *bio = oii->bio; int ret; req = blk_get_request(q, has_write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, - flags); + 0); if (IS_ERR(req)) return req; @@ -1589,13 +1588,12 @@ static struct request *_make_request(struct request_queue *q, bool has_write, static int _init_blk_request(struct osd_request *or, bool has_in, bool has_out) { - gfp_t flags = or->alloc_flags; struct scsi_device *scsi_device = or->osd_dev->scsi_device; struct request_queue *q = scsi_device->request_queue; struct request *req; int ret; - req = _make_request(q, has_out, has_out ? &or->out : &or->in, flags); + req = _make_request(q, has_out, has_out ? &or->out : &or->in); if (IS_ERR(req)) { ret = PTR_ERR(req); goto out; @@ -1611,7 +1609,7 @@ static int _init_blk_request(struct osd_request *or, or->out.req = req; if (has_in) { /* allocate bidi request */ - req = _make_request(q, false, &or->in, flags); + req = _make_request(q, false, &or->in); if (IS_ERR(req)) { OSD_DEBUG("blk_get_request for bidi failed\n"); ret = PTR_ERR(req); diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c index 20ec1c0..2bbe797 100644 --- a/drivers/scsi/osst.c +++ b/drivers/scsi/osst.c @@ -368,7 +368,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd, int write = (data_direction == DMA_TO_DEVICE); req = blk_get_request(SRpnt->stp->device->request_queue, - write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL); + write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); if (IS_ERR(req)) return DRIVER_ERROR << 24; diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index 94c14ce..0e13349 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -1848,7 +1848,7 @@ static enum blk_eh_timer_return qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc) struct iscsi_cls_session *session; struct iscsi_session *sess; unsigned long flags; - enum blk_eh_timer_return ret = BLK_EH_NOT_HANDLED; + enum blk_eh_timer_return ret = BLK_EH_DONE; session = starget_to_session(scsi_target(sc->device)); sess = session->dd_data; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 9460391..9c02ba2 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -282,7 +282,7 @@ void scsi_eh_scmd_add(struct scsi_cmnd *scmd) enum blk_eh_timer_return scsi_times_out(struct request *req) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); - enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED; + enum blk_eh_timer_return rtn = BLK_EH_DONE; struct Scsi_Host *host = scmd->device->host; trace_scsi_dispatch_cmd_timeout(scmd); @@ -294,7 +294,7 @@ enum blk_eh_timer_return scsi_times_out(struct request *req) if (host->hostt->eh_timed_out) rtn = host->hostt->eh_timed_out(scmd); - if (rtn == BLK_EH_NOT_HANDLED) { + if (rtn == BLK_EH_DONE) { if (scsi_abort_command(scmd) != SUCCESS) { set_host_byte(scmd, DID_TIME_OUT); scsi_eh_scmd_add(scmd); @@ -1933,11 +1933,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) struct request *req; struct scsi_request *rq; - /* - * blk_get_request with GFP_KERNEL (__GFP_RECLAIM) sleeps until a - * request becomes available - */ - req = blk_get_request(sdev->request_queue, REQ_OP_SCSI_IN, GFP_KERNEL); + req = blk_get_request(sdev->request_queue, REQ_OP_SCSI_IN, 0); if (IS_ERR(req)) return; rq = scsi_req(req); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index e9b4f27..f125fd7 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -265,7 +265,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, struct scsi_request *rq; int ret = DRIVER_ERROR << 24; - req = blk_get_request_flags(sdev->request_queue, + req = blk_get_request(sdev->request_queue, data_direction == DMA_TO_DEVICE ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, BLK_MQ_REQ_PREEMPT); if (IS_ERR(req)) @@ -273,7 +273,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, rq = scsi_req(req); if (bufflen && blk_rq_map_kern(sdev->request_queue, req, - buffer, bufflen, __GFP_RECLAIM)) + buffer, bufflen, GFP_NOIO)) goto out; rq->cmd_len = COMMAND_SIZE(cmd[0]); diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index be3be0f..1da3d71 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -2087,7 +2087,7 @@ fc_eh_timed_out(struct scsi_cmnd *scmd) if (rport->port_state == FC_PORTSTATE_BLOCKED) return BLK_EH_RESET_TIMER; - return BLK_EH_NOT_HANDLED; + return BLK_EH_DONE; } EXPORT_SYMBOL(fc_eh_timed_out); @@ -3591,10 +3591,9 @@ fc_bsg_job_timeout(struct request *req) } /* the blk_end_sync_io() doesn't check the error */ - if (!inflight) - return BLK_EH_NOT_HANDLED; - else - return BLK_EH_HANDLED; + if (inflight) + blk_mq_complete_request(req); + return BLK_EH_DONE; } /** @@ -3781,8 +3780,7 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host) snprintf(bsg_name, sizeof(bsg_name), "fc_host%d", shost->host_no); - q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size, - NULL); + q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size); if (IS_ERR(q)) { dev_err(dev, "fc_host%d: bsg interface failed to initialize - setup queue\n", @@ -3827,8 +3825,8 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport) if (!i->f->bsg_request) return -ENOTSUPP; - q = bsg_setup_queue(dev, NULL, fc_bsg_dispatch, i->f->dd_bsg_size, - NULL); + q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch, + i->f->dd_bsg_size); if (IS_ERR(q)) { dev_err(dev, "failed to setup bsg queue\n"); return PTR_ERR(q); diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 65f6c94..6fd2fe2 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -1542,7 +1542,7 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost) return -ENOTSUPP; snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no); - q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0, NULL); + q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0); if (IS_ERR(q)) { shost_printk(KERN_ERR, shost, "bsg interface failed to " "initialize - no request queue\n"); diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c index 08acbab..e2953b4 100644 --- a/drivers/scsi/scsi_transport_sas.c +++ b/drivers/scsi/scsi_transport_sas.c @@ -187,16 +187,6 @@ static int sas_smp_dispatch(struct bsg_job *job) return 0; } -static void sas_host_release(struct device *dev) -{ - struct Scsi_Host *shost = dev_to_shost(dev); - struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); - struct request_queue *q = sas_host->q; - - if (q) - blk_cleanup_queue(q); -} - static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy) { struct request_queue *q; @@ -208,7 +198,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy) if (rphy) { q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev), - sas_smp_dispatch, 0, NULL); + sas_smp_dispatch, 0); if (IS_ERR(q)) return PTR_ERR(q); rphy->q = q; @@ -217,7 +207,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy) snprintf(name, sizeof(name), "sas_host%d", shost->host_no); q = bsg_setup_queue(&shost->shost_gendev, name, - sas_smp_dispatch, 0, sas_host_release); + sas_smp_dispatch, 0); if (IS_ERR(q)) return PTR_ERR(q); to_sas_host_attrs(shost)->q = q; @@ -260,8 +250,11 @@ static int sas_host_remove(struct transport_container *tc, struct device *dev, struct Scsi_Host *shost = dev_to_shost(dev); struct request_queue *q = to_sas_host_attrs(shost)->q; - if (q) + if (q) { bsg_unregister_queue(q); + blk_cleanup_queue(q); + } + return 0; } diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index 456ce9f..4e46fdb 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -604,7 +604,7 @@ EXPORT_SYMBOL(srp_reconnect_rport); * * If a timeout occurs while an rport is in the blocked state, ask the SCSI * EH to continue waiting (BLK_EH_RESET_TIMER). Otherwise let the SCSI core - * handle the timeout (BLK_EH_NOT_HANDLED). + * handle the timeout (BLK_EH_DONE). * * Note: This function is called from soft-IRQ context and with the request * queue lock held. @@ -620,7 +620,7 @@ enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd) return rport && rport->fast_io_fail_tmo < 0 && rport->dev_loss_tmo < 0 && i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ? - BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED; + BLK_EH_RESET_TIMER : BLK_EH_DONE; } EXPORT_SYMBOL(srp_timed_out); diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 5c40d80..b6f174d 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1715,7 +1715,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd) * does not sleep except under memory pressure. */ rq = blk_get_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ? - REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL); + REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); if (IS_ERR(rq)) { kfree(long_cmdp); return PTR_ERR(rq); diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 6c39948..a427ce9 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -545,7 +545,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, req = blk_get_request(SRpnt->stp->device->request_queue, data_direction == DMA_TO_DEVICE ? - REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL); + REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); if (IS_ERR(req)) return DRIVER_ERROR << 24; rq = scsi_req(req); diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 00e7905..d0a1674 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -6497,12 +6497,12 @@ static enum blk_eh_timer_return ufshcd_eh_timed_out(struct scsi_cmnd *scmd) bool found = false; if (!scmd || !scmd->device || !scmd->device->host) - return BLK_EH_NOT_HANDLED; + return BLK_EH_DONE; host = scmd->device->host; hba = shost_priv(host); if (!hba) - return BLK_EH_NOT_HANDLED; + return BLK_EH_DONE; spin_lock_irqsave(host->host_lock, flags); @@ -6520,7 +6520,7 @@ static enum blk_eh_timer_return ufshcd_eh_timed_out(struct scsi_cmnd *scmd) * SCSI command was not actually dispatched to UFS driver, otherwise * let SCSI layer handle the error as usual. */ - return found ? BLK_EH_NOT_HANDLED : BLK_EH_RESET_TIMER; + return found ? BLK_EH_DONE : BLK_EH_RESET_TIMER; } static const struct attribute_group *ufshcd_driver_groups[] = { diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 6042901..ce1321a 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -94,8 +94,8 @@ static int iblock_configure_device(struct se_device *dev) return -EINVAL; } - ib_dev->ibd_bio_set = bioset_create(IBLOCK_BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (!ib_dev->ibd_bio_set) { + ret = bioset_init(&ib_dev->ibd_bio_set, IBLOCK_BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (ret) { pr_err("IBLOCK: Unable to create bioset\n"); goto out; } @@ -141,7 +141,7 @@ static int iblock_configure_device(struct se_device *dev) bi = bdev_get_integrity(bd); if (bi) { - struct bio_set *bs = ib_dev->ibd_bio_set; + struct bio_set *bs = &ib_dev->ibd_bio_set; if (!strcmp(bi->profile->name, "T10-DIF-TYPE3-IP") || !strcmp(bi->profile->name, "T10-DIF-TYPE1-IP")) { @@ -164,7 +164,7 @@ static int iblock_configure_device(struct se_device *dev) goto out_blkdev_put; } pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n", - bs->bio_integrity_pool); + &bs->bio_integrity_pool); } dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type; } @@ -174,8 +174,7 @@ static int iblock_configure_device(struct se_device *dev) out_blkdev_put: blkdev_put(ib_dev->ibd_bd, FMODE_WRITE|FMODE_READ|FMODE_EXCL); out_free_bioset: - bioset_free(ib_dev->ibd_bio_set); - ib_dev->ibd_bio_set = NULL; + bioset_exit(&ib_dev->ibd_bio_set); out: return ret; } @@ -199,8 +198,7 @@ static void iblock_destroy_device(struct se_device *dev) if (ib_dev->ibd_bd != NULL) blkdev_put(ib_dev->ibd_bd, FMODE_WRITE|FMODE_READ|FMODE_EXCL); - if (ib_dev->ibd_bio_set != NULL) - bioset_free(ib_dev->ibd_bio_set); + bioset_exit(&ib_dev->ibd_bio_set); } static unsigned long long iblock_emulate_read_cap_with_block_size( @@ -332,7 +330,7 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num, int op, if (sg_num > BIO_MAX_PAGES) sg_num = BIO_MAX_PAGES; - bio = bio_alloc_bioset(GFP_NOIO, sg_num, ib_dev->ibd_bio_set); + bio = bio_alloc_bioset(GFP_NOIO, sg_num, &ib_dev->ibd_bio_set); if (!bio) { pr_err("Unable to allocate memory for bio\n"); return NULL; diff --git a/drivers/target/target_core_iblock.h b/drivers/target/target_core_iblock.h index b4aeb25..9cc3843 100644 --- a/drivers/target/target_core_iblock.h +++ b/drivers/target/target_core_iblock.h @@ -22,7 +22,7 @@ struct iblock_dev { struct se_device dev; unsigned char ibd_udev_path[SE_UDEV_PATH_LEN]; u32 ibd_flags; - struct bio_set *ibd_bio_set; + struct bio_set ibd_bio_set; struct block_device *ibd_bd; bool ibd_readonly; } ____cacheline_aligned; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 6cb933e..668934e 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -986,8 +986,7 @@ pscsi_execute_cmd(struct se_cmd *cmd) req = blk_get_request(pdv->pdv_sd->request_queue, cmd->data_direction == DMA_TO_DEVICE ? - REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, - GFP_KERNEL); + REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); if (IS_ERR(req)) { pr_err("PSCSI: blk_get_request() failed\n"); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; diff --git a/fs/block_dev.c b/fs/block_dev.c index 7ec920e..bef6934 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -272,7 +272,7 @@ struct blkdev_dio { struct bio bio; }; -static struct bio_set *blkdev_dio_pool __read_mostly; +static struct bio_set blkdev_dio_pool; static void blkdev_bio_end_io(struct bio *bio) { @@ -334,7 +334,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool); + bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); bio_get(bio); /* extra ref for the completion handler */ dio = container_of(bio, struct blkdev_dio, bio); @@ -432,10 +432,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static __init int blkdev_init(void) { - blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); - if (!blkdev_dio_pool) - return -ENOMEM; - return 0; + return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); } module_init(blkdev_init); @@ -1322,27 +1319,30 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) * check_disk_size_change - checks for disk size change and adjusts bdev size. * @disk: struct gendisk to check * @bdev: struct bdev to adjust. + * @verbose: if %true log a message about a size change if there is any * * This routine checks to see if the bdev size does not match the disk size * and adjusts it if it differs. When shrinking the bdev size, its all caches * are freed. */ -void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) +void check_disk_size_change(struct gendisk *disk, struct block_device *bdev, + bool verbose) { loff_t disk_size, bdev_size; disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { - printk(KERN_INFO - "%s: detected capacity change from %lld to %lld\n", - disk->disk_name, bdev_size, disk_size); + if (verbose) { + printk(KERN_INFO + "%s: detected capacity change from %lld to %lld\n", + disk->disk_name, bdev_size, disk_size); + } i_size_write(bdev->bd_inode, disk_size); if (bdev_size > disk_size) flush_disk(bdev, false); } } -EXPORT_SYMBOL(check_disk_size_change); /** * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back @@ -1364,7 +1364,7 @@ int revalidate_disk(struct gendisk *disk) return ret; mutex_lock(&bdev->bd_mutex); - check_disk_size_change(disk, bdev); + check_disk_size_change(disk, bdev, ret == 0); bdev->bd_invalidated = 0; mutex_unlock(&bdev->bd_mutex); bdput(bdev); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e99b329..56d32bb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -26,7 +26,7 @@ static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; -static struct bio_set *btrfs_bioset; +static struct bio_set btrfs_bioset; static inline bool extent_state_in_tree(const struct extent_state *state) { @@ -162,20 +162,18 @@ int __init extent_io_init(void) if (!extent_buffer_cache) goto free_state_cache; - btrfs_bioset = bioset_create(BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio), - BIOSET_NEED_BVECS); - if (!btrfs_bioset) + if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_io_bio, bio), + BIOSET_NEED_BVECS)) goto free_buffer_cache; - if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) + if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) goto free_bioset; return 0; free_bioset: - bioset_free(btrfs_bioset); - btrfs_bioset = NULL; + bioset_exit(&btrfs_bioset); free_buffer_cache: kmem_cache_destroy(extent_buffer_cache); @@ -198,8 +196,7 @@ void __cold extent_io_exit(void) rcu_barrier(); kmem_cache_destroy(extent_state_cache); kmem_cache_destroy(extent_buffer_cache); - if (btrfs_bioset) - bioset_free(btrfs_bioset); + bioset_exit(&btrfs_bioset); } void extent_io_tree_init(struct extent_io_tree *tree, @@ -2679,7 +2676,7 @@ struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) { struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); @@ -2692,7 +2689,7 @@ struct bio *btrfs_bio_clone(struct bio *bio) struct bio *new; /* Bio allocation backed by a bioset does not fail */ - new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset); + new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); btrfs_bio = btrfs_io_bio(new); btrfs_io_bio_init(btrfs_bio); btrfs_bio->iter = bio->bi_iter; @@ -2704,7 +2701,7 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) struct bio *bio; /* Bio allocation backed by a bioset does not fail */ - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset); + bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; } @@ -2715,7 +2712,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) struct btrfs_io_bio *btrfs_bio; /* this will never fail when it's backed by a bioset */ - bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset); + bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); btrfs_bio = btrfs_io_bio(bio); diff --git a/fs/direct-io.c b/fs/direct-io.c index 874607b..093fb54 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -432,8 +432,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct bio *bio; /* - * bio_alloc() is guaranteed to return a bio when called with - * __GFP_RECLAIM and we request a valid number of vectors. + * bio_alloc() is guaranteed to return a bio when allowed to sleep and + * we request a valid number of vectors. */ bio = bio_alloc(GFP_KERNEL, nr_vecs); diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 3c6a9c1..ddbf872 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -790,7 +790,7 @@ int ore_create(struct ore_io_state *ios) for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; - or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, i)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -815,7 +815,7 @@ int ore_remove(struct ore_io_state *ios) for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; - or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, i)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -847,7 +847,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; - or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, dev)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -966,7 +966,7 @@ int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) return 0; /* Just an empty slot */ first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; - or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, first_dev)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; @@ -1060,7 +1060,7 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; - or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, cur_comp)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 179cd5c..719a315 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -229,7 +229,7 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, u64 offset, void *p, unsigned length) { - struct osd_request *or = osd_start_request(od, GFP_KERNEL); + struct osd_request *or = osd_start_request(od); /* struct osd_sense_info osi = {.key = 0};*/ int ret; diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 70b8bf7..a43dfed 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -227,7 +227,7 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, if (!buf) return -ENOMEM; - rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL); + rq = blk_get_request(q, REQ_OP_SCSI_IN, 0); if (IS_ERR(rq)) { error = -ENOMEM; goto out_free_buf; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 0ab824f..1024635 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -594,7 +594,7 @@ xfs_alloc_ioend( struct xfs_ioend *ioend; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset); xfs_init_bio_from_bh(bio, bh); ioend = container_of(bio, struct xfs_ioend, io_inline_bio); diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 69346d4..694c85b 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -18,7 +18,7 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern struct bio_set *xfs_ioend_bioset; +extern struct bio_set xfs_ioend_bioset; /* * Types of I/O for bmap clustering and I/O completion tracking. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d714240..f643d76 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -63,7 +63,7 @@ #include <linux/parser.h> static const struct super_operations xfs_super_operations; -struct bio_set *xfs_ioend_bioset; +struct bio_set xfs_ioend_bioset; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG @@ -1845,10 +1845,9 @@ MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) { - xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, + if (bioset_init(&xfs_ioend_bioset, 4 * MAX_BUF_PER_PAGE, offsetof(struct xfs_ioend, io_inline_bio), - BIOSET_NEED_BVECS); - if (!xfs_ioend_bioset) + BIOSET_NEED_BVECS)) goto out; xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), @@ -1997,7 +1996,7 @@ xfs_init_zones(void) out_destroy_log_ticket_zone: kmem_zone_destroy(xfs_log_ticket_zone); out_free_ioend_bioset: - bioset_free(xfs_ioend_bioset); + bioset_exit(&xfs_ioend_bioset); out: return -ENOMEM; } @@ -2029,7 +2028,7 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_btree_cur_zone); kmem_zone_destroy(xfs_bmap_free_item_zone); kmem_zone_destroy(xfs_log_ticket_zone); - bioset_free(xfs_ioend_bioset); + bioset_exit(&xfs_ioend_bioset); } STATIC int __init diff --git a/include/linux/bio.h b/include/linux/bio.h index ce547a2..810a8be 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -67,8 +67,12 @@ #define bio_multiple_segments(bio) \ ((bio)->bi_iter.bi_size != bio_iovec(bio).bv_len) -#define bio_sectors(bio) ((bio)->bi_iter.bi_size >> 9) -#define bio_end_sector(bio) ((bio)->bi_iter.bi_sector + bio_sectors((bio))) + +#define bvec_iter_sectors(iter) ((iter).bi_size >> 9) +#define bvec_iter_end_sector(iter) ((iter).bi_sector + bvec_iter_sectors((iter))) + +#define bio_sectors(bio) bvec_iter_sectors((bio)->bi_iter) +#define bio_end_sector(bio) bvec_iter_end_sector((bio)->bi_iter) /* * Return the data direction, READ or WRITE. @@ -406,13 +410,13 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors, return bio_split(bio, sectors, gfp, bs); } -extern struct bio_set *bioset_create(unsigned int, unsigned int, int flags); enum { BIOSET_NEED_BVECS = BIT(0), BIOSET_NEED_RESCUER = BIT(1), }; -extern void bioset_free(struct bio_set *); -extern mempool_t *biovec_create_pool(int pool_entries); +extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags); +extern void bioset_exit(struct bio_set *); +extern int biovec_init_pool(mempool_t *pool, int pool_entries); extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); extern void bio_put(struct bio *); @@ -421,11 +425,11 @@ extern void __bio_clone_fast(struct bio *, struct bio *); extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); -extern struct bio_set *fs_bio_set; +extern struct bio_set fs_bio_set; static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { - return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); + return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); } static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) @@ -499,7 +503,10 @@ static inline void bio_flush_dcache_pages(struct bio *bi) } #endif +extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, + struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); +extern void bio_list_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); extern struct bio *bio_copy_user_iov(struct request_queue *, @@ -507,7 +514,13 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, struct iov_iter *, gfp_t); extern int bio_uncopy_user(struct bio *); -void zero_fill_bio(struct bio *bio); +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); + +static inline void zero_fill_bio(struct bio *bio) +{ + zero_fill_bio_iter(bio, bio->bi_iter); +} + extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); @@ -722,11 +735,11 @@ struct bio_set { struct kmem_cache *bio_slab; unsigned int front_pad; - mempool_t *bio_pool; - mempool_t *bvec_pool; + mempool_t bio_pool; + mempool_t bvec_pool; #if defined(CONFIG_BLK_DEV_INTEGRITY) - mempool_t *bio_integrity_pool; - mempool_t *bvec_integrity_pool; + mempool_t bio_integrity_pool; + mempool_t bvec_integrity_pool; #endif /* @@ -745,6 +758,11 @@ struct biovec_slab { struct kmem_cache *slab; }; +static inline bool bioset_initialized(struct bio_set *bs) +{ + return bs->bio_slab != NULL; +} + /* * a small number of entries is fine, not going to be performance critical. * basically we just need to survive diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ebc34a5..fb35517 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -259,7 +259,8 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_complete_request(struct request *rq); - +bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, + struct bio *bio); bool blk_mq_queue_stopped(struct request_queue *q); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 17b18b9..3c4f390 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -8,6 +8,7 @@ #include <linux/types.h> #include <linux/bvec.h> +#include <linux/ktime.h> struct bio_set; struct bio; @@ -90,10 +91,52 @@ static inline bool blk_path_error(blk_status_t error) return true; } -struct blk_issue_stat { - u64 stat; +/* + * From most significant bit: + * 1 bit: reserved for other usage, see below + * 12 bits: original size of bio + * 51 bits: issue time of bio + */ +#define BIO_ISSUE_RES_BITS 1 +#define BIO_ISSUE_SIZE_BITS 12 +#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) +#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) +#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) +#define BIO_ISSUE_SIZE_MASK \ + (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) +#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) + +/* Reserved bit for blk-throtl */ +#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) + +struct bio_issue { + u64 value; }; +static inline u64 __bio_issue_time(u64 time) +{ + return time & BIO_ISSUE_TIME_MASK; +} + +static inline u64 bio_issue_time(struct bio_issue *issue) +{ + return __bio_issue_time(issue->value); +} + +static inline sector_t bio_issue_size(struct bio_issue *issue) +{ + return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); +} + +static inline void bio_issue_init(struct bio_issue *issue, + sector_t size) +{ + size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; + issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | + (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | + ((u64)size << BIO_ISSUE_SIZE_SHIFT)); +} + /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) @@ -138,7 +181,7 @@ struct bio { struct cgroup_subsys_state *bi_css; #ifdef CONFIG_BLK_DEV_THROTTLING_LOW void *bi_cg_private; - struct blk_issue_stat bi_issue_stat; + struct bio_issue bi_issue; #endif #endif union { @@ -186,6 +229,8 @@ struct bio { * throttling rules. Don't do it again. */ #define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion * of this bio. */ +#define BIO_QUEUE_ENTERED 11 /* can use blk_queue_enter_live() */ + /* See BVEC_POOL_OFFSET below before adding new flags */ /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c4eee0..bca3a92 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -125,16 +125,23 @@ typedef __u32 __bitwise req_flags_t; #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* The per-zone write lock is held for this request */ #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) -/* timeout is expired */ -#define RQF_MQ_TIMEOUT_EXPIRED ((__force req_flags_t)(1 << 20)) /* already slept for hybrid poll */ -#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 21)) +#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) /* + * Request state for blk-mq. + */ +enum mq_rq_state { + MQ_RQ_IDLE = 0, + MQ_RQ_IN_FLIGHT = 1, + MQ_RQ_COMPLETE = 2, +}; + +/* * Try to put the fields that are referenced together in the same cacheline. * * If you modify this structure, make sure to update blk_rq_init() and @@ -205,9 +212,20 @@ struct request { struct gendisk *rq_disk; struct hd_struct *part; - unsigned long start_time; - struct blk_issue_stat issue_stat; - /* Number of scatter-gather DMA addr+len pairs after + /* Time that I/O was submitted to the kernel. */ + u64 start_time_ns; + /* Time that I/O was submitted to the device. */ + u64 io_start_time_ns; + +#ifdef CONFIG_BLK_WBT + unsigned short wbt_flags; +#endif +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + unsigned short throtl_size; +#endif + + /* + * Number of scatter-gather DMA addr+len pairs after * physical address coalescing is performed. */ unsigned short nr_phys_segments; @@ -219,32 +237,14 @@ struct request { unsigned short write_hint; unsigned short ioprio; - unsigned int timeout; - void *special; /* opaque pointer available for LLD use */ unsigned int extra_len; /* length of alignment and padding */ - /* - * On blk-mq, the lower bits of ->gstate (generation number and - * state) carry the MQ_RQ_* state value and the upper bits the - * generation number which is monotonically incremented and used to - * distinguish the reuse instances. - * - * ->gstate_seq allows updates to ->gstate and other fields - * (currently ->deadline) during request start to be read - * atomically from the timeout path, so that it can operate on a - * coherent set of information. - */ - seqcount_t gstate_seq; - u64 gstate; + enum mq_rq_state state; + refcount_t ref; - /* - * ->aborted_gstate is used by the timeout to claim a specific - * recycle instance of this request. See blk_mq_timeout_work(). - */ - struct u64_stats_sync aborted_gstate_sync; - u64 aborted_gstate; + unsigned int timeout; /* access through blk_rq_set_deadline, blk_rq_deadline */ unsigned long __deadline; @@ -267,8 +267,6 @@ struct request { #ifdef CONFIG_BLK_CGROUP struct request_list *rl; /* rl this rq is alloced from */ - unsigned long long start_time_ns; - unsigned long long io_start_time_ns; /* when passed to hardware */ #endif }; @@ -328,9 +326,8 @@ typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t); typedef void (exit_rq_fn)(struct request_queue *, struct request *); enum blk_eh_timer_return { - BLK_EH_NOT_HANDLED, - BLK_EH_HANDLED, - BLK_EH_RESET_TIMER, + BLK_EH_DONE, /* drivers has completed the command */ + BLK_EH_RESET_TIMER, /* reset timer and try again */ }; typedef enum blk_eh_timer_return (rq_timed_out_fn)(struct request *); @@ -655,7 +652,7 @@ struct request_queue { struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; - struct bio_set *bio_split; + struct bio_set bio_split; #ifdef CONFIG_BLK_DEBUG_FS struct dentry *debugfs_dir; @@ -967,11 +964,8 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); -extern struct request *blk_get_request_flags(struct request_queue *, - unsigned int op, - blk_mq_req_flags_t flags); extern struct request *blk_get_request(struct request_queue *, unsigned int op, - gfp_t gfp_mask); + blk_mq_req_flags_t flags); extern void blk_requeue_request(struct request_queue *, struct request *); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, @@ -1788,48 +1782,6 @@ int kblockd_schedule_work(struct work_struct *work); int kblockd_schedule_work_on(int cpu, struct work_struct *work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); -#ifdef CONFIG_BLK_CGROUP -/* - * This should not be using sched_clock(). A real patch is in progress - * to fix this up, until that is in place we need to disable preemption - * around sched_clock() in this function and set_io_start_time_ns(). - */ -static inline void set_start_time_ns(struct request *req) -{ - preempt_disable(); - req->start_time_ns = sched_clock(); - preempt_enable(); -} - -static inline void set_io_start_time_ns(struct request *req) -{ - preempt_disable(); - req->io_start_time_ns = sched_clock(); - preempt_enable(); -} - -static inline uint64_t rq_start_time_ns(struct request *req) -{ - return req->start_time_ns; -} - -static inline uint64_t rq_io_start_time_ns(struct request *req) -{ - return req->io_start_time_ns; -} -#else -static inline void set_start_time_ns(struct request *req) {} -static inline void set_io_start_time_ns(struct request *req) {} -static inline uint64_t rq_start_time_ns(struct request *req) -{ - return 0; -} -static inline uint64_t rq_io_start_time_ns(struct request *req) -{ - return 0; -} -#endif - #define MODULE_ALIAS_BLOCKDEV(major,minor) \ MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h index 28a7ccc..6aeaf64 100644 --- a/include/linux/bsg-lib.h +++ b/include/linux/bsg-lib.h @@ -72,8 +72,7 @@ struct bsg_job { void bsg_job_done(struct bsg_job *job, int result, unsigned int reply_payload_rcv_len); struct request_queue *bsg_setup_queue(struct device *dev, const char *name, - bsg_job_fn *job_fn, int dd_job_size, - void (*release)(struct device *)); + bsg_job_fn *job_fn, int dd_job_size); void bsg_job_put(struct bsg_job *job); int __must_check bsg_job_get(struct bsg_job *job); diff --git a/include/linux/bsg.h b/include/linux/bsg.h index 0c7dd9c..dac37b6 100644 --- a/include/linux/bsg.h +++ b/include/linux/bsg.h @@ -17,17 +17,13 @@ struct bsg_ops { struct bsg_class_device { struct device *class_dev; - struct device *parent; int minor; struct request_queue *queue; - struct kref ref; const struct bsg_ops *ops; - void (*release)(struct device *); }; int bsg_register_queue(struct request_queue *q, struct device *parent, - const char *name, const struct bsg_ops *ops, - void (*release)(struct device *)); + const char *name, const struct bsg_ops *ops); int bsg_scsi_register_queue(struct request_queue *q, struct device *parent); void bsg_unregister_queue(struct request_queue *q); #else diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 6d9e230..a02deea 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -218,8 +218,6 @@ extern void elv_unregister(struct elevator_type *); extern ssize_t elv_iosched_show(struct request_queue *, char *); extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); -extern int elevator_init(struct request_queue *, char *); -extern void elevator_exit(struct request_queue *, struct elevator_queue *); extern bool elv_bio_merge_ok(struct request *, struct bio *); extern struct elevator_queue *elevator_alloc(struct request_queue *, struct elevator_type *); diff --git a/include/linux/fs.h b/include/linux/fs.h index 760d8da..d8d4831 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2570,7 +2570,7 @@ extern bool is_bad_inode(struct inode *); #ifdef CONFIG_BLOCK extern void check_disk_size_change(struct gendisk *disk, - struct block_device *bdev); + struct block_device *bdev, bool verbose); extern int revalidate_disk(struct gendisk *); extern int check_disk_change(struct block_device *); extern int __invalidate_device(struct block_device *, bool); diff --git a/include/linux/libata.h b/include/linux/libata.h index 1795fec..1c11313 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1133,7 +1133,6 @@ extern int ata_sas_port_start(struct ata_port *ap); extern void ata_sas_port_stop(struct ata_port *ap); extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *); extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap); -extern enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd); extern int sata_scr_valid(struct ata_link *link); extern int sata_scr_read(struct ata_link *link, int reg, u32 *val); extern int sata_scr_write(struct ata_link *link, int reg, u32 val); @@ -1359,7 +1358,6 @@ extern struct device_attribute *ata_common_sdev_attrs[]; .proc_name = drv_name, \ .slave_configure = ata_scsi_slave_config, \ .slave_destroy = ata_scsi_slave_destroy, \ - .eh_timed_out = ata_scsi_timed_out, \ .bios_param = ata_std_bios_param, \ .unlock_native_capacity = ata_scsi_unlock_native_capacity, \ .sdev_attrs = ata_common_sdev_attrs diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 6e0859b..e9e0d1c 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -489,7 +489,7 @@ typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, int flags); -typedef void (nvm_tgt_exit_fn)(void *); +typedef void (nvm_tgt_exit_fn)(void *, bool); typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *); typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *); diff --git a/include/linux/mempool.h b/include/linux/mempool.h index b51f5c4..0c964ac 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -25,6 +25,18 @@ typedef struct mempool_s { wait_queue_head_t wait; } mempool_t; +static inline bool mempool_initialized(mempool_t *pool) +{ + return pool->elements != NULL; +} + +void mempool_exit(mempool_t *pool); +int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id); +int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); + extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data); extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, @@ -43,6 +55,14 @@ extern void mempool_free(void *element, mempool_t *pool); */ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data); void mempool_free_slab(void *element, void *pool_data); + +static inline int +mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc) +{ + return mempool_init(pool, min_nr, mempool_alloc_slab, + mempool_free_slab, (void *) kc); +} + static inline mempool_t * mempool_create_slab_pool(int min_nr, struct kmem_cache *kc) { @@ -56,6 +76,13 @@ mempool_create_slab_pool(int min_nr, struct kmem_cache *kc) */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data); void mempool_kfree(void *element, void *pool_data); + +static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size) +{ + return mempool_init(pool, min_nr, mempool_kmalloc, + mempool_kfree, (void *) size); +} + static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) { return mempool_create(min_nr, mempool_kmalloc, mempool_kfree, @@ -68,6 +95,13 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) */ void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data); void mempool_free_pages(void *element, void *pool_data); + +static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order) +{ + return mempool_init(pool, min_nr, mempool_alloc_pages, + mempool_free_pages, (void *)(long)order); +} + static inline mempool_t *mempool_create_page_pool(int min_nr, int order) { return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages, diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 4112e2b..2950ce9 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -436,10 +436,19 @@ enum { enum { NVME_AER_ERROR = 0, NVME_AER_SMART = 1, + NVME_AER_NOTICE = 2, NVME_AER_CSS = 6, NVME_AER_VS = 7, - NVME_AER_NOTICE_NS_CHANGED = 0x0002, - NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102, +}; + +enum { + NVME_AER_NOTICE_NS_CHANGED = 0x00, + NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, +}; + +enum { + NVME_AEN_CFG_NS_ATTR = 1 << 8, + NVME_AEN_CFG_FW_ACT = 1 << 9, }; struct nvme_lba_range_type { @@ -747,6 +756,7 @@ enum { NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, + NVME_LOG_CHANGED_NS = 0x04, NVME_LOG_CMD_EFFECTS = 0x05, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, @@ -755,6 +765,8 @@ enum { NVME_FWACT_ACTV = (2 << 3), }; +#define NVME_MAX_CHANGED_NAMESPACES 1024 + struct nvme_identify { __u8 opcode; __u8 flags; diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index 93d142a..1746015 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -186,7 +186,7 @@ struct pktcdvd_device sector_t current_sector; /* Keep track of where the elevator is */ atomic_t scan_queue; /* Set to non-zero when pkt_handle_queue */ /* needs to be run. */ - mempool_t *rb_pool; /* mempool for pkt_rb_node allocations */ + mempool_t rb_pool; /* mempool for pkt_rb_node allocations */ struct packet_iosched iosched; struct gendisk *disk; diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 841585f..e653953 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -127,6 +127,12 @@ struct sbitmap_queue { * @round_robin: Allocate bits in strict round-robin order. */ bool round_robin; + + /** + * @min_shallow_depth: The minimum shallow depth which may be passed to + * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow(). + */ + unsigned int min_shallow_depth; }; /** @@ -390,6 +396,9 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq); * @shallow_depth: The maximum number of bits to allocate from a single word. * See sbitmap_get_shallow(). * + * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after + * initializing @sbq. + * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, @@ -424,6 +433,9 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, * @shallow_depth: The maximum number of bits to allocate from a single word. * See sbitmap_get_shallow(). * + * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after + * initializing @sbq. + * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, @@ -439,6 +451,23 @@ static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, } /** + * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the + * minimum shallow depth that will be used. + * @sbq: Bitmap queue in question. + * @min_shallow_depth: The minimum shallow depth that will be passed to + * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow(). + * + * sbitmap_queue_clear() batches wakeups as an optimization. The batch size + * depends on the depth of the bitmap. Since the shallow allocation functions + * effectively operate with a different depth, the shallow depth must be taken + * into account when calculating the batch size. This function must be called + * with the minimum shallow depth that will be used. Failure to do so can result + * in missed wakeups. + */ +void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, + unsigned int min_shallow_depth); + +/** * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a * &struct sbitmap_queue. * @sbq: Bitmap to free from. @@ -484,6 +513,13 @@ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); /** + * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue + * on a &struct sbitmap_queue. + * @sbq: Bitmap queue to wake up. + */ +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); + +/** * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct * seq_file. * @sbq: Bitmap queue to show. diff --git a/include/scsi/osd_initiator.h b/include/scsi/osd_initiator.h index a29d308..86a569d 100644 --- a/include/scsi/osd_initiator.h +++ b/include/scsi/osd_initiator.h @@ -148,7 +148,6 @@ struct osd_request { u8 *pad_buff; } out, in; - gfp_t alloc_flags; unsigned timeout; unsigned retries; unsigned sense_len; @@ -202,14 +201,11 @@ static inline bool osd_req_is_ver1(struct osd_request *or) * * @osd_dev: OSD device that holds the scsi-device and default values * that the request is associated with. - * @gfp: The allocation flags to use for request allocation, and all - * subsequent allocations. This will be stored at - * osd_request->alloc_flags, can be changed by user later * * Allocate osd_request and initialize all members to the * default/initial state. */ -struct osd_request *osd_start_request(struct osd_dev *od, gfp_t gfp); +struct osd_request *osd_start_request(struct osd_dev *od); enum osd_req_options { OSD_REQ_FUA = 0x08, /* Force Unit Access */ diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 12f454c..53b485f 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -307,7 +307,7 @@ struct scsi_host_template { * EH_HANDLED: I fixed the error, please complete the command * EH_RESET_TIMER: I need more time, reset the timer and * begin counting again - * EH_NOT_HANDLED Begin normal error recovery + * EH_DONE: Begin normal error recovery * * Status: OPTIONAL */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 11b4282..1efcb5b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -269,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); + bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio_set_dev(bio, hib_resume_bdev); bio_set_op_attrs(bio, op, op_flags); @@ -376,7 +376,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) return -ENOSPC; if (hb) { - src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | + src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); @@ -384,7 +384,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_RECLAIM | + src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (src) { @@ -691,7 +691,7 @@ static int save_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); + page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!page) { pr_err("Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -989,7 +989,7 @@ static int get_swap_reader(struct swap_map_handle *handle, last = tmp; tmp->map = (struct swap_map_page *) - __get_free_page(__GFP_RECLAIM | __GFP_HIGH); + __get_free_page(GFP_NOIO | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; @@ -1261,8 +1261,8 @@ static int load_image_lzo(struct swap_map_handle *handle, for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? - __GFP_RECLAIM | __GFP_HIGH : - __GFP_RECLAIM | __GFP_NOWARN | + GFP_NOIO | __GFP_HIGH : + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!page[i]) { diff --git a/lib/sbitmap.c b/lib/sbitmap.c index e6a9c06..6fdc626 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -270,18 +270,33 @@ void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m) } EXPORT_SYMBOL_GPL(sbitmap_bitmap_show); -static unsigned int sbq_calc_wake_batch(unsigned int depth) +static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq, + unsigned int depth) { unsigned int wake_batch; + unsigned int shallow_depth; /* * For each batch, we wake up one queue. We need to make sure that our - * batch size is small enough that the full depth of the bitmap is - * enough to wake up all of the queues. + * batch size is small enough that the full depth of the bitmap, + * potentially limited by a shallow depth, is enough to wake up all of + * the queues. + * + * Each full word of the bitmap has bits_per_word bits, and there might + * be a partial word. There are depth / bits_per_word full words and + * depth % bits_per_word bits left over. In bitwise arithmetic: + * + * bits_per_word = 1 << shift + * depth / bits_per_word = depth >> shift + * depth % bits_per_word = depth & ((1 << shift) - 1) + * + * Each word can be limited to sbq->min_shallow_depth bits. */ - wake_batch = SBQ_WAKE_BATCH; - if (wake_batch > depth / SBQ_WAIT_QUEUES) - wake_batch = max(1U, depth / SBQ_WAIT_QUEUES); + shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth); + depth = ((depth >> sbq->sb.shift) * shallow_depth + + min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth)); + wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1, + SBQ_WAKE_BATCH); return wake_batch; } @@ -307,7 +322,8 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, *per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth; } - sbq->wake_batch = sbq_calc_wake_batch(depth); + sbq->min_shallow_depth = UINT_MAX; + sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); atomic_set(&sbq->wake_index, 0); sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); @@ -327,21 +343,28 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, } EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); -void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) +static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, + unsigned int depth) { - unsigned int wake_batch = sbq_calc_wake_batch(depth); + unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth); int i; if (sbq->wake_batch != wake_batch) { WRITE_ONCE(sbq->wake_batch, wake_batch); /* - * Pairs with the memory barrier in sbq_wake_up() to ensure that - * the batch size is updated before the wait counts. + * Pairs with the memory barrier in sbitmap_queue_wake_up() + * to ensure that the batch size is updated before the wait + * counts. */ smp_mb__before_atomic(); for (i = 0; i < SBQ_WAIT_QUEUES; i++) atomic_set(&sbq->ws[i].wait_cnt, 1); } +} + +void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) +{ + sbitmap_queue_update_wake_batch(sbq, depth); sbitmap_resize(&sbq->sb, depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_resize); @@ -380,6 +403,8 @@ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int hint, depth; int nr; + WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth); + hint = this_cpu_read(*sbq->alloc_hint); depth = READ_ONCE(sbq->sb.depth); if (unlikely(hint >= depth)) { @@ -403,6 +428,14 @@ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, } EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow); +void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, + unsigned int min_shallow_depth) +{ + sbq->min_shallow_depth = min_shallow_depth; + sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth); +} +EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth); + static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) { int i, wake_index; @@ -425,52 +458,67 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) return NULL; } -static void sbq_wake_up(struct sbitmap_queue *sbq) +static bool __sbq_wake_up(struct sbitmap_queue *sbq) { struct sbq_wait_state *ws; unsigned int wake_batch; int wait_cnt; - /* - * Pairs with the memory barrier in set_current_state() to ensure the - * proper ordering of clear_bit()/waitqueue_active() in the waker and - * test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the - * waiter. See the comment on waitqueue_active(). This is __after_atomic - * because we just did clear_bit_unlock() in the caller. - */ - smp_mb__after_atomic(); - ws = sbq_wake_ptr(sbq); if (!ws) - return; + return false; wait_cnt = atomic_dec_return(&ws->wait_cnt); if (wait_cnt <= 0) { + int ret; + wake_batch = READ_ONCE(sbq->wake_batch); + /* * Pairs with the memory barrier in sbitmap_queue_resize() to * ensure that we see the batch size update before the wait * count is reset. */ smp_mb__before_atomic(); + /* - * If there are concurrent callers to sbq_wake_up(), the last - * one to decrement the wait count below zero will bump it back - * up. If there is a concurrent resize, the count reset will - * either cause the cmpxchg to fail or overwrite after the - * cmpxchg. + * For concurrent callers of this, the one that failed the + * atomic_cmpxhcg() race should call this function again + * to wakeup a new batch on a different 'ws'. */ - atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch); - sbq_index_atomic_inc(&sbq->wake_index); - wake_up_nr(&ws->wait, wake_batch); + ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch); + if (ret == wait_cnt) { + sbq_index_atomic_inc(&sbq->wake_index); + wake_up_nr(&ws->wait, wake_batch); + return false; + } + + return true; } + + return false; +} + +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq) +{ + while (__sbq_wake_up(sbq)) + ; } +EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu) { sbitmap_clear_bit_unlock(&sbq->sb, nr); - sbq_wake_up(sbq); + /* + * Pairs with the memory barrier in set_current_state() to ensure the + * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker + * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the + * waiter. See the comment on waitqueue_active(). + */ + smp_mb__after_atomic(); + sbitmap_queue_wake_up(sbq); + if (likely(!sbq->round_robin && nr < sbq->sb.depth)) *per_cpu_ptr(sbq->alloc_hint, cpu) = nr; } @@ -482,7 +530,7 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) /* * Pairs with the memory barrier in set_current_state() like in - * sbq_wake_up(). + * sbitmap_queue_wake_up(). */ smp_mb(); wake_index = atomic_read(&sbq->wake_index); @@ -528,5 +576,6 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) seq_puts(m, "}\n"); seq_printf(m, "round_robin=%d\n", sbq->round_robin); + seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_show); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7441bd9..8fe3ebd 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -412,6 +412,7 @@ static void wb_exit(struct bdi_writeback *wb) * protected. */ static DEFINE_SPINLOCK(cgwb_lock); +static struct workqueue_struct *cgwb_release_wq; /** * wb_congested_get_create - get or create a wb_congested @@ -522,7 +523,7 @@ static void cgwb_release(struct percpu_ref *refcnt) { struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, refcnt); - schedule_work(&wb->release_work); + queue_work(cgwb_release_wq, &wb->release_work); } static void cgwb_kill(struct bdi_writeback *wb) @@ -784,6 +785,21 @@ static void cgwb_bdi_register(struct backing_dev_info *bdi) spin_unlock_irq(&cgwb_lock); } +static int __init cgwb_init(void) +{ + /* + * There can be many concurrent release work items overwhelming + * system_wq. Put them in a separate wq and limit concurrency. + * There's no point in executing many of these in parallel. + */ + cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); + if (!cgwb_release_wq) + return -ENOMEM; + + return 0; +} +subsys_initcall(cgwb_init); + #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) diff --git a/mm/mempool.c b/mm/mempool.c index 5c9dce3..b54f2c2 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -138,6 +138,28 @@ static void *remove_element(mempool_t *pool, gfp_t flags) } /** + * mempool_exit - exit a mempool initialized with mempool_init() + * @pool: pointer to the memory pool which was initialized with + * mempool_init(). + * + * Free all reserved elements in @pool and @pool itself. This function + * only sleeps if the free_fn() function sleeps. + * + * May be called on a zeroed but uninitialized mempool (i.e. allocated with + * kzalloc()). + */ +void mempool_exit(mempool_t *pool) +{ + while (pool->curr_nr) { + void *element = remove_element(pool, GFP_KERNEL); + pool->free(element, pool->pool_data); + } + kfree(pool->elements); + pool->elements = NULL; +} +EXPORT_SYMBOL(mempool_exit); + +/** * mempool_destroy - deallocate a memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). @@ -150,15 +172,65 @@ void mempool_destroy(mempool_t *pool) if (unlikely(!pool)) return; - while (pool->curr_nr) { - void *element = remove_element(pool, GFP_KERNEL); - pool->free(element, pool->pool_data); - } - kfree(pool->elements); + mempool_exit(pool); kfree(pool); } EXPORT_SYMBOL(mempool_destroy); +int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id) +{ + spin_lock_init(&pool->lock); + pool->min_nr = min_nr; + pool->pool_data = pool_data; + pool->alloc = alloc_fn; + pool->free = free_fn; + init_waitqueue_head(&pool->wait); + + pool->elements = kmalloc_array_node(min_nr, sizeof(void *), + gfp_mask, node_id); + if (!pool->elements) + return -ENOMEM; + + /* + * First pre-allocate the guaranteed number of buffers. + */ + while (pool->curr_nr < pool->min_nr) { + void *element; + + element = pool->alloc(gfp_mask, pool->pool_data); + if (unlikely(!element)) { + mempool_exit(pool); + return -ENOMEM; + } + add_element(pool, element); + } + + return 0; +} +EXPORT_SYMBOL(mempool_init_node); + +/** + * mempool_init - initialize a memory pool + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. + * @free_fn: user-defined element-freeing function. + * @pool_data: optional private data available to the user-defined functions. + * + * Like mempool_create(), but initializes the pool in (i.e. embedded in another + * structure). + */ +int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) +{ + return mempool_init_node(pool, min_nr, alloc_fn, free_fn, + pool_data, GFP_KERNEL, NUMA_NO_NODE); + +} +EXPORT_SYMBOL(mempool_init); + /** * mempool_create - create a memory pool * @min_nr: the minimum number of elements guaranteed to be @@ -186,35 +258,17 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, gfp_t gfp_mask, int node_id) { mempool_t *pool; + pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); if (!pool) return NULL; - pool->elements = kmalloc_array_node(min_nr, sizeof(void *), - gfp_mask, node_id); - if (!pool->elements) { + + if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, + gfp_mask, node_id)) { kfree(pool); return NULL; } - spin_lock_init(&pool->lock); - pool->min_nr = min_nr; - pool->pool_data = pool_data; - init_waitqueue_head(&pool->wait); - pool->alloc = alloc_fn; - pool->free = free_fn; - /* - * First pre-allocate the guaranteed number of buffers. - */ - while (pool->curr_nr < pool->min_nr) { - void *element; - - element = pool->alloc(gfp_mask, pool->pool_data); - if (unlikely(!element)) { - mempool_destroy(pool); - return NULL; - } - add_element(pool, element); - } return pool; } EXPORT_SYMBOL(mempool_create_node); |