diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-29 11:51:49 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-29 11:51:49 -0800 |
commit | 0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0 (patch) | |
tree | cefccd67dc1f27bb45830f6b8065dd4a1c05e83b | |
parent | 9697e9da84299d0d715d515dd2cc48f1eceb277d (diff) | |
parent | 796baeeef85a40b3495a907fb7425086e7010102 (diff) | |
download | op-kernel-dev-0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0.zip op-kernel-dev-0a4b6e2f80aad46fb55a5cf7b1664c0aef030ee0.tar.gz |
Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
"This is the main pull request for block IO related changes for the
4.16 kernel. Nothing major in this pull request, but a good amount of
improvements and fixes all over the map. This contains:
- BFQ improvements, fixes, and cleanups from Angelo, Chiara, and
Paolo.
- Support for SMR zones for deadline and mq-deadline from Damien and
Christoph.
- Set of fixes for bcache by way of Michael Lyle, including fixes
from himself, Kent, Rui, Tang, and Coly.
- Series from Matias for lightnvm with fixes from Hans Holmberg,
Javier, and Matias. Mostly centered around pblk, and the removing
rrpc 1.2 in preparation for supporting 2.0.
- A couple of NVMe pull requests from Christoph. Nothing major in
here, just fixes and cleanups, and support for command tracing from
Johannes.
- Support for blk-throttle for tracking reads and writes separately.
From Joseph Qi. A few cleanups/fixes also for blk-throttle from
Weiping.
- Series from Mike Snitzer that enables dm to register its queue more
logically, something that's alwways been problematic on dm since
it's a stacked device.
- Series from Ming cleaning up some of the bio accessor use, in
preparation for supporting multipage bvecs.
- Various fixes from Ming closing up holes around queue mapping and
quiescing.
- BSD partition fix from Richard Narron, fixing a problem where we
can't mount newer (10/11) FreeBSD partitions.
- Series from Tejun reworking blk-mq timeout handling. The previous
scheme relied on atomic bits, but it had races where we would think
a request had timed out if it to reused at the wrong time.
- null_blk now supports faking timeouts, to enable us to better
exercise and test that functionality separately. From me.
- Kill the separate atomic poll bit in the request struct. After
this, we don't use the atomic bits on blk-mq anymore at all. From
me.
- sgl_alloc/free helpers from Bart.
- Heavily contended tag case scalability improvement from me.
- Various little fixes and cleanups from Arnd, Bart, Corentin,
Douglas, Eryu, Goldwyn, and myself"
* 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits)
block: remove smart1,2.h
nvme: add tracepoint for nvme_complete_rq
nvme: add tracepoint for nvme_setup_cmd
nvme-pci: introduce RECONNECTING state to mark initializing procedure
nvme-rdma: remove redundant boolean for inline_data
nvme: don't free uuid pointer before printing it
nvme-pci: Suspend queues after deleting them
bsg: use pr_debug instead of hand crafted macros
blk-mq-debugfs: don't allow write on attributes with seq_operations set
nvme-pci: Fix queue double allocations
block: Set BIO_TRACE_COMPLETION on new bio during split
blk-throttle: use queue_is_rq_based
block: Remove kblockd_schedule_delayed_work{,_on}()
blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays
blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly()
lib/scatterlist: Fix chaining support in sgl_alloc_order()
blk-throttle: track read and write request individually
block: add bdev_read_only() checks to common helpers
block: fail op_is_write() requests to read-only partitions
blk-throttle: export io_serviced_recursive, io_service_bytes_recursive
...
124 files changed, 3884 insertions, 4729 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index da1525e..d819dc7 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -775,10 +775,11 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) unsigned long flags; int i; + spin_lock_irqsave(&bfqd->lock, flags); + if (!entity) /* root group */ - return; + goto put_async_queues; - spin_lock_irqsave(&bfqd->lock, flags); /* * Empty all service_trees belonging to this group before * deactivating the group itself. @@ -809,6 +810,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) } __bfq_deactivate_entity(entity, false); + +put_async_queues: bfq_put_async_queues(bfqd, bfqg); spin_unlock_irqrestore(&bfqd->lock, flags); diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index bcb6d21..47e6ec7 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -166,6 +166,20 @@ static const int bfq_async_charge_factor = 10; /* Default timeout values, in jiffies, approximating CFQ defaults. */ const int bfq_timeout = HZ / 8; +/* + * Time limit for merging (see comments in bfq_setup_cooperator). Set + * to the slowest value that, in our tests, proved to be effective in + * removing false positives, while not causing true positives to miss + * queue merging. + * + * As can be deduced from the low time limit below, queue merging, if + * successful, happens at the very beggining of the I/O of the involved + * cooperating processes, as a consequence of the arrival of the very + * first requests from each cooperator. After that, there is very + * little chance to find cooperators. + */ +static const unsigned long bfq_merge_time_limit = HZ/10; + static struct kmem_cache *bfq_pool; /* Below this threshold (in ns), we consider thinktime immediate. */ @@ -178,7 +192,7 @@ static struct kmem_cache *bfq_pool; #define BFQQ_SEEK_THR (sector_t)(8 * 100) #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) +#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) /* Min number of samples required to perform peak-rate update */ #define BFQ_RATE_MIN_SAMPLES 32 @@ -195,15 +209,17 @@ static struct kmem_cache *bfq_pool; * interactive applications automatically, using the following formula: * duration = (R / r) * T, where r is the peak rate of the device, and * R and T are two reference parameters. - * In particular, R is the peak rate of the reference device (see below), - * and T is a reference time: given the systems that are likely to be - * installed on the reference device according to its speed class, T is - * about the maximum time needed, under BFQ and while reading two files in - * parallel, to load typical large applications on these systems. - * In practice, the slower/faster the device at hand is, the more/less it - * takes to load applications with respect to the reference device. - * Accordingly, the longer/shorter BFQ grants weight raising to interactive - * applications. + * In particular, R is the peak rate of the reference device (see + * below), and T is a reference time: given the systems that are + * likely to be installed on the reference device according to its + * speed class, T is about the maximum time needed, under BFQ and + * while reading two files in parallel, to load typical large + * applications on these systems (see the comments on + * max_service_from_wr below, for more details on how T is obtained). + * In practice, the slower/faster the device at hand is, the more/less + * it takes to load applications with respect to the reference device. + * Accordingly, the longer/shorter BFQ grants weight raising to + * interactive applications. * * BFQ uses four different reference pairs (R, T), depending on: * . whether the device is rotational or non-rotational; @@ -240,6 +256,60 @@ static int T_slow[2]; static int T_fast[2]; static int device_speed_thresh[2]; +/* + * BFQ uses the above-detailed, time-based weight-raising mechanism to + * privilege interactive tasks. This mechanism is vulnerable to the + * following false positives: I/O-bound applications that will go on + * doing I/O for much longer than the duration of weight + * raising. These applications have basically no benefit from being + * weight-raised at the beginning of their I/O. On the opposite end, + * while being weight-raised, these applications + * a) unjustly steal throughput to applications that may actually need + * low latency; + * b) make BFQ uselessly perform device idling; device idling results + * in loss of device throughput with most flash-based storage, and may + * increase latencies when used purposelessly. + * + * BFQ tries to reduce these problems, by adopting the following + * countermeasure. To introduce this countermeasure, we need first to + * finish explaining how the duration of weight-raising for + * interactive tasks is computed. + * + * For a bfq_queue deemed as interactive, the duration of weight + * raising is dynamically adjusted, as a function of the estimated + * peak rate of the device, so as to be equal to the time needed to + * execute the 'largest' interactive task we benchmarked so far. By + * largest task, we mean the task for which each involved process has + * to do more I/O than for any of the other tasks we benchmarked. This + * reference interactive task is the start-up of LibreOffice Writer, + * and in this task each process/bfq_queue needs to have at most ~110K + * sectors transferred. + * + * This last piece of information enables BFQ to reduce the actual + * duration of weight-raising for at least one class of I/O-bound + * applications: those doing sequential or quasi-sequential I/O. An + * example is file copy. In fact, once started, the main I/O-bound + * processes of these applications usually consume the above 110K + * sectors in much less time than the processes of an application that + * is starting, because these I/O-bound processes will greedily devote + * almost all their CPU cycles only to their target, + * throughput-friendly I/O operations. This is even more true if BFQ + * happens to be underestimating the device peak rate, and thus + * overestimating the duration of weight raising. But, according to + * our measurements, once transferred 110K sectors, these processes + * have no right to be weight-raised any longer. + * + * Basing on the last consideration, BFQ ends weight-raising for a + * bfq_queue if the latter happens to have received an amount of + * service at least equal to the following constant. The constant is + * set to slightly more than 110K, to have a minimum safety margin. + * + * This early ending of weight-raising reduces the amount of time + * during which interactive false positives cause the two problems + * described at the beginning of these comments. + */ +static const unsigned long max_service_from_wr = 120000; + #define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) @@ -403,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, } } +/* + * See the comments on bfq_limit_depth for the purpose of + * the depths set in the function. + */ +static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) +{ + bfqd->sb_shift = bt->sb.shift; + + /* + * In-word depths if no bfq_queue is being weight-raised: + * leaving 25% of tags only for sync reads. + * + * In next formulas, right-shift the value + * (1U<<bfqd->sb_shift), instead of computing directly + * (1U<<(bfqd->sb_shift - something)), to be robust against + * any possible value of bfqd->sb_shift, without having to + * limit 'something'. + */ + /* no more than 50% of tags for async I/O */ + bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U); + /* + * no more than 75% of tags for sync writes (25% extra tags + * w.r.t. async I/O, to prevent async I/O from starving sync + * writes) + */ + bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U); + + /* + * In-word depths in case some bfq_queue is being weight- + * raised: leaving ~63% of tags for sync reads. This is the + * highest percentage for which, in our tests, application + * start-up times didn't suffer from any regression due to tag + * shortage. + */ + /* no more than ~18% of tags for async I/O */ + bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U); + /* no more than ~37% of tags for sync writes (~20% extra tags) */ + bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U); +} + +/* + * Async I/O can easily starve sync I/O (both sync reads and sync + * writes), by consuming all tags. Similarly, storms of sync writes, + * such as those that sync(2) may trigger, can starve sync reads. + * Limit depths of async I/O and sync writes so as to counter both + * problems. + */ +static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +{ + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct bfq_data *bfqd = data->q->elevator->elevator_data; + struct sbitmap_queue *bt; + + if (op_is_sync(op) && !op_is_write(op)) + return; + + if (data->flags & BLK_MQ_REQ_RESERVED) { + if (unlikely(!tags->nr_reserved_tags)) { + WARN_ON_ONCE(1); + return; + } + bt = &tags->breserved_tags; + } else + bt = &tags->bitmap_tags; + + if (unlikely(bfqd->sb_shift != bt->sb.shift)) + bfq_update_depths(bfqd, bt); + + data->shallow_depth = + bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + + bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", + __func__, bfqd->wr_busy_queues, op_is_sync(op), + data->shallow_depth); +} + static struct bfq_queue * bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, sector_t sector, struct rb_node **ret_parent, @@ -444,6 +590,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, return bfqq; } +static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) +{ + return bfqq->service_from_backlogged > 0 && + time_is_before_jiffies(bfqq->first_IO_time + + bfq_merge_time_limit); +} + void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct rb_node **p, *parent; @@ -454,6 +607,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->pos_root = NULL; } + /* + * bfqq cannot be merged any longer (see comments in + * bfq_setup_cooperator): no point in adding bfqq into the + * position tree. + */ + if (bfq_too_late_for_merging(bfqq)) + return; + if (bfq_class_idle(bfqq)) return; if (!bfqq->next_rq) @@ -1247,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, if (old_wr_coeff == 1 && wr_or_deserves_wr) { /* start a weight-raising period */ if (interactive) { + bfqq->service_from_wr = 0; bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); } else { @@ -1627,6 +1789,8 @@ static void bfq_remove_request(struct request_queue *q, rb_erase(&bfqq->pos_node, bfqq->pos_root); bfqq->pos_root = NULL; } + } else { + bfq_pos_tree_add_move(bfqd, bfqq); } if (rq->cmd_flags & REQ_META) @@ -1933,6 +2097,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) { + if (bfq_too_late_for_merging(new_bfqq)) + return false; + if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || (bfqq->ioprio_class != new_bfqq->ioprio_class)) return false; @@ -1957,20 +2124,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, } /* - * If this function returns true, then bfqq cannot be merged. The idea - * is that true cooperation happens very early after processes start - * to do I/O. Usually, late cooperations are just accidental false - * positives. In case bfqq is weight-raised, such false positives - * would evidently degrade latency guarantees for bfqq. - */ -static bool wr_from_too_long(struct bfq_queue *bfqq) -{ - return bfqq->wr_coeff > 1 && - time_is_before_jiffies(bfqq->last_wr_start_finish + - msecs_to_jiffies(100)); -} - -/* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return * NULL if no merge was scheduled, a pointer to the shared bfq_queue @@ -1983,11 +2136,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq) * to maintain. Besides, in such a critical condition as an out of memory, * the benefits of queue merging may be little relevant, or even negligible. * - * Weight-raised queues can be merged only if their weight-raising - * period has just started. In fact cooperating processes are usually - * started together. Thus, with this filter we avoid false positives - * that would jeopardize low-latency guarantees. - * * WARNING: queue merging may impair fairness among non-weight raised * queues, for at least two reasons: 1) the original weight of a * merged queue may change during the merged state, 2) even being the @@ -2001,12 +2149,24 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_queue *in_service_bfqq, *new_bfqq; + /* + * Prevent bfqq from being merged if it has been created too + * long ago. The idea is that true cooperating processes, and + * thus their associated bfq_queues, are supposed to be + * created shortly after each other. This is the case, e.g., + * for KVM/QEMU and dump I/O threads. Basing on this + * assumption, the following filtering greatly reduces the + * probability that two non-cooperating processes, which just + * happen to do close I/O for some short time interval, have + * their queues merged by mistake. + */ + if (bfq_too_late_for_merging(bfqq)) + return NULL; + if (bfqq->new_bfqq) return bfqq->new_bfqq; - if (!io_struct || - wr_from_too_long(bfqq) || - unlikely(bfqq == &bfqd->oom_bfqq)) + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; /* If there is only one backlogged queue, don't search. */ @@ -2015,12 +2175,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, in_service_bfqq = bfqd->in_service_queue; - if (!in_service_bfqq || in_service_bfqq == bfqq - || wr_from_too_long(in_service_bfqq) || - unlikely(in_service_bfqq == &bfqd->oom_bfqq)) - goto check_scheduled; - - if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + if (in_service_bfqq && in_service_bfqq != bfqq && + likely(in_service_bfqq != &bfqd->oom_bfqq) && + bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && bfqq->entity.parent == in_service_bfqq->entity.parent && bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); @@ -2032,12 +2189,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, * queues. The only thing we need is that the bio/request is not * NULL, as we need it to establish whether a cooperator exists. */ -check_scheduled: new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, bfq_io_struct_pos(io_struct, request)); - if (new_bfqq && !wr_from_too_long(new_bfqq) && - likely(new_bfqq != &bfqd->oom_bfqq) && + if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && bfq_may_be_close_cooperator(bfqq, new_bfqq)) return bfq_setup_merge(bfqq, new_bfqq); @@ -2062,7 +2217,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); if (unlikely(bfq_bfqq_just_created(bfqq) && - !bfq_bfqq_in_large_burst(bfqq))) { + !bfq_bfqq_in_large_burst(bfqq) && + bfqq->bfqd->low_latency)) { /* * bfqq being merged right after being created: bfqq * would have deserved interactive weight raising, but @@ -2917,45 +3073,87 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, * whereas soft_rt_next_start is set to infinity for applications that do * not. * - * Unfortunately, even a greedy application may happen to behave in an - * isochronous way if the CPU load is high. In fact, the application may - * stop issuing requests while the CPUs are busy serving other processes, - * then restart, then stop again for a while, and so on. In addition, if - * the disk achieves a low enough throughput with the request pattern - * issued by the application (e.g., because the request pattern is random - * and/or the device is slow), then the application may meet the above - * bandwidth requirement too. To prevent such a greedy application to be - * deemed as soft real-time, a further rule is used in the computation of - * soft_rt_next_start: soft_rt_next_start must be higher than the current - * time plus the maximum time for which the arrival of a request is waited - * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. - * This filters out greedy applications, as the latter issue instead their - * next request as soon as possible after the last one has been completed - * (in contrast, when a batch of requests is completed, a soft real-time - * application spends some time processing data). + * Unfortunately, even a greedy (i.e., I/O-bound) application may + * happen to meet, occasionally or systematically, both the above + * bandwidth and isochrony requirements. This may happen at least in + * the following circumstances. First, if the CPU load is high. The + * application may stop issuing requests while the CPUs are busy + * serving other processes, then restart, then stop again for a while, + * and so on. The other circumstances are related to the storage + * device: the storage device is highly loaded or reaches a low-enough + * throughput with the I/O of the application (e.g., because the I/O + * is random and/or the device is slow). In all these cases, the + * I/O of the application may be simply slowed down enough to meet + * the bandwidth and isochrony requirements. To reduce the probability + * that greedy applications are deemed as soft real-time in these + * corner cases, a further rule is used in the computation of + * soft_rt_next_start: the return value of this function is forced to + * be higher than the maximum between the following two quantities. + * + * (a) Current time plus: (1) the maximum time for which the arrival + * of a request is waited for when a sync queue becomes idle, + * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We + * postpone for a moment the reason for adding a few extra + * jiffies; we get back to it after next item (b). Lower-bounding + * the return value of this function with the current time plus + * bfqd->bfq_slice_idle tends to filter out greedy applications, + * because the latter issue their next request as soon as possible + * after the last one has been completed. In contrast, a soft + * real-time application spends some time processing data, after a + * batch of its requests has been completed. * - * Unfortunately, the last filter may easily generate false positives if - * only bfqd->bfq_slice_idle is used as a reference time interval and one - * or both the following cases occur: - * 1) HZ is so low that the duration of a jiffy is comparable to or higher - * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with - * HZ=100. + * (b) Current value of bfqq->soft_rt_next_start. As pointed out + * above, greedy applications may happen to meet both the + * bandwidth and isochrony requirements under heavy CPU or + * storage-device load. In more detail, in these scenarios, these + * applications happen, only for limited time periods, to do I/O + * slowly enough to meet all the requirements described so far, + * including the filtering in above item (a). These slow-speed + * time intervals are usually interspersed between other time + * intervals during which these applications do I/O at a very high + * speed. Fortunately, exactly because of the high speed of the + * I/O in the high-speed intervals, the values returned by this + * function happen to be so high, near the end of any such + * high-speed interval, to be likely to fall *after* the end of + * the low-speed time interval that follows. These high values are + * stored in bfqq->soft_rt_next_start after each invocation of + * this function. As a consequence, if the last value of + * bfqq->soft_rt_next_start is constantly used to lower-bound the + * next value that this function may return, then, from the very + * beginning of a low-speed interval, bfqq->soft_rt_next_start is + * likely to be constantly kept so high that any I/O request + * issued during the low-speed interval is considered as arriving + * to soon for the application to be deemed as soft + * real-time. Then, in the high-speed interval that follows, the + * application will not be deemed as soft real-time, just because + * it will do I/O at a high speed. And so on. + * + * Getting back to the filtering in item (a), in the following two + * cases this filtering might be easily passed by a greedy + * application, if the reference quantity was just + * bfqd->bfq_slice_idle: + * 1) HZ is so low that the duration of a jiffy is comparable to or + * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow + * devices with HZ=100. The time granularity may be so coarse + * that the approximation, in jiffies, of bfqd->bfq_slice_idle + * is rather lower than the exact value. * 2) jiffies, instead of increasing at a constant rate, may stop increasing * for a while, then suddenly 'jump' by several units to recover the lost * increments. This seems to happen, e.g., inside virtual machines. - * To address this issue, we do not use as a reference time interval just - * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In - * particular we add the minimum number of jiffies for which the filter - * seems to be quite precise also in embedded systems and KVM/QEMU virtual - * machines. + * To address this issue, in the filtering in (a) we do not use as a + * reference time interval just bfqd->bfq_slice_idle, but + * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the + * minimum number of jiffies for which the filter seems to be quite + * precise also in embedded systems and KVM/QEMU virtual machines. */ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - return max(bfqq->last_idle_bklogged + - HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate, - jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); + return max3(bfqq->soft_rt_next_start, + bfqq->last_idle_bklogged + + HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate, + jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); } /** @@ -3000,17 +3198,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); /* - * Increase service_from_backlogged before next statement, - * because the possible next invocation of - * bfq_bfqq_charge_time would likely inflate - * entity->service. In contrast, service_from_backlogged must - * contain real service, to enable the soft real-time - * heuristic to correctly compute the bandwidth consumed by - * bfqq. - */ - bfqq->service_from_backlogged += entity->service; - - /* * As above explained, charge slow (typically seeky) and * timed-out queues with the time and not the service * received, to favor sequential workloads. @@ -3535,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->entity.prio_changed = 1; } } + if (bfqq->wr_coeff > 1 && + bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && + bfqq->service_from_wr > max_service_from_wr) { + /* see comments on max_service_from_wr */ + bfq_bfqq_end_wr(bfqq); + } } /* * To improve latency (for this or other queues), immediately @@ -3630,8 +3823,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) } /* - * We exploit the put_rq_private hook to decrement - * rq_in_driver, but put_rq_private will not be + * We exploit the bfq_finish_request hook to decrement + * rq_in_driver, but bfq_finish_request will not be * invoked on this request. So, to avoid unbalance, * just start this request, without incrementing * rq_in_driver. As a negative consequence, @@ -3640,14 +3833,14 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * bfq_schedule_dispatch to be invoked uselessly. * * As for implementing an exact solution, the - * put_request hook, if defined, is probably invoked - * also on this request. So, by exploiting this hook, - * we could 1) increment rq_in_driver here, and 2) - * decrement it in put_request. Such a solution would - * let the value of the counter be always accurate, - * but it would entail using an extra interface - * function. This cost seems higher than the benefit, - * being the frequency of non-elevator-private + * bfq_finish_request hook, if defined, is probably + * invoked also on this request. So, by exploiting + * this hook, we could 1) increment rq_in_driver here, + * and 2) decrement it in bfq_finish_request. Such a + * solution would let the value of the counter be + * always accurate, but it would entail using an extra + * interface function. This cost seems higher than the + * benefit, being the frequency of non-elevator-private * requests very low. */ goto start_rq; @@ -3689,35 +3882,16 @@ exit: return rq; } -static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -{ - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct request *rq; #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - struct bfq_queue *in_serv_queue, *bfqq; - bool waiting_rq, idle_timer_disabled; -#endif - - spin_lock_irq(&bfqd->lock); - -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - in_serv_queue = bfqd->in_service_queue; - waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); - - rq = __bfq_dispatch_request(hctx); - - idle_timer_disabled = - waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); - -#else - rq = __bfq_dispatch_request(hctx); -#endif - spin_unlock_irq(&bfqd->lock); +static void bfq_update_dispatch_stats(struct request_queue *q, + struct request *rq, + struct bfq_queue *in_serv_queue, + bool idle_timer_disabled) +{ + struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - bfqq = rq ? RQ_BFQQ(rq) : NULL; if (!idle_timer_disabled && !bfqq) - return rq; + return; /* * rq and bfqq are guaranteed to exist until this function @@ -3732,7 +3906,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * In addition, the following queue lock guarantees that * bfqq_group(bfqq) exists as well. */ - spin_lock_irq(hctx->queue->queue_lock); + spin_lock_irq(q->queue_lock); if (idle_timer_disabled) /* * Since the idle timer has been disabled, @@ -3751,9 +3925,37 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) bfqg_stats_set_start_empty_time(bfqg); bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); } - spin_unlock_irq(hctx->queue->queue_lock); + spin_unlock_irq(q->queue_lock); +} +#else +static inline void bfq_update_dispatch_stats(struct request_queue *q, + struct request *rq, + struct bfq_queue *in_serv_queue, + bool idle_timer_disabled) {} #endif +static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct request *rq; + struct bfq_queue *in_serv_queue; + bool waiting_rq, idle_timer_disabled; + + spin_lock_irq(&bfqd->lock); + + in_serv_queue = bfqd->in_service_queue; + waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); + + rq = __bfq_dispatch_request(hctx); + + idle_timer_disabled = + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + + spin_unlock_irq(&bfqd->lock); + + bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, + idle_timer_disabled); + return rq; } @@ -4002,10 +4204,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->split_time = bfq_smallest_from_now(); /* - * Set to the value for which bfqq will not be deemed as - * soft rt when it becomes backlogged. + * To not forget the possibly high bandwidth consumed by a + * process/queue in the recent past, + * bfq_bfqq_softrt_next_start() returns a value at least equal + * to the current value of bfqq->soft_rt_next_start (see + * comments on bfq_bfqq_softrt_next_start). Set + * soft_rt_next_start to now, to mean that bfqq has consumed + * no bandwidth so far. */ - bfqq->soft_rt_next_start = bfq_greatest_from_now(); + bfqq->soft_rt_next_start = jiffies; /* first request is almost certainly seeky */ bfqq->seek_history = 1; @@ -4276,16 +4483,46 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) return idle_timer_disabled; } +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) +static void bfq_update_insert_stats(struct request_queue *q, + struct bfq_queue *bfqq, + bool idle_timer_disabled, + unsigned int cmd_flags) +{ + if (!bfqq) + return; + + /* + * bfqq still exists, because it can disappear only after + * either it is merged with another queue, or the process it + * is associated with exits. But both actions must be taken by + * the same process currently executing this flow of + * instructions. + * + * In addition, the following queue lock guarantees that + * bfqq_group(bfqq) exists as well. + */ + spin_lock_irq(q->queue_lock); + bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); + if (idle_timer_disabled) + bfqg_stats_update_idle_time(bfqq_group(bfqq)); + spin_unlock_irq(q->queue_lock); +} +#else +static inline void bfq_update_insert_stats(struct request_queue *q, + struct bfq_queue *bfqq, + bool idle_timer_disabled, + unsigned int cmd_flags) {} +#endif + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) struct bfq_queue *bfqq = RQ_BFQQ(rq); bool idle_timer_disabled = false; unsigned int cmd_flags; -#endif spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { @@ -4304,7 +4541,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, else list_add_tail(&rq->queuelist, &bfqd->dispatch); } else { -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred @@ -4312,9 +4548,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * redirected into a new queue. */ bfqq = RQ_BFQQ(rq); -#else - __bfq_insert_request(bfqd, rq); -#endif if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -4323,35 +4556,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } } -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* * Cache cmd_flags before releasing scheduler lock, because rq * may disappear afterwards (for example, because of a request * merge). */ cmd_flags = rq->cmd_flags; -#endif + spin_unlock_irq(&bfqd->lock); -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - if (!bfqq) - return; - /* - * bfqq still exists, because it can disappear only after - * either it is merged with another queue, or the process it - * is associated with exits. But both actions must be taken by - * the same process currently executing this flow of - * instruction. - * - * In addition, the following queue lock guarantees that - * bfqq_group(bfqq) exists as well. - */ - spin_lock_irq(q->queue_lock); - bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); - if (idle_timer_disabled) - bfqg_stats_update_idle_time(bfqq_group(bfqq)); - spin_unlock_irq(q->queue_lock); -#endif + bfq_update_insert_stats(q, bfqq, idle_timer_disabled, + cmd_flags); } static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, @@ -4482,7 +4697,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_schedule_dispatch(bfqd); } -static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) +static void bfq_finish_request_body(struct bfq_queue *bfqq) { bfqq->allocated--; @@ -4512,7 +4727,7 @@ static void bfq_finish_request(struct request *rq) spin_lock_irqsave(&bfqd->lock, flags); bfq_completed_request(bfqq, bfqd); - bfq_put_rq_priv_body(bfqq); + bfq_finish_request_body(bfqq); spin_unlock_irqrestore(&bfqd->lock, flags); } else { @@ -4533,7 +4748,7 @@ static void bfq_finish_request(struct request *rq) bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); } - bfq_put_rq_priv_body(bfqq); + bfq_finish_request_body(bfqq); } rq->elv.priv[0] = NULL; @@ -4818,6 +5033,9 @@ static void bfq_exit_queue(struct elevator_queue *e) hrtimer_cancel(&bfqd->idle_slice_timer); #ifdef CONFIG_BFQ_GROUP_IOSCHED + /* release oom-queue reference to root group */ + bfqg_and_blkg_put(bfqd->root_group); + blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); #else spin_lock_irq(&bfqd->lock); @@ -5206,6 +5424,7 @@ static struct elv_fs_entry bfq_attrs[] = { static struct elevator_type iosched_bfq_mq = { .ops.mq = { + .limit_depth = bfq_limit_depth, .prepare_request = bfq_prepare_request, .finish_request = bfq_finish_request, .exit_icq = bfq_exit_icq, diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 91c4390..350c39a 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -337,6 +337,11 @@ struct bfq_queue { * last transition from idle to backlogged. */ unsigned long service_from_backlogged; + /* + * Cumulative service received from the @bfq_queue since its + * last transition to weight-raised state. + */ + unsigned long service_from_wr; /* * Value of wr start time when switching to soft rt @@ -344,6 +349,8 @@ struct bfq_queue { unsigned long wr_start_at_switch_to_srt; unsigned long split_time; /* time of last split */ + + unsigned long first_IO_time; /* time of first I/O for this queue */ }; /** @@ -627,6 +634,18 @@ struct bfq_data { struct bfq_io_cq *bio_bic; /* bfqq associated with the task issuing current bio for merging */ struct bfq_queue *bio_bfqq; + + /* + * Cached sbitmap shift, used to compute depth limits in + * bfq_update_depths. + */ + unsigned int sb_shift; + + /* + * Depth limits used in bfq_limit_depth (see comments on the + * function) + */ + unsigned int word_depths[2][2]; }; enum bfqq_state_flags { diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index e495d3f..4498c43 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -835,6 +835,13 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served) struct bfq_entity *entity = &bfqq->entity; struct bfq_service_tree *st; + if (!bfqq->service_from_backlogged) + bfqq->first_IO_time = jiffies; + + if (bfqq->wr_coeff > 1) + bfqq->service_from_wr += served; + + bfqq->service_from_backlogged += served; for_each_entity(entity) { st = bfq_entity_service_tree(entity); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 23b42e8..9cfdd6c 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work) /** * __bio_integrity_endio - Integrity I/O completion function * @bio: Protected bio - * @error: Pointer to errno * * Description: Completion for integrity I/O * diff --git a/block/bio.c b/block/bio.c index 9ef6cf3..e1708db 100644 --- a/block/bio.c +++ b/block/bio.c @@ -971,34 +971,6 @@ void bio_advance(struct bio *bio, unsigned bytes) EXPORT_SYMBOL(bio_advance); /** - * bio_alloc_pages - allocates a single page for each bvec in a bio - * @bio: bio to allocate pages for - * @gfp_mask: flags for allocation - * - * Allocates pages up to @bio->bi_vcnt. - * - * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are - * freed. - */ -int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) -{ - int i; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, i) { - bv->bv_page = alloc_page(gfp_mask); - if (!bv->bv_page) { - while (--bv >= bio->bi_io_vec) - __free_page(bv->bv_page); - return -ENOMEM; - } - } - - return 0; -} -EXPORT_SYMBOL(bio_alloc_pages); - -/** * bio_copy_data - copy contents of data buffers from one chain of bios to * another * @src: source bio list @@ -1838,7 +1810,7 @@ struct bio *bio_split(struct bio *bio, int sectors, bio_advance(bio, split->bi_iter.bi_size); if (bio_flagged(bio, BIO_TRACE_COMPLETION)) - bio_set_flag(bio, BIO_TRACE_COMPLETION); + bio_set_flag(split, BIO_TRACE_COMPLETION); return split; } diff --git a/block/blk-core.c b/block/blk-core.c index 3ba4326..a2005a4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->start_time = jiffies; set_start_time_ns(rq); rq->part = NULL; + seqcount_init(&rq->gstate_seq); + u64_stats_init(&rq->aborted_gstate_sync); } EXPORT_SYMBOL(blk_rq_init); @@ -699,6 +701,15 @@ void blk_cleanup_queue(struct request_queue *q) queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); + /* + * make sure all in-progress dispatch are completed because + * blk_freeze_queue() can only complete all requests, and + * dispatch may still be in-progress since we dispatch requests + * from more than one contexts + */ + if (q->mq_ops) + blk_mq_quiesce_queue(q); + /* for synchronous bio-based driver finish in-flight integrity i/o */ blk_flush_integrity(); @@ -1646,6 +1657,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) lockdep_assert_held(q->queue_lock); + blk_req_zone_write_unlock(req); blk_pm_put_request(req); elv_completed_request(q, req); @@ -2055,6 +2067,21 @@ static inline bool should_fail_request(struct hd_struct *part, #endif /* CONFIG_FAIL_MAKE_REQUEST */ +static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) +{ + if (part->policy && op_is_write(bio_op(bio))) { + char b[BDEVNAME_SIZE]; + + printk(KERN_ERR + "generic_make_request: Trying to write " + "to read-only block-device %s (partno %d)\n", + bio_devname(bio, b), part->partno); + return true; + } + + return false; +} + /* * Remap block n of partition p to block n+start(p) of the disk. */ @@ -2063,27 +2090,28 @@ static inline int blk_partition_remap(struct bio *bio) struct hd_struct *p; int ret = 0; + rcu_read_lock(); + p = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) || + bio_check_ro(bio, p))) { + ret = -EIO; + goto out; + } + /* * Zone reset does not include bi_size so bio_sectors() is always 0. * Include a test for the reset op code and perform the remap if needed. */ - if (!bio->bi_partno || - (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)) - return 0; + if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET) + goto out; - rcu_read_lock(); - p = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) { - bio->bi_iter.bi_sector += p->start_sect; - bio->bi_partno = 0; - trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), - bio->bi_iter.bi_sector - p->start_sect); - } else { - printk("%s: fail for partition %d\n", __func__, bio->bi_partno); - ret = -EIO; - } - rcu_read_unlock(); + bio->bi_iter.bi_sector += p->start_sect; + bio->bi_partno = 0; + trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), + bio->bi_iter.bi_sector - p->start_sect); +out: + rcu_read_unlock(); return ret; } @@ -2142,15 +2170,19 @@ generic_make_request_checks(struct bio *bio) * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue is not a request based queue. */ - if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) goto not_supported; if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) goto end_io; - if (blk_partition_remap(bio)) - goto end_io; + if (!bio->bi_partno) { + if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) + goto end_io; + } else { + if (blk_partition_remap(bio)) + goto end_io; + } if (bio_check_eod(bio, nr_sectors)) goto end_io; @@ -2493,8 +2525,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - blk_mq_request_bypass_insert(rq, true); - return BLK_STS_OK; + return blk_mq_request_issue_directly(rq); } spin_lock_irqsave(q->queue_lock, flags); @@ -2846,7 +2877,7 @@ void blk_start_request(struct request *req) wbt_issue(req->q->rq_wb, &req->issue_stat); } - BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); + BUG_ON(blk_rq_is_complete(req)); blk_add_timer(req); } EXPORT_SYMBOL(blk_start_request); @@ -3415,20 +3446,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, } EXPORT_SYMBOL(kblockd_mod_delayed_work_on); -int kblockd_schedule_delayed_work(struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work(kblockd_workqueue, dwork, delay); -} -EXPORT_SYMBOL(kblockd_schedule_delayed_work); - -int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); -} -EXPORT_SYMBOL(kblockd_schedule_delayed_work_on); - /** * blk_start_plug - initialize blk_plug and track it inside the task_struct * @plug: The &struct blk_plug that needs to be initialized diff --git a/block/blk-exec.c b/block/blk-exec.c index 5c0f3dc..f7b292f 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be reused after dying flag is set */ if (q->mq_ops) { - blk_mq_sched_insert_request(rq, at_head, true, false, false); + blk_mq_sched_insert_request(rq, at_head, true, false); return; } diff --git a/block/blk-lib.c b/block/blk-lib.c index 2bc544c..a676084 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + if (flags & BLKDEV_DISCARD_SECURE) { if (!blk_queue_secure_erase(q)) return -EOPNOTSUPP; @@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; if ((sector | nr_sects) & bs_mask) return -EINVAL; @@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); @@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + while (nr_sects != 0) { bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), gfp_mask); diff --git a/block/blk-map.c b/block/blk-map.c index d3a9471..db9373b 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -119,7 +119,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); struct bio *bio = NULL; struct iov_iter i; - int ret; + int ret = -EINVAL; if (!iter_is_iovec(iter)) goto fail; @@ -148,7 +148,7 @@ unmap_rq: __blk_rq_unmap_user(bio); fail: rq->bio = NULL; - return -EINVAL; + return ret; } EXPORT_SYMBOL(blk_rq_map_user_iov); diff --git a/block/blk-merge.c b/block/blk-merge.c index f5dedd5..8452fc7 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -128,9 +128,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, nsegs++; sectors = max_sectors; } - if (sectors) - goto split; - /* Make this single bvec as the 1st segment */ + goto split; } if (bvprvp && blk_queue_cluster(q)) { @@ -146,22 +144,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bvprvp = &bvprv; sectors += bv.bv_len >> 9; - if (nsegs == 1 && seg_size > front_seg_size) - front_seg_size = seg_size; continue; } new_segment: if (nsegs == queue_max_segments(q)) goto split; + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; + nsegs++; bvprv = bv; bvprvp = &bvprv; seg_size = bv.bv_len; sectors += bv.bv_len >> 9; - if (nsegs == 1 && seg_size > front_seg_size) - front_seg_size = seg_size; } do_split = false; @@ -174,6 +171,8 @@ split: bio = new; } + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; bio->bi_seg_front_size = front_seg_size; if (seg_size > bio->bi_seg_back_size) bio->bi_seg_back_size = seg_size; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b56a4f3..21cbc1f 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -289,17 +289,12 @@ static const char *const rqf_name[] = { RQF_NAME(HASHED), RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), + RQF_NAME(ZONE_WRITE_LOCKED), + RQF_NAME(MQ_TIMEOUT_EXPIRED), + RQF_NAME(MQ_POLL_SLEPT), }; #undef RQF_NAME -#define RQAF_NAME(name) [REQ_ATOM_##name] = #name -static const char *const rqaf_name[] = { - RQAF_NAME(COMPLETE), - RQAF_NAME(STARTED), - RQAF_NAME(POLL_SLEPT), -}; -#undef RQAF_NAME - int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; @@ -316,8 +311,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) seq_puts(m, ", .rq_flags="); blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, ARRAY_SIZE(rqf_name)); - seq_puts(m, ", .atomic_flags="); - blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name)); + seq_printf(m, ", complete=%d", blk_rq_is_complete(rq)); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); if (mq_ops->show_rq) @@ -409,7 +403,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) const struct show_busy_params *params = data; if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx && - test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + blk_mq_rq_state(rq) != MQ_RQ_IDLE) __blk_mq_debugfs_rq_show(params->m, list_entry_rq(&rq->queuelist)); } @@ -703,7 +697,11 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf, const struct blk_mq_debugfs_attr *attr = m->private; void *data = d_inode(file->f_path.dentry->d_parent)->i_private; - if (!attr->write) + /* + * Attributes that only implement .seq_ops are read-only and 'attr' is + * the same with 'data' in this case. + */ + if (attr == data || !attr->write) return -EPERM; return attr->write(data, buf, count, ppos); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index c117bd8..55c0a74 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -172,7 +172,6 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) WRITE_ONCE(hctx->dispatch_from, ctx); } -/* return true if hw queue need to be run again */ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; @@ -428,7 +427,7 @@ done: } void blk_mq_sched_insert_request(struct request *rq, bool at_head, - bool run_queue, bool async, bool can_block) + bool run_queue, bool async) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index ba1d141..1e9c901 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_insert_request(struct request *rq, bool at_head, - bool run_queue, bool async, bool can_block); + bool run_queue, bool async); void blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 79969c3..a54b4b0 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) return ret; } -static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) +void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; @@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) q->mq_sysfs_init_done = false; } -void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) -{ - mutex_lock(&q->sysfs_lock); - __blk_mq_unregister_dev(dev, q); - mutex_unlock(&q->sysfs_lock); -} - void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) { kobject_init(&hctx->kobj, &blk_mq_hw_ktype); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index c81b40e..336dde0 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ws = bt_wait_ptr(bt, data->hctx); drop_ctx = data->ctx == NULL; do { - prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); - - tag = __blk_mq_get_tag(data, bt); - if (tag != -1) - break; - /* * We're out of tags on this hardware queue, kick any * pending IO submits before going to sleep waiting for @@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != -1) break; + prepare_to_wait_exclusive(&ws->wait, &wait, + TASK_UNINTERRUPTIBLE); + + tag = __blk_mq_get_tag(data, bt); + if (tag != -1) + break; + if (data->ctx) blk_mq_put_ctx(data->ctx); diff --git a/block/blk-mq.c b/block/blk-mq.c index 3d37973..01f271d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -95,8 +95,7 @@ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, { struct mq_inflight *mi = priv; - if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) && - !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { + if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) { /* * index[0] counts the specific partition that was asked * for. index[1] counts the ones that are active on the @@ -222,7 +221,7 @@ void blk_mq_quiesce_queue(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) { if (hctx->flags & BLK_MQ_F_BLOCKING) - synchronize_srcu(hctx->queue_rq_srcu); + synchronize_srcu(hctx->srcu); else rcu = true; } @@ -272,15 +271,14 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; - - rq->rq_flags = 0; + req_flags_t rq_flags = 0; if (data->flags & BLK_MQ_REQ_INTERNAL) { rq->tag = -1; rq->internal_tag = tag; } else { if (blk_mq_tag_busy(data->hctx)) { - rq->rq_flags = RQF_MQ_INFLIGHT; + rq_flags = RQF_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); } rq->tag = tag; @@ -288,27 +286,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, data->hctx->tags->rqs[rq->tag] = rq; } - INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; + rq->rq_flags = rq_flags; + rq->cpu = -1; rq->cmd_flags = op; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; - /* do not touch atomic flags, it needs atomic ops against the timer */ - rq->cpu = -1; + INIT_LIST_HEAD(&rq->queuelist); INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; rq->start_time = jiffies; -#ifdef CONFIG_BLK_CGROUP - rq->rl = NULL; - set_start_time_ns(rq); - rq->io_start_time_ns = 0; -#endif rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -316,6 +309,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->special = NULL; /* tag was already set */ rq->extra_len = 0; + rq->__deadline = 0; INIT_LIST_HEAD(&rq->timeout_list); rq->timeout = 0; @@ -324,6 +318,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->end_io_data = NULL; rq->next_rq = NULL; +#ifdef CONFIG_BLK_CGROUP + rq->rl = NULL; + set_start_time_ns(rq); + rq->io_start_time_ns = 0; +#endif + data->ctx->rq_dispatched[op_is_sync(op)]++; return rq; } @@ -443,7 +443,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, blk_queue_exit(q); return ERR_PTR(-EXDEV); } - cpu = cpumask_first(alloc_data.hctx->cpumask); + cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); alloc_data.ctx = __blk_mq_get_ctx(q, cpu); rq = blk_mq_get_request(q, NULL, op, &alloc_data); @@ -485,8 +485,7 @@ void blk_mq_free_request(struct request *rq) if (blk_rq_rl(rq)) blk_put_rl(blk_rq_rl(rq)); - clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); + blk_mq_rq_update_state(rq, MQ_RQ_IDLE); if (rq->tag != -1) blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) @@ -532,6 +531,9 @@ static void __blk_mq_complete_request(struct request *rq) bool shared = false; int cpu; + WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT); + blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE); + if (rq->internal_tag != -1) blk_mq_sched_completed_request(rq); if (rq->rq_flags & RQF_STATS) { @@ -559,6 +561,56 @@ static void __blk_mq_complete_request(struct request *rq) put_cpu(); } +static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) + __releases(hctx->srcu) +{ + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) + rcu_read_unlock(); + else + srcu_read_unlock(hctx->srcu, srcu_idx); +} + +static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) + __acquires(hctx->srcu) +{ + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { + /* shut up gcc false positive */ + *srcu_idx = 0; + rcu_read_lock(); + } else + *srcu_idx = srcu_read_lock(hctx->srcu); +} + +static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate) +{ + unsigned long flags; + + /* + * blk_mq_rq_aborted_gstate() is used from the completion path and + * can thus be called from irq context. u64_stats_fetch in the + * middle of update on the same CPU leads to lockup. Disable irq + * while updating. + */ + local_irq_save(flags); + u64_stats_update_begin(&rq->aborted_gstate_sync); + rq->aborted_gstate = gstate; + u64_stats_update_end(&rq->aborted_gstate_sync); + local_irq_restore(flags); +} + +static u64 blk_mq_rq_aborted_gstate(struct request *rq) +{ + unsigned int start; + u64 aborted_gstate; + + do { + start = u64_stats_fetch_begin(&rq->aborted_gstate_sync); + aborted_gstate = rq->aborted_gstate; + } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start)); + + return aborted_gstate; +} + /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed @@ -570,17 +622,33 @@ static void __blk_mq_complete_request(struct request *rq) void blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); + int srcu_idx; if (unlikely(blk_should_fake_timeout(q))) return; - if (!blk_mark_rq_complete(rq)) + + /* + * If @rq->aborted_gstate equals the current instance, timeout is + * claiming @rq and we lost. This is synchronized through + * hctx_lock(). See blk_mq_timeout_work() for details. + * + * Completion path never blocks and we can directly use RCU here + * instead of hctx_lock() which can be either RCU or SRCU. + * However, that would complicate paths which want to synchronize + * against us. Let stay in sync with the issue path so that + * hctx_lock() covers both issue and completion paths. + */ + hctx_lock(hctx, &srcu_idx); + if (blk_mq_rq_aborted_gstate(rq) != rq->gstate) __blk_mq_complete_request(rq); + hctx_unlock(hctx, srcu_idx); } EXPORT_SYMBOL(blk_mq_complete_request); int blk_mq_request_started(struct request *rq) { - return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + return blk_mq_rq_state(rq) != MQ_RQ_IDLE; } EXPORT_SYMBOL_GPL(blk_mq_request_started); @@ -598,34 +666,27 @@ void blk_mq_start_request(struct request *rq) wbt_issue(q->rq_wb, &rq->issue_stat); } - blk_add_timer(rq); - - WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); + WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); /* - * Mark us as started and clear complete. Complete might have been - * set if requeue raced with timeout, which then marked it as - * complete. So be sure to clear complete again when we start - * the request, otherwise we'll ignore the completion event. + * Mark @rq in-flight which also advances the generation number, + * and register for timeout. Protect with a seqcount to allow the + * timeout path to read both @rq->gstate and @rq->deadline + * coherently. * - * Ensure that ->deadline is visible before we set STARTED, such that - * blk_mq_check_expired() is guaranteed to observe our ->deadline when - * it observes STARTED. + * This is the only place where a request is marked in-flight. If + * the timeout path reads an in-flight @rq->gstate, the + * @rq->deadline it reads together under @rq->gstate_seq is + * guaranteed to be the matching one. */ - smp_wmb(); - set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { - /* - * Coherence order guarantees these consecutive stores to a - * single variable propagate in the specified order. Thus the - * clear_bit() is ordered _after_ the set bit. See - * blk_mq_check_expired(). - * - * (the bits must be part of the same byte for this to be - * true). - */ - clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); - } + preempt_disable(); + write_seqcount_begin(&rq->gstate_seq); + + blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT); + blk_add_timer(rq); + + write_seqcount_end(&rq->gstate_seq); + preempt_enable(); if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -639,13 +700,9 @@ void blk_mq_start_request(struct request *rq) EXPORT_SYMBOL(blk_mq_start_request); /* - * When we reach here because queue is busy, REQ_ATOM_COMPLETE - * flag isn't set yet, so there may be race with timeout handler, - * but given rq->deadline is just set in .queue_rq() under - * this situation, the race won't be possible in reality because - * rq->timeout should be set as big enough to cover the window - * between blk_mq_start_request() called from .queue_rq() and - * clearing REQ_ATOM_STARTED here. + * When we reach here because queue is busy, it's safe to change the state + * to IDLE without checking @rq->aborted_gstate because we should still be + * holding the RCU read lock and thus protected against timeout. */ static void __blk_mq_requeue_request(struct request *rq) { @@ -657,7 +714,8 @@ static void __blk_mq_requeue_request(struct request *rq) wbt_requeue(q->rq_wb, &rq->issue_stat); blk_mq_sched_requeue_request(rq); - if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) { + blk_mq_rq_update_state(rq, MQ_RQ_IDLE); if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; } @@ -689,13 +747,13 @@ static void blk_mq_requeue_work(struct work_struct *work) rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, true, false, false, true); + blk_mq_sched_insert_request(rq, true, false, false); } while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, false, false, false, true); + blk_mq_sched_insert_request(rq, false, false, false); } blk_mq_run_hw_queues(q, false); @@ -729,7 +787,7 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q) { - kblockd_schedule_delayed_work(&q->requeue_work, 0); + kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); @@ -755,24 +813,15 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq); struct blk_mq_timeout_data { unsigned long next; unsigned int next_set; + unsigned int nr_expired; }; -void blk_mq_rq_timed_out(struct request *req, bool reserved) +static void blk_mq_rq_timed_out(struct request *req, bool reserved) { const struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; - /* - * We know that complete is set at this point. If STARTED isn't set - * anymore, then the request isn't active and the "timeout" should - * just be ignored. This can happen due to the bitflag ordering. - * Timeout first checks if STARTED is set, and if it is, assumes - * the request is active. But if we race with completion, then - * both flags will get cleared. So check here again, and ignore - * a timeout event with a request that isn't active. - */ - if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) - return; + req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED; if (ops->timeout) ret = ops->timeout(req, reserved); @@ -782,8 +831,13 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved) __blk_mq_complete_request(req); break; case BLK_EH_RESET_TIMER: + /* + * As nothing prevents from completion happening while + * ->aborted_gstate is set, this may lead to ignored + * completions and further spurious timeouts. + */ + blk_mq_rq_update_aborted_gstate(req, 0); blk_add_timer(req); - blk_clear_rq_complete(req); break; case BLK_EH_NOT_HANDLED: break; @@ -797,50 +851,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct blk_mq_timeout_data *data = priv; - unsigned long deadline; + unsigned long gstate, deadline; + int start; - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - return; + might_sleep(); - /* - * Ensures that if we see STARTED we must also see our - * up-to-date deadline, see blk_mq_start_request(). - */ - smp_rmb(); + if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) + return; - deadline = READ_ONCE(rq->deadline); + /* read coherent snapshots of @rq->state_gen and @rq->deadline */ + while (true) { + start = read_seqcount_begin(&rq->gstate_seq); + gstate = READ_ONCE(rq->gstate); + deadline = blk_rq_deadline(rq); + if (!read_seqcount_retry(&rq->gstate_seq, start)) + break; + cond_resched(); + } - /* - * The rq being checked may have been freed and reallocated - * out already here, we avoid this race by checking rq->deadline - * and REQ_ATOM_COMPLETE flag together: - * - * - if rq->deadline is observed as new value because of - * reusing, the rq won't be timed out because of timing. - * - if rq->deadline is observed as previous value, - * REQ_ATOM_COMPLETE flag won't be cleared in reuse path - * because we put a barrier between setting rq->deadline - * and clearing the flag in blk_mq_start_request(), so - * this rq won't be timed out too. - */ - if (time_after_eq(jiffies, deadline)) { - if (!blk_mark_rq_complete(rq)) { - /* - * Again coherence order ensures that consecutive reads - * from the same variable must be in that order. This - * ensures that if we see COMPLETE clear, we must then - * see STARTED set and we'll ignore this timeout. - * - * (There's also the MB implied by the test_and_clear()) - */ - blk_mq_rq_timed_out(rq, reserved); - } + /* if in-flight && overdue, mark for abortion */ + if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT && + time_after_eq(jiffies, deadline)) { + blk_mq_rq_update_aborted_gstate(rq, gstate); + data->nr_expired++; + hctx->nr_expired++; } else if (!data->next_set || time_after(data->next, deadline)) { data->next = deadline; data->next_set = 1; } } +static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + /* + * We marked @rq->aborted_gstate and waited for RCU. If there were + * completions that we lost to, they would have finished and + * updated @rq->gstate by now; otherwise, the completion path is + * now guaranteed to see @rq->aborted_gstate and yield. If + * @rq->aborted_gstate still matches @rq->gstate, @rq is ours. + */ + if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) && + READ_ONCE(rq->gstate) == rq->aborted_gstate) + blk_mq_rq_timed_out(rq, reserved); +} + static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = @@ -848,7 +903,9 @@ static void blk_mq_timeout_work(struct work_struct *work) struct blk_mq_timeout_data data = { .next = 0, .next_set = 0, + .nr_expired = 0, }; + struct blk_mq_hw_ctx *hctx; int i; /* A deadlock might occur if a request is stuck requiring a @@ -867,14 +924,46 @@ static void blk_mq_timeout_work(struct work_struct *work) if (!percpu_ref_tryget(&q->q_usage_counter)) return; + /* scan for the expired ones and set their ->aborted_gstate */ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); + if (data.nr_expired) { + bool has_rcu = false; + + /* + * Wait till everyone sees ->aborted_gstate. The + * sequential waits for SRCUs aren't ideal. If this ever + * becomes a problem, we can add per-hw_ctx rcu_head and + * wait in parallel. + */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->nr_expired) + continue; + + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) + has_rcu = true; + else + synchronize_srcu(hctx->srcu); + + hctx->nr_expired = 0; + } + if (has_rcu) + synchronize_rcu(); + + /* terminate the ones we won */ + blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL); + } + if (data.next_set) { data.next = blk_rq_timeout(round_jiffies_up(data.next)); mod_timer(&q->timeout, data.next); } else { - struct blk_mq_hw_ctx *hctx; - + /* + * Request timeouts are handled as a forward rolling timer. If + * we end up here it means that no requests are pending and + * also that no request has been pending for a while. Mark + * each hctx as idle. + */ queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) @@ -1010,66 +1099,67 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, /* * Mark us waiting for a tag. For shared tags, this involves hooking us into - * the tag wakeups. For non-shared tags, we can simply mark us nedeing a - * restart. For both caes, take care to check the condition again after + * the tag wakeups. For non-shared tags, we can simply mark us needing a + * restart. For both cases, take care to check the condition again after * marking us as waiting. */ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, struct request *rq) { struct blk_mq_hw_ctx *this_hctx = *hctx; - bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0; struct sbq_wait_state *ws; wait_queue_entry_t *wait; bool ret; - if (!shared_tags) { + if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) { if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); - } else { - wait = &this_hctx->dispatch_wait; - if (!list_empty_careful(&wait->entry)) - return false; - spin_lock(&this_hctx->lock); - if (!list_empty(&wait->entry)) { - spin_unlock(&this_hctx->lock); - return false; - } + /* + * It's possible that a tag was freed in the window between the + * allocation failure and adding the hardware queue to the wait + * queue. + * + * Don't clear RESTART here, someone else could have set it. + * At most this will cost an extra queue run. + */ + return blk_mq_get_driver_tag(rq, hctx, false); + } - ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); - add_wait_queue(&ws->wait, wait); + wait = &this_hctx->dispatch_wait; + if (!list_empty_careful(&wait->entry)) + return false; + + spin_lock(&this_hctx->lock); + if (!list_empty(&wait->entry)) { + spin_unlock(&this_hctx->lock); + return false; } + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); + add_wait_queue(&ws->wait, wait); + /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. */ ret = blk_mq_get_driver_tag(rq, hctx, false); - - if (!shared_tags) { - /* - * Don't clear RESTART here, someone else could have set it. - * At most this will cost an extra queue run. - */ - return ret; - } else { - if (!ret) { - spin_unlock(&this_hctx->lock); - return false; - } - - /* - * We got a tag, remove ourselves from the wait queue to ensure - * someone else gets the wakeup. - */ - spin_lock_irq(&ws->wait.lock); - list_del_init(&wait->entry); - spin_unlock_irq(&ws->wait.lock); + if (!ret) { spin_unlock(&this_hctx->lock); - return true; + return false; } + + /* + * We got a tag, remove ourselves from the wait queue to ensure + * someone else gets the wakeup. + */ + spin_lock_irq(&ws->wait.lock); + list_del_init(&wait->entry); + spin_unlock_irq(&ws->wait.lock); + spin_unlock(&this_hctx->lock); + + return true; } bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, @@ -1206,9 +1296,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) /* * We should be running this queue from one of the CPUs that * are mapped to it. + * + * There are at least two related races now between setting + * hctx->next_cpu from blk_mq_hctx_next_cpu() and running + * __blk_mq_run_hw_queue(): + * + * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), + * but later it becomes online, then this warning is harmless + * at all + * + * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), + * but later it becomes offline, then the warning can't be + * triggered, and we depend on blk-mq timeout handler to + * handle dispatched requests to this hctx */ - WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && - cpu_online(hctx->next_cpu)); + if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && + cpu_online(hctx->next_cpu)) { + printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", + raw_smp_processor_id(), + cpumask_empty(hctx->cpumask) ? "inactive": "active"); + dump_stack(); + } /* * We can't run the queue inline with ints disabled. Ensure that @@ -1216,17 +1324,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ WARN_ON_ONCE(in_interrupt()); - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { - rcu_read_lock(); - blk_mq_sched_dispatch_requests(hctx); - rcu_read_unlock(); - } else { - might_sleep(); + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - blk_mq_sched_dispatch_requests(hctx); - srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); - } + hctx_lock(hctx, &srcu_idx); + blk_mq_sched_dispatch_requests(hctx); + hctx_unlock(hctx, srcu_idx); } /* @@ -1237,20 +1339,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { + bool tried = false; + if (hctx->queue->nr_hw_queues == 1) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { int next_cpu; - - next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); +select_cpu: + next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask, + cpu_online_mask); if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(hctx->cpumask); + next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask); - hctx->next_cpu = next_cpu; + /* + * No online CPU is found, so have to make sure hctx->next_cpu + * is set correctly for not breaking workqueue. + */ + if (next_cpu >= nr_cpu_ids) + hctx->next_cpu = cpumask_first(hctx->cpumask); + else + hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } + /* + * Do unbound schedule if we can't find a online CPU for this hctx, + * and it should only happen in the path of handling CPU DEAD. + */ + if (!cpu_online(hctx->next_cpu)) { + if (!tried) { + tried = true; + goto select_cpu; + } + + /* + * Make sure to re-select CPU next time once after CPUs + * in hctx->cpumask become online again. + */ + hctx->next_cpu_batch = 1; + return WORK_CPU_UNBOUND; + } return hctx->next_cpu; } @@ -1274,9 +1403,8 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, put_cpu(); } - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), - &hctx->run_work, - msecs_to_jiffies(msecs)); + kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, + msecs_to_jiffies(msecs)); } void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) @@ -1287,7 +1415,23 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - if (blk_mq_hctx_has_pending(hctx)) { + int srcu_idx; + bool need_run; + + /* + * When queue is quiesced, we may be switching io scheduler, or + * updating nr_hw_queues, or other things, and we can't run queue + * any more, even __blk_mq_hctx_has_pending() can't be called safely. + * + * And queue will be rerun in blk_mq_unquiesce_queue() if it is + * quiesced. + */ + hctx_lock(hctx, &srcu_idx); + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx); + hctx_unlock(hctx, srcu_idx); + + if (need_run) { __blk_mq_delay_run_hw_queue(hctx, async, 0); return true; } @@ -1595,9 +1739,9 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } -static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie, bool may_sleep) +static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { @@ -1606,15 +1750,52 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, }; blk_qc_t new_cookie; blk_status_t ret; + + new_cookie = request_to_qc_t(hctx, rq); + + /* + * For OK queue, we are done. For error, caller may kill it. + * Any other error (busy), just add it to our list as we + * previously would have done. + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + switch (ret) { + case BLK_STS_OK: + *cookie = new_cookie; + break; + case BLK_STS_RESOURCE: + __blk_mq_requeue_request(rq); + break; + default: + *cookie = BLK_QC_T_NONE; + break; + } + + return ret; +} + +static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie, + bool bypass_insert) +{ + struct request_queue *q = rq->q; bool run_queue = true; - /* RCU or SRCU read lock is needed before checking quiesced flag */ + /* + * RCU or SRCU read lock is needed before checking quiesced flag. + * + * When queue is stopped or quiesced, ignore 'bypass_insert' from + * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, + * and avoid driver to try to dispatch again. + */ if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { run_queue = false; + bypass_insert = false; goto insert; } - if (q->elevator) + if (q->elevator && !bypass_insert) goto insert; if (!blk_mq_get_driver_tag(rq, NULL, false)) @@ -1625,47 +1806,47 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - new_cookie = request_to_qc_t(hctx, rq); - - /* - * For OK queue, we are done. For error, kill it. Any other - * error (busy), just add it to our list as we previously - * would have done - */ - ret = q->mq_ops->queue_rq(hctx, &bd); - switch (ret) { - case BLK_STS_OK: - *cookie = new_cookie; - return; - case BLK_STS_RESOURCE: - __blk_mq_requeue_request(rq); - goto insert; - default: - *cookie = BLK_QC_T_NONE; - blk_mq_end_request(rq, ret); - return; - } - + return __blk_mq_issue_directly(hctx, rq, cookie); insert: - blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); + if (bypass_insert) + return BLK_STS_RESOURCE; + + blk_mq_sched_insert_request(rq, false, run_queue, false); + return BLK_STS_OK; } static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie) { - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { - rcu_read_lock(); - __blk_mq_try_issue_directly(hctx, rq, cookie, false); - rcu_read_unlock(); - } else { - unsigned int srcu_idx; + blk_status_t ret; + int srcu_idx; - might_sleep(); + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - __blk_mq_try_issue_directly(hctx, rq, cookie, true); - srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); - } + hctx_lock(hctx, &srcu_idx); + + ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); + if (ret == BLK_STS_RESOURCE) + blk_mq_sched_insert_request(rq, false, true, false); + else if (ret != BLK_STS_OK) + blk_mq_end_request(rq, ret); + + hctx_unlock(hctx, srcu_idx); +} + +blk_status_t blk_mq_request_issue_directly(struct request *rq) +{ + blk_status_t ret; + int srcu_idx; + blk_qc_t unused_cookie; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + + hctx_lock(hctx, &srcu_idx); + ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true); + hctx_unlock(hctx, srcu_idx); + + return ret; } static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) @@ -1776,7 +1957,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else if (q->elevator) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_sched_insert_request(rq, false, true, true, true); + blk_mq_sched_insert_request(rq, false, true, true); } else { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); @@ -1869,6 +2050,22 @@ static size_t order_to_size(unsigned int order) return (size_t)PAGE_SIZE << order; } +static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, int node) +{ + int ret; + + if (set->ops->init_request) { + ret = set->ops->init_request(set, rq, hctx_idx, node); + if (ret) + return ret; + } + + seqcount_init(&rq->gstate_seq); + u64_stats_init(&rq->aborted_gstate_sync); + return 0; +} + int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx, unsigned int depth) { @@ -1930,12 +2127,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, struct request *rq = p; tags->static_rqs[i] = rq; - if (set->ops->init_request) { - if (set->ops->init_request(set, rq, hctx_idx, - node)) { - tags->static_rqs[i] = NULL; - goto fail; - } + if (blk_mq_init_request(set, rq, hctx_idx, node)) { + tags->static_rqs[i] = NULL; + goto fail; } p += rq_size; @@ -1994,7 +2188,8 @@ static void blk_mq_exit_hctx(struct request_queue *q, { blk_mq_debugfs_unregister_hctx(hctx); - blk_mq_tag_idle(hctx); + if (blk_mq_hw_queue_mapped(hctx)) + blk_mq_tag_idle(hctx); if (set->ops->exit_request) set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); @@ -2005,7 +2200,7 @@ static void blk_mq_exit_hctx(struct request_queue *q, set->ops->exit_hctx(hctx, hctx_idx); if (hctx->flags & BLK_MQ_F_BLOCKING) - cleanup_srcu_struct(hctx->queue_rq_srcu); + cleanup_srcu_struct(hctx->srcu); blk_mq_remove_cpuhp(hctx); blk_free_flush_queue(hctx->fq); @@ -2074,13 +2269,11 @@ static int blk_mq_init_hctx(struct request_queue *q, if (!hctx->fq) goto sched_exit_hctx; - if (set->ops->init_request && - set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx, - node)) + if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node)) goto free_fq; if (hctx->flags & BLK_MQ_F_BLOCKING) - init_srcu_struct(hctx->queue_rq_srcu); + init_srcu_struct(hctx->srcu); blk_mq_debugfs_register_hctx(q, hctx); @@ -2116,16 +2309,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; - /* If the cpu isn't present, the cpu is mapped to first hctx */ - if (!cpu_present(i)) - continue; - - hctx = blk_mq_map_queue(q, i); - /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ + hctx = blk_mq_map_queue(q, i); if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) hctx->numa_node = local_memory_node(cpu_to_node(i)); } @@ -2182,7 +2370,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) * * If the cpu isn't present, the cpu is mapped to first hctx. */ - for_each_present_cpu(i) { + for_each_possible_cpu(i) { hctx_idx = q->mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && @@ -2236,7 +2424,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) /* * Initialize batch roundrobin counts */ - hctx->next_cpu = cpumask_first(hctx->cpumask); + hctx->next_cpu = cpumask_first_and(hctx->cpumask, + cpu_online_mask); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } } @@ -2369,7 +2558,7 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) { int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); - BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu), + BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), __alignof__(struct blk_mq_hw_ctx)) != sizeof(struct blk_mq_hw_ctx)); @@ -2386,6 +2575,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; blk_mq_sysfs_unregister(q); + + /* protect against switching io scheduler */ + mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int node; @@ -2430,6 +2622,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, } } q->nr_hw_queues = i; + mutex_unlock(&q->sysfs_lock); blk_mq_sysfs_register(q); } @@ -2601,9 +2794,27 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { - if (set->ops->map_queues) + if (set->ops->map_queues) { + int cpu; + /* + * transport .map_queues is usually done in the following + * way: + * + * for (queue = 0; queue < set->nr_hw_queues; queue++) { + * mask = get_cpu_mask(queue) + * for_each_cpu(cpu, mask) + * set->mq_map[cpu] = queue; + * } + * + * When we need to remap, the table has to be cleared for + * killing stale mapping since one CPU may not be mapped + * to any hw queue. + */ + for_each_possible_cpu(cpu) + set->mq_map[cpu] = 0; + return set->ops->map_queues(set); - else + } else return blk_mq_map_queues(set); } @@ -2712,6 +2923,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return -EINVAL; blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); ret = 0; queue_for_each_hw_ctx(q, hctx, i) { @@ -2735,6 +2947,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!ret) q->nr_requests = nr; + blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); return ret; @@ -2850,7 +3063,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, unsigned int nsecs; ktime_t kt; - if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) + if (rq->rq_flags & RQF_MQ_POLL_SLEPT) return false; /* @@ -2870,7 +3083,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, if (!nsecs) return false; - set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); + rq->rq_flags |= RQF_MQ_POLL_SLEPT; /* * This will be replaced with the stats tracking code, using @@ -2884,7 +3097,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, hrtimer_init_sleeper(&hs, current); do { - if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) + if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) break; set_current_state(TASK_UNINTERRUPTIBLE); hrtimer_start_expires(&hs.timer, mode); @@ -2970,12 +3183,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) static int __init blk_mq_init(void) { - /* - * See comment in block/blk.h rq_atomic_flags enum - */ - BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) != - (REQ_ATOM_COMPLETE / BITS_PER_BYTE)); - cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); return 0; diff --git a/block/blk-mq.h b/block/blk-mq.h index 6c7c3ff..88c558f 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -27,6 +27,20 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; +/* + * Bits for request->gstate. The lower two bits carry MQ_RQ_* state value + * and the upper bits the generation number. + */ +enum mq_rq_state { + MQ_RQ_IDLE = 0, + MQ_RQ_IN_FLIGHT = 1, + MQ_RQ_COMPLETE = 2, + + MQ_RQ_STATE_BITS = 2, + MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1, + MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS, +}; + void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); @@ -60,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); +/* Used by blk_insert_cloned_request() to issue request directly */ +blk_status_t blk_mq_request_issue_directly(struct request *rq); + /* * CPU -> queue mappings */ @@ -81,10 +98,41 @@ extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); -extern void blk_mq_rq_timed_out(struct request *req, bool reserved); - void blk_mq_release(struct request_queue *q); +/** + * blk_mq_rq_state() - read the current MQ_RQ_* state of a request + * @rq: target request. + */ +static inline int blk_mq_rq_state(struct request *rq) +{ + return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK; +} + +/** + * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request + * @rq: target request. + * @state: new state to set. + * + * Set @rq's state to @state. The caller is responsible for ensuring that + * there are no other updaters. A request can transition into IN_FLIGHT + * only from IDLE and doing so increments the generation number. + */ +static inline void blk_mq_rq_update_state(struct request *rq, + enum mq_rq_state state) +{ + u64 old_val = READ_ONCE(rq->gstate); + u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state; + + if (state == MQ_RQ_IN_FLIGHT) { + WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE); + new_val += MQ_RQ_GEN_INC; + } + + /* avoid exposing interim values */ + WRITE_ONCE(rq->gstate, new_val); +} + static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 870484e..cbea895 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +/** + * blk_register_queue - register a block layer queue with sysfs + * @disk: Disk of which the request queue should be registered with sysfs. + */ int blk_register_queue(struct gendisk *disk) { int ret; @@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk) if (q->request_fn || (q->mq_ops && q->elevator)) { ret = elv_register_queue(q); if (ret) { + mutex_unlock(&q->sysfs_lock); kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); - goto unlock; + return ret; } } ret = 0; @@ -921,7 +926,15 @@ unlock: mutex_unlock(&q->sysfs_lock); return ret; } +EXPORT_SYMBOL_GPL(blk_register_queue); +/** + * blk_unregister_queue - counterpart of blk_register_queue() + * @disk: Disk of which the request queue should be unregistered from sysfs. + * + * Note: the caller is responsible for guaranteeing that this function is called + * after blk_register_queue() has finished. + */ void blk_unregister_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; @@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk) if (WARN_ON(!q)) return; - mutex_lock(&q->sysfs_lock); - queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q); - mutex_unlock(&q->sysfs_lock); + /* Return early if disk->queue was never registered. */ + if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + return; - wbt_exit(q); + /* + * Since sysfs_remove_dir() prevents adding new directory entries + * before removal of existing entries starts, protect against + * concurrent elv_iosched_store() calls. + */ + mutex_lock(&q->sysfs_lock); + spin_lock_irq(q->queue_lock); + queue_flag_clear(QUEUE_FLAG_REGISTERED, q); + spin_unlock_irq(q->queue_lock); + /* + * Remove the sysfs attributes before unregistering the queue data + * structures that can be modified through sysfs. + */ if (q->mq_ops) blk_mq_unregister_dev(disk_to_dev(disk), q); - - if (q->request_fn || (q->mq_ops && q->elevator)) - elv_unregister_queue(q); + mutex_unlock(&q->sysfs_lock); kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); + + wbt_exit(q); + + mutex_lock(&q->sysfs_lock); + if (q->request_fn || (q->mq_ops && q->elevator)) + elv_unregister_queue(q); + mutex_unlock(&q->sysfs_lock); + kobject_put(&disk_to_dev(disk)->kobj); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d19f416..c5a1316 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -216,9 +216,9 @@ struct throtl_data unsigned int scale; - struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE]; - struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; - struct latency_bucket __percpu *latency_buckets; + struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE]; + struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE]; + struct latency_bucket __percpu *latency_buckets[2]; unsigned long last_calculate_time; unsigned long filtered_latency; @@ -1511,10 +1511,20 @@ static struct cftype throtl_legacy_files[] = { .seq_show = blkg_print_stat_bytes, }, { + .name = "throttle.io_service_bytes_recursive", + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_bytes_recursive, + }, + { .name = "throttle.io_serviced", .private = (unsigned long)&blkcg_policy_throtl, .seq_show = blkg_print_stat_ios, }, + { + .name = "throttle.io_serviced_recursive", + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_ios_recursive, + }, { } /* terminate */ }; @@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) #ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void throtl_update_latency_buckets(struct throtl_data *td) { - struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE]; - int i, cpu; - unsigned long last_latency = 0; - unsigned long latency; + struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; + int i, cpu, rw; + unsigned long last_latency[2] = { 0 }; + unsigned long latency[2]; if (!blk_queue_nonrot(td->queue)) return; @@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td) td->last_calculate_time = jiffies; memset(avg_latency, 0, sizeof(avg_latency)); - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - struct latency_bucket *tmp = &td->tmp_buckets[i]; - - for_each_possible_cpu(cpu) { - struct latency_bucket *bucket; - - /* this isn't race free, but ok in practice */ - bucket = per_cpu_ptr(td->latency_buckets, cpu); - tmp->total_latency += bucket[i].total_latency; - tmp->samples += bucket[i].samples; - bucket[i].total_latency = 0; - bucket[i].samples = 0; - } + for (rw = READ; rw <= WRITE; rw++) { + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + struct latency_bucket *tmp = &td->tmp_buckets[rw][i]; + + for_each_possible_cpu(cpu) { + struct latency_bucket *bucket; + + /* this isn't race free, but ok in practice */ + bucket = per_cpu_ptr(td->latency_buckets[rw], + cpu); + tmp->total_latency += bucket[i].total_latency; + tmp->samples += bucket[i].samples; + bucket[i].total_latency = 0; + bucket[i].samples = 0; + } - if (tmp->samples >= 32) { - int samples = tmp->samples; + if (tmp->samples >= 32) { + int samples = tmp->samples; - latency = tmp->total_latency; + latency[rw] = tmp->total_latency; - tmp->total_latency = 0; - tmp->samples = 0; - latency /= samples; - if (latency == 0) - continue; - avg_latency[i].latency = latency; + tmp->total_latency = 0; + tmp->samples = 0; + latency[rw] /= samples; + if (latency[rw] == 0) + continue; + avg_latency[rw][i].latency = latency[rw]; + } } } - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - if (!avg_latency[i].latency) { - if (td->avg_buckets[i].latency < last_latency) - td->avg_buckets[i].latency = last_latency; - continue; - } + for (rw = READ; rw <= WRITE; rw++) { + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + if (!avg_latency[rw][i].latency) { + if (td->avg_buckets[rw][i].latency < last_latency[rw]) + td->avg_buckets[rw][i].latency = + last_latency[rw]; + continue; + } - if (!td->avg_buckets[i].valid) - latency = avg_latency[i].latency; - else - latency = (td->avg_buckets[i].latency * 7 + - avg_latency[i].latency) >> 3; + if (!td->avg_buckets[rw][i].valid) + latency[rw] = avg_latency[rw][i].latency; + else + latency[rw] = (td->avg_buckets[rw][i].latency * 7 + + avg_latency[rw][i].latency) >> 3; - td->avg_buckets[i].latency = max(latency, last_latency); - td->avg_buckets[i].valid = true; - last_latency = td->avg_buckets[i].latency; + td->avg_buckets[rw][i].latency = max(latency[rw], + last_latency[rw]); + td->avg_buckets[rw][i].valid = true; + last_latency[rw] = td->avg_buckets[rw][i].latency; + } } for (i = 0; i < LATENCY_BUCKET_SIZE; i++) throtl_log(&td->service_queue, - "Latency bucket %d: latency=%ld, valid=%d", i, - td->avg_buckets[i].latency, td->avg_buckets[i].valid); + "Latency bucket %d: read latency=%ld, read valid=%d, " + "write latency=%ld, write valid=%d", i, + td->avg_buckets[READ][i].latency, + td->avg_buckets[READ][i].valid, + td->avg_buckets[WRITE][i].latency, + td->avg_buckets[WRITE][i].valid); } #else static inline void throtl_update_latency_buckets(struct throtl_data *td) @@ -2242,16 +2263,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size, struct latency_bucket *latency; int index; - if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ || + if (!td || td->limit_index != LIMIT_LOW || + !(op == REQ_OP_READ || op == REQ_OP_WRITE) || !blk_queue_nonrot(td->queue)) return; index = request_bucket_index(size); - latency = get_cpu_ptr(td->latency_buckets); + latency = get_cpu_ptr(td->latency_buckets[op]); latency[index].total_latency += time; latency[index].samples++; - put_cpu_ptr(td->latency_buckets); + put_cpu_ptr(td->latency_buckets[op]); } void blk_throtl_stat_add(struct request *rq, u64 time_ns) @@ -2270,6 +2292,7 @@ void blk_throtl_bio_endio(struct bio *bio) unsigned long finish_time; unsigned long start_time; unsigned long lat; + int rw = bio_data_dir(bio); tg = bio->bi_cg_private; if (!tg) @@ -2298,7 +2321,7 @@ void blk_throtl_bio_endio(struct bio *bio) bucket = request_bucket_index( blk_stat_size(&bio->bi_issue_stat)); - threshold = tg->td->avg_buckets[bucket].latency + + threshold = tg->td->avg_buckets[rw][bucket].latency + tg->latency_target; if (lat > threshold) tg->bad_bio_cnt++; @@ -2391,9 +2414,16 @@ int blk_throtl_init(struct request_queue *q) td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; - td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) * + td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) * LATENCY_BUCKET_SIZE, __alignof__(u64)); - if (!td->latency_buckets) { + if (!td->latency_buckets[READ]) { + kfree(td); + return -ENOMEM; + } + td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) * + LATENCY_BUCKET_SIZE, __alignof__(u64)); + if (!td->latency_buckets[WRITE]) { + free_percpu(td->latency_buckets[READ]); kfree(td); return -ENOMEM; } @@ -2412,7 +2442,8 @@ int blk_throtl_init(struct request_queue *q) /* activate policy */ ret = blkcg_activate_policy(q, &blkcg_policy_throtl); if (ret) { - free_percpu(td->latency_buckets); + free_percpu(td->latency_buckets[READ]); + free_percpu(td->latency_buckets[WRITE]); kfree(td); } return ret; @@ -2423,7 +2454,8 @@ void blk_throtl_exit(struct request_queue *q) BUG_ON(!q->td); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); - free_percpu(q->td->latency_buckets); + free_percpu(q->td->latency_buckets[READ]); + free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); } @@ -2441,15 +2473,17 @@ void blk_throtl_register_queue(struct request_queue *q) } else { td->throtl_slice = DFL_THROTL_SLICE_HD; td->filtered_latency = LATENCY_FILTERED_HD; - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) - td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY; + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY; + td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY; + } } #ifndef CONFIG_BLK_DEV_THROTTLING_LOW /* if no low limit, use previous default */ td->throtl_slice = DFL_THROTL_SLICE_HD; #endif - td->track_bio_latency = !q->mq_ops && !q->request_fn; + td->track_bio_latency = !queue_is_rq_based(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 764ecf9..a05e367 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req) static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, unsigned int *next_set) { - if (time_after_eq(jiffies, rq->deadline)) { + const unsigned long deadline = blk_rq_deadline(rq); + + if (time_after_eq(jiffies, deadline)) { list_del_init(&rq->timeout_list); /* @@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout */ if (!blk_mark_rq_complete(rq)) blk_rq_timed_out(rq); - } else if (!*next_set || time_after(*next_timeout, rq->deadline)) { - *next_timeout = rq->deadline; + } else if (!*next_set || time_after(*next_timeout, deadline)) { + *next_timeout = deadline; *next_set = 1; } } @@ -156,12 +158,17 @@ void blk_timeout_work(struct work_struct *work) */ void blk_abort_request(struct request *req) { - if (blk_mark_rq_complete(req)) - return; - if (req->q->mq_ops) { - blk_mq_rq_timed_out(req, false); + /* + * All we need to ensure is that timeout scan takes place + * immediately and that scan sees the new timeout value. + * No need for fancy synchronizations. + */ + blk_rq_set_deadline(req, jiffies); + mod_timer(&req->q->timeout, 0); } else { + if (blk_mark_rq_complete(req)) + return; blk_delete_timer(req); blk_rq_timed_out(req); } @@ -208,7 +215,8 @@ void blk_add_timer(struct request *req) if (!req->timeout) req->timeout = q->rq_timeout; - WRITE_ONCE(req->deadline, jiffies + req->timeout); + blk_rq_set_deadline(req, jiffies + req->timeout); + req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED; /* * Only the non-mq case needs to add the request to a protected list. @@ -222,7 +230,7 @@ void blk_add_timer(struct request *req) * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); + expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req))); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { diff --git a/block/blk-zoned.c b/block/blk-zoned.c index ff57fb5..acb7252 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -22,6 +22,48 @@ static inline sector_t blk_zone_start(struct request_queue *q, } /* + * Return true if a request is a write requests that needs zone write locking. + */ +bool blk_req_needs_zone_write_lock(struct request *rq) +{ + if (!rq->q->seq_zones_wlock) + return false; + + if (blk_rq_is_passthrough(rq)) + return false; + + switch (req_op(rq)) { + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + return blk_rq_zone_is_seq(rq); + default: + return false; + } +} +EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); + +void __blk_req_zone_write_lock(struct request *rq) +{ + if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), + rq->q->seq_zones_wlock))) + return; + + WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); + rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; +} +EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); + +void __blk_req_zone_write_unlock(struct request *rq) +{ + rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; + if (rq->q->seq_zones_wlock) + WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), + rq->q->seq_zones_wlock)); +} +EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); + +/* * Check that a zone report belongs to the partition. * If yes, fix its start sector and write pointer, copy it in the * zone information array and return true. Return false otherwise. diff --git a/block/blk.h b/block/blk.h index 442098a..46db5dc 100644 --- a/block/blk.h +++ b/block/blk.h @@ -120,33 +120,23 @@ void blk_account_io_completion(struct request *req, unsigned int bytes); void blk_account_io_done(struct request *req); /* - * Internal atomic flags for request handling - */ -enum rq_atomic_flags { - /* - * Keep these two bits first - not because we depend on the - * value of them, but we do depend on them being in the same - * byte of storage to ensure ordering on writes. Keeping them - * first will achieve that nicely. - */ - REQ_ATOM_COMPLETE = 0, - REQ_ATOM_STARTED, - - REQ_ATOM_POLL_SLEPT, -}; - -/* * EH timer and IO completion will both attempt to 'grab' the request, make - * sure that only one of them succeeds + * sure that only one of them succeeds. Steal the bottom bit of the + * __deadline field for this. */ static inline int blk_mark_rq_complete(struct request *rq) { - return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); + return test_and_set_bit(0, &rq->__deadline); } static inline void blk_clear_rq_complete(struct request *rq) { - clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); + clear_bit(0, &rq->__deadline); +} + +static inline bool blk_rq_is_complete(struct request *rq) +{ + return test_bit(0, &rq->__deadline); } /* @@ -172,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq e->type->ops.sq.elevator_deactivate_req_fn(q, rq); } +int elv_register_queue(struct request_queue *q); +void elv_unregister_queue(struct request_queue *q); + struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); #ifdef CONFIG_FAIL_IO_TIMEOUT @@ -246,6 +239,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req) } /* + * Steal a bit from this field for legacy IO path atomic IO marking. Note that + * setting the deadline clears the bottom bit, potentially clearing the + * completed bit. The user has to be OK with this (current ones are fine). + */ +static inline void blk_rq_set_deadline(struct request *rq, unsigned long time) +{ + rq->__deadline = time & ~0x1UL; +} + +static inline unsigned long blk_rq_deadline(struct request *rq) +{ + return rq->__deadline & ~0x1UL; +} + +/* * Internal io_context interface */ void get_io_context(struct io_context *ioc); diff --git a/block/bounce.c b/block/bounce.c index 1d05c42..6a3e682 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -113,45 +113,50 @@ int init_emergency_isa_pool(void) static void copy_to_high_bio_irq(struct bio *to, struct bio *from) { unsigned char *vfrom; - struct bio_vec tovec, *fromvec = from->bi_io_vec; + struct bio_vec tovec, fromvec; struct bvec_iter iter; + /* + * The bio of @from is created by bounce, so we can iterate + * its bvec from start to end, but the @from->bi_iter can't be + * trusted because it might be changed by splitting. + */ + struct bvec_iter from_iter = BVEC_ITER_ALL_INIT; bio_for_each_segment(tovec, to, iter) { - if (tovec.bv_page != fromvec->bv_page) { + fromvec = bio_iter_iovec(from, from_iter); + if (tovec.bv_page != fromvec.bv_page) { /* * fromvec->bv_offset and fromvec->bv_len might have * been modified by the block layer, so use the original * copy, bounce_copy_vec already uses tovec->bv_len */ - vfrom = page_address(fromvec->bv_page) + + vfrom = page_address(fromvec.bv_page) + tovec.bv_offset; bounce_copy_vec(&tovec, vfrom); flush_dcache_page(tovec.bv_page); } - - fromvec++; + bio_advance_iter(from, &from_iter, tovec.bv_len); } } static void bounce_end_io(struct bio *bio, mempool_t *pool) { struct bio *bio_orig = bio->bi_private; - struct bio_vec *bvec, *org_vec; + struct bio_vec *bvec, orig_vec; int i; - int start = bio_orig->bi_iter.bi_idx; + struct bvec_iter orig_iter = bio_orig->bi_iter; /* * free up bounce indirect pages used */ bio_for_each_segment_all(bvec, bio, i) { - org_vec = bio_orig->bi_io_vec + i + start; - - if (bvec->bv_page == org_vec->bv_page) - continue; - - dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, pool); + orig_vec = bio_iter_iovec(bio_orig, orig_iter); + if (bvec->bv_page != orig_vec.bv_page) { + dec_zone_page_state(bvec->bv_page, NR_BOUNCE); + mempool_free(bvec->bv_page, pool); + } + bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len); } bio_orig->bi_status = bio->bi_status; diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 15d25cc..1474153 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -30,7 +30,7 @@ /** * bsg_teardown_job - routine to teardown a bsg job - * @job: bsg_job that is to be torn down + * @kref: kref inside bsg_job that is to be torn down */ static void bsg_teardown_job(struct kref *kref) { @@ -251,6 +251,7 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req) * @name: device to give bsg device * @job_fn: bsg job handler * @dd_job_size: size of LLD data needed for each job + * @release: @dev release function */ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, bsg_job_fn *job_fn, int dd_job_size, diff --git a/block/bsg.c b/block/bsg.c index 452f94f..a1bcbb6 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -32,6 +32,9 @@ #define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver" #define BSG_VERSION "0.4" +#define bsg_dbg(bd, fmt, ...) \ + pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__) + struct bsg_device { struct request_queue *queue; spinlock_t lock; @@ -55,14 +58,6 @@ enum { #define BSG_DEFAULT_CMDS 64 #define BSG_MAX_DEVS 32768 -#undef BSG_DEBUG - -#ifdef BSG_DEBUG -#define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ##args) -#else -#define dprintk(fmt, args...) -#endif - static DEFINE_MUTEX(bsg_mutex); static DEFINE_IDR(bsg_minor_idr); @@ -123,7 +118,7 @@ static struct bsg_command *bsg_alloc_command(struct bsg_device *bd) bc->bd = bd; INIT_LIST_HEAD(&bc->list); - dprintk("%s: returning free cmd %p\n", bd->name, bc); + bsg_dbg(bd, "returning free cmd %p\n", bc); return bc; out: spin_unlock_irq(&bd->lock); @@ -222,7 +217,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode) if (!bcd->class_dev) return ERR_PTR(-ENXIO); - dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp, + bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n", + (unsigned long long) hdr->dout_xferp, hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, hdr->din_xfer_len); @@ -299,8 +295,8 @@ static void bsg_rq_end_io(struct request *rq, blk_status_t status) struct bsg_device *bd = bc->bd; unsigned long flags; - dprintk("%s: finished rq %p bc %p, bio %p\n", - bd->name, rq, bc, bc->bio); + bsg_dbg(bd, "finished rq %p bc %p, bio %p\n", + rq, bc, bc->bio); bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); @@ -333,7 +329,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q, list_add_tail(&bc->list, &bd->busy_list); spin_unlock_irq(&bd->lock); - dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc); + bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc); rq->end_io_data = bc; blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io); @@ -379,7 +375,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd) } } while (1); - dprintk("%s: returning done %p\n", bd->name, bc); + bsg_dbg(bd, "returning done %p\n", bc); return bc; } @@ -390,7 +386,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, struct scsi_request *req = scsi_req(rq); int ret = 0; - dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result); + pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result); /* * fill in all the output members */ @@ -469,7 +465,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd) struct bsg_command *bc; int ret, tret; - dprintk("%s: entered\n", bd->name); + bsg_dbg(bd, "entered\n"); /* * wait for all commands to complete @@ -572,7 +568,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) int ret; ssize_t bytes_read; - dprintk("%s: read %zd bytes\n", bd->name, count); + bsg_dbg(bd, "read %zd bytes\n", count); bsg_set_block(bd, file); @@ -646,7 +642,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) ssize_t bytes_written; int ret; - dprintk("%s: write %zd bytes\n", bd->name, count); + bsg_dbg(bd, "write %zd bytes\n", count); if (unlikely(uaccess_kernel())) return -EINVAL; @@ -664,7 +660,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) if (!bytes_written || err_block_err(ret)) bytes_written = ret; - dprintk("%s: returning %zd\n", bd->name, bytes_written); + bsg_dbg(bd, "returning %zd\n", bytes_written); return bytes_written; } @@ -717,7 +713,7 @@ static int bsg_put_device(struct bsg_device *bd) hlist_del(&bd->dev_list); mutex_unlock(&bsg_mutex); - dprintk("%s: tearing down\n", bd->name); + bsg_dbg(bd, "tearing down\n"); /* * close can always block @@ -744,9 +740,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode, struct file *file) { struct bsg_device *bd; -#ifdef BSG_DEBUG unsigned char buf[32]; -#endif if (!blk_queue_scsi_passthrough(rq)) { WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); @@ -771,7 +765,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode, hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); - dprintk("bound to <%s>, max queue %d\n", + bsg_dbg(bd, "bound to <%s>, max queue %d\n", format_dev_t(buf, inode->i_rdev), bd->max_queue); mutex_unlock(&bsg_mutex); diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index b83f774..9de9f15 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -50,8 +50,6 @@ struct deadline_data { int front_merges; }; -static void deadline_move_request(struct deadline_data *, struct request *); - static inline struct rb_root * deadline_rb_root(struct deadline_data *dd, struct request *rq) { @@ -100,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request *rq) struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); + /* + * This may be a requeue of a write request that has locked its + * target zone. If it is the case, this releases the zone lock. + */ + blk_req_zone_write_unlock(rq); + deadline_add_rq_rb(dd, rq); /* @@ -190,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq) { struct request_queue *q = rq->q; + /* + * For a zoned block device, write requests must write lock their + * target zone. + */ + blk_req_zone_write_lock(rq); + deadline_remove_request(q, rq); elv_dispatch_add_tail(q, rq); } @@ -231,6 +241,69 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) } /* + * For the specified data direction, return the next request to dispatch using + * arrival ordered lists. + */ +static struct request * +deadline_fifo_request(struct deadline_data *dd, int data_dir) +{ + struct request *rq; + + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + if (list_empty(&dd->fifo_list[data_dir])) + return NULL; + + rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { + if (blk_req_can_dispatch_to_zone(rq)) + return rq; + } + + return NULL; +} + +/* + * For the specified data direction, return the next request to dispatch using + * sector position sorted lists. + */ +static struct request * +deadline_next_request(struct deadline_data *dd, int data_dir) +{ + struct request *rq; + + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + rq = dd->next_rq[data_dir]; + if (!rq) + return NULL; + + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + while (rq) { + if (blk_req_can_dispatch_to_zone(rq)) + return rq; + rq = deadline_latter_request(rq); + } + + return NULL; +} + +/* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc */ @@ -239,16 +312,15 @@ static int deadline_dispatch_requests(struct request_queue *q, int force) struct deadline_data *dd = q->elevator->elevator_data; const int reads = !list_empty(&dd->fifo_list[READ]); const int writes = !list_empty(&dd->fifo_list[WRITE]); - struct request *rq; + struct request *rq, *next_rq; int data_dir; /* * batches are currently reads XOR writes */ - if (dd->next_rq[WRITE]) - rq = dd->next_rq[WRITE]; - else - rq = dd->next_rq[READ]; + rq = deadline_next_request(dd, WRITE); + if (!rq) + rq = deadline_next_request(dd, READ); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ @@ -262,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue *q, int force) if (reads) { BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); - if (writes && (dd->starved++ >= dd->writes_starved)) + if (deadline_fifo_request(dd, WRITE) && + (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = READ; @@ -291,21 +364,29 @@ dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ - if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { + next_rq = deadline_next_request(dd, data_dir); + if (deadline_check_fifo(dd, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + rq = deadline_fifo_request(dd, data_dir); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ - rq = dd->next_rq[data_dir]; + rq = next_rq; } + /* + * For a zoned block device, if we only have writes queued and none of + * them can be dispatched, rq will be NULL. + */ + if (!rq) + return 0; + dd->batching = 0; dispatch_request: @@ -318,6 +399,16 @@ dispatch_request: return 1; } +/* + * For zoned block devices, write unlock the target zone of completed + * write requests. + */ +static void +deadline_completed_request(struct request_queue *q, struct request *rq) +{ + blk_req_zone_write_unlock(rq); +} + static void deadline_exit_queue(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; @@ -439,6 +530,7 @@ static struct elevator_type iosched_deadline = { .elevator_merged_fn = deadline_merged_request, .elevator_merge_req_fn = deadline_merged_requests, .elevator_dispatch_fn = deadline_dispatch_requests, + .elevator_completed_req_fn = deadline_completed_request, .elevator_add_req_fn = deadline_add_request, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, diff --git a/block/elevator.c b/block/elevator.c index 7bda083..e87e9b4 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q) struct elevator_queue *e = q->elevator; int error; + lockdep_assert_held(&q->sysfs_lock); + error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); if (!error) { struct elv_fs_entry *attr = e->type->elevator_attrs; @@ -886,10 +888,11 @@ int elv_register_queue(struct request_queue *q) } return error; } -EXPORT_SYMBOL(elv_register_queue); void elv_unregister_queue(struct request_queue *q) { + lockdep_assert_held(&q->sysfs_lock); + if (q) { struct elevator_queue *e = q->elevator; @@ -900,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q) wbt_enable_default(q); } } -EXPORT_SYMBOL(elv_unregister_queue); int elv_register(struct elevator_type *e) { @@ -967,7 +969,10 @@ static int elevator_switch_mq(struct request_queue *q, { int ret; + lockdep_assert_held(&q->sysfs_lock); + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); if (q->elevator) { if (q->elevator->registered) @@ -994,6 +999,7 @@ static int elevator_switch_mq(struct request_queue *q, blk_add_trace_msg(q, "elv switch: none"); out: + blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); return ret; } @@ -1010,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) bool old_registered = false; int err; + lockdep_assert_held(&q->sysfs_lock); + if (q->mq_ops) return elevator_switch_mq(q, new_e); diff --git a/block/genhd.c b/block/genhd.c index 96a66f6..88a53c1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -629,16 +629,18 @@ exit: } /** - * device_add_disk - add partitioning information to kernel list + * __device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information + * @register_queue: register the queue if set to true * * This function registers the partitioning information in @disk * with the kernel. * * FIXME: error handling */ -void device_add_disk(struct device *parent, struct gendisk *disk) +static void __device_add_disk(struct device *parent, struct gendisk *disk, + bool register_queue) { dev_t devt; int retval; @@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk) exact_match, exact_lock, disk); } register_disk(parent, disk); - blk_register_queue(disk); + if (register_queue) + blk_register_queue(disk); /* * Take an extra ref on queue which will be put on disk_release() @@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk) disk_add_events(disk); blk_integrity_add(disk); } + +void device_add_disk(struct device *parent, struct gendisk *disk) +{ + __device_add_disk(parent, disk, true); +} EXPORT_SYMBOL(device_add_disk); +void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) +{ + __device_add_disk(parent, disk, false); +} +EXPORT_SYMBOL(device_add_disk_no_queue_reg); + void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; @@ -725,7 +739,8 @@ void del_gendisk(struct gendisk *disk) * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ - bdi_unregister(disk->queue->backing_dev_info); + if (!(disk->flags & GENHD_FL_HIDDEN)) + bdi_unregister(disk->queue->backing_dev_info); blk_unregister_queue(disk); } else { WARN_ON(1); diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 0179e48..c56f211 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -59,6 +59,7 @@ struct deadline_data { int front_merges; spinlock_t lock; + spinlock_t zone_lock; struct list_head dispatch; }; @@ -192,13 +193,83 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) } /* + * For the specified data direction, return the next request to + * dispatch using arrival ordered lists. + */ +static struct request * +deadline_fifo_request(struct deadline_data *dd, int data_dir) +{ + struct request *rq; + unsigned long flags; + + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + if (list_empty(&dd->fifo_list[data_dir])) + return NULL; + + rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + spin_lock_irqsave(&dd->zone_lock, flags); + list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { + if (blk_req_can_dispatch_to_zone(rq)) + goto out; + } + rq = NULL; +out: + spin_unlock_irqrestore(&dd->zone_lock, flags); + + return rq; +} + +/* + * For the specified data direction, return the next request to + * dispatch using sector position sorted lists. + */ +static struct request * +deadline_next_request(struct deadline_data *dd, int data_dir) +{ + struct request *rq; + unsigned long flags; + + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + rq = dd->next_rq[data_dir]; + if (!rq) + return NULL; + + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + spin_lock_irqsave(&dd->zone_lock, flags); + while (rq) { + if (blk_req_can_dispatch_to_zone(rq)) + break; + rq = deadline_latter_request(rq); + } + spin_unlock_irqrestore(&dd->zone_lock, flags); + + return rq; +} + +/* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc */ -static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) +static struct request *__dd_dispatch_request(struct deadline_data *dd) { - struct deadline_data *dd = hctx->queue->elevator->elevator_data; - struct request *rq; + struct request *rq, *next_rq; bool reads, writes; int data_dir; @@ -214,10 +285,9 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) /* * batches are currently reads XOR writes */ - if (dd->next_rq[WRITE]) - rq = dd->next_rq[WRITE]; - else - rq = dd->next_rq[READ]; + rq = deadline_next_request(dd, WRITE); + if (!rq) + rq = deadline_next_request(dd, READ); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ @@ -231,7 +301,8 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) if (reads) { BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); - if (writes && (dd->starved++ >= dd->writes_starved)) + if (deadline_fifo_request(dd, WRITE) && + (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = READ; @@ -260,21 +331,29 @@ dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ - if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { + next_rq = deadline_next_request(dd, data_dir); + if (deadline_check_fifo(dd, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + rq = deadline_fifo_request(dd, data_dir); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ - rq = dd->next_rq[data_dir]; + rq = next_rq; } + /* + * For a zoned block device, if we only have writes queued and none of + * them can be dispatched, rq will be NULL. + */ + if (!rq) + return NULL; + dd->batching = 0; dispatch_request: @@ -284,17 +363,27 @@ dispatch_request: dd->batching++; deadline_move_request(dd, rq); done: + /* + * If the request needs its target zone locked, do it. + */ + blk_req_zone_write_lock(rq); rq->rq_flags |= RQF_STARTED; return rq; } +/* + * One confusing aspect here is that we get called for a specific + * hardware queue, but we return a request that may not be for a + * different hardware queue. This is because mq-deadline has shared + * state for all hardware queues, in terms of sorting, FIFOs, etc. + */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct request *rq; spin_lock(&dd->lock); - rq = __dd_dispatch_request(hctx); + rq = __dd_dispatch_request(dd); spin_unlock(&dd->lock); return rq; @@ -339,6 +428,7 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e) dd->front_merges = 1; dd->fifo_batch = fifo_batch; spin_lock_init(&dd->lock); + spin_lock_init(&dd->zone_lock); INIT_LIST_HEAD(&dd->dispatch); q->elevator = eq; @@ -395,6 +485,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); + /* + * This may be a requeue of a write request that has locked its + * target zone. If it is the case, this releases the zone lock. + */ + blk_req_zone_write_unlock(rq); + if (blk_mq_sched_try_insert_merge(q, rq)) return; @@ -439,6 +535,26 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, spin_unlock(&dd->lock); } +/* + * For zoned block devices, write unlock the target zone of + * completed write requests. Do this while holding the zone lock + * spinlock so that the zone is never unlocked while deadline_fifo_request() + * while deadline_next_request() are executing. + */ +static void dd_completed_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + if (blk_queue_is_zoned(q)) { + struct deadline_data *dd = q->elevator->elevator_data; + unsigned long flags; + + spin_lock_irqsave(&dd->zone_lock, flags); + blk_req_zone_write_unlock(rq); + spin_unlock_irqrestore(&dd->zone_lock, flags); + } +} + static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; @@ -640,6 +756,7 @@ static struct elevator_type mq_deadline = { .ops.mq = { .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, + .completed_request = dd_completed_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 0af3a3d..82c44f7 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -301,7 +301,9 @@ static void parse_bsd(struct parsed_partitions *state, continue; bsd_start = le32_to_cpu(p->p_offset); bsd_size = le32_to_cpu(p->p_size); - if (memcmp(flavour, "bsd\0", 4) == 0) + /* FreeBSD has relative offset if C partition offset is zero */ + if (memcmp(flavour, "bsd\0", 4) == 0 && + le32_to_cpu(l->d_partitions[2].p_offset) == 0) bsd_start += offset; if (offset == bsd_start && size == bsd_size) /* full parent partition, we have it already */ diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index edcfff9..60b471f 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -384,9 +384,10 @@ out_put_request: /** * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl - * @file: file this ioctl operates on (optional) * @q: request queue to send scsi commands down * @disk: gendisk to operate on (option) + * @mode: mode used to open the file through which the ioctl has been + * submitted * @sic: userspace structure describing the command to perform * * Send down the scsi command described by @sic to the device below @@ -415,10 +416,10 @@ out_put_request: * Positive numbers returned are the compacted SCSI error codes (4 * bytes in one int) where the lowest byte is the SCSI status. */ -#define OMAX_SB_LEN 16 /* For backward compatibility */ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, struct scsi_ioctl_command __user *sic) { + enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */ struct request *rq; struct scsi_request *req; int err; @@ -692,38 +693,9 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) if (bd && bd == bd->bd_contains) return 0; - /* Actually none of these is particularly useful on a partition, - * but they are safe. - */ - switch (cmd) { - case SCSI_IOCTL_GET_IDLUN: - case SCSI_IOCTL_GET_BUS_NUMBER: - case SCSI_IOCTL_GET_PCI: - case SCSI_IOCTL_PROBE_HOST: - case SG_GET_VERSION_NUM: - case SG_SET_TIMEOUT: - case SG_GET_TIMEOUT: - case SG_GET_RESERVED_SIZE: - case SG_SET_RESERVED_SIZE: - case SG_EMULATED_HOST: - return 0; - case CDROM_GET_CAPABILITY: - /* Keep this until we remove the printk below. udev sends it - * and we do not want to spam dmesg about it. CD-ROMs do - * not have partitions, so we get here only for disks. - */ - return -ENOIOCTLCMD; - default: - break; - } - if (capable(CAP_SYS_RAWIO)) return 0; - /* In particular, rule out all resets and host-specific ioctls. */ - printk_ratelimited(KERN_WARNING - "%s: sending ioctl %x to a partition!\n", current->comm, cmd); - return -ENOIOCTLCMD; } EXPORT_SYMBOL(scsi_verify_blk_ioctl); diff --git a/crypto/Kconfig b/crypto/Kconfig index f7911963..20360e0 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -106,6 +106,7 @@ config CRYPTO_KPP config CRYPTO_ACOMP2 tristate select CRYPTO_ALGAPI2 + select SGL_ALLOC config CRYPTO_ACOMP tristate diff --git a/crypto/scompress.c b/crypto/scompress.c index 2075e2c..968bbcf 100644 --- a/crypto/scompress.c +++ b/crypto/scompress.c @@ -140,53 +140,6 @@ static int crypto_scomp_init_tfm(struct crypto_tfm *tfm) return ret; } -static void crypto_scomp_sg_free(struct scatterlist *sgl) -{ - int i, n; - struct page *page; - - if (!sgl) - return; - - n = sg_nents(sgl); - for_each_sg(sgl, sgl, n, i) { - page = sg_page(sgl); - if (page) - __free_page(page); - } - - kfree(sgl); -} - -static struct scatterlist *crypto_scomp_sg_alloc(size_t size, gfp_t gfp) -{ - struct scatterlist *sgl; - struct page *page; - int i, n; - - n = ((size - 1) >> PAGE_SHIFT) + 1; - - sgl = kmalloc_array(n, sizeof(struct scatterlist), gfp); - if (!sgl) - return NULL; - - sg_init_table(sgl, n); - - for (i = 0; i < n; i++) { - page = alloc_page(gfp); - if (!page) - goto err; - sg_set_page(sgl + i, page, PAGE_SIZE, 0); - } - - return sgl; - -err: - sg_mark_end(sgl + i); - crypto_scomp_sg_free(sgl); - return NULL; -} - static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) { struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); @@ -220,7 +173,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) scratch_dst, &req->dlen, *ctx); if (!ret) { if (!req->dst) { - req->dst = crypto_scomp_sg_alloc(req->dlen, GFP_ATOMIC); + req->dst = sgl_alloc(req->dlen, GFP_ATOMIC, NULL); if (!req->dst) goto out; } @@ -274,7 +227,7 @@ int crypto_init_scomp_ops_async(struct crypto_tfm *tfm) crt->compress = scomp_acomp_compress; crt->decompress = scomp_acomp_decompress; - crt->dst_free = crypto_scomp_sg_free; + crt->dst_free = sgl_free; crt->reqsize = sizeof(void *); return 0; diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 442e777..7280752 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -6619,43 +6619,27 @@ static void DAC960_DestroyProcEntries(DAC960_Controller_T *Controller) #ifdef DAC960_GAM_MINOR -/* - * DAC960_gam_ioctl is the ioctl function for performing RAID operations. -*/ - -static long DAC960_gam_ioctl(struct file *file, unsigned int Request, - unsigned long Argument) +static long DAC960_gam_get_controller_info(DAC960_ControllerInfo_T __user *UserSpaceControllerInfo) { - long ErrorCode = 0; - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - - mutex_lock(&DAC960_mutex); - switch (Request) - { - case DAC960_IOCTL_GET_CONTROLLER_COUNT: - ErrorCode = DAC960_ControllerCount; - break; - case DAC960_IOCTL_GET_CONTROLLER_INFO: - { - DAC960_ControllerInfo_T __user *UserSpaceControllerInfo = - (DAC960_ControllerInfo_T __user *) Argument; DAC960_ControllerInfo_T ControllerInfo; DAC960_Controller_T *Controller; int ControllerNumber; + long ErrorCode; + if (UserSpaceControllerInfo == NULL) ErrorCode = -EINVAL; else ErrorCode = get_user(ControllerNumber, &UserSpaceControllerInfo->ControllerNumber); if (ErrorCode != 0) - break; + goto out; ErrorCode = -ENXIO; if (ControllerNumber < 0 || ControllerNumber > DAC960_ControllerCount - 1) { - break; + goto out; } Controller = DAC960_Controllers[ControllerNumber]; if (Controller == NULL) - break; + goto out; memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T)); ControllerInfo.ControllerNumber = ControllerNumber; ControllerInfo.FirmwareType = Controller->FirmwareType; @@ -6670,12 +6654,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, strcpy(ControllerInfo.FirmwareVersion, Controller->FirmwareVersion); ErrorCode = (copy_to_user(UserSpaceControllerInfo, &ControllerInfo, sizeof(DAC960_ControllerInfo_T)) ? -EFAULT : 0); - break; - } - case DAC960_IOCTL_V1_EXECUTE_COMMAND: - { - DAC960_V1_UserCommand_T __user *UserSpaceUserCommand = - (DAC960_V1_UserCommand_T __user *) Argument; +out: + return ErrorCode; +} + +static long DAC960_gam_v1_execute_command(DAC960_V1_UserCommand_T __user *UserSpaceUserCommand) +{ DAC960_V1_UserCommand_T UserCommand; DAC960_Controller_T *Controller; DAC960_Command_T *Command = NULL; @@ -6688,39 +6672,41 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, int ControllerNumber, DataTransferLength; unsigned char *DataTransferBuffer = NULL; dma_addr_t DataTransferBufferDMA; + long ErrorCode; + if (UserSpaceUserCommand == NULL) { ErrorCode = -EINVAL; - break; + goto out; } if (copy_from_user(&UserCommand, UserSpaceUserCommand, sizeof(DAC960_V1_UserCommand_T))) { ErrorCode = -EFAULT; - break; + goto out; } ControllerNumber = UserCommand.ControllerNumber; ErrorCode = -ENXIO; if (ControllerNumber < 0 || ControllerNumber > DAC960_ControllerCount - 1) - break; + goto out; Controller = DAC960_Controllers[ControllerNumber]; if (Controller == NULL) - break; + goto out; ErrorCode = -EINVAL; if (Controller->FirmwareType != DAC960_V1_Controller) - break; + goto out; CommandOpcode = UserCommand.CommandMailbox.Common.CommandOpcode; DataTransferLength = UserCommand.DataTransferLength; if (CommandOpcode & 0x80) - break; + goto out; if (CommandOpcode == DAC960_V1_DCDB) { if (copy_from_user(&DCDB, UserCommand.DCDB, sizeof(DAC960_V1_DCDB_T))) { ErrorCode = -EFAULT; - break; + goto out; } if (DCDB.Channel >= DAC960_V1_MaxChannels) - break; + goto out; if (!((DataTransferLength == 0 && DCDB.Direction == DAC960_V1_DCDB_NoDataTransfer) || @@ -6730,15 +6716,15 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, (DataTransferLength < 0 && DCDB.Direction == DAC960_V1_DCDB_DataTransferSystemToDevice))) - break; + goto out; if (((DCDB.TransferLengthHigh4 << 16) | DCDB.TransferLength) != abs(DataTransferLength)) - break; + goto out; DCDB_IOBUF = pci_alloc_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T), &DCDB_IOBUFDMA); if (DCDB_IOBUF == NULL) { ErrorCode = -ENOMEM; - break; + goto out; } } ErrorCode = -ENOMEM; @@ -6748,19 +6734,19 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, DataTransferLength, &DataTransferBufferDMA); if (DataTransferBuffer == NULL) - break; + goto out; } else if (DataTransferLength < 0) { DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, -DataTransferLength, &DataTransferBufferDMA); if (DataTransferBuffer == NULL) - break; + goto out; if (copy_from_user(DataTransferBuffer, UserCommand.DataTransferBuffer, -DataTransferLength)) { ErrorCode = -EFAULT; - break; + goto out; } } if (CommandOpcode == DAC960_V1_DCDB) @@ -6837,12 +6823,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, if (DCDB_IOBUF != NULL) pci_free_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T), DCDB_IOBUF, DCDB_IOBUFDMA); - break; - } - case DAC960_IOCTL_V2_EXECUTE_COMMAND: - { - DAC960_V2_UserCommand_T __user *UserSpaceUserCommand = - (DAC960_V2_UserCommand_T __user *) Argument; + out: + return ErrorCode; +} + +static long DAC960_gam_v2_execute_command(DAC960_V2_UserCommand_T __user *UserSpaceUserCommand) +{ DAC960_V2_UserCommand_T UserCommand; DAC960_Controller_T *Controller; DAC960_Command_T *Command = NULL; @@ -6855,26 +6841,26 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, dma_addr_t DataTransferBufferDMA; unsigned char *RequestSenseBuffer = NULL; dma_addr_t RequestSenseBufferDMA; + long ErrorCode = -EINVAL; - ErrorCode = -EINVAL; if (UserSpaceUserCommand == NULL) - break; + goto out; if (copy_from_user(&UserCommand, UserSpaceUserCommand, sizeof(DAC960_V2_UserCommand_T))) { ErrorCode = -EFAULT; - break; + goto out; } ErrorCode = -ENXIO; ControllerNumber = UserCommand.ControllerNumber; if (ControllerNumber < 0 || ControllerNumber > DAC960_ControllerCount - 1) - break; + goto out; Controller = DAC960_Controllers[ControllerNumber]; if (Controller == NULL) - break; + goto out; if (Controller->FirmwareType != DAC960_V2_Controller){ ErrorCode = -EINVAL; - break; + goto out; } DataTransferLength = UserCommand.DataTransferLength; ErrorCode = -ENOMEM; @@ -6884,14 +6870,14 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, DataTransferLength, &DataTransferBufferDMA); if (DataTransferBuffer == NULL) - break; + goto out; } else if (DataTransferLength < 0) { DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, -DataTransferLength, &DataTransferBufferDMA); if (DataTransferBuffer == NULL) - break; + goto out; if (copy_from_user(DataTransferBuffer, UserCommand.DataTransferBuffer, -DataTransferLength)) { @@ -7001,42 +6987,44 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, if (RequestSenseBuffer != NULL) pci_free_consistent(Controller->PCIDevice, RequestSenseLength, RequestSenseBuffer, RequestSenseBufferDMA); - break; - } - case DAC960_IOCTL_V2_GET_HEALTH_STATUS: - { - DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus = - (DAC960_V2_GetHealthStatus_T __user *) Argument; +out: + return ErrorCode; +} + +static long DAC960_gam_v2_get_health_status(DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus) +{ DAC960_V2_GetHealthStatus_T GetHealthStatus; DAC960_V2_HealthStatusBuffer_T HealthStatusBuffer; DAC960_Controller_T *Controller; int ControllerNumber; + long ErrorCode; + if (UserSpaceGetHealthStatus == NULL) { ErrorCode = -EINVAL; - break; + goto out; } if (copy_from_user(&GetHealthStatus, UserSpaceGetHealthStatus, sizeof(DAC960_V2_GetHealthStatus_T))) { ErrorCode = -EFAULT; - break; + goto out; } ErrorCode = -ENXIO; ControllerNumber = GetHealthStatus.ControllerNumber; if (ControllerNumber < 0 || ControllerNumber > DAC960_ControllerCount - 1) - break; + goto out; Controller = DAC960_Controllers[ControllerNumber]; if (Controller == NULL) - break; + goto out; if (Controller->FirmwareType != DAC960_V2_Controller) { ErrorCode = -EINVAL; - break; + goto out; } if (copy_from_user(&HealthStatusBuffer, GetHealthStatus.HealthStatusBuffer, sizeof(DAC960_V2_HealthStatusBuffer_T))) { ErrorCode = -EFAULT; - break; + goto out; } ErrorCode = wait_event_interruptible_timeout(Controller->HealthStatusWaitQueue, !(Controller->V2.HealthStatusBuffer->StatusChangeCounter @@ -7046,7 +7034,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, DAC960_MonitoringTimerInterval); if (ErrorCode == -ERESTARTSYS) { ErrorCode = -EINTR; - break; + goto out; } if (copy_to_user(GetHealthStatus.HealthStatusBuffer, Controller->V2.HealthStatusBuffer, @@ -7054,7 +7042,39 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, ErrorCode = -EFAULT; else ErrorCode = 0; - } + +out: + return ErrorCode; +} + +/* + * DAC960_gam_ioctl is the ioctl function for performing RAID operations. +*/ + +static long DAC960_gam_ioctl(struct file *file, unsigned int Request, + unsigned long Argument) +{ + long ErrorCode = 0; + void __user *argp = (void __user *)Argument; + if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + mutex_lock(&DAC960_mutex); + switch (Request) + { + case DAC960_IOCTL_GET_CONTROLLER_COUNT: + ErrorCode = DAC960_ControllerCount; + break; + case DAC960_IOCTL_GET_CONTROLLER_INFO: + ErrorCode = DAC960_gam_get_controller_info(argp); + break; + case DAC960_IOCTL_V1_EXECUTE_COMMAND: + ErrorCode = DAC960_gam_v1_execute_command(argp); + break; + case DAC960_IOCTL_V2_EXECUTE_COMMAND: + ErrorCode = DAC960_gam_v2_execute_command(argp); + break; + case DAC960_IOCTL_V2_GET_HEALTH_STATUS: + ErrorCode = DAC960_gam_v2_get_health_status(argp); break; default: ErrorCode = -ENOTTY; diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 40579d0..ad9b687 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -20,6 +20,10 @@ config BLK_DEV_NULL_BLK tristate "Null test block driver" select CONFIGFS_FS +config BLK_DEV_NULL_BLK_FAULT_INJECTION + bool "Support fault injection for Null test block driver" + depends on BLK_DEV_NULL_BLK && FAULT_INJECTION + config BLK_DEV_FD tristate "Normal floppy disk support" depends on ARCH_MAY_HAVE_PC_FDC diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 9220f8e..c0ebda1 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -112,8 +112,7 @@ enum frame_flags { struct frame { struct list_head head; u32 tag; - struct timeval sent; /* high-res time packet was sent */ - u32 sent_jiffs; /* low-res jiffies-based sent time */ + ktime_t sent; /* high-res time packet was sent */ ulong waited; ulong waited_total; struct aoetgt *t; /* parent target I belong to */ diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 812fed0..540bb60 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -398,8 +398,7 @@ aoecmd_ata_rw(struct aoedev *d) skb = skb_clone(f->skb, GFP_ATOMIC); if (skb) { - do_gettimeofday(&f->sent); - f->sent_jiffs = (u32) jiffies; + f->sent = ktime_get(); __skb_queue_head_init(&queue); __skb_queue_tail(&queue, skb); aoenet_xmit(&queue); @@ -489,8 +488,7 @@ resend(struct aoedev *d, struct frame *f) skb = skb_clone(skb, GFP_ATOMIC); if (skb == NULL) return; - do_gettimeofday(&f->sent); - f->sent_jiffs = (u32) jiffies; + f->sent = ktime_get(); __skb_queue_head_init(&queue); __skb_queue_tail(&queue, skb); aoenet_xmit(&queue); @@ -499,33 +497,17 @@ resend(struct aoedev *d, struct frame *f) static int tsince_hr(struct frame *f) { - struct timeval now; - int n; + u64 delta = ktime_to_ns(ktime_sub(ktime_get(), f->sent)); - do_gettimeofday(&now); - n = now.tv_usec - f->sent.tv_usec; - n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC; + /* delta is normally under 4.2 seconds, avoid 64-bit division */ + if (likely(delta <= UINT_MAX)) + return (u32)delta / NSEC_PER_USEC; - if (n < 0) - n = -n; + /* avoid overflow after 71 minutes */ + if (delta > ((u64)INT_MAX * NSEC_PER_USEC)) + return INT_MAX; - /* For relatively long periods, use jiffies to avoid - * discrepancies caused by updates to the system time. - * - * On system with HZ of 1000, 32-bits is over 49 days - * worth of jiffies, or over 71 minutes worth of usecs. - * - * Jiffies overflow is handled by subtraction of unsigned ints: - * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe - * $3 = 4 - * (gdb) - */ - if (n > USEC_PER_SEC / 4) { - n = ((u32) jiffies) - f->sent_jiffs; - n *= USEC_PER_SEC / HZ; - } - - return n; + return div_u64(delta, NSEC_PER_USEC); } static int @@ -589,7 +571,6 @@ reassign_frame(struct frame *f) nf->waited = 0; nf->waited_total = f->waited_total; nf->sent = f->sent; - nf->sent_jiffs = f->sent_jiffs; f->skb = skb; return nf; @@ -633,8 +614,7 @@ probe(struct aoetgt *t) skb = skb_clone(f->skb, GFP_ATOMIC); if (skb) { - do_gettimeofday(&f->sent); - f->sent_jiffs = (u32) jiffies; + f->sent = ktime_get(); __skb_queue_head_init(&queue); __skb_queue_tail(&queue, skb); aoenet_xmit(&queue); @@ -1432,10 +1412,8 @@ aoecmd_ata_id(struct aoedev *d) d->timer.function = rexmit_timer; skb = skb_clone(skb, GFP_ATOMIC); - if (skb) { - do_gettimeofday(&f->sent); - f->sent_jiffs = (u32) jiffies; - } + if (skb) + f->sent = ktime_get(); return skb; } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index bd97908..9f4e6f5 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -953,7 +953,7 @@ static void drbd_bm_endio(struct bio *bio) struct drbd_bm_aio_ctx *ctx = bio->bi_private; struct drbd_device *device = ctx->device; struct drbd_bitmap *b = device->bitmap; - unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page); + unsigned int idx = bm_page_to_idx(bio_first_page_all(bio)); if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 && !bm_test_page_unchanged(b->bm_pages[idx])) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index ad0477a..6655893 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -12,9 +12,9 @@ #include <linux/slab.h> #include <linux/blk-mq.h> #include <linux/hrtimer.h> -#include <linux/lightnvm.h> #include <linux/configfs.h> #include <linux/badblocks.h> +#include <linux/fault-inject.h> #define SECTOR_SHIFT 9 #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) @@ -27,6 +27,10 @@ #define TICKS_PER_SEC 50ULL #define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +static DECLARE_FAULT_ATTR(null_timeout_attr); +#endif + static inline u64 mb_per_tick(int mbps) { return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); @@ -107,7 +111,6 @@ struct nullb_device { unsigned int hw_queue_depth; /* queue depth */ unsigned int index; /* index of the disk, only valid with a disk */ unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ - bool use_lightnvm; /* register as a LightNVM device */ bool blocking; /* blocking blk-mq device */ bool use_per_node_hctx; /* use per-node allocation for hardware context */ bool power; /* power on/off the device */ @@ -121,7 +124,6 @@ struct nullb { unsigned int index; struct request_queue *q; struct gendisk *disk; - struct nvm_dev *ndev; struct blk_mq_tag_set *tag_set; struct blk_mq_tag_set __tag_set; unsigned int queue_depth; @@ -139,7 +141,6 @@ static LIST_HEAD(nullb_list); static struct mutex lock; static int null_major; static DEFINE_IDA(nullb_indexes); -static struct kmem_cache *ppa_cache; static struct blk_mq_tag_set tag_set; enum { @@ -166,6 +167,11 @@ static int g_home_node = NUMA_NO_NODE; module_param_named(home_node, g_home_node, int, S_IRUGO); MODULE_PARM_DESC(home_node, "Home node for the device"); +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +static char g_timeout_str[80]; +module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO); +#endif + static int g_queue_mode = NULL_Q_MQ; static int null_param_store_val(const char *str, int *val, int min, int max) @@ -208,10 +214,6 @@ static int nr_devices = 1; module_param(nr_devices, int, S_IRUGO); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); -static bool g_use_lightnvm; -module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO); -MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device"); - static bool g_blocking; module_param_named(blocking, g_blocking, bool, S_IRUGO); MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); @@ -345,7 +347,6 @@ NULLB_DEVICE_ATTR(blocksize, uint); NULLB_DEVICE_ATTR(irqmode, uint); NULLB_DEVICE_ATTR(hw_queue_depth, uint); NULLB_DEVICE_ATTR(index, uint); -NULLB_DEVICE_ATTR(use_lightnvm, bool); NULLB_DEVICE_ATTR(blocking, bool); NULLB_DEVICE_ATTR(use_per_node_hctx, bool); NULLB_DEVICE_ATTR(memory_backed, bool); @@ -455,7 +456,6 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_irqmode, &nullb_device_attr_hw_queue_depth, &nullb_device_attr_index, - &nullb_device_attr_use_lightnvm, &nullb_device_attr_blocking, &nullb_device_attr_use_per_node_hctx, &nullb_device_attr_power, @@ -573,7 +573,6 @@ static struct nullb_device *null_alloc_dev(void) dev->blocksize = g_bs; dev->irqmode = g_irqmode; dev->hw_queue_depth = g_hw_queue_depth; - dev->use_lightnvm = g_use_lightnvm; dev->blocking = g_blocking; dev->use_per_node_hctx = g_use_per_node_hctx; return dev; @@ -1352,6 +1351,12 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } +static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq) +{ + pr_info("null: rq %p timed out\n", rq); + return BLK_EH_HANDLED; +} + static int null_rq_prep_fn(struct request_queue *q, struct request *req) { struct nullb *nullb = q->queuedata; @@ -1369,6 +1374,16 @@ static int null_rq_prep_fn(struct request_queue *q, struct request *req) return BLKPREP_DEFER; } +static bool should_timeout_request(struct request *rq) +{ +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (g_timeout_str[0]) + return should_fail(&null_timeout_attr, 1); +#endif + + return false; +} + static void null_request_fn(struct request_queue *q) { struct request *rq; @@ -1376,12 +1391,20 @@ static void null_request_fn(struct request_queue *q) while ((rq = blk_fetch_request(q)) != NULL) { struct nullb_cmd *cmd = rq->special; - spin_unlock_irq(q->queue_lock); - null_handle_cmd(cmd); - spin_lock_irq(q->queue_lock); + if (!should_timeout_request(rq)) { + spin_unlock_irq(q->queue_lock); + null_handle_cmd(cmd); + spin_lock_irq(q->queue_lock); + } } } +static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) +{ + pr_info("null: rq %p timed out\n", rq); + return BLK_EH_HANDLED; +} + static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -1399,12 +1422,16 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(bd->rq); - return null_handle_cmd(cmd); + if (!should_timeout_request(bd->rq)) + return null_handle_cmd(cmd); + + return BLK_STS_OK; } static const struct blk_mq_ops null_mq_ops = { .queue_rq = null_queue_rq, .complete = null_softirq_done_fn, + .timeout = null_timeout_rq, }; static void cleanup_queue(struct nullb_queue *nq) @@ -1423,170 +1450,6 @@ static void cleanup_queues(struct nullb *nullb) kfree(nullb->queues); } -#ifdef CONFIG_NVM - -static void null_lnvm_end_io(struct request *rq, blk_status_t status) -{ - struct nvm_rq *rqd = rq->end_io_data; - - /* XXX: lighnvm core seems to expect NVM_RSP_* values here.. */ - rqd->error = status ? -EIO : 0; - nvm_end_io(rqd); - - blk_put_request(rq); -} - -static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - struct request_queue *q = dev->q; - struct request *rq; - struct bio *bio = rqd->bio; - - rq = blk_mq_alloc_request(q, - op_is_write(bio_op(bio)) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); - if (IS_ERR(rq)) - return -ENOMEM; - - blk_init_request_from_bio(rq, bio); - - rq->end_io_data = rqd; - - blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io); - - return 0; -} - -static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) -{ - struct nullb *nullb = dev->q->queuedata; - sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL; - sector_t blksize; - struct nvm_id_group *grp; - - id->ver_id = 0x1; - id->vmnt = 0; - id->cap = 0x2; - id->dom = 0x1; - - id->ppaf.blk_offset = 0; - id->ppaf.blk_len = 16; - id->ppaf.pg_offset = 16; - id->ppaf.pg_len = 16; - id->ppaf.sect_offset = 32; - id->ppaf.sect_len = 8; - id->ppaf.pln_offset = 40; - id->ppaf.pln_len = 8; - id->ppaf.lun_offset = 48; - id->ppaf.lun_len = 8; - id->ppaf.ch_offset = 56; - id->ppaf.ch_len = 8; - - sector_div(size, nullb->dev->blocksize); /* convert size to pages */ - size >>= 8; /* concert size to pgs pr blk */ - grp = &id->grp; - grp->mtype = 0; - grp->fmtype = 0; - grp->num_ch = 1; - grp->num_pg = 256; - blksize = size; - size >>= 16; - grp->num_lun = size + 1; - sector_div(blksize, grp->num_lun); - grp->num_blk = blksize; - grp->num_pln = 1; - - grp->fpg_sz = nullb->dev->blocksize; - grp->csecs = nullb->dev->blocksize; - grp->trdt = 25000; - grp->trdm = 25000; - grp->tprt = 500000; - grp->tprm = 500000; - grp->tbet = 1500000; - grp->tbem = 1500000; - grp->mpos = 0x010101; /* single plane rwe */ - grp->cpar = nullb->dev->hw_queue_depth; - - return 0; -} - -static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name) -{ - mempool_t *virtmem_pool; - - virtmem_pool = mempool_create_slab_pool(64, ppa_cache); - if (!virtmem_pool) { - pr_err("null_blk: Unable to create virtual memory pool\n"); - return NULL; - } - - return virtmem_pool; -} - -static void null_lnvm_destroy_dma_pool(void *pool) -{ - mempool_destroy(pool); -} - -static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, - gfp_t mem_flags, dma_addr_t *dma_handler) -{ - return mempool_alloc(pool, mem_flags); -} - -static void null_lnvm_dev_dma_free(void *pool, void *entry, - dma_addr_t dma_handler) -{ - mempool_free(entry, pool); -} - -static struct nvm_dev_ops null_lnvm_dev_ops = { - .identity = null_lnvm_id, - .submit_io = null_lnvm_submit_io, - - .create_dma_pool = null_lnvm_create_dma_pool, - .destroy_dma_pool = null_lnvm_destroy_dma_pool, - .dev_dma_alloc = null_lnvm_dev_dma_alloc, - .dev_dma_free = null_lnvm_dev_dma_free, - - /* Simulate nvme protocol restriction */ - .max_phys_sect = 64, -}; - -static int null_nvm_register(struct nullb *nullb) -{ - struct nvm_dev *dev; - int rv; - - dev = nvm_alloc_dev(0); - if (!dev) - return -ENOMEM; - - dev->q = nullb->q; - memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN); - dev->ops = &null_lnvm_dev_ops; - - rv = nvm_register(dev); - if (rv) { - kfree(dev); - return rv; - } - nullb->ndev = dev; - return 0; -} - -static void null_nvm_unregister(struct nullb *nullb) -{ - nvm_unregister(nullb->ndev); -} -#else -static int null_nvm_register(struct nullb *nullb) -{ - pr_err("null_blk: CONFIG_NVM needs to be enabled for LightNVM\n"); - return -EINVAL; -} -static void null_nvm_unregister(struct nullb *nullb) {} -#endif /* CONFIG_NVM */ - static void null_del_dev(struct nullb *nullb) { struct nullb_device *dev = nullb->dev; @@ -1595,10 +1458,7 @@ static void null_del_dev(struct nullb *nullb) list_del_init(&nullb->list); - if (dev->use_lightnvm) - null_nvm_unregister(nullb); - else - del_gendisk(nullb->disk); + del_gendisk(nullb->disk); if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { hrtimer_cancel(&nullb->bw_timer); @@ -1610,8 +1470,7 @@ static void null_del_dev(struct nullb *nullb) if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); - if (!dev->use_lightnvm) - put_disk(nullb->disk); + put_disk(nullb->disk); cleanup_queues(nullb); if (null_cache_active(nullb)) null_free_device_storage(nullb->dev, true); @@ -1775,11 +1634,6 @@ static void null_validate_conf(struct nullb_device *dev) { dev->blocksize = round_down(dev->blocksize, 512); dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); - if (dev->use_lightnvm && dev->blocksize != 4096) - dev->blocksize = 4096; - - if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ) - dev->queue_mode = NULL_Q_MQ; if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { if (dev->submit_queues != nr_online_nodes) @@ -1805,6 +1659,20 @@ static void null_validate_conf(struct nullb_device *dev) dev->mbps = 0; } +static bool null_setup_fault(void) +{ +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (!g_timeout_str[0]) + return true; + + if (!setup_fault_attr(&null_timeout_attr, g_timeout_str)) + return false; + + null_timeout_attr.verbose = 0; +#endif + return true; +} + static int null_add_dev(struct nullb_device *dev) { struct nullb *nullb; @@ -1838,6 +1706,10 @@ static int null_add_dev(struct nullb_device *dev) if (rv) goto out_cleanup_queues; + if (!null_setup_fault()) + goto out_cleanup_queues; + + nullb->tag_set->timeout = 5 * HZ; nullb->q = blk_mq_init_queue(nullb->tag_set); if (IS_ERR(nullb->q)) { rv = -ENOMEM; @@ -1861,8 +1733,14 @@ static int null_add_dev(struct nullb_device *dev) rv = -ENOMEM; goto out_cleanup_queues; } + + if (!null_setup_fault()) + goto out_cleanup_blk_queue; + blk_queue_prep_rq(nullb->q, null_rq_prep_fn); blk_queue_softirq_done(nullb->q, null_softirq_done_fn); + blk_queue_rq_timed_out(nullb->q, null_rq_timed_out_fn); + nullb->q->rq_timeout = 5 * HZ; rv = init_driver_queues(nullb); if (rv) goto out_cleanup_blk_queue; @@ -1895,11 +1773,7 @@ static int null_add_dev(struct nullb_device *dev) sprintf(nullb->disk_name, "nullb%d", nullb->index); - if (dev->use_lightnvm) - rv = null_nvm_register(nullb); - else - rv = null_gendisk_register(nullb); - + rv = null_gendisk_register(nullb); if (rv) goto out_cleanup_blk_queue; @@ -1938,18 +1812,6 @@ static int __init null_init(void) g_bs = PAGE_SIZE; } - if (g_use_lightnvm && g_bs != 4096) { - pr_warn("null_blk: LightNVM only supports 4k block size\n"); - pr_warn("null_blk: defaults block size to 4k\n"); - g_bs = 4096; - } - - if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) { - pr_warn("null_blk: LightNVM only supported for blk-mq\n"); - pr_warn("null_blk: defaults queue mode to blk-mq\n"); - g_queue_mode = NULL_Q_MQ; - } - if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { if (g_submit_queues != nr_online_nodes) { pr_warn("null_blk: submit_queues param is set to %u.\n", @@ -1982,16 +1844,6 @@ static int __init null_init(void) goto err_conf; } - if (g_use_lightnvm) { - ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64), - 0, 0, NULL); - if (!ppa_cache) { - pr_err("null_blk: unable to create ppa cache\n"); - ret = -ENOMEM; - goto err_ppa; - } - } - for (i = 0; i < nr_devices; i++) { dev = null_alloc_dev(); if (!dev) { @@ -2015,8 +1867,6 @@ err_dev: null_del_dev(nullb); null_free_dev(dev); } - kmem_cache_destroy(ppa_cache); -err_ppa: unregister_blkdev(null_major, "nullb"); err_conf: configfs_unregister_subsystem(&nullb_subsys); @@ -2047,8 +1897,6 @@ static void __exit null_exit(void) if (g_queue_mode == NULL_Q_MQ && shared_tags) blk_mq_free_tag_set(&tag_set); - - kmem_cache_destroy(ppa_cache); } module_init(null_init); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 6797479..531a091 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2579,14 +2579,14 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) bdev = bdget(dev); if (!bdev) return -ENOMEM; + ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); + if (ret) + return ret; if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) { WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); - bdput(bdev); + blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); return -EINVAL; } - ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); - if (ret) - return ret; /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); @@ -2745,7 +2745,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) pd->pkt_dev = MKDEV(pktdev_major, idx); ret = pkt_new_dev(pd, dev); if (ret) - goto out_new_dev; + goto out_mem2; /* inherit events of the host device */ disk->events = pd->bdev->bd_disk->events; @@ -2763,8 +2763,6 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) mutex_unlock(&ctl_mutex); return 0; -out_new_dev: - blk_cleanup_queue(disk->queue); out_mem2: put_disk(disk); out_mem: diff --git a/drivers/block/smart1,2.h b/drivers/block/smart1,2.h deleted file mode 100644 index e5565fb..0000000 --- a/drivers/block/smart1,2.h +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Disk Array driver for Compaq SMART2 Controllers - * Copyright 1998 Compaq Computer Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Questions/Comments/Bugfixes to iss_storagedev@hp.com - * - * If you want to make changes, improve or add functionality to this - * driver, you'll probably need the Compaq Array Controller Interface - * Specificiation (Document number ECG086/1198) - */ - -/* - * This file contains the controller communication implementation for - * Compaq SMART-1 and SMART-2 controllers. To the best of my knowledge, - * this should support: - * - * PCI: - * SMART-2/P, SMART-2DH, SMART-2SL, SMART-221, SMART-3100ES, SMART-3200 - * Integerated SMART Array Controller, SMART-4200, SMART-4250ES - * - * EISA: - * SMART-2/E, SMART, IAES, IDA-2, IDA - */ - -/* - * Memory mapped FIFO interface (SMART 42xx cards) - */ -static void smart4_submit_command(ctlr_info_t *h, cmdlist_t *c) -{ - writel(c->busaddr, h->vaddr + S42XX_REQUEST_PORT_OFFSET); -} - -/* - * This card is the opposite of the other cards. - * 0 turns interrupts on... - * 0x08 turns them off... - */ -static void smart4_intr_mask(ctlr_info_t *h, unsigned long val) -{ - if (val) - { /* Turn interrupts on */ - writel(0, h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET); - } else /* Turn them off */ - { - writel( S42XX_INTR_OFF, - h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET); - } -} - -/* - * For older cards FIFO Full = 0. - * On this card 0 means there is room, anything else FIFO Full. - * - */ -static unsigned long smart4_fifo_full(ctlr_info_t *h) -{ - - return (!readl(h->vaddr + S42XX_REQUEST_PORT_OFFSET)); -} - -/* This type of controller returns -1 if the fifo is empty, - * Not 0 like the others. - * And we need to let it know we read a value out - */ -static unsigned long smart4_completed(ctlr_info_t *h) -{ - long register_value - = readl(h->vaddr + S42XX_REPLY_PORT_OFFSET); - - /* Fifo is empty */ - if( register_value == 0xffffffff) - return 0; - - /* Need to let it know we got the reply */ - /* We do this by writing a 0 to the port we just read from */ - writel(0, h->vaddr + S42XX_REPLY_PORT_OFFSET); - - return ((unsigned long) register_value); -} - - /* - * This hardware returns interrupt pending at a different place and - * it does not tell us if the fifo is empty, we will have check - * that by getting a 0 back from the command_completed call. - */ -static unsigned long smart4_intr_pending(ctlr_info_t *h) -{ - unsigned long register_value = - readl(h->vaddr + S42XX_INTR_STATUS); - - if( register_value & S42XX_INTR_PENDING) - return FIFO_NOT_EMPTY; - return 0 ; -} - -static struct access_method smart4_access = { - smart4_submit_command, - smart4_intr_mask, - smart4_fifo_full, - smart4_intr_pending, - smart4_completed, -}; - -/* - * Memory mapped FIFO interface (PCI SMART2 and SMART 3xxx cards) - */ -static void smart2_submit_command(ctlr_info_t *h, cmdlist_t *c) -{ - writel(c->busaddr, h->vaddr + COMMAND_FIFO); -} - -static void smart2_intr_mask(ctlr_info_t *h, unsigned long val) -{ - writel(val, h->vaddr + INTR_MASK); -} - -static unsigned long smart2_fifo_full(ctlr_info_t *h) -{ - return readl(h->vaddr + COMMAND_FIFO); -} - -static unsigned long smart2_completed(ctlr_info_t *h) -{ - return readl(h->vaddr + COMMAND_COMPLETE_FIFO); -} - -static unsigned long smart2_intr_pending(ctlr_info_t *h) -{ - return readl(h->vaddr + INTR_PENDING); -} - -static struct access_method smart2_access = { - smart2_submit_command, - smart2_intr_mask, - smart2_fifo_full, - smart2_intr_pending, - smart2_completed, -}; - -/* - * IO access for SMART-2/E cards - */ -static void smart2e_submit_command(ctlr_info_t *h, cmdlist_t *c) -{ - outl(c->busaddr, h->io_mem_addr + COMMAND_FIFO); -} - -static void smart2e_intr_mask(ctlr_info_t *h, unsigned long val) -{ - outl(val, h->io_mem_addr + INTR_MASK); -} - -static unsigned long smart2e_fifo_full(ctlr_info_t *h) -{ - return inl(h->io_mem_addr + COMMAND_FIFO); -} - -static unsigned long smart2e_completed(ctlr_info_t *h) -{ - return inl(h->io_mem_addr + COMMAND_COMPLETE_FIFO); -} - -static unsigned long smart2e_intr_pending(ctlr_info_t *h) -{ - return inl(h->io_mem_addr + INTR_PENDING); -} - -static struct access_method smart2e_access = { - smart2e_submit_command, - smart2e_intr_mask, - smart2e_fifo_full, - smart2e_intr_pending, - smart2e_completed, -}; - -/* - * IO access for older SMART-1 type cards - */ -#define SMART1_SYSTEM_MASK 0xC8E -#define SMART1_SYSTEM_DOORBELL 0xC8F -#define SMART1_LOCAL_MASK 0xC8C -#define SMART1_LOCAL_DOORBELL 0xC8D -#define SMART1_INTR_MASK 0xC89 -#define SMART1_LISTADDR 0xC90 -#define SMART1_LISTLEN 0xC94 -#define SMART1_TAG 0xC97 -#define SMART1_COMPLETE_ADDR 0xC98 -#define SMART1_LISTSTATUS 0xC9E - -#define CHANNEL_BUSY 0x01 -#define CHANNEL_CLEAR 0x02 - -static void smart1_submit_command(ctlr_info_t *h, cmdlist_t *c) -{ - /* - * This __u16 is actually a bunch of control flags on SMART - * and below. We want them all to be zero. - */ - c->hdr.size = 0; - - outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_SYSTEM_DOORBELL); - - outl(c->busaddr, h->io_mem_addr + SMART1_LISTADDR); - outw(c->size, h->io_mem_addr + SMART1_LISTLEN); - - outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL); -} - -static void smart1_intr_mask(ctlr_info_t *h, unsigned long val) -{ - if (val == 1) { - outb(0xFD, h->io_mem_addr + SMART1_SYSTEM_DOORBELL); - outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL); - outb(0x01, h->io_mem_addr + SMART1_INTR_MASK); - outb(0x01, h->io_mem_addr + SMART1_SYSTEM_MASK); - } else { - outb(0, h->io_mem_addr + 0xC8E); - } -} - -static unsigned long smart1_fifo_full(ctlr_info_t *h) -{ - unsigned char chan; - chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_CLEAR; - return chan; -} - -static unsigned long smart1_completed(ctlr_info_t *h) -{ - unsigned char status; - unsigned long cmd; - - if (inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY) { - outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_SYSTEM_DOORBELL); - - cmd = inl(h->io_mem_addr + SMART1_COMPLETE_ADDR); - status = inb(h->io_mem_addr + SMART1_LISTSTATUS); - - outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_LOCAL_DOORBELL); - - /* - * this is x86 (actually compaq x86) only, so it's ok - */ - if (cmd) ((cmdlist_t*)bus_to_virt(cmd))->req.hdr.rcode = status; - } else { - cmd = 0; - } - return cmd; -} - -static unsigned long smart1_intr_pending(ctlr_info_t *h) -{ - unsigned char chan; - chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY; - return chan; -} - -static struct access_method smart1_access = { - smart1_submit_command, - smart1_intr_mask, - smart1_fifo_full, - smart1_intr_pending, - smart1_completed, -}; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d70eba3..0afa6c8 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -430,7 +430,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry) static void zram_page_end_io(struct bio *bio) { - struct page *page = bio->bi_io_vec[0].bv_page; + struct page *page = bio_first_page_all(bio); page_endio(page, op_is_write(bio_op(bio)), blk_status_to_errno(bio->bi_status)); diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index 2a953ef..10c0898 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -27,13 +27,6 @@ config NVM_DEBUG It is required to create/remove targets without IOCTLs. -config NVM_RRPC - tristate "Round-robin Hybrid Open-Channel SSD target" - ---help--- - Allows an open-channel SSD to be exposed as a block device to the - host. The target is implemented using a linear mapping table and - cost-based garbage collection. It is optimized for 4K IO sizes. - config NVM_PBLK tristate "Physical Block Device Open-Channel SSD target" ---help--- diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile index 2c3fd9d..97d9d7c 100644 --- a/drivers/lightnvm/Makefile +++ b/drivers/lightnvm/Makefile @@ -4,7 +4,6 @@ # obj-$(CONFIG_NVM) := core.o -obj-$(CONFIG_NVM_RRPC) += rrpc.o obj-$(CONFIG_NVM_PBLK) += pblk.o pblk-y := pblk-init.o pblk-core.o pblk-rb.o \ pblk-write.o pblk-cache.o pblk-read.o \ diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 83249b4..dcc9e62 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -45,12 +45,6 @@ struct nvm_dev_map { int nr_chnls; }; -struct nvm_area { - struct list_head list; - sector_t begin; - sector_t end; /* end is excluded */ -}; - static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) { struct nvm_target *tgt; @@ -62,6 +56,30 @@ static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) return NULL; } +static bool nvm_target_exists(const char *name) +{ + struct nvm_dev *dev; + struct nvm_target *tgt; + bool ret = false; + + down_write(&nvm_lock); + list_for_each_entry(dev, &nvm_devices, devices) { + mutex_lock(&dev->mlock); + list_for_each_entry(tgt, &dev->targets, list) { + if (!strcmp(name, tgt->disk->disk_name)) { + ret = true; + mutex_unlock(&dev->mlock); + goto out; + } + } + mutex_unlock(&dev->mlock); + } + +out: + up_write(&nvm_lock); + return ret; +} + static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end) { int i; @@ -104,7 +122,7 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear) if (clear) { for (j = 0; j < ch_map->nr_luns; j++) { int lun = j + lun_offs[j]; - int lunid = (ch * dev->geo.luns_per_chnl) + lun; + int lunid = (ch * dev->geo.nr_luns) + lun; WARN_ON(!test_and_clear_bit(lunid, dev->lun_map)); @@ -122,7 +140,8 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear) } static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, - int lun_begin, int lun_end) + u16 lun_begin, u16 lun_end, + u16 op) { struct nvm_tgt_dev *tgt_dev = NULL; struct nvm_dev_map *dev_rmap = dev->rmap; @@ -130,10 +149,10 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, struct ppa_addr *luns; int nr_luns = lun_end - lun_begin + 1; int luns_left = nr_luns; - int nr_chnls = nr_luns / dev->geo.luns_per_chnl; - int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl; - int bch = lun_begin / dev->geo.luns_per_chnl; - int blun = lun_begin % dev->geo.luns_per_chnl; + int nr_chnls = nr_luns / dev->geo.nr_luns; + int nr_chnls_mod = nr_luns % dev->geo.nr_luns; + int bch = lun_begin / dev->geo.nr_luns; + int blun = lun_begin % dev->geo.nr_luns; int lunid = 0; int lun_balanced = 1; int prev_nr_luns; @@ -154,15 +173,15 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, if (!luns) goto err_luns; - prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ? - dev->geo.luns_per_chnl : luns_left; + prev_nr_luns = (luns_left > dev->geo.nr_luns) ? + dev->geo.nr_luns : luns_left; for (i = 0; i < nr_chnls; i++) { struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch]; int *lun_roffs = ch_rmap->lun_offs; struct nvm_ch_map *ch_map = &dev_map->chnls[i]; int *lun_offs; - int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ? - dev->geo.luns_per_chnl : luns_left; + int luns_in_chnl = (luns_left > dev->geo.nr_luns) ? + dev->geo.nr_luns : luns_left; if (lun_balanced && prev_nr_luns != luns_in_chnl) lun_balanced = 0; @@ -199,8 +218,9 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo)); /* Target device only owns a portion of the physical device */ tgt_dev->geo.nr_chnls = nr_chnls; - tgt_dev->geo.nr_luns = nr_luns; - tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1; + tgt_dev->geo.all_luns = nr_luns; + tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1; + tgt_dev->geo.op = op; tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun; tgt_dev->q = dev->q; tgt_dev->map = dev_map; @@ -226,27 +246,79 @@ static const struct block_device_operations nvm_fops = { .owner = THIS_MODULE, }; -static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) +static struct nvm_tgt_type *__nvm_find_target_type(const char *name) { - struct nvm_tgt_type *tmp, *tt = NULL; + struct nvm_tgt_type *tt; - if (lock) - down_write(&nvm_tgtt_lock); + list_for_each_entry(tt, &nvm_tgt_types, list) + if (!strcmp(name, tt->name)) + return tt; - list_for_each_entry(tmp, &nvm_tgt_types, list) - if (!strcmp(name, tmp->name)) { - tt = tmp; - break; - } + return NULL; +} + +static struct nvm_tgt_type *nvm_find_target_type(const char *name) +{ + struct nvm_tgt_type *tt; + + down_write(&nvm_tgtt_lock); + tt = __nvm_find_target_type(name); + up_write(&nvm_tgtt_lock); - if (lock) - up_write(&nvm_tgtt_lock); return tt; } +static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin, + int lun_end) +{ + if (lun_begin > lun_end || lun_end >= geo->all_luns) { + pr_err("nvm: lun out of bound (%u:%u > %u)\n", + lun_begin, lun_end, geo->all_luns - 1); + return -EINVAL; + } + + return 0; +} + +static int __nvm_config_simple(struct nvm_dev *dev, + struct nvm_ioctl_create_simple *s) +{ + struct nvm_geo *geo = &dev->geo; + + if (s->lun_begin == -1 && s->lun_end == -1) { + s->lun_begin = 0; + s->lun_end = geo->all_luns - 1; + } + + return nvm_config_check_luns(geo, s->lun_begin, s->lun_end); +} + +static int __nvm_config_extended(struct nvm_dev *dev, + struct nvm_ioctl_create_extended *e) +{ + struct nvm_geo *geo = &dev->geo; + + if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) { + e->lun_begin = 0; + e->lun_end = dev->geo.all_luns - 1; + } + + /* op not set falls into target's default */ + if (e->op == 0xFFFF) + e->op = NVM_TARGET_DEFAULT_OP; + + if (e->op < NVM_TARGET_MIN_OP || + e->op > NVM_TARGET_MAX_OP) { + pr_err("nvm: invalid over provisioning value\n"); + return -EINVAL; + } + + return nvm_config_check_luns(geo, e->lun_begin, e->lun_end); +} + static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) { - struct nvm_ioctl_create_simple *s = &create->conf.s; + struct nvm_ioctl_create_extended e; struct request_queue *tqueue; struct gendisk *tdisk; struct nvm_tgt_type *tt; @@ -255,22 +327,41 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) void *targetdata; int ret; - tt = nvm_find_target_type(create->tgttype, 1); + switch (create->conf.type) { + case NVM_CONFIG_TYPE_SIMPLE: + ret = __nvm_config_simple(dev, &create->conf.s); + if (ret) + return ret; + + e.lun_begin = create->conf.s.lun_begin; + e.lun_end = create->conf.s.lun_end; + e.op = NVM_TARGET_DEFAULT_OP; + break; + case NVM_CONFIG_TYPE_EXTENDED: + ret = __nvm_config_extended(dev, &create->conf.e); + if (ret) + return ret; + + e = create->conf.e; + break; + default: + pr_err("nvm: config type not valid\n"); + return -EINVAL; + } + + tt = nvm_find_target_type(create->tgttype); if (!tt) { pr_err("nvm: target type %s not found\n", create->tgttype); return -EINVAL; } - mutex_lock(&dev->mlock); - t = nvm_find_target(dev, create->tgtname); - if (t) { - pr_err("nvm: target name already exists.\n"); - mutex_unlock(&dev->mlock); + if (nvm_target_exists(create->tgtname)) { + pr_err("nvm: target name already exists (%s)\n", + create->tgtname); return -EINVAL; } - mutex_unlock(&dev->mlock); - ret = nvm_reserve_luns(dev, s->lun_begin, s->lun_end); + ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end); if (ret) return ret; @@ -280,7 +371,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) goto err_reserve; } - tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end); + tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op); if (!tgt_dev) { pr_err("nvm: could not create target device\n"); ret = -ENOMEM; @@ -350,7 +441,7 @@ err_dev: err_t: kfree(t); err_reserve: - nvm_release_luns_err(dev, s->lun_begin, s->lun_end); + nvm_release_luns_err(dev, e.lun_begin, e.lun_end); return ret; } @@ -420,7 +511,7 @@ static int nvm_register_map(struct nvm_dev *dev) for (i = 0; i < dev->geo.nr_chnls; i++) { struct nvm_ch_map *ch_rmap; int *lun_roffs; - int luns_in_chnl = dev->geo.luns_per_chnl; + int luns_in_chnl = dev->geo.nr_luns; ch_rmap = &rmap->chnls[i]; @@ -524,41 +615,12 @@ static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas); } -void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries, - int len) -{ - struct nvm_geo *geo = &dev->geo; - struct nvm_dev_map *dev_rmap = dev->rmap; - u64 i; - - for (i = 0; i < len; i++) { - struct nvm_ch_map *ch_rmap; - int *lun_roffs; - struct ppa_addr gaddr; - u64 pba = le64_to_cpu(entries[i]); - u64 diff; - - if (!pba) - continue; - - gaddr = linear_to_generic_addr(geo, pba); - ch_rmap = &dev_rmap->chnls[gaddr.g.ch]; - lun_roffs = ch_rmap->lun_offs; - - diff = ((ch_rmap->ch_off * geo->luns_per_chnl) + - (lun_roffs[gaddr.g.lun])) * geo->sec_per_lun; - - entries[i] -= cpu_to_le64(diff); - } -} -EXPORT_SYMBOL(nvm_part_to_tgt); - int nvm_register_tgt_type(struct nvm_tgt_type *tt) { int ret = 0; down_write(&nvm_tgtt_lock); - if (nvm_find_target_type(tt->name, 0)) + if (__nvm_find_target_type(tt->name)) ret = -EEXIST; else list_add(&tt->list, &nvm_tgt_types); @@ -726,112 +788,6 @@ int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) } EXPORT_SYMBOL(nvm_submit_io_sync); -int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, - int nr_ppas) -{ - struct nvm_geo *geo = &tgt_dev->geo; - struct nvm_rq rqd; - int ret; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - rqd.opcode = NVM_OP_ERASE; - rqd.flags = geo->plane_mode >> 1; - - ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); - if (ret) - return ret; - - ret = nvm_submit_io_sync(tgt_dev, &rqd); - if (ret) { - pr_err("rrpr: erase I/O submission failed: %d\n", ret); - goto free_ppa_list; - } - -free_ppa_list: - nvm_free_rqd_ppalist(tgt_dev, &rqd); - - return ret; -} -EXPORT_SYMBOL(nvm_erase_sync); - -int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb, - nvm_l2p_update_fn *update_l2p, void *priv) -{ - struct nvm_dev *dev = tgt_dev->parent; - - if (!dev->ops->get_l2p_tbl) - return 0; - - return dev->ops->get_l2p_tbl(dev, slba, nlb, update_l2p, priv); -} -EXPORT_SYMBOL(nvm_get_l2p_tbl); - -int nvm_get_area(struct nvm_tgt_dev *tgt_dev, sector_t *lba, sector_t len) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_geo *geo = &dev->geo; - struct nvm_area *area, *prev, *next; - sector_t begin = 0; - sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9; - - if (len > max_sectors) - return -EINVAL; - - area = kmalloc(sizeof(struct nvm_area), GFP_KERNEL); - if (!area) - return -ENOMEM; - - prev = NULL; - - spin_lock(&dev->lock); - list_for_each_entry(next, &dev->area_list, list) { - if (begin + len > next->begin) { - begin = next->end; - prev = next; - continue; - } - break; - } - - if ((begin + len) > max_sectors) { - spin_unlock(&dev->lock); - kfree(area); - return -EINVAL; - } - - area->begin = *lba = begin; - area->end = begin + len; - - if (prev) /* insert into sorted order */ - list_add(&area->list, &prev->list); - else - list_add(&area->list, &dev->area_list); - spin_unlock(&dev->lock); - - return 0; -} -EXPORT_SYMBOL(nvm_get_area); - -void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_area *area; - - spin_lock(&dev->lock); - list_for_each_entry(area, &dev->area_list, list) { - if (area->begin != begin) - continue; - - list_del(&area->list); - spin_unlock(&dev->lock); - kfree(area); - return; - } - spin_unlock(&dev->lock); -} -EXPORT_SYMBOL(nvm_put_area); - void nvm_end_io(struct nvm_rq *rqd) { struct nvm_tgt_dev *tgt_dev = rqd->dev; @@ -858,10 +814,10 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks) struct nvm_geo *geo = &dev->geo; int blk, offset, pl, blktype; - if (nr_blks != geo->blks_per_lun * geo->plane_mode) + if (nr_blks != geo->nr_chks * geo->plane_mode) return -EINVAL; - for (blk = 0; blk < geo->blks_per_lun; blk++) { + for (blk = 0; blk < geo->nr_chks; blk++) { offset = blk * geo->plane_mode; blktype = blks[offset]; @@ -877,7 +833,7 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks) blks[blk] = blktype; } - return geo->blks_per_lun; + return geo->nr_chks; } EXPORT_SYMBOL(nvm_bb_tbl_fold); @@ -892,53 +848,6 @@ int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa, } EXPORT_SYMBOL(nvm_get_tgt_bb_tbl); -static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) -{ - struct nvm_geo *geo = &dev->geo; - int i; - - dev->lps_per_blk = geo->pgs_per_blk; - dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL); - if (!dev->lptbl) - return -ENOMEM; - - /* Just a linear array */ - for (i = 0; i < dev->lps_per_blk; i++) - dev->lptbl[i] = i; - - return 0; -} - -static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) -{ - int i, p; - struct nvm_id_lp_mlc *mlc = &grp->lptbl.mlc; - - if (!mlc->num_pairs) - return 0; - - dev->lps_per_blk = mlc->num_pairs; - dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL); - if (!dev->lptbl) - return -ENOMEM; - - /* The lower page table encoding consists of a list of bytes, where each - * has a lower and an upper half. The first half byte maintains the - * increment value and every value after is an offset added to the - * previous incrementation value - */ - dev->lptbl[0] = mlc->pairs[0] & 0xF; - for (i = 1; i < dev->lps_per_blk; i++) { - p = mlc->pairs[i >> 1]; - if (i & 0x1) /* upper */ - dev->lptbl[i] = dev->lptbl[i - 1] + ((p & 0xF0) >> 4); - else /* lower */ - dev->lptbl[i] = dev->lptbl[i - 1] + (p & 0xF); - } - - return 0; -} - static int nvm_core_init(struct nvm_dev *dev) { struct nvm_id *id = &dev->identity; @@ -946,66 +855,44 @@ static int nvm_core_init(struct nvm_dev *dev) struct nvm_geo *geo = &dev->geo; int ret; + memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format)); + + if (grp->mtype != 0) { + pr_err("nvm: memory type not supported\n"); + return -EINVAL; + } + /* Whole device values */ geo->nr_chnls = grp->num_ch; - geo->luns_per_chnl = grp->num_lun; - - /* Generic device values */ - geo->pgs_per_blk = grp->num_pg; - geo->blks_per_lun = grp->num_blk; - geo->nr_planes = grp->num_pln; - geo->fpg_size = grp->fpg_sz; - geo->pfpg_size = grp->fpg_sz * grp->num_pln; + geo->nr_luns = grp->num_lun; + + /* Generic device geometry values */ + geo->ws_min = grp->ws_min; + geo->ws_opt = grp->ws_opt; + geo->ws_seq = grp->ws_seq; + geo->ws_per_chk = grp->ws_per_chk; + geo->nr_chks = grp->num_chk; geo->sec_size = grp->csecs; geo->oob_size = grp->sos; - geo->sec_per_pg = grp->fpg_sz / grp->csecs; geo->mccap = grp->mccap; - memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format)); - - geo->plane_mode = NVM_PLANE_SINGLE; geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size; - if (grp->mpos & 0x020202) - geo->plane_mode = NVM_PLANE_DOUBLE; - if (grp->mpos & 0x040404) - geo->plane_mode = NVM_PLANE_QUAD; + geo->sec_per_chk = grp->clba; + geo->sec_per_lun = geo->sec_per_chk * geo->nr_chks; + geo->all_luns = geo->nr_luns * geo->nr_chnls; - if (grp->mtype != 0) { - pr_err("nvm: memory type not supported\n"); - return -EINVAL; - } - - /* calculated values */ + /* 1.2 spec device geometry values */ + geo->plane_mode = 1 << geo->ws_seq; + geo->nr_planes = geo->ws_opt / geo->ws_min; + geo->sec_per_pg = geo->ws_min; geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes; - geo->sec_per_blk = geo->sec_per_pl * geo->pgs_per_blk; - geo->sec_per_lun = geo->sec_per_blk * geo->blks_per_lun; - geo->nr_luns = geo->luns_per_chnl * geo->nr_chnls; - dev->total_secs = geo->nr_luns * geo->sec_per_lun; - dev->lun_map = kcalloc(BITS_TO_LONGS(geo->nr_luns), + dev->total_secs = geo->all_luns * geo->sec_per_lun; + dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns), sizeof(unsigned long), GFP_KERNEL); if (!dev->lun_map) return -ENOMEM; - switch (grp->fmtype) { - case NVM_ID_FMTYPE_SLC: - if (nvm_init_slc_tbl(dev, grp)) { - ret = -ENOMEM; - goto err_fmtype; - } - break; - case NVM_ID_FMTYPE_MLC: - if (nvm_init_mlc_tbl(dev, grp)) { - ret = -ENOMEM; - goto err_fmtype; - } - break; - default: - pr_err("nvm: flash type not supported\n"); - ret = -EINVAL; - goto err_fmtype; - } - INIT_LIST_HEAD(&dev->area_list); INIT_LIST_HEAD(&dev->targets); mutex_init(&dev->mlock); @@ -1031,7 +918,6 @@ static void nvm_free(struct nvm_dev *dev) dev->ops->destroy_dma_pool(dev->dma_pool); nvm_unregister_map(dev); - kfree(dev->lptbl); kfree(dev->lun_map); kfree(dev); } @@ -1062,8 +948,8 @@ static int nvm_init(struct nvm_dev *dev) pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n", dev->name, geo->sec_per_pg, geo->nr_planes, - geo->pgs_per_blk, geo->blks_per_lun, - geo->nr_luns, geo->nr_chnls); + geo->ws_per_chk, geo->nr_chks, + geo->all_luns, geo->nr_chnls); return 0; err: pr_err("nvm: failed to initialize nvm\n"); @@ -1135,7 +1021,6 @@ EXPORT_SYMBOL(nvm_unregister); static int __nvm_configure_create(struct nvm_ioctl_create *create) { struct nvm_dev *dev; - struct nvm_ioctl_create_simple *s; down_write(&nvm_lock); dev = nvm_find_nvm_dev(create->dev); @@ -1146,23 +1031,6 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) return -EINVAL; } - if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) { - pr_err("nvm: config type not valid\n"); - return -EINVAL; - } - s = &create->conf.s; - - if (s->lun_begin == -1 && s->lun_end == -1) { - s->lun_begin = 0; - s->lun_end = dev->geo.nr_luns - 1; - } - - if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.nr_luns) { - pr_err("nvm: lun out of bound (%u:%u > %u)\n", - s->lun_begin, s->lun_end, dev->geo.nr_luns - 1); - return -EINVAL; - } - return nvm_create_tgt(dev, create); } @@ -1262,6 +1130,12 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg) if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create))) return -EFAULT; + if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED && + create.conf.e.rsv != 0) { + pr_err("nvm: reserved config field in use\n"); + return -EINVAL; + } + create.dev[DISK_NAME_LEN - 1] = '\0'; create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0'; create.tgtname[DISK_NAME_LEN - 1] = '\0'; diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index 0d227ef..000fcad 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c @@ -19,12 +19,16 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags) { + struct request_queue *q = pblk->dev->q; struct pblk_w_ctx w_ctx; sector_t lba = pblk_get_lba(bio); + unsigned long start_time = jiffies; unsigned int bpos, pos; int nr_entries = pblk_get_secs(bio); int i, ret; + generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0); + /* Update the write buffer head (mem) with the entries that we can * write. The write in itself cannot fail, so there is no need to * rollback from here on. @@ -67,6 +71,7 @@ retry: pblk_rl_inserted(&pblk->rl, nr_entries); out: + generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time); pblk_write_should_kick(pblk); return ret; } diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 76516ee..0487b93 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -32,8 +32,8 @@ static void pblk_line_mark_bb(struct work_struct *work) struct pblk_line *line; int pos; - line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)]; - pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa); + line = &pblk->lines[pblk_ppa_to_line(*ppa)]; + pos = pblk_ppa_to_pos(&dev->geo, *ppa); pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", line->id, pos); @@ -48,7 +48,7 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - int pos = pblk_dev_ppa_to_pos(geo, *ppa); + int pos = pblk_ppa_to_pos(geo, *ppa); pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos); atomic_long_inc(&pblk->erase_failed); @@ -66,7 +66,7 @@ static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) { struct pblk_line *line; - line = &pblk->lines[pblk_dev_ppa_to_line(rqd->ppa_addr)]; + line = &pblk->lines[pblk_ppa_to_line(rqd->ppa_addr)]; atomic_dec(&line->left_seblks); if (rqd->error) { @@ -144,7 +144,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa) BUG_ON(pblk_ppa_empty(ppa)); #endif - line_id = pblk_tgt_ppa_to_line(ppa); + line_id = pblk_ppa_to_line(ppa); line = &pblk->lines[line_id]; paddr = pblk_dev_ppa_to_line_addr(pblk, ppa); @@ -650,7 +650,7 @@ next_rq: } else { for (i = 0; i < rqd.nr_ppas; ) { struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id); - int pos = pblk_dev_ppa_to_pos(geo, ppa); + int pos = pblk_ppa_to_pos(geo, ppa); int read_type = PBLK_READ_RANDOM; if (pblk_io_aligned(pblk, rq_ppas)) @@ -668,7 +668,7 @@ next_rq: } ppa = addr_to_gen_ppa(pblk, paddr, id); - pos = pblk_dev_ppa_to_pos(geo, ppa); + pos = pblk_ppa_to_pos(geo, ppa); } if (pblk_boundary_paddr_checks(pblk, paddr + min)) { @@ -742,7 +742,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, cmd_op = NVM_OP_PWRITE; flags = pblk_set_progr_mode(pblk, PBLK_WRITE); lba_list = emeta_to_lbas(pblk, line->emeta->buf); - } else if (dir == PBLK_READ) { + } else if (dir == PBLK_READ_RECOV || dir == PBLK_READ) { bio_op = REQ_OP_READ; cmd_op = NVM_OP_PREAD; flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); @@ -802,7 +802,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, if (rqd.error) { if (dir == PBLK_WRITE) pblk_log_write_err(pblk, &rqd); - else + else if (dir == PBLK_READ) pblk_log_read_err(pblk, &rqd); } @@ -816,7 +816,7 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line) { u64 bpaddr = pblk_line_smeta_start(pblk, line); - return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ); + return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ_RECOV); } int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, @@ -854,8 +854,8 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) struct nvm_geo *geo = &dev->geo; pr_err("pblk: could not sync erase line:%d,blk:%d\n", - pblk_dev_ppa_to_line(ppa), - pblk_dev_ppa_to_pos(geo, ppa)); + pblk_ppa_to_line(ppa), + pblk_ppa_to_pos(geo, ppa)); rqd.error = ret; goto out; @@ -979,7 +979,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, /* Start metadata */ smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); - smeta_buf->window_wr_lun = cpu_to_le32(geo->nr_luns); + smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns); /* Fill metadata among lines */ if (cur) { @@ -1032,7 +1032,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, lm->sec_per_line); bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux, lm->sec_per_line); - line->sec_in_line -= geo->sec_per_blk; + line->sec_in_line -= geo->sec_per_chk; if (bit >= lm->emeta_bb) nr_bb++; } @@ -1145,7 +1145,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line) } spin_unlock(&l_mg->free_lock); - pblk_rl_free_lines_dec(&pblk->rl, line); + pblk_rl_free_lines_dec(&pblk->rl, line, true); if (!pblk_line_init_bb(pblk, line, 0)) { list_add(&line->list, &l_mg->free_list); @@ -1233,7 +1233,7 @@ retry: l_mg->data_line = retry_line; spin_unlock(&l_mg->free_lock); - pblk_rl_free_lines_dec(&pblk->rl, retry_line); + pblk_rl_free_lines_dec(&pblk->rl, line, false); if (pblk_line_erase(pblk, retry_line)) goto retry; @@ -1252,7 +1252,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *line; - int is_next = 0; spin_lock(&l_mg->free_lock); line = pblk_line_get(pblk); @@ -1280,7 +1279,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) } else { l_mg->data_next->seq_nr = l_mg->d_seq_nr++; l_mg->data_next->type = PBLK_LINETYPE_DATA; - is_next = 1; } spin_unlock(&l_mg->free_lock); @@ -1290,10 +1288,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) return NULL; } - pblk_rl_free_lines_dec(&pblk->rl, line); - if (is_next) - pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); - retry_setup: if (!pblk_line_init_metadata(pblk, line, NULL)) { line = pblk_line_retry(pblk, line); @@ -1311,6 +1305,8 @@ retry_setup: goto retry_setup; } + pblk_rl_free_lines_dec(&pblk->rl, line, true); + return line; } @@ -1395,7 +1391,6 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk) struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *cur, *new = NULL; unsigned int left_seblks; - int is_next = 0; cur = l_mg->data_line; new = l_mg->data_next; @@ -1444,6 +1439,8 @@ retry_setup: goto retry_setup; } + pblk_rl_free_lines_dec(&pblk->rl, new, true); + /* Allocate next line for preparation */ spin_lock(&l_mg->free_lock); l_mg->data_next = pblk_line_get(pblk); @@ -1457,13 +1454,9 @@ retry_setup: } else { l_mg->data_next->seq_nr = l_mg->d_seq_nr++; l_mg->data_next->type = PBLK_LINETYPE_DATA; - is_next = 1; } spin_unlock(&l_mg->free_lock); - if (is_next) - pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); - out: return new; } @@ -1561,8 +1554,8 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) struct nvm_geo *geo = &dev->geo; pr_err("pblk: could not async erase line:%d,blk:%d\n", - pblk_dev_ppa_to_line(ppa), - pblk_dev_ppa_to_pos(geo, ppa)); + pblk_ppa_to_line(ppa), + pblk_ppa_to_pos(geo, ppa)); } return err; @@ -1746,7 +1739,7 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct pblk_lun *rlun; - int nr_luns = geo->nr_luns; + int nr_luns = geo->all_luns; int bit = -1; while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) { @@ -1884,7 +1877,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, /* If the L2P entry maps to a line, the reference is valid */ if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) { - int line_id = pblk_dev_ppa_to_line(ppa); + int line_id = pblk_ppa_to_line(ppa); struct pblk_line *line = &pblk->lines[line_id]; kref_get(&line->ref); diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 9c8e114..3d89938 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -169,7 +169,14 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) * the line untouched. TODO: Implement a recovery routine that scans and * moves all sectors on the line. */ - lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); + + ret = pblk_recov_check_emeta(pblk, emeta_buf); + if (ret) { + pr_err("pblk: inconsistent emeta (line %d)\n", line->id); + goto fail_free_emeta; + } + + lba_list = emeta_to_lbas(pblk, emeta_buf); if (!lba_list) { pr_err("pblk: could not interpret emeta (line %d)\n", line->id); goto fail_free_emeta; @@ -519,22 +526,12 @@ void pblk_gc_should_start(struct pblk *pblk) } } -/* - * If flush_wq == 1 then no lock should be held by the caller since - * flush_workqueue can sleep - */ -static void pblk_gc_stop(struct pblk *pblk, int flush_wq) -{ - pblk->gc.gc_active = 0; - pr_debug("pblk: gc stop\n"); -} - void pblk_gc_should_stop(struct pblk *pblk) { struct pblk_gc *gc = &pblk->gc; if (gc->gc_active && !gc->gc_forced) - pblk_gc_stop(pblk, 0); + gc->gc_active = 0; } void pblk_gc_should_kick(struct pblk *pblk) @@ -660,7 +657,7 @@ void pblk_gc_exit(struct pblk *pblk) gc->gc_enabled = 0; del_timer_sync(&gc->gc_timer); - pblk_gc_stop(pblk, 1); + gc->gc_active = 0; if (gc->gc_ts) kthread_stop(gc->gc_ts); diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 695826a..93d671c 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -169,8 +169,8 @@ static int pblk_set_ppaf(struct pblk *pblk) } ppaf.ch_len = power_len; - power_len = get_count_order(geo->luns_per_chnl); - if (1 << power_len != geo->luns_per_chnl) { + power_len = get_count_order(geo->nr_luns); + if (1 << power_len != geo->nr_luns) { pr_err("pblk: supports only power-of-two LUN config.\n"); return -EINVAL; } @@ -254,7 +254,7 @@ static int pblk_core_init(struct pblk *pblk) struct nvm_geo *geo = &dev->geo; pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg * - geo->nr_planes * geo->nr_luns; + geo->nr_planes * geo->all_luns; if (pblk_init_global_caches(pblk)) return -ENOMEM; @@ -270,21 +270,22 @@ static int pblk_core_init(struct pblk *pblk) if (!pblk->gen_ws_pool) goto free_page_bio_pool; - pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache); + pblk->rec_pool = mempool_create_slab_pool(geo->all_luns, + pblk_rec_cache); if (!pblk->rec_pool) goto free_gen_ws_pool; - pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns, + pblk->r_rq_pool = mempool_create_slab_pool(geo->all_luns, pblk_g_rq_cache); if (!pblk->r_rq_pool) goto free_rec_pool; - pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns, + pblk->e_rq_pool = mempool_create_slab_pool(geo->all_luns, pblk_g_rq_cache); if (!pblk->e_rq_pool) goto free_r_rq_pool; - pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns, + pblk->w_rq_pool = mempool_create_slab_pool(geo->all_luns, pblk_w_rq_cache); if (!pblk->w_rq_pool) goto free_e_rq_pool; @@ -354,6 +355,8 @@ static void pblk_core_free(struct pblk *pblk) mempool_destroy(pblk->e_rq_pool); mempool_destroy(pblk->w_rq_pool); + pblk_rwb_free(pblk); + pblk_free_global_caches(pblk); } @@ -409,7 +412,7 @@ static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun) u8 *blks; int nr_blks, ret; - nr_blks = geo->blks_per_lun * geo->plane_mode; + nr_blks = geo->nr_chks * geo->plane_mode; blks = kmalloc(nr_blks, GFP_KERNEL); if (!blks) return -ENOMEM; @@ -482,20 +485,21 @@ static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns) int i, ret; /* TODO: Implement unbalanced LUN support */ - if (geo->luns_per_chnl < 0) { + if (geo->nr_luns < 0) { pr_err("pblk: unbalanced LUN config.\n"); return -EINVAL; } - pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL); + pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun), + GFP_KERNEL); if (!pblk->luns) return -ENOMEM; - for (i = 0; i < geo->nr_luns; i++) { + for (i = 0; i < geo->all_luns; i++) { /* Stripe across channels */ int ch = i % geo->nr_chnls; int lun_raw = i / geo->nr_chnls; - int lunid = lun_raw + ch * geo->luns_per_chnl; + int lunid = lun_raw + ch * geo->nr_luns; rlun = &pblk->luns[i]; rlun->bppa = luns[lunid]; @@ -577,22 +581,37 @@ static unsigned int calc_emeta_len(struct pblk *pblk) static void pblk_set_provision(struct pblk *pblk, long nr_free_blks) { struct nvm_tgt_dev *dev = pblk->dev; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; struct nvm_geo *geo = &dev->geo; sector_t provisioned; + int sec_meta, blk_meta; - pblk->over_pct = 20; + if (geo->op == NVM_TARGET_DEFAULT_OP) + pblk->op = PBLK_DEFAULT_OP; + else + pblk->op = geo->op; provisioned = nr_free_blks; - provisioned *= (100 - pblk->over_pct); + provisioned *= (100 - pblk->op); sector_div(provisioned, 100); + pblk->op_blks = nr_free_blks - provisioned; + /* Internally pblk manages all free blocks, but all calculations based * on user capacity consider only provisioned blocks */ pblk->rl.total_blocks = nr_free_blks; - pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk; - pblk->capacity = provisioned * geo->sec_per_blk; + pblk->rl.nr_secs = nr_free_blks * geo->sec_per_chk; + + /* Consider sectors used for metadata */ + sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; + blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk); + + pblk->capacity = (provisioned - blk_meta) * geo->sec_per_chk; + atomic_set(&pblk->rl.free_blocks, nr_free_blks); + atomic_set(&pblk->rl.free_user_blocks, nr_free_blks); } static int pblk_lines_alloc_metadata(struct pblk *pblk) @@ -683,7 +702,7 @@ static int pblk_lines_init(struct pblk *pblk) int i, ret; pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE); - max_write_ppas = pblk->min_write_pgs * geo->nr_luns; + max_write_ppas = pblk->min_write_pgs * geo->all_luns; pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ? max_write_ppas : nvm_max_phys_sects(dev); pblk_set_sec_per_write(pblk, pblk->min_write_pgs); @@ -693,26 +712,26 @@ static int pblk_lines_init(struct pblk *pblk) return -EINVAL; } - div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod); + div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, &mod); if (mod) { pr_err("pblk: bad configuration of sectors/pages\n"); return -EINVAL; } - l_mg->nr_lines = geo->blks_per_lun; + l_mg->nr_lines = geo->nr_chks; l_mg->log_line = l_mg->data_line = NULL; l_mg->l_seq_nr = l_mg->d_seq_nr = 0; l_mg->nr_free_lines = 0; bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES); - lm->sec_per_line = geo->sec_per_blk * geo->nr_luns; - lm->blk_per_line = geo->nr_luns; - lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); + lm->sec_per_line = geo->sec_per_chk * geo->all_luns; + lm->blk_per_line = geo->all_luns; + lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long); lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); - lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); + lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long); lm->mid_thrs = lm->sec_per_line / 2; lm->high_thrs = lm->sec_per_line / 4; - lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs; + lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs; /* Calculate necessary pages for smeta. See comment over struct * line_smeta definition @@ -742,12 +761,12 @@ add_emeta_page: goto add_emeta_page; } - lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0; + lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0; lm->min_blk_line = 1; - if (geo->nr_luns > 1) + if (geo->all_luns > 1) lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec + - lm->emeta_sec[0], geo->sec_per_blk); + lm->emeta_sec[0], geo->sec_per_chk); if (lm->min_blk_line > lm->blk_per_line) { pr_err("pblk: config. not supported. Min. LUN in line:%d\n", @@ -772,7 +791,7 @@ add_emeta_page: goto fail_free_bb_template; } - bb_distance = (geo->nr_luns) * geo->sec_per_pl; + bb_distance = (geo->all_luns) * geo->sec_per_pl; for (i = 0; i < lm->sec_per_line; i += bb_distance) bitmap_set(l_mg->bb_template, i, geo->sec_per_pl); @@ -844,7 +863,7 @@ add_emeta_page: pblk_set_provision(pblk, nr_free_blks); /* Cleanup per-LUN bad block lists - managed within lines on run-time */ - for (i = 0; i < geo->nr_luns; i++) + for (i = 0; i < geo->all_luns; i++) kfree(pblk->luns[i].bb_list); return 0; @@ -858,7 +877,7 @@ fail_free_bb_template: fail_free_meta: pblk_line_meta_free(pblk); fail: - for (i = 0; i < geo->nr_luns; i++) + for (i = 0; i < geo->all_luns; i++) kfree(pblk->luns[i].bb_list); return ret; @@ -866,15 +885,19 @@ fail: static int pblk_writer_init(struct pblk *pblk) { - timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); - mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); - pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); if (IS_ERR(pblk->writer_ts)) { - pr_err("pblk: could not allocate writer kthread\n"); - return PTR_ERR(pblk->writer_ts); + int err = PTR_ERR(pblk->writer_ts); + + if (err != -EINTR) + pr_err("pblk: could not allocate writer kthread (%d)\n", + err); + return err; } + timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); + mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); + return 0; } @@ -910,7 +933,6 @@ static void pblk_tear_down(struct pblk *pblk) pblk_pipeline_stop(pblk); pblk_writer_stop(pblk); pblk_rb_sync_l2p(&pblk->rwb); - pblk_rwb_free(pblk); pblk_rl_free(&pblk->rl); pr_debug("pblk: consistent tear down\n"); @@ -1025,7 +1047,8 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, ret = pblk_writer_init(pblk); if (ret) { - pr_err("pblk: could not initialize write thread\n"); + if (ret != -EINTR) + pr_err("pblk: could not initialize write thread\n"); goto fail_free_lines; } @@ -1041,13 +1064,14 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, blk_queue_write_cache(tqueue, true, false); - tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size; + tqueue->limits.discard_granularity = geo->sec_per_chk * geo->sec_size; tqueue->limits.discard_alignment = 0; blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue); - pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n", - geo->nr_luns, pblk->l_mg.nr_lines, + pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n", + tdisk->disk_name, + geo->all_luns, pblk->l_mg.nr_lines, (unsigned long long)pblk->rl.nr_secs, pblk->rwb.nr_entries); diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c index 6f3ecde..7445e64 100644 --- a/drivers/lightnvm/pblk-map.c +++ b/drivers/lightnvm/pblk-map.c @@ -146,7 +146,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, return; /* Erase blocks that are bad in this line but might not be in next */ - if (unlikely(ppa_empty(*erase_ppa)) && + if (unlikely(pblk_ppa_empty(*erase_ppa)) && bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) { int bit = -1; diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index b8f78e4..ec8fc31 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -54,7 +54,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base, rb->seg_size = (1 << power_seg_sz); rb->nr_entries = (1 << power_size); rb->mem = rb->subm = rb->sync = rb->l2p_update = 0; - rb->sync_point = EMPTY_ENTRY; + rb->flush_point = EMPTY_ENTRY; spin_lock_init(&rb->w_lock); spin_lock_init(&rb->s_lock); @@ -112,7 +112,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base, up_write(&pblk_rb_lock); #ifdef CONFIG_NVM_DEBUG - atomic_set(&rb->inflight_sync_point, 0); + atomic_set(&rb->inflight_flush_point, 0); #endif /* @@ -226,7 +226,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update) pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa, entry->cacheline); - line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)]; + line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)]; kref_put(&line->ref, pblk_line_put); clean_wctx(w_ctx); rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1); @@ -349,35 +349,35 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, smp_store_release(&entry->w_ctx.flags, flags); } -static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio, +static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio, unsigned int pos) { struct pblk_rb_entry *entry; - unsigned int subm, sync_point; + unsigned int sync, flush_point; - subm = READ_ONCE(rb->subm); + sync = READ_ONCE(rb->sync); + + if (pos == sync) + return 0; #ifdef CONFIG_NVM_DEBUG - atomic_inc(&rb->inflight_sync_point); + atomic_inc(&rb->inflight_flush_point); #endif - if (pos == subm) - return 0; + flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); + entry = &rb->entries[flush_point]; - sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); - entry = &rb->entries[sync_point]; + pblk_rb_sync_init(rb, NULL); - /* Protect syncs */ - smp_store_release(&rb->sync_point, sync_point); + /* Protect flush points */ + smp_store_release(&rb->flush_point, flush_point); - if (!bio) - return 0; + if (bio) + bio_list_add(&entry->w_ctx.bios, bio); - spin_lock_irq(&rb->s_lock); - bio_list_add(&entry->w_ctx.bios, bio); - spin_unlock_irq(&rb->s_lock); + pblk_rb_sync_end(rb, NULL); - return 1; + return bio ? 1 : 0; } static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, @@ -416,7 +416,7 @@ void pblk_rb_flush(struct pblk_rb *rb) struct pblk *pblk = container_of(rb, struct pblk, rwb); unsigned int mem = READ_ONCE(rb->mem); - if (pblk_rb_sync_point_set(rb, NULL, mem)) + if (pblk_rb_flush_point_set(rb, NULL, mem)) return; pblk_write_should_kick(pblk); @@ -440,7 +440,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries, #ifdef CONFIG_NVM_DEBUG atomic_long_inc(&pblk->nr_flush); #endif - if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem)) + if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem)) *io_ret = NVM_IO_OK; } @@ -606,21 +606,6 @@ try: return NVM_IO_ERR; } - if (flags & PBLK_FLUSH_ENTRY) { - unsigned int sync_point; - - sync_point = READ_ONCE(rb->sync_point); - if (sync_point == pos) { - /* Protect syncs */ - smp_store_release(&rb->sync_point, EMPTY_ENTRY); - } - - flags &= ~PBLK_FLUSH_ENTRY; -#ifdef CONFIG_NVM_DEBUG - atomic_dec(&rb->inflight_sync_point); -#endif - } - flags &= ~PBLK_WRITTEN_DATA; flags |= PBLK_SUBMITTED_ENTRY; @@ -730,15 +715,24 @@ void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags) unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries) { - unsigned int sync; - unsigned int i; - + unsigned int sync, flush_point; lockdep_assert_held(&rb->s_lock); sync = READ_ONCE(rb->sync); + flush_point = READ_ONCE(rb->flush_point); - for (i = 0; i < nr_entries; i++) - sync = (sync + 1) & (rb->nr_entries - 1); + if (flush_point != EMPTY_ENTRY) { + unsigned int secs_to_flush; + + secs_to_flush = pblk_rb_ring_count(flush_point, sync, + rb->nr_entries); + if (secs_to_flush < nr_entries) { + /* Protect flush points */ + smp_store_release(&rb->flush_point, EMPTY_ENTRY); + } + } + + sync = (sync + nr_entries) & (rb->nr_entries - 1); /* Protect from counts */ smp_store_release(&rb->sync, sync); @@ -746,22 +740,27 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries) return sync; } -unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb) +/* Calculate how many sectors to submit up to the current flush point. */ +unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb) { - unsigned int subm, sync_point; - unsigned int count; + unsigned int subm, sync, flush_point; + unsigned int submitted, to_flush; - /* Protect syncs */ - sync_point = smp_load_acquire(&rb->sync_point); - if (sync_point == EMPTY_ENTRY) + /* Protect flush points */ + flush_point = smp_load_acquire(&rb->flush_point); + if (flush_point == EMPTY_ENTRY) return 0; + /* Protect syncs */ + sync = smp_load_acquire(&rb->sync); + subm = READ_ONCE(rb->subm); + submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries); /* The sync point itself counts as a sector to sync */ - count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1; + to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1; - return count; + return (submitted < to_flush) ? (to_flush - submitted) : 0; } /* @@ -801,7 +800,7 @@ int pblk_rb_tear_down_check(struct pblk_rb *rb) if ((rb->mem == rb->subm) && (rb->subm == rb->sync) && (rb->sync == rb->l2p_update) && - (rb->sync_point == EMPTY_ENTRY)) { + (rb->flush_point == EMPTY_ENTRY)) { goto out; } @@ -848,7 +847,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) queued_entries++; spin_unlock_irq(&rb->s_lock); - if (rb->sync_point != EMPTY_ENTRY) + if (rb->flush_point != EMPTY_ENTRY) offset = scnprintf(buf, PAGE_SIZE, "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n", rb->nr_entries, @@ -857,14 +856,14 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) rb->sync, rb->l2p_update, #ifdef CONFIG_NVM_DEBUG - atomic_read(&rb->inflight_sync_point), + atomic_read(&rb->inflight_flush_point), #else 0, #endif - rb->sync_point, + rb->flush_point, pblk_rb_read_count(rb), pblk_rb_space(rb), - pblk_rb_sync_point_count(rb), + pblk_rb_flush_point_count(rb), queued_entries); else offset = scnprintf(buf, PAGE_SIZE, @@ -875,13 +874,13 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) rb->sync, rb->l2p_update, #ifdef CONFIG_NVM_DEBUG - atomic_read(&rb->inflight_sync_point), + atomic_read(&rb->inflight_flush_point), #else 0, #endif pblk_rb_read_count(rb), pblk_rb_space(rb), - pblk_rb_sync_point_count(rb), + pblk_rb_flush_point_count(rb), queued_entries); return offset; diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index ca79d8f..2f76128 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -141,7 +141,7 @@ static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd) struct ppa_addr ppa = ppa_list[i]; struct pblk_line *line; - line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; + line = &pblk->lines[pblk_ppa_to_line(ppa)]; kref_put(&line->ref, pblk_line_put_wq); } } @@ -158,8 +158,12 @@ static void pblk_end_user_read(struct bio *bio) static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, bool put_line) { + struct nvm_tgt_dev *dev = pblk->dev; struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); struct bio *bio = rqd->bio; + unsigned long start_time = r_ctx->start_time; + + generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time); if (rqd->error) pblk_log_read_err(pblk, rqd); @@ -193,9 +197,9 @@ static void pblk_end_io_read(struct nvm_rq *rqd) __pblk_end_io_read(pblk, rqd, true); } -static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, - unsigned int bio_init_idx, - unsigned long *read_bitmap) +static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, + unsigned int bio_init_idx, + unsigned long *read_bitmap) { struct bio *new_bio, *bio = rqd->bio; struct pblk_sec_meta *meta_list = rqd->meta_list; @@ -270,7 +274,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, i = 0; hole = find_first_zero_bit(read_bitmap, nr_secs); do { - int line_id = pblk_dev_ppa_to_line(rqd->ppa_list[i]); + int line_id = pblk_ppa_to_line(rqd->ppa_list[i]); struct pblk_line *line = &pblk->lines[line_id]; kref_put(&line->ref, pblk_line_put); @@ -306,6 +310,8 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, return NVM_IO_OK; err: + pr_err("pblk: failed to perform partial read\n"); + /* Free allocated pages in new bio */ pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt); __pblk_end_io_read(pblk, rqd, false); @@ -357,6 +363,7 @@ retry: int pblk_submit_read(struct pblk *pblk, struct bio *bio) { struct nvm_tgt_dev *dev = pblk->dev; + struct request_queue *q = dev->q; sector_t blba = pblk_get_lba(bio); unsigned int nr_secs = pblk_get_secs(bio); struct pblk_g_ctx *r_ctx; @@ -372,6 +379,8 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) return NVM_IO_ERR; } + generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0); + bitmap_zero(&read_bitmap, nr_secs); rqd = pblk_alloc_rqd(pblk, PBLK_READ); @@ -383,6 +392,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) rqd->end_io = pblk_end_io_read; r_ctx = nvm_rq_to_pdu(rqd); + r_ctx->start_time = jiffies; r_ctx->lba = blba; /* Save the index for this bio's start. This is needed in case @@ -422,7 +432,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set); if (!int_bio) { pr_err("pblk: could not clone read bio\n"); - return NVM_IO_ERR; + goto fail_end_io; } rqd->bio = int_bio; @@ -433,7 +443,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) pr_err("pblk: read IO submission failed\n"); if (int_bio) bio_put(int_bio); - return ret; + goto fail_end_io; } return NVM_IO_OK; @@ -442,17 +452,14 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) /* The read bio request could be partially filled by the write buffer, * but there are some holes that need to be read from the drive. */ - ret = pblk_fill_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap); - if (ret) { - pr_err("pblk: failed to perform partial read\n"); - return ret; - } - - return NVM_IO_OK; + return pblk_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap); fail_rqd_free: pblk_free_rqd(pblk, rqd, PBLK_READ); return ret; +fail_end_io: + __pblk_end_io_read(pblk, rqd, false); + return ret; } static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index eadb3eb..1d5e961 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -111,18 +111,18 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, return 0; } -__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta_buf) +int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf) { u32 crc; crc = pblk_calc_emeta_crc(pblk, emeta_buf); if (le32_to_cpu(emeta_buf->crc) != crc) - return NULL; + return 1; if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) - return NULL; + return 1; - return emeta_to_lbas(pblk, emeta_buf); + return 0; } static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) @@ -137,7 +137,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) u64 nr_valid_lbas, nr_lbas = 0; u64 i; - lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); + lba_list = emeta_to_lbas(pblk, emeta_buf); if (!lba_list) return 1; @@ -149,7 +149,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) struct ppa_addr ppa; int pos; - ppa = addr_to_pblk_ppa(pblk, i, line->id); + ppa = addr_to_gen_ppa(pblk, i, line->id); pos = pblk_ppa_to_pos(geo, ppa); /* Do not update bad blocks */ @@ -188,7 +188,7 @@ static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line) int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] - - nr_bb * geo->sec_per_blk; + nr_bb * geo->sec_per_chk; } struct pblk_recov_alloc { @@ -263,12 +263,12 @@ next_read_rq: int pos; ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id); - pos = pblk_dev_ppa_to_pos(geo, ppa); + pos = pblk_ppa_to_pos(geo, ppa); while (test_bit(pos, line->blk_bitmap)) { r_ptr_int += pblk->min_write_pgs; ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id); - pos = pblk_dev_ppa_to_pos(geo, ppa); + pos = pblk_ppa_to_pos(geo, ppa); } for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++) @@ -288,7 +288,7 @@ next_read_rq: /* At this point, the read should not fail. If it does, it is a problem * we cannot recover from here. Need FTL log. */ - if (rqd->error) { + if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) { pr_err("pblk: L2P recovery failed (%d)\n", rqd->error); return -EINTR; } @@ -411,12 +411,12 @@ next_pad_rq: int pos; w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); - ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id); + ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); pos = pblk_ppa_to_pos(geo, ppa); while (test_bit(pos, line->blk_bitmap)) { w_ptr += pblk->min_write_pgs; - ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id); + ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); pos = pblk_ppa_to_pos(geo, ppa); } @@ -541,12 +541,12 @@ next_rq: w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); - pos = pblk_dev_ppa_to_pos(geo, ppa); + pos = pblk_ppa_to_pos(geo, ppa); while (test_bit(pos, line->blk_bitmap)) { w_ptr += pblk->min_write_pgs; ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); - pos = pblk_dev_ppa_to_pos(geo, ppa); + pos = pblk_ppa_to_pos(geo, ppa); } for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) @@ -672,12 +672,12 @@ next_rq: paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); ppa = addr_to_gen_ppa(pblk, paddr, line->id); - pos = pblk_dev_ppa_to_pos(geo, ppa); + pos = pblk_ppa_to_pos(geo, ppa); while (test_bit(pos, line->blk_bitmap)) { paddr += pblk->min_write_pgs; ppa = addr_to_gen_ppa(pblk, paddr, line->id); - pos = pblk_dev_ppa_to_pos(geo, ppa); + pos = pblk_ppa_to_pos(geo, ppa); } for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++) @@ -817,7 +817,7 @@ static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line) while (emeta_secs) { emeta_start--; - ppa = addr_to_pblk_ppa(pblk, emeta_start, line->id); + ppa = addr_to_gen_ppa(pblk, emeta_start, line->id); pos = pblk_ppa_to_pos(geo, ppa); if (!test_bit(pos, line->blk_bitmap)) emeta_secs--; @@ -938,6 +938,11 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) goto next; } + if (pblk_recov_check_emeta(pblk, line->emeta->buf)) { + pblk_recov_l2p_from_oob(pblk, line); + goto next; + } + if (pblk_recov_l2p_from_emeta(pblk, line)) pblk_recov_l2p_from_oob(pblk, line); @@ -984,10 +989,8 @@ next: } spin_unlock(&l_mg->free_lock); - if (is_next) { + if (is_next) pblk_line_erase(pblk, l_mg->data_next); - pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); - } out: if (found_lines != recovered_lines) diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index dacc719..0d457b1 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -89,17 +89,15 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl) return atomic_read(&rl->free_blocks); } -/* - * We check for (i) the number of free blocks in the current LUN and (ii) the - * total number of free blocks in the pblk instance. This is to even out the - * number of free blocks on each LUN when GC kicks in. - * - * Only the total number of free blocks is used to configure the rate limiter. - */ -void pblk_rl_update_rates(struct pblk_rl *rl) +unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl) +{ + return atomic_read(&rl->free_user_blocks); +} + +static void __pblk_rl_update_rates(struct pblk_rl *rl, + unsigned long free_blocks) { struct pblk *pblk = container_of(rl, struct pblk, rl); - unsigned long free_blocks = pblk_rl_nr_free_blks(rl); int max = rl->rb_budget; if (free_blocks >= rl->high) { @@ -132,20 +130,37 @@ void pblk_rl_update_rates(struct pblk_rl *rl) pblk_gc_should_stop(pblk); } +void pblk_rl_update_rates(struct pblk_rl *rl) +{ + __pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl)); +} + void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) { int blk_in_line = atomic_read(&line->blk_in_line); + int free_blocks; atomic_add(blk_in_line, &rl->free_blocks); - pblk_rl_update_rates(rl); + free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks); + + __pblk_rl_update_rates(rl, free_blocks); } -void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) +void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line, + bool used) { int blk_in_line = atomic_read(&line->blk_in_line); + int free_blocks; atomic_sub(blk_in_line, &rl->free_blocks); - pblk_rl_update_rates(rl); + + if (used) + free_blocks = atomic_sub_return(blk_in_line, + &rl->free_user_blocks); + else + free_blocks = atomic_read(&rl->free_user_blocks); + + __pblk_rl_update_rates(rl, free_blocks); } int pblk_rl_high_thrs(struct pblk_rl *rl) @@ -174,16 +189,21 @@ void pblk_rl_free(struct pblk_rl *rl) void pblk_rl_init(struct pblk_rl *rl, int budget) { struct pblk *pblk = container_of(rl, struct pblk, rl); + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE; + int sec_meta, blk_meta; + unsigned int rb_windows; - rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS; - rl->high_pw = get_count_order(rl->high); + /* Consider sectors used for metadata */ + sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; + blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk); - rl->low = rl->total_blocks / PBLK_USER_LOW_THRS; - if (rl->low < min_blocks) - rl->low = min_blocks; + rl->high = pblk->op_blks - blk_meta - lm->blk_per_line; + rl->high_pw = get_count_order(rl->high); rl->rsv_blocks = min_blocks; diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c index cd49e88..620bab8 100644 --- a/drivers/lightnvm/pblk-sysfs.c +++ b/drivers/lightnvm/pblk-sysfs.c @@ -28,7 +28,7 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page) ssize_t sz = 0; int i; - for (i = 0; i < geo->nr_luns; i++) { + for (i = 0; i < geo->all_luns; i++) { int active = 1; rlun = &pblk->luns[i]; @@ -49,11 +49,12 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page) static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page) { - int free_blocks, total_blocks; + int free_blocks, free_user_blocks, total_blocks; int rb_user_max, rb_user_cnt; int rb_gc_max, rb_gc_cnt, rb_budget, rb_state; - free_blocks = atomic_read(&pblk->rl.free_blocks); + free_blocks = pblk_rl_nr_free_blks(&pblk->rl); + free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl); rb_user_max = pblk->rl.rb_user_max; rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt); rb_gc_max = pblk->rl.rb_gc_max; @@ -64,16 +65,16 @@ static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page) total_blocks = pblk->rl.total_blocks; return snprintf(page, PAGE_SIZE, - "u:%u/%u,gc:%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n", + "u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n", rb_user_cnt, rb_user_max, rb_gc_cnt, rb_gc_max, rb_state, rb_budget, - pblk->rl.low, pblk->rl.high, free_blocks, + free_user_blocks, total_blocks, READ_ONCE(pblk->rl.rb_user_active)); } @@ -238,7 +239,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) sz = snprintf(page, PAGE_SIZE - sz, "line: nluns:%d, nblks:%d, nsecs:%d\n", - geo->nr_luns, lm->blk_per_line, lm->sec_per_line); + geo->all_luns, lm->blk_per_line, lm->sec_per_line); sz += snprintf(page + sz, PAGE_SIZE - sz, "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n", @@ -287,7 +288,7 @@ static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page) "blk_line:%d, sec_line:%d, sec_blk:%d\n", lm->blk_per_line, lm->sec_per_line, - geo->sec_per_blk); + geo->sec_per_chk); return sz; } diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 6c1cafa..aae86ed 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -21,13 +21,28 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, struct pblk_c_ctx *c_ctx) { struct bio *original_bio; + struct pblk_rb *rwb = &pblk->rwb; unsigned long ret; int i; for (i = 0; i < c_ctx->nr_valid; i++) { struct pblk_w_ctx *w_ctx; + int pos = c_ctx->sentry + i; + int flags; + + w_ctx = pblk_rb_w_ctx(rwb, pos); + flags = READ_ONCE(w_ctx->flags); + + if (flags & PBLK_FLUSH_ENTRY) { + flags &= ~PBLK_FLUSH_ENTRY; + /* Release flags on context. Protect from writes */ + smp_store_release(&w_ctx->flags, flags); + +#ifdef CONFIG_NVM_DEBUG + atomic_dec(&rwb->inflight_flush_point); +#endif + } - w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i); while ((original_bio = bio_list_pop(&w_ctx->bios))) bio_endio(original_bio); } @@ -439,7 +454,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) struct pblk_line *meta_line; int err; - ppa_set_empty(&erase_ppa); + pblk_ppa_set_empty(&erase_ppa); /* Assign lbas to ppas and populate request structure */ err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); @@ -457,7 +472,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) return NVM_IO_ERR; } - if (!ppa_empty(erase_ppa)) { + if (!pblk_ppa_empty(erase_ppa)) { /* Submit erase for next data line */ if (pblk_blk_erase_async(pblk, erase_ppa)) { struct pblk_line *e_line = pblk_line_get_erase(pblk); @@ -508,7 +523,7 @@ static int pblk_submit_write(struct pblk *pblk) if (!secs_avail) return 1; - secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb); + secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb); if (!secs_to_flush && secs_avail < pblk->min_write_pgs) return 1; diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 59a64d4..8c357fb 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -51,17 +51,16 @@ #define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR) -#define pblk_for_each_lun(pblk, rlun, i) \ - for ((i) = 0, rlun = &(pblk)->luns[0]; \ - (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)]) - /* Static pool sizes */ #define PBLK_GEN_WS_POOL_SIZE (2) +#define PBLK_DEFAULT_OP (11) + enum { PBLK_READ = READ, PBLK_WRITE = WRITE,/* Write from write buffer */ PBLK_WRITE_INT, /* Internal write - no write buffer */ + PBLK_READ_RECOV, /* Recovery read - errors allowed */ PBLK_ERASE, }; @@ -114,6 +113,7 @@ struct pblk_c_ctx { /* read context */ struct pblk_g_ctx { void *private; + unsigned long start_time; u64 lba; }; @@ -170,7 +170,7 @@ struct pblk_rb { * the last submitted entry that has * been successfully persisted to media */ - unsigned int sync_point; /* Sync point - last entry that must be + unsigned int flush_point; /* Sync point - last entry that must be * flushed to the media. Used with * REQ_FLUSH and REQ_FUA */ @@ -193,7 +193,7 @@ struct pblk_rb { spinlock_t s_lock; /* Sync lock */ #ifdef CONFIG_NVM_DEBUG - atomic_t inflight_sync_point; /* Not served REQ_FLUSH | REQ_FUA */ + atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */ #endif }; @@ -256,9 +256,6 @@ struct pblk_rl { unsigned int high; /* Upper threshold for rate limiter (free run - * user I/O rate limiter */ - unsigned int low; /* Lower threshold for rate limiter (user I/O - * rate limiter - stall) - */ unsigned int high_pw; /* High rounded up as a power of 2 */ #define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */ @@ -292,7 +289,9 @@ struct pblk_rl { unsigned long long nr_secs; unsigned long total_blocks; - atomic_t free_blocks; + + atomic_t free_blocks; /* Total number of free blocks (+ OP) */ + atomic_t free_user_blocks; /* Number of user free blocks (no OP) */ }; #define PBLK_LINE_EMPTY (~0U) @@ -583,7 +582,9 @@ struct pblk { */ sector_t capacity; /* Device capacity when bad blocks are subtracted */ - int over_pct; /* Percentage of device used for over-provisioning */ + + int op; /* Percentage of device used for over-provisioning */ + int op_blks; /* Number of blocks used for over-provisioning */ /* pblk provisioning values. Used by rate limiter */ struct pblk_rl rl; @@ -691,7 +692,7 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries); struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb, struct ppa_addr *ppa); void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags); -unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb); +unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb); unsigned int pblk_rb_read_count(struct pblk_rb *rb); unsigned int pblk_rb_sync_count(struct pblk_rb *rb); @@ -812,7 +813,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq); void pblk_submit_rec(struct work_struct *work); struct pblk_line *pblk_recov_l2p(struct pblk *pblk); int pblk_recov_pad(struct pblk *pblk); -__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta); +int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta); int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, struct pblk_rec_ctx *recovery, u64 *comp_bits, unsigned int comp); @@ -843,6 +844,7 @@ void pblk_rl_free(struct pblk_rl *rl); void pblk_rl_update_rates(struct pblk_rl *rl); int pblk_rl_high_thrs(struct pblk_rl *rl); unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); +unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl); int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); @@ -851,7 +853,8 @@ void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); int pblk_rl_max_io(struct pblk_rl *rl); void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); -void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line); +void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line, + bool used); int pblk_rl_is_limit(struct pblk_rl *rl); /* @@ -907,28 +910,47 @@ static inline int pblk_pad_distance(struct pblk *pblk) struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - return NVM_MEM_PAGE_WRITE * geo->nr_luns * geo->sec_per_pl; + return NVM_MEM_PAGE_WRITE * geo->all_luns * geo->sec_per_pl; } -static inline int pblk_dev_ppa_to_line(struct ppa_addr p) +static inline int pblk_ppa_to_line(struct ppa_addr p) { return p.g.blk; } -static inline int pblk_tgt_ppa_to_line(struct ppa_addr p) +static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p) { - return p.g.blk; + return p.g.lun * geo->nr_chnls + p.g.ch; } -static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p) +static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr, + u64 line_id) { - return p.g.lun * geo->nr_chnls + p.g.ch; + struct ppa_addr ppa; + + ppa.ppa = 0; + ppa.g.blk = line_id; + ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset; + ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset; + ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset; + ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset; + ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset; + + return ppa; } -/* A block within a line corresponds to the lun */ -static inline int pblk_dev_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p) +static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk, + struct ppa_addr p) { - return p.g.lun * geo->nr_chnls + p.g.ch; + u64 paddr; + + paddr = (u64)p.g.pg << pblk->ppaf.pg_offset; + paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset; + paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset; + paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset; + paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset; + + return paddr; } static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32) @@ -960,24 +982,6 @@ static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32) return ppa64; } -static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk, - sector_t lba) -{ - struct ppa_addr ppa; - - if (pblk->ppaf_bitsize < 32) { - u32 *map = (u32 *)pblk->trans_map; - - ppa = pblk_ppa32_to_ppa64(pblk, map[lba]); - } else { - struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map; - - ppa = map[lba]; - } - - return ppa; -} - static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64) { u32 ppa32 = 0; @@ -999,33 +1003,36 @@ static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64) return ppa32; } -static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba, - struct ppa_addr ppa) +static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk, + sector_t lba) { + struct ppa_addr ppa; + if (pblk->ppaf_bitsize < 32) { u32 *map = (u32 *)pblk->trans_map; - map[lba] = pblk_ppa64_to_ppa32(pblk, ppa); + ppa = pblk_ppa32_to_ppa64(pblk, map[lba]); } else { - u64 *map = (u64 *)pblk->trans_map; + struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map; - map[lba] = ppa.ppa; + ppa = map[lba]; } + + return ppa; } -static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk, - struct ppa_addr p) +static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba, + struct ppa_addr ppa) { - u64 paddr; + if (pblk->ppaf_bitsize < 32) { + u32 *map = (u32 *)pblk->trans_map; - paddr = 0; - paddr |= (u64)p.g.pg << pblk->ppaf.pg_offset; - paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset; - paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset; - paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset; - paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset; + map[lba] = pblk_ppa64_to_ppa32(pblk, ppa); + } else { + u64 *map = (u64 *)pblk->trans_map; - return paddr; + map[lba] = ppa.ppa; + } } static inline int pblk_ppa_empty(struct ppa_addr ppa_addr) @@ -1040,10 +1047,7 @@ static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr) static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa) { - if (lppa.ppa == rppa.ppa) - return true; - - return false; + return (lppa.ppa == rppa.ppa); } static inline int pblk_addr_in_cache(struct ppa_addr ppa) @@ -1066,32 +1070,6 @@ static inline struct ppa_addr pblk_cacheline_to_addr(int addr) return p; } -static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr, - u64 line_id) -{ - struct ppa_addr ppa; - - ppa.ppa = 0; - ppa.g.blk = line_id; - ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset; - ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset; - ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset; - ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset; - ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset; - - return ppa; -} - -static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr, - u64 line_id) -{ - struct ppa_addr ppa; - - ppa = addr_to_gen_ppa(pblk, paddr, line_id); - - return ppa; -} - static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk, struct line_header *header) { @@ -1212,10 +1190,10 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, if (!ppa->c.is_cached && ppa->g.ch < geo->nr_chnls && - ppa->g.lun < geo->luns_per_chnl && + ppa->g.lun < geo->nr_luns && ppa->g.pl < geo->nr_planes && - ppa->g.blk < geo->blks_per_lun && - ppa->g.pg < geo->pgs_per_blk && + ppa->g.blk < geo->nr_chks && + ppa->g.pg < geo->ws_per_chk && ppa->g.sec < geo->sec_per_pg) continue; @@ -1245,7 +1223,7 @@ static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd) for (i = 0; i < rqd->nr_ppas; i++) { ppa = ppa_list[i]; - line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; + line = &pblk->lines[pblk_ppa_to_line(ppa)]; spin_lock(&line->lock); if (line->state != PBLK_LINESTATE_OPEN) { @@ -1288,11 +1266,6 @@ static inline unsigned int pblk_get_secs(struct bio *bio) return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE; } -static inline sector_t pblk_get_sector(sector_t lba) -{ - return lba * NR_PHY_IN_LOG; -} - static inline void pblk_setup_uuid(struct pblk *pblk) { uuid_le uuid; diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c deleted file mode 100644 index 0993c14..0000000 --- a/drivers/lightnvm/rrpc.c +++ /dev/null @@ -1,1625 +0,0 @@ -/* - * Copyright (C) 2015 IT University of Copenhagen - * Initial release: Matias Bjorling <m@bjorling.me> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Implementation of a Round-robin page-based Hybrid FTL for Open-channel SSDs. - */ - -#include "rrpc.h" - -static struct kmem_cache *rrpc_gcb_cache, *rrpc_rq_cache; -static DECLARE_RWSEM(rrpc_lock); - -static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio, - struct nvm_rq *rqd, unsigned long flags); - -#define rrpc_for_each_lun(rrpc, rlun, i) \ - for ((i) = 0, rlun = &(rrpc)->luns[0]; \ - (i) < (rrpc)->nr_luns; (i)++, rlun = &(rrpc)->luns[(i)]) - -static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - struct rrpc_block *rblk = a->rblk; - unsigned int pg_offset; - - lockdep_assert_held(&rrpc->rev_lock); - - if (a->addr == ADDR_EMPTY || !rblk) - return; - - spin_lock(&rblk->lock); - - div_u64_rem(a->addr, dev->geo.sec_per_blk, &pg_offset); - WARN_ON(test_and_set_bit(pg_offset, rblk->invalid_pages)); - rblk->nr_invalid_pages++; - - spin_unlock(&rblk->lock); - - rrpc->rev_trans_map[a->addr].addr = ADDR_EMPTY; -} - -static void rrpc_invalidate_range(struct rrpc *rrpc, sector_t slba, - unsigned int len) -{ - sector_t i; - - spin_lock(&rrpc->rev_lock); - for (i = slba; i < slba + len; i++) { - struct rrpc_addr *gp = &rrpc->trans_map[i]; - - rrpc_page_invalidate(rrpc, gp); - gp->rblk = NULL; - } - spin_unlock(&rrpc->rev_lock); -} - -static struct nvm_rq *rrpc_inflight_laddr_acquire(struct rrpc *rrpc, - sector_t laddr, unsigned int pages) -{ - struct nvm_rq *rqd; - struct rrpc_inflight_rq *inf; - - rqd = mempool_alloc(rrpc->rq_pool, GFP_ATOMIC); - if (!rqd) - return ERR_PTR(-ENOMEM); - - inf = rrpc_get_inflight_rq(rqd); - if (rrpc_lock_laddr(rrpc, laddr, pages, inf)) { - mempool_free(rqd, rrpc->rq_pool); - return NULL; - } - - return rqd; -} - -static void rrpc_inflight_laddr_release(struct rrpc *rrpc, struct nvm_rq *rqd) -{ - struct rrpc_inflight_rq *inf = rrpc_get_inflight_rq(rqd); - - rrpc_unlock_laddr(rrpc, inf); - - mempool_free(rqd, rrpc->rq_pool); -} - -static void rrpc_discard(struct rrpc *rrpc, struct bio *bio) -{ - sector_t slba = bio->bi_iter.bi_sector / NR_PHY_IN_LOG; - sector_t len = bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE; - struct nvm_rq *rqd; - - while (1) { - rqd = rrpc_inflight_laddr_acquire(rrpc, slba, len); - if (rqd) - break; - - schedule(); - } - - if (IS_ERR(rqd)) { - pr_err("rrpc: unable to acquire inflight IO\n"); - bio_io_error(bio); - return; - } - - rrpc_invalidate_range(rrpc, slba, len); - rrpc_inflight_laddr_release(rrpc, rqd); -} - -static int block_is_full(struct rrpc *rrpc, struct rrpc_block *rblk) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - - return (rblk->next_page == dev->geo.sec_per_blk); -} - -/* Calculate relative addr for the given block, considering instantiated LUNs */ -static u64 block_to_rel_addr(struct rrpc *rrpc, struct rrpc_block *rblk) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - struct rrpc_lun *rlun = rblk->rlun; - - return rlun->id * dev->geo.sec_per_blk; -} - -static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_tgt_dev *dev, - struct rrpc_addr *gp) -{ - struct rrpc_block *rblk = gp->rblk; - struct rrpc_lun *rlun = rblk->rlun; - u64 addr = gp->addr; - struct ppa_addr paddr; - - paddr.ppa = addr; - paddr = rrpc_linear_to_generic_addr(&dev->geo, paddr); - paddr.g.ch = rlun->bppa.g.ch; - paddr.g.lun = rlun->bppa.g.lun; - paddr.g.blk = rblk->id; - - return paddr; -} - -/* requires lun->lock taken */ -static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *new_rblk, - struct rrpc_block **cur_rblk) -{ - struct rrpc *rrpc = rlun->rrpc; - - if (*cur_rblk) { - spin_lock(&(*cur_rblk)->lock); - WARN_ON(!block_is_full(rrpc, *cur_rblk)); - spin_unlock(&(*cur_rblk)->lock); - } - *cur_rblk = new_rblk; -} - -static struct rrpc_block *__rrpc_get_blk(struct rrpc *rrpc, - struct rrpc_lun *rlun) -{ - struct rrpc_block *rblk = NULL; - - if (list_empty(&rlun->free_list)) - goto out; - - rblk = list_first_entry(&rlun->free_list, struct rrpc_block, list); - - list_move_tail(&rblk->list, &rlun->used_list); - rblk->state = NVM_BLK_ST_TGT; - rlun->nr_free_blocks--; - -out: - return rblk; -} - -static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun, - unsigned long flags) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - struct rrpc_block *rblk; - int is_gc = flags & NVM_IOTYPE_GC; - - spin_lock(&rlun->lock); - if (!is_gc && rlun->nr_free_blocks < rlun->reserved_blocks) { - pr_err("nvm: rrpc: cannot give block to non GC request\n"); - spin_unlock(&rlun->lock); - return NULL; - } - - rblk = __rrpc_get_blk(rrpc, rlun); - if (!rblk) { - pr_err("nvm: rrpc: cannot get new block\n"); - spin_unlock(&rlun->lock); - return NULL; - } - spin_unlock(&rlun->lock); - - bitmap_zero(rblk->invalid_pages, dev->geo.sec_per_blk); - rblk->next_page = 0; - rblk->nr_invalid_pages = 0; - atomic_set(&rblk->data_cmnt_size, 0); - - return rblk; -} - -static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk) -{ - struct rrpc_lun *rlun = rblk->rlun; - - spin_lock(&rlun->lock); - if (rblk->state & NVM_BLK_ST_TGT) { - list_move_tail(&rblk->list, &rlun->free_list); - rlun->nr_free_blocks++; - rblk->state = NVM_BLK_ST_FREE; - } else if (rblk->state & NVM_BLK_ST_BAD) { - list_move_tail(&rblk->list, &rlun->bb_list); - rblk->state = NVM_BLK_ST_BAD; - } else { - WARN_ON_ONCE(1); - pr_err("rrpc: erroneous type (ch:%d,lun:%d,blk%d-> %u)\n", - rlun->bppa.g.ch, rlun->bppa.g.lun, - rblk->id, rblk->state); - list_move_tail(&rblk->list, &rlun->bb_list); - } - spin_unlock(&rlun->lock); -} - -static void rrpc_put_blks(struct rrpc *rrpc) -{ - struct rrpc_lun *rlun; - int i; - - for (i = 0; i < rrpc->nr_luns; i++) { - rlun = &rrpc->luns[i]; - if (rlun->cur) - rrpc_put_blk(rrpc, rlun->cur); - if (rlun->gc_cur) - rrpc_put_blk(rrpc, rlun->gc_cur); - } -} - -static struct rrpc_lun *get_next_lun(struct rrpc *rrpc) -{ - int next = atomic_inc_return(&rrpc->next_lun); - - return &rrpc->luns[next % rrpc->nr_luns]; -} - -static void rrpc_gc_kick(struct rrpc *rrpc) -{ - struct rrpc_lun *rlun; - unsigned int i; - - for (i = 0; i < rrpc->nr_luns; i++) { - rlun = &rrpc->luns[i]; - queue_work(rrpc->krqd_wq, &rlun->ws_gc); - } -} - -/* - * timed GC every interval. - */ -static void rrpc_gc_timer(struct timer_list *t) -{ - struct rrpc *rrpc = from_timer(rrpc, t, gc_timer); - - rrpc_gc_kick(rrpc); - mod_timer(&rrpc->gc_timer, jiffies + msecs_to_jiffies(10)); -} - -static void rrpc_end_sync_bio(struct bio *bio) -{ - struct completion *waiting = bio->bi_private; - - if (bio->bi_status) - pr_err("nvm: gc request failed (%u).\n", bio->bi_status); - - complete(waiting); -} - -/* - * rrpc_move_valid_pages -- migrate live data off the block - * @rrpc: the 'rrpc' structure - * @block: the block from which to migrate live pages - * - * Description: - * GC algorithms may call this function to migrate remaining live - * pages off the block prior to erasing it. This function blocks - * further execution until the operation is complete. - */ -static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - struct request_queue *q = dev->q; - struct rrpc_rev_addr *rev; - struct nvm_rq *rqd; - struct bio *bio; - struct page *page; - int slot; - int nr_sec_per_blk = dev->geo.sec_per_blk; - u64 phys_addr; - DECLARE_COMPLETION_ONSTACK(wait); - - if (bitmap_full(rblk->invalid_pages, nr_sec_per_blk)) - return 0; - - bio = bio_alloc(GFP_NOIO, 1); - if (!bio) { - pr_err("nvm: could not alloc bio to gc\n"); - return -ENOMEM; - } - - page = mempool_alloc(rrpc->page_pool, GFP_NOIO); - - while ((slot = find_first_zero_bit(rblk->invalid_pages, - nr_sec_per_blk)) < nr_sec_per_blk) { - - /* Lock laddr */ - phys_addr = rrpc_blk_to_ppa(rrpc, rblk) + slot; - -try: - spin_lock(&rrpc->rev_lock); - /* Get logical address from physical to logical table */ - rev = &rrpc->rev_trans_map[phys_addr]; - /* already updated by previous regular write */ - if (rev->addr == ADDR_EMPTY) { - spin_unlock(&rrpc->rev_lock); - continue; - } - - rqd = rrpc_inflight_laddr_acquire(rrpc, rev->addr, 1); - if (IS_ERR_OR_NULL(rqd)) { - spin_unlock(&rrpc->rev_lock); - schedule(); - goto try; - } - - spin_unlock(&rrpc->rev_lock); - - /* Perform read to do GC */ - bio->bi_iter.bi_sector = rrpc_get_sector(rev->addr); - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bio->bi_private = &wait; - bio->bi_end_io = rrpc_end_sync_bio; - - /* TODO: may fail when EXP_PG_SIZE > PAGE_SIZE */ - bio_add_pc_page(q, bio, page, RRPC_EXPOSED_PAGE_SIZE, 0); - - if (rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_GC)) { - pr_err("rrpc: gc read failed.\n"); - rrpc_inflight_laddr_release(rrpc, rqd); - goto finished; - } - wait_for_completion_io(&wait); - if (bio->bi_status) { - rrpc_inflight_laddr_release(rrpc, rqd); - goto finished; - } - - bio_reset(bio); - reinit_completion(&wait); - - bio->bi_iter.bi_sector = rrpc_get_sector(rev->addr); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - bio->bi_private = &wait; - bio->bi_end_io = rrpc_end_sync_bio; - - bio_add_pc_page(q, bio, page, RRPC_EXPOSED_PAGE_SIZE, 0); - - /* turn the command around and write the data back to a new - * address - */ - if (rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_GC)) { - pr_err("rrpc: gc write failed.\n"); - rrpc_inflight_laddr_release(rrpc, rqd); - goto finished; - } - wait_for_completion_io(&wait); - - rrpc_inflight_laddr_release(rrpc, rqd); - if (bio->bi_status) - goto finished; - - bio_reset(bio); - } - -finished: - mempool_free(page, rrpc->page_pool); - bio_put(bio); - - if (!bitmap_full(rblk->invalid_pages, nr_sec_per_blk)) { - pr_err("nvm: failed to garbage collect block\n"); - return -EIO; - } - - return 0; -} - -static void rrpc_block_gc(struct work_struct *work) -{ - struct rrpc_block_gc *gcb = container_of(work, struct rrpc_block_gc, - ws_gc); - struct rrpc *rrpc = gcb->rrpc; - struct rrpc_block *rblk = gcb->rblk; - struct rrpc_lun *rlun = rblk->rlun; - struct ppa_addr ppa; - - mempool_free(gcb, rrpc->gcb_pool); - pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' being reclaimed\n", - rlun->bppa.g.ch, rlun->bppa.g.lun, - rblk->id); - - if (rrpc_move_valid_pages(rrpc, rblk)) - goto put_back; - - ppa.ppa = 0; - ppa.g.ch = rlun->bppa.g.ch; - ppa.g.lun = rlun->bppa.g.lun; - ppa.g.blk = rblk->id; - - if (nvm_erase_sync(rrpc->dev, &ppa, 1)) - goto put_back; - - rrpc_put_blk(rrpc, rblk); - - return; - -put_back: - spin_lock(&rlun->lock); - list_add_tail(&rblk->prio, &rlun->prio_list); - spin_unlock(&rlun->lock); -} - -/* the block with highest number of invalid pages, will be in the beginning - * of the list - */ -static struct rrpc_block *rblk_max_invalid(struct rrpc_block *ra, - struct rrpc_block *rb) -{ - if (ra->nr_invalid_pages == rb->nr_invalid_pages) - return ra; - - return (ra->nr_invalid_pages < rb->nr_invalid_pages) ? rb : ra; -} - -/* linearly find the block with highest number of invalid pages - * requires lun->lock - */ -static struct rrpc_block *block_prio_find_max(struct rrpc_lun *rlun) -{ - struct list_head *prio_list = &rlun->prio_list; - struct rrpc_block *rblk, *max; - - BUG_ON(list_empty(prio_list)); - - max = list_first_entry(prio_list, struct rrpc_block, prio); - list_for_each_entry(rblk, prio_list, prio) - max = rblk_max_invalid(max, rblk); - - return max; -} - -static void rrpc_lun_gc(struct work_struct *work) -{ - struct rrpc_lun *rlun = container_of(work, struct rrpc_lun, ws_gc); - struct rrpc *rrpc = rlun->rrpc; - struct nvm_tgt_dev *dev = rrpc->dev; - struct rrpc_block_gc *gcb; - unsigned int nr_blocks_need; - - nr_blocks_need = dev->geo.blks_per_lun / GC_LIMIT_INVERSE; - - if (nr_blocks_need < rrpc->nr_luns) - nr_blocks_need = rrpc->nr_luns; - - spin_lock(&rlun->lock); - while (nr_blocks_need > rlun->nr_free_blocks && - !list_empty(&rlun->prio_list)) { - struct rrpc_block *rblk = block_prio_find_max(rlun); - - if (!rblk->nr_invalid_pages) - break; - - gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC); - if (!gcb) - break; - - list_del_init(&rblk->prio); - - WARN_ON(!block_is_full(rrpc, rblk)); - - pr_debug("rrpc: selected block 'ch:%d,lun:%d,blk:%d' for GC\n", - rlun->bppa.g.ch, rlun->bppa.g.lun, - rblk->id); - - gcb->rrpc = rrpc; - gcb->rblk = rblk; - INIT_WORK(&gcb->ws_gc, rrpc_block_gc); - - queue_work(rrpc->kgc_wq, &gcb->ws_gc); - - nr_blocks_need--; - } - spin_unlock(&rlun->lock); - - /* TODO: Hint that request queue can be started again */ -} - -static void rrpc_gc_queue(struct work_struct *work) -{ - struct rrpc_block_gc *gcb = container_of(work, struct rrpc_block_gc, - ws_gc); - struct rrpc *rrpc = gcb->rrpc; - struct rrpc_block *rblk = gcb->rblk; - struct rrpc_lun *rlun = rblk->rlun; - - spin_lock(&rlun->lock); - list_add_tail(&rblk->prio, &rlun->prio_list); - spin_unlock(&rlun->lock); - - mempool_free(gcb, rrpc->gcb_pool); - pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' full, allow GC (sched)\n", - rlun->bppa.g.ch, rlun->bppa.g.lun, - rblk->id); -} - -static const struct block_device_operations rrpc_fops = { - .owner = THIS_MODULE, -}; - -static struct rrpc_lun *rrpc_get_lun_rr(struct rrpc *rrpc, int is_gc) -{ - unsigned int i; - struct rrpc_lun *rlun, *max_free; - - if (!is_gc) - return get_next_lun(rrpc); - - /* during GC, we don't care about RR, instead we want to make - * sure that we maintain evenness between the block luns. - */ - max_free = &rrpc->luns[0]; - /* prevent GC-ing lun from devouring pages of a lun with - * little free blocks. We don't take the lock as we only need an - * estimate. - */ - rrpc_for_each_lun(rrpc, rlun, i) { - if (rlun->nr_free_blocks > max_free->nr_free_blocks) - max_free = rlun; - } - - return max_free; -} - -static struct rrpc_addr *rrpc_update_map(struct rrpc *rrpc, sector_t laddr, - struct rrpc_block *rblk, u64 paddr) -{ - struct rrpc_addr *gp; - struct rrpc_rev_addr *rev; - - BUG_ON(laddr >= rrpc->nr_sects); - - gp = &rrpc->trans_map[laddr]; - spin_lock(&rrpc->rev_lock); - if (gp->rblk) - rrpc_page_invalidate(rrpc, gp); - - gp->addr = paddr; - gp->rblk = rblk; - - rev = &rrpc->rev_trans_map[gp->addr]; - rev->addr = laddr; - spin_unlock(&rrpc->rev_lock); - - return gp; -} - -static u64 rrpc_alloc_addr(struct rrpc *rrpc, struct rrpc_block *rblk) -{ - u64 addr = ADDR_EMPTY; - - spin_lock(&rblk->lock); - if (block_is_full(rrpc, rblk)) - goto out; - - addr = rblk->next_page; - - rblk->next_page++; -out: - spin_unlock(&rblk->lock); - return addr; -} - -/* Map logical address to a physical page. The mapping implements a round robin - * approach and allocates a page from the next lun available. - * - * Returns rrpc_addr with the physical address and block. Returns NULL if no - * blocks in the next rlun are available. - */ -static struct ppa_addr rrpc_map_page(struct rrpc *rrpc, sector_t laddr, - int is_gc) -{ - struct nvm_tgt_dev *tgt_dev = rrpc->dev; - struct rrpc_lun *rlun; - struct rrpc_block *rblk, **cur_rblk; - struct rrpc_addr *p; - struct ppa_addr ppa; - u64 paddr; - int gc_force = 0; - - ppa.ppa = ADDR_EMPTY; - rlun = rrpc_get_lun_rr(rrpc, is_gc); - - if (!is_gc && rlun->nr_free_blocks < rrpc->nr_luns * 4) - return ppa; - - /* - * page allocation steps: - * 1. Try to allocate new page from current rblk - * 2a. If succeed, proceed to map it in and return - * 2b. If fail, first try to allocate a new block from media manger, - * and then retry step 1. Retry until the normal block pool is - * exhausted. - * 3. If exhausted, and garbage collector is requesting the block, - * go to the reserved block and retry step 1. - * In the case that this fails as well, or it is not GC - * requesting, report not able to retrieve a block and let the - * caller handle further processing. - */ - - spin_lock(&rlun->lock); - cur_rblk = &rlun->cur; - rblk = rlun->cur; -retry: - paddr = rrpc_alloc_addr(rrpc, rblk); - - if (paddr != ADDR_EMPTY) - goto done; - - if (!list_empty(&rlun->wblk_list)) { -new_blk: - rblk = list_first_entry(&rlun->wblk_list, struct rrpc_block, - prio); - rrpc_set_lun_cur(rlun, rblk, cur_rblk); - list_del(&rblk->prio); - goto retry; - } - spin_unlock(&rlun->lock); - - rblk = rrpc_get_blk(rrpc, rlun, gc_force); - if (rblk) { - spin_lock(&rlun->lock); - list_add_tail(&rblk->prio, &rlun->wblk_list); - /* - * another thread might already have added a new block, - * Therefore, make sure that one is used, instead of the - * one just added. - */ - goto new_blk; - } - - if (unlikely(is_gc) && !gc_force) { - /* retry from emergency gc block */ - cur_rblk = &rlun->gc_cur; - rblk = rlun->gc_cur; - gc_force = 1; - spin_lock(&rlun->lock); - goto retry; - } - - pr_err("rrpc: failed to allocate new block\n"); - return ppa; -done: - spin_unlock(&rlun->lock); - p = rrpc_update_map(rrpc, laddr, rblk, paddr); - if (!p) - return ppa; - - /* return global address */ - return rrpc_ppa_to_gaddr(tgt_dev, p); -} - -static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk) -{ - struct rrpc_block_gc *gcb; - - gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC); - if (!gcb) { - pr_err("rrpc: unable to queue block for gc."); - return; - } - - gcb->rrpc = rrpc; - gcb->rblk = rblk; - - INIT_WORK(&gcb->ws_gc, rrpc_gc_queue); - queue_work(rrpc->kgc_wq, &gcb->ws_gc); -} - -static struct rrpc_lun *rrpc_ppa_to_lun(struct rrpc *rrpc, struct ppa_addr p) -{ - struct rrpc_lun *rlun = NULL; - int i; - - for (i = 0; i < rrpc->nr_luns; i++) { - if (rrpc->luns[i].bppa.g.ch == p.g.ch && - rrpc->luns[i].bppa.g.lun == p.g.lun) { - rlun = &rrpc->luns[i]; - break; - } - } - - return rlun; -} - -static void __rrpc_mark_bad_block(struct rrpc *rrpc, struct ppa_addr ppa) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - struct rrpc_lun *rlun; - struct rrpc_block *rblk; - - rlun = rrpc_ppa_to_lun(rrpc, ppa); - rblk = &rlun->blocks[ppa.g.blk]; - rblk->state = NVM_BLK_ST_BAD; - - nvm_set_tgt_bb_tbl(dev, &ppa, 1, NVM_BLK_T_GRWN_BAD); -} - -static void rrpc_mark_bad_block(struct rrpc *rrpc, struct nvm_rq *rqd) -{ - void *comp_bits = &rqd->ppa_status; - struct ppa_addr ppa, prev_ppa; - int nr_ppas = rqd->nr_ppas; - int bit; - - if (rqd->nr_ppas == 1) - __rrpc_mark_bad_block(rrpc, rqd->ppa_addr); - - ppa_set_empty(&prev_ppa); - bit = -1; - while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) { - ppa = rqd->ppa_list[bit]; - if (ppa_cmp_blk(ppa, prev_ppa)) - continue; - - __rrpc_mark_bad_block(rrpc, ppa); - } -} - -static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd, - sector_t laddr, uint8_t npages) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - struct rrpc_addr *p; - struct rrpc_block *rblk; - int cmnt_size, i; - - for (i = 0; i < npages; i++) { - p = &rrpc->trans_map[laddr + i]; - rblk = p->rblk; - - cmnt_size = atomic_inc_return(&rblk->data_cmnt_size); - if (unlikely(cmnt_size == dev->geo.sec_per_blk)) - rrpc_run_gc(rrpc, rblk); - } -} - -static void rrpc_end_io(struct nvm_rq *rqd) -{ - struct rrpc *rrpc = rqd->private; - struct nvm_tgt_dev *dev = rrpc->dev; - struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd); - uint8_t npages = rqd->nr_ppas; - sector_t laddr = rrpc_get_laddr(rqd->bio) - npages; - - if (bio_data_dir(rqd->bio) == WRITE) { - if (rqd->error == NVM_RSP_ERR_FAILWRITE) - rrpc_mark_bad_block(rrpc, rqd); - - rrpc_end_io_write(rrpc, rrqd, laddr, npages); - } - - bio_put(rqd->bio); - - if (rrqd->flags & NVM_IOTYPE_GC) - return; - - rrpc_unlock_rq(rrpc, rqd); - - if (npages > 1) - nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list); - - mempool_free(rqd, rrpc->rq_pool); -} - -static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio, - struct nvm_rq *rqd, unsigned long flags, int npages) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd); - struct rrpc_addr *gp; - sector_t laddr = rrpc_get_laddr(bio); - int is_gc = flags & NVM_IOTYPE_GC; - int i; - - if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) { - nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list); - return NVM_IO_REQUEUE; - } - - for (i = 0; i < npages; i++) { - /* We assume that mapping occurs at 4KB granularity */ - BUG_ON(!(laddr + i < rrpc->nr_sects)); - gp = &rrpc->trans_map[laddr + i]; - - if (gp->rblk) { - rqd->ppa_list[i] = rrpc_ppa_to_gaddr(dev, gp); - } else { - BUG_ON(is_gc); - rrpc_unlock_laddr(rrpc, r); - nvm_dev_dma_free(dev->parent, rqd->ppa_list, - rqd->dma_ppa_list); - return NVM_IO_DONE; - } - } - - rqd->opcode = NVM_OP_HBREAD; - - return NVM_IO_OK; -} - -static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd, - unsigned long flags) -{ - int is_gc = flags & NVM_IOTYPE_GC; - sector_t laddr = rrpc_get_laddr(bio); - struct rrpc_addr *gp; - - if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) - return NVM_IO_REQUEUE; - - BUG_ON(!(laddr < rrpc->nr_sects)); - gp = &rrpc->trans_map[laddr]; - - if (gp->rblk) { - rqd->ppa_addr = rrpc_ppa_to_gaddr(rrpc->dev, gp); - } else { - BUG_ON(is_gc); - rrpc_unlock_rq(rrpc, rqd); - return NVM_IO_DONE; - } - - rqd->opcode = NVM_OP_HBREAD; - - return NVM_IO_OK; -} - -static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio, - struct nvm_rq *rqd, unsigned long flags, int npages) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd); - struct ppa_addr p; - sector_t laddr = rrpc_get_laddr(bio); - int is_gc = flags & NVM_IOTYPE_GC; - int i; - - if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) { - nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list); - return NVM_IO_REQUEUE; - } - - for (i = 0; i < npages; i++) { - /* We assume that mapping occurs at 4KB granularity */ - p = rrpc_map_page(rrpc, laddr + i, is_gc); - if (p.ppa == ADDR_EMPTY) { - BUG_ON(is_gc); - rrpc_unlock_laddr(rrpc, r); - nvm_dev_dma_free(dev->parent, rqd->ppa_list, - rqd->dma_ppa_list); - rrpc_gc_kick(rrpc); - return NVM_IO_REQUEUE; - } - - rqd->ppa_list[i] = p; - } - - rqd->opcode = NVM_OP_HBWRITE; - - return NVM_IO_OK; -} - -static int rrpc_write_rq(struct rrpc *rrpc, struct bio *bio, - struct nvm_rq *rqd, unsigned long flags) -{ - struct ppa_addr p; - int is_gc = flags & NVM_IOTYPE_GC; - sector_t laddr = rrpc_get_laddr(bio); - - if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) - return NVM_IO_REQUEUE; - - p = rrpc_map_page(rrpc, laddr, is_gc); - if (p.ppa == ADDR_EMPTY) { - BUG_ON(is_gc); - rrpc_unlock_rq(rrpc, rqd); - rrpc_gc_kick(rrpc); - return NVM_IO_REQUEUE; - } - - rqd->ppa_addr = p; - rqd->opcode = NVM_OP_HBWRITE; - - return NVM_IO_OK; -} - -static int rrpc_setup_rq(struct rrpc *rrpc, struct bio *bio, - struct nvm_rq *rqd, unsigned long flags, uint8_t npages) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - - if (npages > 1) { - rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, - &rqd->dma_ppa_list); - if (!rqd->ppa_list) { - pr_err("rrpc: not able to allocate ppa list\n"); - return NVM_IO_ERR; - } - - if (bio_op(bio) == REQ_OP_WRITE) - return rrpc_write_ppalist_rq(rrpc, bio, rqd, flags, - npages); - - return rrpc_read_ppalist_rq(rrpc, bio, rqd, flags, npages); - } - - if (bio_op(bio) == REQ_OP_WRITE) - return rrpc_write_rq(rrpc, bio, rqd, flags); - - return rrpc_read_rq(rrpc, bio, rqd, flags); -} - -static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio, - struct nvm_rq *rqd, unsigned long flags) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - struct rrpc_rq *rrq = nvm_rq_to_pdu(rqd); - uint8_t nr_pages = rrpc_get_pages(bio); - int bio_size = bio_sectors(bio) << 9; - int err; - - if (bio_size < dev->geo.sec_size) - return NVM_IO_ERR; - else if (bio_size > dev->geo.max_rq_size) - return NVM_IO_ERR; - - err = rrpc_setup_rq(rrpc, bio, rqd, flags, nr_pages); - if (err) - return err; - - bio_get(bio); - rqd->bio = bio; - rqd->private = rrpc; - rqd->nr_ppas = nr_pages; - rqd->end_io = rrpc_end_io; - rrq->flags = flags; - - err = nvm_submit_io(dev, rqd); - if (err) { - pr_err("rrpc: I/O submission failed: %d\n", err); - bio_put(bio); - if (!(flags & NVM_IOTYPE_GC)) { - rrpc_unlock_rq(rrpc, rqd); - if (rqd->nr_ppas > 1) - nvm_dev_dma_free(dev->parent, rqd->ppa_list, - rqd->dma_ppa_list); - } - return NVM_IO_ERR; - } - - return NVM_IO_OK; -} - -static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio) -{ - struct rrpc *rrpc = q->queuedata; - struct nvm_rq *rqd; - int err; - - blk_queue_split(q, &bio); - - if (bio_op(bio) == REQ_OP_DISCARD) { - rrpc_discard(rrpc, bio); - return BLK_QC_T_NONE; - } - - rqd = mempool_alloc(rrpc->rq_pool, GFP_KERNEL); - memset(rqd, 0, sizeof(struct nvm_rq)); - - err = rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_NONE); - switch (err) { - case NVM_IO_OK: - return BLK_QC_T_NONE; - case NVM_IO_ERR: - bio_io_error(bio); - break; - case NVM_IO_DONE: - bio_endio(bio); - break; - case NVM_IO_REQUEUE: - spin_lock(&rrpc->bio_lock); - bio_list_add(&rrpc->requeue_bios, bio); - spin_unlock(&rrpc->bio_lock); - queue_work(rrpc->kgc_wq, &rrpc->ws_requeue); - break; - } - - mempool_free(rqd, rrpc->rq_pool); - return BLK_QC_T_NONE; -} - -static void rrpc_requeue(struct work_struct *work) -{ - struct rrpc *rrpc = container_of(work, struct rrpc, ws_requeue); - struct bio_list bios; - struct bio *bio; - - bio_list_init(&bios); - - spin_lock(&rrpc->bio_lock); - bio_list_merge(&bios, &rrpc->requeue_bios); - bio_list_init(&rrpc->requeue_bios); - spin_unlock(&rrpc->bio_lock); - - while ((bio = bio_list_pop(&bios))) - rrpc_make_rq(rrpc->disk->queue, bio); -} - -static void rrpc_gc_free(struct rrpc *rrpc) -{ - if (rrpc->krqd_wq) - destroy_workqueue(rrpc->krqd_wq); - - if (rrpc->kgc_wq) - destroy_workqueue(rrpc->kgc_wq); -} - -static int rrpc_gc_init(struct rrpc *rrpc) -{ - rrpc->krqd_wq = alloc_workqueue("rrpc-lun", WQ_MEM_RECLAIM|WQ_UNBOUND, - rrpc->nr_luns); - if (!rrpc->krqd_wq) - return -ENOMEM; - - rrpc->kgc_wq = alloc_workqueue("rrpc-bg", WQ_MEM_RECLAIM, 1); - if (!rrpc->kgc_wq) - return -ENOMEM; - - timer_setup(&rrpc->gc_timer, rrpc_gc_timer, 0); - - return 0; -} - -static void rrpc_map_free(struct rrpc *rrpc) -{ - vfree(rrpc->rev_trans_map); - vfree(rrpc->trans_map); -} - -static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private) -{ - struct rrpc *rrpc = (struct rrpc *)private; - struct nvm_tgt_dev *dev = rrpc->dev; - struct rrpc_addr *addr = rrpc->trans_map + slba; - struct rrpc_rev_addr *raddr = rrpc->rev_trans_map; - struct rrpc_lun *rlun; - struct rrpc_block *rblk; - u64 i; - - for (i = 0; i < nlb; i++) { - struct ppa_addr gaddr; - u64 pba = le64_to_cpu(entries[i]); - unsigned int mod; - - /* LNVM treats address-spaces as silos, LBA and PBA are - * equally large and zero-indexed. - */ - if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) { - pr_err("nvm: L2P data entry is out of bounds!\n"); - pr_err("nvm: Maybe loaded an old target L2P\n"); - return -EINVAL; - } - - /* Address zero is a special one. The first page on a disk is - * protected. As it often holds internal device boot - * information. - */ - if (!pba) - continue; - - div_u64_rem(pba, rrpc->nr_sects, &mod); - - gaddr = rrpc_recov_addr(dev, pba); - rlun = rrpc_ppa_to_lun(rrpc, gaddr); - if (!rlun) { - pr_err("rrpc: l2p corruption on lba %llu\n", - slba + i); - return -EINVAL; - } - - rblk = &rlun->blocks[gaddr.g.blk]; - if (!rblk->state) { - /* at this point, we don't know anything about the - * block. It's up to the FTL on top to re-etablish the - * block state. The block is assumed to be open. - */ - list_move_tail(&rblk->list, &rlun->used_list); - rblk->state = NVM_BLK_ST_TGT; - rlun->nr_free_blocks--; - } - - addr[i].addr = pba; - addr[i].rblk = rblk; - raddr[mod].addr = slba + i; - } - - return 0; -} - -static int rrpc_map_init(struct rrpc *rrpc) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - sector_t i; - int ret; - - rrpc->trans_map = vzalloc(sizeof(struct rrpc_addr) * rrpc->nr_sects); - if (!rrpc->trans_map) - return -ENOMEM; - - rrpc->rev_trans_map = vmalloc(sizeof(struct rrpc_rev_addr) - * rrpc->nr_sects); - if (!rrpc->rev_trans_map) - return -ENOMEM; - - for (i = 0; i < rrpc->nr_sects; i++) { - struct rrpc_addr *p = &rrpc->trans_map[i]; - struct rrpc_rev_addr *r = &rrpc->rev_trans_map[i]; - - p->addr = ADDR_EMPTY; - r->addr = ADDR_EMPTY; - } - - /* Bring up the mapping table from device */ - ret = nvm_get_l2p_tbl(dev, rrpc->soffset, rrpc->nr_sects, - rrpc_l2p_update, rrpc); - if (ret) { - pr_err("nvm: rrpc: could not read L2P table.\n"); - return -EINVAL; - } - - return 0; -} - -/* Minimum pages needed within a lun */ -#define PAGE_POOL_SIZE 16 -#define ADDR_POOL_SIZE 64 - -static int rrpc_core_init(struct rrpc *rrpc) -{ - down_write(&rrpc_lock); - if (!rrpc_gcb_cache) { - rrpc_gcb_cache = kmem_cache_create("rrpc_gcb", - sizeof(struct rrpc_block_gc), 0, 0, NULL); - if (!rrpc_gcb_cache) { - up_write(&rrpc_lock); - return -ENOMEM; - } - - rrpc_rq_cache = kmem_cache_create("rrpc_rq", - sizeof(struct nvm_rq) + sizeof(struct rrpc_rq), - 0, 0, NULL); - if (!rrpc_rq_cache) { - kmem_cache_destroy(rrpc_gcb_cache); - up_write(&rrpc_lock); - return -ENOMEM; - } - } - up_write(&rrpc_lock); - - rrpc->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0); - if (!rrpc->page_pool) - return -ENOMEM; - - rrpc->gcb_pool = mempool_create_slab_pool(rrpc->dev->geo.nr_luns, - rrpc_gcb_cache); - if (!rrpc->gcb_pool) - return -ENOMEM; - - rrpc->rq_pool = mempool_create_slab_pool(64, rrpc_rq_cache); - if (!rrpc->rq_pool) - return -ENOMEM; - - spin_lock_init(&rrpc->inflights.lock); - INIT_LIST_HEAD(&rrpc->inflights.reqs); - - return 0; -} - -static void rrpc_core_free(struct rrpc *rrpc) -{ - mempool_destroy(rrpc->page_pool); - mempool_destroy(rrpc->gcb_pool); - mempool_destroy(rrpc->rq_pool); -} - -static void rrpc_luns_free(struct rrpc *rrpc) -{ - struct rrpc_lun *rlun; - int i; - - if (!rrpc->luns) - return; - - for (i = 0; i < rrpc->nr_luns; i++) { - rlun = &rrpc->luns[i]; - vfree(rlun->blocks); - } - - kfree(rrpc->luns); -} - -static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun) -{ - struct nvm_geo *geo = &dev->geo; - struct rrpc_block *rblk; - struct ppa_addr ppa; - u8 *blks; - int nr_blks; - int i; - int ret; - - if (!dev->parent->ops->get_bb_tbl) - return 0; - - nr_blks = geo->blks_per_lun * geo->plane_mode; - blks = kmalloc(nr_blks, GFP_KERNEL); - if (!blks) - return -ENOMEM; - - ppa.ppa = 0; - ppa.g.ch = rlun->bppa.g.ch; - ppa.g.lun = rlun->bppa.g.lun; - - ret = nvm_get_tgt_bb_tbl(dev, ppa, blks); - if (ret) { - pr_err("rrpc: could not get BB table\n"); - goto out; - } - - nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks); - if (nr_blks < 0) { - ret = nr_blks; - goto out; - } - - for (i = 0; i < nr_blks; i++) { - if (blks[i] == NVM_BLK_T_FREE) - continue; - - rblk = &rlun->blocks[i]; - list_move_tail(&rblk->list, &rlun->bb_list); - rblk->state = NVM_BLK_ST_BAD; - rlun->nr_free_blocks--; - } - -out: - kfree(blks); - return ret; -} - -static void rrpc_set_lun_ppa(struct rrpc_lun *rlun, struct ppa_addr ppa) -{ - rlun->bppa.ppa = 0; - rlun->bppa.g.ch = ppa.g.ch; - rlun->bppa.g.lun = ppa.g.lun; -} - -static int rrpc_luns_init(struct rrpc *rrpc, struct ppa_addr *luns) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - struct nvm_geo *geo = &dev->geo; - struct rrpc_lun *rlun; - int i, j, ret = -EINVAL; - - if (geo->sec_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) { - pr_err("rrpc: number of pages per block too high."); - return -EINVAL; - } - - spin_lock_init(&rrpc->rev_lock); - - rrpc->luns = kcalloc(rrpc->nr_luns, sizeof(struct rrpc_lun), - GFP_KERNEL); - if (!rrpc->luns) - return -ENOMEM; - - /* 1:1 mapping */ - for (i = 0; i < rrpc->nr_luns; i++) { - rlun = &rrpc->luns[i]; - rlun->id = i; - rrpc_set_lun_ppa(rlun, luns[i]); - rlun->blocks = vzalloc(sizeof(struct rrpc_block) * - geo->blks_per_lun); - if (!rlun->blocks) { - ret = -ENOMEM; - goto err; - } - - INIT_LIST_HEAD(&rlun->free_list); - INIT_LIST_HEAD(&rlun->used_list); - INIT_LIST_HEAD(&rlun->bb_list); - - for (j = 0; j < geo->blks_per_lun; j++) { - struct rrpc_block *rblk = &rlun->blocks[j]; - - rblk->id = j; - rblk->rlun = rlun; - rblk->state = NVM_BLK_T_FREE; - INIT_LIST_HEAD(&rblk->prio); - INIT_LIST_HEAD(&rblk->list); - spin_lock_init(&rblk->lock); - - list_add_tail(&rblk->list, &rlun->free_list); - } - - rlun->rrpc = rrpc; - rlun->nr_free_blocks = geo->blks_per_lun; - rlun->reserved_blocks = 2; /* for GC only */ - - INIT_LIST_HEAD(&rlun->prio_list); - INIT_LIST_HEAD(&rlun->wblk_list); - - INIT_WORK(&rlun->ws_gc, rrpc_lun_gc); - spin_lock_init(&rlun->lock); - - if (rrpc_bb_discovery(dev, rlun)) - goto err; - - } - - return 0; -err: - return ret; -} - -/* returns 0 on success and stores the beginning address in *begin */ -static int rrpc_area_init(struct rrpc *rrpc, sector_t *begin) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - sector_t size = rrpc->nr_sects * dev->geo.sec_size; - int ret; - - size >>= 9; - - ret = nvm_get_area(dev, begin, size); - if (!ret) - *begin >>= (ilog2(dev->geo.sec_size) - 9); - - return ret; -} - -static void rrpc_area_free(struct rrpc *rrpc) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - sector_t begin = rrpc->soffset << (ilog2(dev->geo.sec_size) - 9); - - nvm_put_area(dev, begin); -} - -static void rrpc_free(struct rrpc *rrpc) -{ - rrpc_gc_free(rrpc); - rrpc_map_free(rrpc); - rrpc_core_free(rrpc); - rrpc_luns_free(rrpc); - rrpc_area_free(rrpc); - - kfree(rrpc); -} - -static void rrpc_exit(void *private) -{ - struct rrpc *rrpc = private; - - del_timer(&rrpc->gc_timer); - - flush_workqueue(rrpc->krqd_wq); - flush_workqueue(rrpc->kgc_wq); - - rrpc_free(rrpc); -} - -static sector_t rrpc_capacity(void *private) -{ - struct rrpc *rrpc = private; - struct nvm_tgt_dev *dev = rrpc->dev; - sector_t reserved, provisioned; - - /* cur, gc, and two emergency blocks for each lun */ - reserved = rrpc->nr_luns * dev->geo.sec_per_blk * 4; - provisioned = rrpc->nr_sects - reserved; - - if (reserved > rrpc->nr_sects) { - pr_err("rrpc: not enough space available to expose storage.\n"); - return 0; - } - - sector_div(provisioned, 10); - return provisioned * 9 * NR_PHY_IN_LOG; -} - -/* - * Looks up the logical address from reverse trans map and check if its valid by - * comparing the logical to physical address with the physical address. - * Returns 0 on free, otherwise 1 if in use - */ -static void rrpc_block_map_update(struct rrpc *rrpc, struct rrpc_block *rblk) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - int offset; - struct rrpc_addr *laddr; - u64 bpaddr, paddr, pladdr; - - bpaddr = block_to_rel_addr(rrpc, rblk); - for (offset = 0; offset < dev->geo.sec_per_blk; offset++) { - paddr = bpaddr + offset; - - pladdr = rrpc->rev_trans_map[paddr].addr; - if (pladdr == ADDR_EMPTY) - continue; - - laddr = &rrpc->trans_map[pladdr]; - - if (paddr == laddr->addr) { - laddr->rblk = rblk; - } else { - set_bit(offset, rblk->invalid_pages); - rblk->nr_invalid_pages++; - } - } -} - -static int rrpc_blocks_init(struct rrpc *rrpc) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - struct rrpc_lun *rlun; - struct rrpc_block *rblk; - int lun_iter, blk_iter; - - for (lun_iter = 0; lun_iter < rrpc->nr_luns; lun_iter++) { - rlun = &rrpc->luns[lun_iter]; - - for (blk_iter = 0; blk_iter < dev->geo.blks_per_lun; - blk_iter++) { - rblk = &rlun->blocks[blk_iter]; - rrpc_block_map_update(rrpc, rblk); - } - } - - return 0; -} - -static int rrpc_luns_configure(struct rrpc *rrpc) -{ - struct rrpc_lun *rlun; - struct rrpc_block *rblk; - int i; - - for (i = 0; i < rrpc->nr_luns; i++) { - rlun = &rrpc->luns[i]; - - rblk = rrpc_get_blk(rrpc, rlun, 0); - if (!rblk) - goto err; - rrpc_set_lun_cur(rlun, rblk, &rlun->cur); - - /* Emergency gc block */ - rblk = rrpc_get_blk(rrpc, rlun, 1); - if (!rblk) - goto err; - rrpc_set_lun_cur(rlun, rblk, &rlun->gc_cur); - } - - return 0; -err: - rrpc_put_blks(rrpc); - return -EINVAL; -} - -static struct nvm_tgt_type tt_rrpc; - -static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, - int flags) -{ - struct request_queue *bqueue = dev->q; - struct request_queue *tqueue = tdisk->queue; - struct nvm_geo *geo = &dev->geo; - struct rrpc *rrpc; - sector_t soffset; - int ret; - - if (!(dev->identity.dom & NVM_RSP_L2P)) { - pr_err("nvm: rrpc: device does not support l2p (%x)\n", - dev->identity.dom); - return ERR_PTR(-EINVAL); - } - - rrpc = kzalloc(sizeof(struct rrpc), GFP_KERNEL); - if (!rrpc) - return ERR_PTR(-ENOMEM); - - rrpc->dev = dev; - rrpc->disk = tdisk; - - bio_list_init(&rrpc->requeue_bios); - spin_lock_init(&rrpc->bio_lock); - INIT_WORK(&rrpc->ws_requeue, rrpc_requeue); - - rrpc->nr_luns = geo->nr_luns; - rrpc->nr_sects = (unsigned long long)geo->sec_per_lun * rrpc->nr_luns; - - /* simple round-robin strategy */ - atomic_set(&rrpc->next_lun, -1); - - ret = rrpc_area_init(rrpc, &soffset); - if (ret < 0) { - pr_err("nvm: rrpc: could not initialize area\n"); - return ERR_PTR(ret); - } - rrpc->soffset = soffset; - - ret = rrpc_luns_init(rrpc, dev->luns); - if (ret) { - pr_err("nvm: rrpc: could not initialize luns\n"); - goto err; - } - - ret = rrpc_core_init(rrpc); - if (ret) { - pr_err("nvm: rrpc: could not initialize core\n"); - goto err; - } - - ret = rrpc_map_init(rrpc); - if (ret) { - pr_err("nvm: rrpc: could not initialize maps\n"); - goto err; - } - - ret = rrpc_blocks_init(rrpc); - if (ret) { - pr_err("nvm: rrpc: could not initialize state for blocks\n"); - goto err; - } - - ret = rrpc_luns_configure(rrpc); - if (ret) { - pr_err("nvm: rrpc: not enough blocks available in LUNs.\n"); - goto err; - } - - ret = rrpc_gc_init(rrpc); - if (ret) { - pr_err("nvm: rrpc: could not initialize gc\n"); - goto err; - } - - /* inherit the size from the underlying device */ - blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue)); - blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue)); - - pr_info("nvm: rrpc initialized with %u luns and %llu pages.\n", - rrpc->nr_luns, (unsigned long long)rrpc->nr_sects); - - mod_timer(&rrpc->gc_timer, jiffies + msecs_to_jiffies(10)); - - return rrpc; -err: - rrpc_free(rrpc); - return ERR_PTR(ret); -} - -/* round robin, page-based FTL, and cost-based GC */ -static struct nvm_tgt_type tt_rrpc = { - .name = "rrpc", - .version = {1, 0, 0}, - - .make_rq = rrpc_make_rq, - .capacity = rrpc_capacity, - - .init = rrpc_init, - .exit = rrpc_exit, -}; - -static int __init rrpc_module_init(void) -{ - return nvm_register_tgt_type(&tt_rrpc); -} - -static void rrpc_module_exit(void) -{ - nvm_unregister_tgt_type(&tt_rrpc); -} - -module_init(rrpc_module_init); -module_exit(rrpc_module_exit); -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Block-Device Target for Open-Channel SSDs"); diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h deleted file mode 100644 index fdb6ff9..0000000 --- a/drivers/lightnvm/rrpc.h +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright (C) 2015 IT University of Copenhagen - * Initial release: Matias Bjorling <m@bjorling.me> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Implementation of a Round-robin page-based Hybrid FTL for Open-channel SSDs. - */ - -#ifndef RRPC_H_ -#define RRPC_H_ - -#include <linux/blkdev.h> -#include <linux/blk-mq.h> -#include <linux/bio.h> -#include <linux/module.h> -#include <linux/kthread.h> -#include <linux/vmalloc.h> - -#include <linux/lightnvm.h> - -/* Run only GC if less than 1/X blocks are free */ -#define GC_LIMIT_INVERSE 10 -#define GC_TIME_SECS 100 - -#define RRPC_SECTOR (512) -#define RRPC_EXPOSED_PAGE_SIZE (4096) - -#define NR_PHY_IN_LOG (RRPC_EXPOSED_PAGE_SIZE / RRPC_SECTOR) - -struct rrpc_inflight { - struct list_head reqs; - spinlock_t lock; -}; - -struct rrpc_inflight_rq { - struct list_head list; - sector_t l_start; - sector_t l_end; -}; - -struct rrpc_rq { - struct rrpc_inflight_rq inflight_rq; - unsigned long flags; -}; - -struct rrpc_block { - int id; /* id inside of LUN */ - struct rrpc_lun *rlun; - - struct list_head prio; /* LUN CG list */ - struct list_head list; /* LUN free, used, bb list */ - -#define MAX_INVALID_PAGES_STORAGE 8 - /* Bitmap for invalid page intries */ - unsigned long invalid_pages[MAX_INVALID_PAGES_STORAGE]; - /* points to the next writable page within a block */ - unsigned int next_page; - /* number of pages that are invalid, wrt host page size */ - unsigned int nr_invalid_pages; - - int state; - - spinlock_t lock; - atomic_t data_cmnt_size; /* data pages committed to stable storage */ -}; - -struct rrpc_lun { - struct rrpc *rrpc; - - int id; - struct ppa_addr bppa; - - struct rrpc_block *cur, *gc_cur; - struct rrpc_block *blocks; /* Reference to block allocation */ - - struct list_head prio_list; /* Blocks that may be GC'ed */ - struct list_head wblk_list; /* Queued blocks to be written to */ - - /* lun block lists */ - struct list_head used_list; /* In-use blocks */ - struct list_head free_list; /* Not used blocks i.e. released - * and ready for use - */ - struct list_head bb_list; /* Bad blocks. Mutually exclusive with - * free_list and used_list - */ - unsigned int nr_free_blocks; /* Number of unused blocks */ - - struct work_struct ws_gc; - - int reserved_blocks; - - spinlock_t lock; -}; - -struct rrpc { - struct nvm_tgt_dev *dev; - struct gendisk *disk; - - sector_t soffset; /* logical sector offset */ - - int nr_luns; - struct rrpc_lun *luns; - - /* calculated values */ - unsigned long long nr_sects; - - /* Write strategy variables. Move these into each for structure for each - * strategy - */ - atomic_t next_lun; /* Whenever a page is written, this is updated - * to point to the next write lun - */ - - spinlock_t bio_lock; - struct bio_list requeue_bios; - struct work_struct ws_requeue; - - /* Simple translation map of logical addresses to physical addresses. - * The logical addresses is known by the host system, while the physical - * addresses are used when writing to the disk block device. - */ - struct rrpc_addr *trans_map; - /* also store a reverse map for garbage collection */ - struct rrpc_rev_addr *rev_trans_map; - spinlock_t rev_lock; - - struct rrpc_inflight inflights; - - mempool_t *addr_pool; - mempool_t *page_pool; - mempool_t *gcb_pool; - mempool_t *rq_pool; - - struct timer_list gc_timer; - struct workqueue_struct *krqd_wq; - struct workqueue_struct *kgc_wq; -}; - -struct rrpc_block_gc { - struct rrpc *rrpc; - struct rrpc_block *rblk; - struct work_struct ws_gc; -}; - -/* Logical to physical mapping */ -struct rrpc_addr { - u64 addr; - struct rrpc_block *rblk; -}; - -/* Physical to logical mapping */ -struct rrpc_rev_addr { - u64 addr; -}; - -static inline struct ppa_addr rrpc_linear_to_generic_addr(struct nvm_geo *geo, - struct ppa_addr r) -{ - struct ppa_addr l; - int secs, pgs; - sector_t ppa = r.ppa; - - l.ppa = 0; - - div_u64_rem(ppa, geo->sec_per_pg, &secs); - l.g.sec = secs; - - sector_div(ppa, geo->sec_per_pg); - div_u64_rem(ppa, geo->pgs_per_blk, &pgs); - l.g.pg = pgs; - - return l; -} - -static inline struct ppa_addr rrpc_recov_addr(struct nvm_tgt_dev *dev, u64 pba) -{ - return linear_to_generic_addr(&dev->geo, pba); -} - -static inline u64 rrpc_blk_to_ppa(struct rrpc *rrpc, struct rrpc_block *rblk) -{ - struct nvm_tgt_dev *dev = rrpc->dev; - struct nvm_geo *geo = &dev->geo; - struct rrpc_lun *rlun = rblk->rlun; - - return (rlun->id * geo->sec_per_lun) + (rblk->id * geo->sec_per_blk); -} - -static inline sector_t rrpc_get_laddr(struct bio *bio) -{ - return bio->bi_iter.bi_sector / NR_PHY_IN_LOG; -} - -static inline unsigned int rrpc_get_pages(struct bio *bio) -{ - return bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE; -} - -static inline sector_t rrpc_get_sector(sector_t laddr) -{ - return laddr * NR_PHY_IN_LOG; -} - -static inline int request_intersects(struct rrpc_inflight_rq *r, - sector_t laddr_start, sector_t laddr_end) -{ - return (laddr_end >= r->l_start) && (laddr_start <= r->l_end); -} - -static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, - unsigned int pages, struct rrpc_inflight_rq *r) -{ - sector_t laddr_end = laddr + pages - 1; - struct rrpc_inflight_rq *rtmp; - - WARN_ON(irqs_disabled()); - - spin_lock_irq(&rrpc->inflights.lock); - list_for_each_entry(rtmp, &rrpc->inflights.reqs, list) { - if (unlikely(request_intersects(rtmp, laddr, laddr_end))) { - /* existing, overlapping request, come back later */ - spin_unlock_irq(&rrpc->inflights.lock); - return 1; - } - } - - r->l_start = laddr; - r->l_end = laddr_end; - - list_add_tail(&r->list, &rrpc->inflights.reqs); - spin_unlock_irq(&rrpc->inflights.lock); - return 0; -} - -static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, - unsigned int pages, - struct rrpc_inflight_rq *r) -{ - BUG_ON((laddr + pages) > rrpc->nr_sects); - - return __rrpc_lock_laddr(rrpc, laddr, pages, r); -} - -static inline struct rrpc_inflight_rq *rrpc_get_inflight_rq(struct nvm_rq *rqd) -{ - struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd); - - return &rrqd->inflight_rq; -} - -static inline int rrpc_lock_rq(struct rrpc *rrpc, struct bio *bio, - struct nvm_rq *rqd) -{ - sector_t laddr = rrpc_get_laddr(bio); - unsigned int pages = rrpc_get_pages(bio); - struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd); - - return rrpc_lock_laddr(rrpc, laddr, pages, r); -} - -static inline void rrpc_unlock_laddr(struct rrpc *rrpc, - struct rrpc_inflight_rq *r) -{ - unsigned long flags; - - spin_lock_irqsave(&rrpc->inflights.lock, flags); - list_del_init(&r->list); - spin_unlock_irqrestore(&rrpc->inflights.lock, flags); -} - -static inline void rrpc_unlock_rq(struct rrpc *rrpc, struct nvm_rq *rqd) -{ - struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd); - uint8_t pages = rqd->nr_ppas; - - BUG_ON((r->l_start + pages) > rrpc->nr_sects); - - rrpc_unlock_laddr(rrpc, r); -} - -#endif /* RRPC_H_ */ diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index a0cc1bc..6cc6c0f 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -525,15 +525,21 @@ struct open_bucket { /* * We keep multiple buckets open for writes, and try to segregate different - * write streams for better cache utilization: first we look for a bucket where - * the last write to it was sequential with the current write, and failing that - * we look for a bucket that was last used by the same task. + * write streams for better cache utilization: first we try to segregate flash + * only volume write streams from cached devices, secondly we look for a bucket + * where the last write to it was sequential with the current write, and + * failing that we look for a bucket that was last used by the same task. * * The ideas is if you've got multiple tasks pulling data into the cache at the * same time, you'll get better cache utilization if you try to segregate their * data and preserve locality. * - * For example, say you've starting Firefox at the same time you're copying a + * For example, dirty sectors of flash only volume is not reclaimable, if their + * dirty sectors mixed with dirty sectors of cached device, such buckets will + * be marked as dirty and won't be reclaimed, though the dirty data of cached + * device have been written back to backend device. + * + * And say you've starting Firefox at the same time you're copying a * bunch of files. Firefox will likely end up being fairly hot and stay in the * cache awhile, but the data you copied might not be; if you wrote all that * data to the same buckets it'd get invalidated at the same time. @@ -550,7 +556,10 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c, struct open_bucket *ret, *ret_task = NULL; list_for_each_entry_reverse(ret, &c->data_buckets, list) - if (!bkey_cmp(&ret->key, search)) + if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) != + UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)])) + continue; + else if (!bkey_cmp(&ret->key, search)) goto found; else if (ret->last_write_point == write_point) ret_task = ret; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 843877e..5e2d4e8 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -320,14 +320,15 @@ struct cached_dev { */ atomic_t has_dirty; - struct bch_ratelimit writeback_rate; - struct delayed_work writeback_rate_update; - /* - * Internal to the writeback code, so read_dirty() can keep track of - * where it's at. + * Set to zero by things that touch the backing volume-- except + * writeback. Incremented by writeback. Used to determine when to + * accelerate idle writeback. */ - sector_t last_read; + atomic_t backing_idle; + + struct bch_ratelimit writeback_rate; + struct delayed_work writeback_rate_update; /* Limit number of writeback bios in flight */ struct semaphore in_flight; @@ -336,6 +337,14 @@ struct cached_dev { struct keybuf writeback_keys; + /* + * Order the write-half of writeback operations strongly in dispatch + * order. (Maintain LBA order; don't allow reads completing out of + * order to re-order the writes...) + */ + struct closure_waitlist writeback_ordering_wait; + atomic_t writeback_sequence_next; + /* For tracking sequential IO */ #define RECENT_IO_BITS 7 #define RECENT_IO (1 << RECENT_IO_BITS) @@ -488,6 +497,7 @@ struct cache_set { int caches_loaded; struct bcache_device **devices; + unsigned devices_max_used; struct list_head cached_devs; uint64_t cached_dev_sectors; struct closure caching; @@ -852,7 +862,7 @@ static inline void wake_up_allocators(struct cache_set *c) /* Forward declarations */ -void bch_count_io_errors(struct cache *, blk_status_t, const char *); +void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); void bch_bbio_count_io_errors(struct cache_set *, struct bio *, blk_status_t, const char *); void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t, diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 81e8dc3..bf3a48a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -419,7 +419,7 @@ static void do_btree_node_write(struct btree *b) SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_sector_offset(&b->keys, i)); - if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { + if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { int j; struct bio_vec *bv; void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); @@ -432,6 +432,7 @@ static void do_btree_node_write(struct btree *b) continue_at(cl, btree_node_write_done, NULL); } else { + /* No problem for multipage bvec since the bio is just allocated */ b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); @@ -1678,7 +1679,7 @@ static void bch_btree_gc_finish(struct cache_set *c) /* don't reclaim buckets to which writeback keys point */ rcu_read_lock(); - for (i = 0; i < c->nr_uuids; i++) { + for (i = 0; i < c->devices_max_used; i++) { struct bcache_device *d = c->devices[i]; struct cached_dev *dc; struct keybuf_key *w, *n; @@ -1803,10 +1804,7 @@ static int bch_gc_thread(void *arg) int bch_gc_thread_start(struct cache_set *c) { c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); - if (IS_ERR(c->gc_thread)) - return PTR_ERR(c->gc_thread); - - return 0; + return PTR_ERR_OR_ZERO(c->gc_thread); } /* Initial partial gc */ diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 1841d03..7f12920 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -8,6 +8,7 @@ #include <linux/debugfs.h> #include <linux/module.h> #include <linux/seq_file.h> +#include <linux/sched/debug.h> #include "closure.h" @@ -18,10 +19,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) BUG_ON(flags & CLOSURE_GUARD_MASK); BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); - /* Must deliver precisely one wakeup */ - if (r == 1 && (flags & CLOSURE_SLEEPING)) - wake_up_process(cl->task); - if (!r) { if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { atomic_set(&cl->remaining, @@ -100,28 +97,34 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) } EXPORT_SYMBOL(closure_wait); -/** - * closure_sync - sleep until a closure has nothing left to wait on - * - * Sleeps until the refcount hits 1 - the thread that's running the closure owns - * the last refcount. - */ -void closure_sync(struct closure *cl) +struct closure_syncer { + struct task_struct *task; + int done; +}; + +static void closure_sync_fn(struct closure *cl) { - while (1) { - __closure_start_sleep(cl); - closure_set_ret_ip(cl); + cl->s->done = 1; + wake_up_process(cl->s->task); +} - if ((atomic_read(&cl->remaining) & - CLOSURE_REMAINING_MASK) == 1) - break; +void __sched __closure_sync(struct closure *cl) +{ + struct closure_syncer s = { .task = current }; + cl->s = &s; + continue_at(cl, closure_sync_fn, NULL); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (s.done) + break; schedule(); } - __closure_end_sleep(cl); + __set_current_state(TASK_RUNNING); } -EXPORT_SYMBOL(closure_sync); +EXPORT_SYMBOL(__closure_sync); #ifdef CONFIG_BCACHE_CLOSURES_DEBUG @@ -168,12 +171,10 @@ static int debug_seq_show(struct seq_file *f, void *data) cl, (void *) cl->ip, cl->fn, cl->parent, r & CLOSURE_REMAINING_MASK); - seq_printf(f, "%s%s%s%s\n", + seq_printf(f, "%s%s\n", test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&cl->work)) ? "Q" : "", - r & CLOSURE_RUNNING ? "R" : "", - r & CLOSURE_STACK ? "S" : "", - r & CLOSURE_SLEEPING ? "Sl" : ""); + r & CLOSURE_RUNNING ? "R" : ""); if (r & CLOSURE_WAITING) seq_printf(f, " W %pF\n", diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index ccfbea6..3b9dfc9 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -103,6 +103,7 @@ */ struct closure; +struct closure_syncer; typedef void (closure_fn) (struct closure *); struct closure_waitlist { @@ -115,10 +116,6 @@ enum closure_state { * the thread that owns the closure, and cleared by the thread that's * waking up the closure. * - * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep - * - indicates that cl->task is valid and closure_put() may wake it up. - * Only set or cleared by the thread that owns the closure. - * * The rest are for debugging and don't affect behaviour: * * CLOSURE_RUNNING: Set when a closure is running (i.e. by @@ -128,22 +125,16 @@ enum closure_state { * continue_at() and closure_return() clear it for you, if you're doing * something unusual you can use closure_set_dead() which also helps * annotate where references are being transferred. - * - * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a - * closure with this flag set */ - CLOSURE_BITS_START = (1 << 23), - CLOSURE_DESTRUCTOR = (1 << 23), - CLOSURE_WAITING = (1 << 25), - CLOSURE_SLEEPING = (1 << 27), - CLOSURE_RUNNING = (1 << 29), - CLOSURE_STACK = (1 << 31), + CLOSURE_BITS_START = (1U << 26), + CLOSURE_DESTRUCTOR = (1U << 26), + CLOSURE_WAITING = (1U << 28), + CLOSURE_RUNNING = (1U << 30), }; #define CLOSURE_GUARD_MASK \ - ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \ - CLOSURE_RUNNING|CLOSURE_STACK) << 1) + ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) @@ -152,7 +143,7 @@ struct closure { union { struct { struct workqueue_struct *wq; - struct task_struct *task; + struct closure_syncer *s; struct llist_node list; closure_fn *fn; }; @@ -178,7 +169,19 @@ void closure_sub(struct closure *cl, int v); void closure_put(struct closure *cl); void __closure_wake_up(struct closure_waitlist *list); bool closure_wait(struct closure_waitlist *list, struct closure *cl); -void closure_sync(struct closure *cl); +void __closure_sync(struct closure *cl); + +/** + * closure_sync - sleep until a closure a closure has nothing left to wait on + * + * Sleeps until the refcount hits 1 - the thread that's running the closure owns + * the last refcount. + */ +static inline void closure_sync(struct closure *cl) +{ + if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) + __closure_sync(cl); +} #ifdef CONFIG_BCACHE_CLOSURES_DEBUG @@ -215,24 +218,6 @@ static inline void closure_set_waiting(struct closure *cl, unsigned long f) #endif } -static inline void __closure_end_sleep(struct closure *cl) -{ - __set_current_state(TASK_RUNNING); - - if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) - atomic_sub(CLOSURE_SLEEPING, &cl->remaining); -} - -static inline void __closure_start_sleep(struct closure *cl) -{ - closure_set_ip(cl); - cl->task = current; - set_current_state(TASK_UNINTERRUPTIBLE); - - if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) - atomic_add(CLOSURE_SLEEPING, &cl->remaining); -} - static inline void closure_set_stopped(struct closure *cl) { atomic_sub(CLOSURE_RUNNING, &cl->remaining); @@ -241,7 +226,6 @@ static inline void closure_set_stopped(struct closure *cl) static inline void set_closure_fn(struct closure *cl, closure_fn *fn, struct workqueue_struct *wq) { - BUG_ON(object_is_on_stack(cl)); closure_set_ip(cl); cl->fn = fn; cl->wq = wq; @@ -300,7 +284,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent) static inline void closure_init_stack(struct closure *cl) { memset(cl, 0, sizeof(struct closure)); - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); } /** @@ -322,6 +306,8 @@ static inline void closure_wake_up(struct closure_waitlist *list) * This is because after calling continue_at() you no longer have a ref on @cl, * and whatever @cl owns may be freed out from under you - a running closure fn * has a ref on its own closure which continue_at() drops. + * + * Note you are expected to immediately return after using this macro. */ #define continue_at(_cl, _fn, _wq) \ do { \ diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index c7a02c4..af89408 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -116,7 +116,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) return; check->bi_opf = REQ_OP_READ; - if (bio_alloc_pages(check, GFP_NOIO)) + if (bch_bio_alloc_pages(check, GFP_NOIO)) goto out_put; submit_bio_wait(check); @@ -251,8 +251,7 @@ void bch_debug_exit(void) int __init bch_debug_init(struct kobject *kobj) { - int ret = 0; - debug = debugfs_create_dir("bcache", NULL); - return ret; + + return IS_ERR_OR_NULL(debug); } diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index fac97ec..a783c5a 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -51,7 +51,10 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, /* IO errors */ -void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) +void bch_count_io_errors(struct cache *ca, + blk_status_t error, + int is_read, + const char *m) { /* * The halflife of an error is: @@ -94,8 +97,9 @@ void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) errors >>= IO_ERROR_SHIFT; if (errors < ca->set->error_limit) - pr_err("%s: IO error on %s, recovering", - bdevname(ca->bdev, buf), m); + pr_err("%s: IO error on %s%s", + bdevname(ca->bdev, buf), m, + is_read ? ", recovering." : "."); else bch_cache_set_error(ca->set, "%s: too many IO errors %s", @@ -108,6 +112,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, { struct bbio *b = container_of(bio, struct bbio, bio); struct cache *ca = PTR_CACHE(c, &b->key, 0); + int is_read = (bio_data_dir(bio) == READ ? 1 : 0); unsigned threshold = op_is_write(bio_op(bio)) ? c->congested_write_threshold_us @@ -129,7 +134,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, atomic_inc(&c->congested); } - bch_count_io_errors(ca, error, m); + bch_count_io_errors(ca, error, is_read, m); } void bch_bbio_endio(struct cache_set *c, struct bio *bio, diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index d50c1c9..a24c3a9 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -162,7 +162,7 @@ static void read_moving(struct cache_set *c) bio_set_op_attrs(bio, REQ_OP_READ, 0); bio->bi_end_io = read_moving_endio; - if (bio_alloc_pages(bio, GFP_KERNEL)) + if (bch_bio_alloc_pages(bio, GFP_KERNEL)) goto err; trace_bcache_gc_copy(&w->key); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 643c3021..1a46b41 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -576,6 +576,7 @@ static void cache_lookup(struct closure *cl) { struct search *s = container_of(cl, struct search, iop.cl); struct bio *bio = &s->bio.bio; + struct cached_dev *dc; int ret; bch_btree_op_init(&s->op, -1); @@ -588,6 +589,27 @@ static void cache_lookup(struct closure *cl) return; } + /* + * We might meet err when searching the btree, If that happens, we will + * get negative ret, in this scenario we should not recover data from + * backing device (when cache device is dirty) because we don't know + * whether bkeys the read request covered are all clean. + * + * And after that happened, s->iop.status is still its initial value + * before we submit s->bio.bio + */ + if (ret < 0) { + BUG_ON(ret == -EINTR); + if (s->d && s->d->c && + !UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) { + dc = container_of(s->d, struct cached_dev, disk); + if (dc && atomic_read(&dc->has_dirty)) + s->recoverable = false; + } + if (!s->iop.status) + s->iop.status = BLK_STS_IOERR; + } + closure_return(cl); } @@ -611,8 +633,8 @@ static void request_endio(struct bio *bio) static void bio_complete(struct search *s) { if (s->orig_bio) { - struct request_queue *q = s->orig_bio->bi_disk->queue; - generic_end_io_acct(q, bio_data_dir(s->orig_bio), + generic_end_io_acct(s->d->disk->queue, + bio_data_dir(s->orig_bio), &s->d->disk->part0, s->start_time); trace_bcache_request_end(s->d, s->orig_bio); @@ -841,7 +863,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, cache_bio->bi_private = &s->cl; bch_bio_map(cache_bio, NULL); - if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) + if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; if (reada) @@ -974,6 +996,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, struct cached_dev *dc = container_of(d, struct cached_dev, disk); int rw = bio_data_dir(bio); + atomic_set(&dc->backing_idle, 0); generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); bio_set_dev(bio, dc->bdev); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index b4d2892..133b812 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -211,7 +211,7 @@ static void write_bdev_super_endio(struct bio *bio) static void __write_super(struct cache_sb *sb, struct bio *bio) { - struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); + struct cache_sb *out = page_address(bio_first_page_all(bio)); unsigned i; bio->bi_iter.bi_sector = SB_SECTOR; @@ -274,7 +274,9 @@ static void write_super_endio(struct bio *bio) { struct cache *ca = bio->bi_private; - bch_count_io_errors(ca, bio->bi_status, "writing superblock"); + /* is_read = 0 */ + bch_count_io_errors(ca, bio->bi_status, 0, + "writing superblock"); closure_put(&ca->set->sb_write); } @@ -721,6 +723,9 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, d->c = c; c->devices[id] = d; + if (id >= c->devices_max_used) + c->devices_max_used = id + 1; + closure_get(&c->caching); } @@ -906,6 +911,12 @@ static void cached_dev_detach_finish(struct work_struct *w) mutex_lock(&bch_register_lock); + cancel_delayed_work_sync(&dc->writeback_rate_update); + if (!IS_ERR_OR_NULL(dc->writeback_thread)) { + kthread_stop(dc->writeback_thread); + dc->writeback_thread = NULL; + } + memset(&dc->sb.set_uuid, 0, 16); SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); @@ -1166,7 +1177,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, dc->bdev->bd_holder = dc; bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1); - dc->sb_bio.bi_io_vec[0].bv_page = sb_page; + bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page; get_page(sb_page); if (cached_dev_init(dc, sb->block_size << 9)) @@ -1261,7 +1272,7 @@ static int flash_devs_run(struct cache_set *c) struct uuid_entry *u; for (u = c->uuids; - u < c->uuids + c->nr_uuids && !ret; + u < c->uuids + c->devices_max_used && !ret; u++) if (UUID_FLASH_ONLY(u)) ret = flash_dev_run(c, u); @@ -1427,7 +1438,7 @@ static void __cache_set_unregister(struct closure *cl) mutex_lock(&bch_register_lock); - for (i = 0; i < c->nr_uuids; i++) + for (i = 0; i < c->devices_max_used; i++) if (c->devices[i]) { if (!UUID_FLASH_ONLY(&c->uuids[i]) && test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { @@ -1490,7 +1501,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->bucket_bits = ilog2(sb->bucket_size); c->block_bits = ilog2(sb->block_size); c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); - + c->devices_max_used = 0; c->btree_pages = bucket_pages(c); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, @@ -1810,7 +1821,7 @@ void bch_cache_release(struct kobject *kobj) free_fifo(&ca->free[i]); if (ca->sb_bio.bi_inline_vecs[0].bv_page) - put_page(ca->sb_bio.bi_io_vec[0].bv_page); + put_page(bio_first_page_all(&ca->sb_bio)); if (!IS_ERR_OR_NULL(ca->bdev)) blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); @@ -1864,7 +1875,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, ca->bdev->bd_holder = ca; bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1); - ca->sb_bio.bi_io_vec[0].bv_page = sb_page; + bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page; get_page(sb_page); if (blk_queue_discard(bdev_get_queue(ca->bdev))) diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index e548b8b..a23cd6a 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -249,6 +249,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) : 0; } +/* + * Generally it isn't good to access .bi_io_vec and .bi_vcnt directly, + * the preferred way is bio_add_page, but in this case, bch_bio_map() + * supposes that the bvec table is empty, so it is safe to access + * .bi_vcnt & .bi_io_vec in this way even after multipage bvec is + * supported. + */ void bch_bio_map(struct bio *bio, void *base) { size_t size = bio->bi_iter.bi_size; @@ -276,6 +283,33 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, } } +/** + * bch_bio_alloc_pages - allocates a single page for each bvec in a bio + * @bio: bio to allocate pages for + * @gfp_mask: flags for allocation + * + * Allocates pages up to @bio->bi_vcnt. + * + * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are + * freed. + */ +int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +{ + int i; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, i) { + bv->bv_page = alloc_page(gfp_mask); + if (!bv->bv_page) { + while (--bv >= bio->bi_io_vec) + __free_page(bv->bv_page); + return -ENOMEM; + } + } + + return 0; +} + /* * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any * use permitted, subject to terms of PostgreSQL license; see.) diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index ed5e8a4..4df4c5c 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -558,6 +558,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) } void bch_bio_map(struct bio *bio, void *base); +int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask); static inline sector_t bdev_sectors(struct block_device *bdev) { diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 56a3788..51306a1 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -18,17 +18,39 @@ #include <trace/events/bcache.h> /* Rate limiting */ - -static void __update_writeback_rate(struct cached_dev *dc) +static uint64_t __calc_target_rate(struct cached_dev *dc) { struct cache_set *c = dc->disk.c; + + /* + * This is the size of the cache, minus the amount used for + * flash-only devices + */ uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - bcache_flash_devs_sectors_dirty(c); + + /* + * Unfortunately there is no control of global dirty data. If the + * user states that they want 10% dirty data in the cache, and has, + * e.g., 5 backing volumes of equal size, we try and ensure each + * backing volume uses about 2% of the cache for dirty data. + */ + uint32_t bdev_share = + div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT, + c->cached_dev_sectors); + uint64_t cache_dirty_target = div_u64(cache_sectors * dc->writeback_percent, 100); - int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), - c->cached_dev_sectors); + /* Ensure each backing dev gets at least one dirty share */ + if (bdev_share < 1) + bdev_share = 1; + + return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT; +} + +static void __update_writeback_rate(struct cached_dev *dc) +{ /* * PI controller: * Figures out the amount that should be written per second. @@ -49,6 +71,7 @@ static void __update_writeback_rate(struct cached_dev *dc) * This acts as a slow, long-term average that is not subject to * variations in usage like the p term. */ + int64_t target = __calc_target_rate(dc); int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); int64_t error = dirty - target; int64_t proportional_scaled = @@ -116,6 +139,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) struct dirty_io { struct closure cl; struct cached_dev *dc; + uint16_t sequence; struct bio bio; }; @@ -194,6 +218,27 @@ static void write_dirty(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; + struct cached_dev *dc = io->dc; + + uint16_t next_sequence; + + if (atomic_read(&dc->writeback_sequence_next) != io->sequence) { + /* Not our turn to write; wait for a write to complete */ + closure_wait(&dc->writeback_ordering_wait, cl); + + if (atomic_read(&dc->writeback_sequence_next) == io->sequence) { + /* + * Edge case-- it happened in indeterminate order + * relative to when we were added to wait list.. + */ + closure_wake_up(&dc->writeback_ordering_wait); + } + + continue_at(cl, write_dirty, io->dc->writeback_write_wq); + return; + } + + next_sequence = io->sequence + 1; /* * IO errors are signalled using the dirty bit on the key. @@ -211,6 +256,9 @@ static void write_dirty(struct closure *cl) closure_bio_submit(&io->bio, cl); } + atomic_set(&dc->writeback_sequence_next, next_sequence); + closure_wake_up(&dc->writeback_ordering_wait); + continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); } @@ -219,8 +267,10 @@ static void read_dirty_endio(struct bio *bio) struct keybuf_key *w = bio->bi_private; struct dirty_io *io = w->private; + /* is_read = 1 */ bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), - bio->bi_status, "reading dirty data from cache"); + bio->bi_status, 1, + "reading dirty data from cache"); dirty_endio(bio); } @@ -237,10 +287,15 @@ static void read_dirty_submit(struct closure *cl) static void read_dirty(struct cached_dev *dc) { unsigned delay = 0; - struct keybuf_key *w; + struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w; + size_t size; + int nk, i; struct dirty_io *io; struct closure cl; + uint16_t sequence = 0; + BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list)); + atomic_set(&dc->writeback_sequence_next, sequence); closure_init_stack(&cl); /* @@ -248,45 +303,109 @@ static void read_dirty(struct cached_dev *dc) * mempools. */ - while (!kthread_should_stop()) { - - w = bch_keybuf_next(&dc->writeback_keys); - if (!w) - break; - - BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); - - if (KEY_START(&w->key) != dc->last_read || - jiffies_to_msecs(delay) > 50) - while (!kthread_should_stop() && delay) - delay = schedule_timeout_interruptible(delay); - - dc->last_read = KEY_OFFSET(&w->key); - - io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) - * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), - GFP_KERNEL); - if (!io) - goto err; - - w->private = io; - io->dc = dc; - - dirty_init(w); - bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); - io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); - bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); - io->bio.bi_end_io = read_dirty_endio; - - if (bio_alloc_pages(&io->bio, GFP_KERNEL)) - goto err_free; + next = bch_keybuf_next(&dc->writeback_keys); + + while (!kthread_should_stop() && next) { + size = 0; + nk = 0; + + do { + BUG_ON(ptr_stale(dc->disk.c, &next->key, 0)); + + /* + * Don't combine too many operations, even if they + * are all small. + */ + if (nk >= MAX_WRITEBACKS_IN_PASS) + break; + + /* + * If the current operation is very large, don't + * further combine operations. + */ + if (size >= MAX_WRITESIZE_IN_PASS) + break; + + /* + * Operations are only eligible to be combined + * if they are contiguous. + * + * TODO: add a heuristic willing to fire a + * certain amount of non-contiguous IO per pass, + * so that we can benefit from backing device + * command queueing. + */ + if ((nk != 0) && bkey_cmp(&keys[nk-1]->key, + &START_KEY(&next->key))) + break; + + size += KEY_SIZE(&next->key); + keys[nk++] = next; + } while ((next = bch_keybuf_next(&dc->writeback_keys))); + + /* Now we have gathered a set of 1..5 keys to write back. */ + for (i = 0; i < nk; i++) { + w = keys[i]; + + io = kzalloc(sizeof(struct dirty_io) + + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); + if (!io) + goto err; + + w->private = io; + io->dc = dc; + io->sequence = sequence++; + + dirty_init(w); + bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); + io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); + bio_set_dev(&io->bio, + PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); + io->bio.bi_end_io = read_dirty_endio; + + if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) + goto err_free; + + trace_bcache_writeback(&w->key); + + down(&dc->in_flight); + + /* We've acquired a semaphore for the maximum + * simultaneous number of writebacks; from here + * everything happens asynchronously. + */ + closure_call(&io->cl, read_dirty_submit, NULL, &cl); + } - trace_bcache_writeback(&w->key); + delay = writeback_delay(dc, size); - down(&dc->in_flight); - closure_call(&io->cl, read_dirty_submit, NULL, &cl); + /* If the control system would wait for at least half a + * second, and there's been no reqs hitting the backing disk + * for awhile: use an alternate mode where we have at most + * one contiguous set of writebacks in flight at a time. If + * someone wants to do IO it will be quick, as it will only + * have to contend with one operation in flight, and we'll + * be round-tripping data to the backing disk as quickly as + * it can accept it. + */ + if (delay >= HZ / 2) { + /* 3 means at least 1.5 seconds, up to 7.5 if we + * have slowed way down. + */ + if (atomic_inc_return(&dc->backing_idle) >= 3) { + /* Wait for current I/Os to finish */ + closure_sync(&cl); + /* And immediately launch a new set. */ + delay = 0; + } + } - delay = writeback_delay(dc, KEY_SIZE(&w->key)); + while (!kthread_should_stop() && delay) { + schedule_timeout_interruptible(delay); + delay = writeback_delay(dc, 0); + } } if (0) { diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index a9e3ffb..66f1c52 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -5,6 +5,16 @@ #define CUTOFF_WRITEBACK 40 #define CUTOFF_WRITEBACK_SYNC 70 +#define MAX_WRITEBACKS_IN_PASS 5 +#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ + +/* + * 14 (16384ths) is chosen here as something that each backing device + * should be a reasonable fraction of the share, and not to blow up + * until individual backing devices are a petabyte. + */ +#define WRITEBACK_SHARE_SHIFT 14 + static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) { uint64_t i, ret = 0; @@ -21,7 +31,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) mutex_lock(&bch_register_lock); - for (i = 0; i < c->nr_uuids; i++) { + for (i = 0; i < c->devices_max_used; i++) { struct bcache_device *d = c->devices[i]; if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 554d603..2ad4291 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1446,7 +1446,6 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) bio_for_each_segment_all(bv, clone, i) { BUG_ON(!bv->bv_page); mempool_free(bv->bv_page, cc->page_pool); - bv->bv_page = NULL; } } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index f7810cc..ef57c6d 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1475,21 +1475,6 @@ static void activate_path_work(struct work_struct *work) activate_or_offline_path(pgpath); } -static int noretry_error(blk_status_t error) -{ - switch (error) { - case BLK_STS_NOTSUPP: - case BLK_STS_NOSPC: - case BLK_STS_TARGET: - case BLK_STS_NEXUS: - case BLK_STS_MEDIUM: - return 1; - } - - /* Anything else could be a path failure, so should be retried */ - return 0; -} - static int multipath_end_io(struct dm_target *ti, struct request *clone, blk_status_t error, union map_info *map_context) { @@ -1508,7 +1493,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, * request into dm core, which will remake a clone request and * clone bios for it and resubmit it later. */ - if (error && !noretry_error(error)) { + if (error && blk_path_error(error)) { struct multipath *m = ti->private; r = DM_ENDIO_REQUEUE; @@ -1544,7 +1529,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, unsigned long flags; int r = DM_ENDIO_DONE; - if (!*error || noretry_error(*error)) + if (!*error || !blk_path_error(*error)) goto done; if (pgpath) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 9d32f25..b7d175e 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -395,7 +395,7 @@ static void end_clone_request(struct request *clone, blk_status_t error) dm_complete_request(tio->orig, error); } -static void dm_dispatch_clone_request(struct request *clone, struct request *rq) +static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq) { blk_status_t r; @@ -404,9 +404,10 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq) clone->start_time = jiffies; r = blk_insert_cloned_request(clone->q, clone); - if (r) + if (r != BLK_STS_OK && r != BLK_STS_RESOURCE) /* must complete clone in terms of original request */ dm_complete_request(rq, r); + return r; } static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, @@ -476,8 +477,10 @@ static int map_request(struct dm_rq_target_io *tio) struct mapped_device *md = tio->md; struct request *rq = tio->orig; struct request *clone = NULL; + blk_status_t ret; r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); +check_again: switch (r) { case DM_MAPIO_SUBMITTED: /* The target has taken the I/O to submit by itself later */ @@ -492,7 +495,17 @@ static int map_request(struct dm_rq_target_io *tio) /* The target has remapped the I/O so dispatch it */ trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), blk_rq_pos(rq)); - dm_dispatch_clone_request(clone, rq); + ret = dm_dispatch_clone_request(clone, rq); + if (ret == BLK_STS_RESOURCE) { + blk_rq_unprep_clone(clone); + tio->ti->type->release_clone_rq(clone); + tio->clone = NULL; + if (!rq->q->mq_ops) + r = DM_MAPIO_DELAY_REQUEUE; + else + r = DM_MAPIO_REQUEUE; + goto check_again; + } break; case DM_MAPIO_REQUEUE: /* The target wants to requeue the I/O */ @@ -713,8 +726,6 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t) return error; } - elv_register_queue(md->queue); - return 0; } @@ -812,15 +823,8 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) } dm_init_md_queue(md); - /* backfill 'mq' sysfs registration normally done in blk_register_queue */ - err = blk_mq_register_dev(disk_to_dev(md->disk), q); - if (err) - goto out_cleanup_queue; - return 0; -out_cleanup_queue: - blk_cleanup_queue(q); out_tag_set: blk_mq_free_tag_set(md->tag_set); out_kfree_tag_set: diff --git a/drivers/md/dm.c b/drivers/md/dm.c index de17b71..8c26bfc 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -920,7 +920,15 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) return -EINVAL; } - ti->max_io_len = (uint32_t) len; + /* + * BIO based queue uses its own splitting. When multipage bvecs + * is switched on, size of the incoming bio may be too big to + * be handled in some targets, such as crypt. + * + * When these targets are ready for the big bio, we can remove + * the limit. + */ + ti->max_io_len = min_t(uint32_t, len, BIO_MAX_PAGES * PAGE_SIZE); return 0; } @@ -1753,7 +1761,7 @@ static struct mapped_device *alloc_dev(int minor) goto bad; md->dax_dev = dax_dev; - add_disk(md->disk); + add_disk_no_queue_reg(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); @@ -2013,6 +2021,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits); int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { int r; + struct queue_limits limits; enum dm_queue_mode type = dm_get_md_type(md); switch (type) { @@ -2049,6 +2058,14 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) break; } + r = dm_calculate_queue_limits(t, &limits); + if (r) { + DMERR("Cannot calculate initial queue limits"); + return r; + } + dm_table_set_restrictions(t, md->queue, &limits); + blk_register_queue(md->disk); + return 0; } diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index a25fd43..441e67e 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 + +ccflags-y += -I$(src) + obj-$(CONFIG_NVME_CORE) += nvme-core.o obj-$(CONFIG_BLK_DEV_NVME) += nvme.o obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o @@ -6,6 +9,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o nvme-core-y := core.o +nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVM) += lightnvm.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 839650e..e810487 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -29,6 +29,9 @@ #include <linux/pm_qos.h> #include <asm/unaligned.h> +#define CREATE_TRACE_POINTS +#include "trace.h" + #include "nvme.h" #include "fabrics.h" @@ -65,9 +68,26 @@ static bool streams; module_param(streams, bool, 0644); MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); +/* + * nvme_wq - hosts nvme related works that are not reset or delete + * nvme_reset_wq - hosts nvme reset works + * nvme_delete_wq - hosts nvme delete works + * + * nvme_wq will host works such are scan, aen handling, fw activation, + * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq + * runs reset works which also flush works hosted on nvme_wq for + * serialization purposes. nvme_delete_wq host controller deletion + * works which flush reset works for serialization. + */ struct workqueue_struct *nvme_wq; EXPORT_SYMBOL_GPL(nvme_wq); +struct workqueue_struct *nvme_reset_wq; +EXPORT_SYMBOL_GPL(nvme_reset_wq); + +struct workqueue_struct *nvme_delete_wq; +EXPORT_SYMBOL_GPL(nvme_delete_wq); + static DEFINE_IDA(nvme_subsystems_ida); static LIST_HEAD(nvme_subsystems); static DEFINE_MUTEX(nvme_subsystems_lock); @@ -89,13 +109,13 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) return -EBUSY; - if (!queue_work(nvme_wq, &ctrl->reset_work)) + if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) return -EBUSY; return 0; } EXPORT_SYMBOL_GPL(nvme_reset_ctrl); -static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) { int ret; @@ -104,6 +124,7 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) flush_work(&ctrl->reset_work); return ret; } +EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); static void nvme_delete_ctrl_work(struct work_struct *work) { @@ -122,7 +143,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) return -EBUSY; - if (!queue_work(nvme_wq, &ctrl->delete_work)) + if (!queue_work(nvme_delete_wq, &ctrl->delete_work)) return -EBUSY; return 0; } @@ -157,13 +178,20 @@ static blk_status_t nvme_error_status(struct request *req) return BLK_STS_OK; case NVME_SC_CAP_EXCEEDED: return BLK_STS_NOSPC; + case NVME_SC_LBA_RANGE: + return BLK_STS_TARGET; + case NVME_SC_BAD_ATTRIBUTES: case NVME_SC_ONCS_NOT_SUPPORTED: + case NVME_SC_INVALID_OPCODE: + case NVME_SC_INVALID_FIELD: + case NVME_SC_INVALID_NS: return BLK_STS_NOTSUPP; case NVME_SC_WRITE_FAULT: case NVME_SC_READ_ERROR: case NVME_SC_UNWRITTEN_BLOCK: case NVME_SC_ACCESS_DENIED: case NVME_SC_READ_ONLY: + case NVME_SC_COMPARE_FAILED: return BLK_STS_MEDIUM; case NVME_SC_GUARD_CHECK: case NVME_SC_APPTAG_CHECK: @@ -190,8 +218,12 @@ static inline bool nvme_req_needs_retry(struct request *req) void nvme_complete_rq(struct request *req) { - if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { - if (nvme_req_needs_failover(req)) { + blk_status_t status = nvme_error_status(req); + + trace_nvme_complete_rq(req); + + if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { + if (nvme_req_needs_failover(req, status)) { nvme_failover_req(req); return; } @@ -202,8 +234,7 @@ void nvme_complete_rq(struct request *req) return; } } - - blk_mq_end_request(req, nvme_error_status(req)); + blk_mq_end_request(req, status); } EXPORT_SYMBOL_GPL(nvme_complete_rq); @@ -232,6 +263,15 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, old_state = ctrl->state; switch (new_state) { + case NVME_CTRL_ADMIN_ONLY: + switch (old_state) { + case NVME_CTRL_RECONNECTING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; case NVME_CTRL_LIVE: switch (old_state) { case NVME_CTRL_NEW: @@ -247,6 +287,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_NEW: case NVME_CTRL_LIVE: + case NVME_CTRL_ADMIN_ONLY: changed = true; /* FALLTHRU */ default: @@ -266,6 +307,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_DELETING: switch (old_state) { case NVME_CTRL_LIVE: + case NVME_CTRL_ADMIN_ONLY: case NVME_CTRL_RESETTING: case NVME_CTRL_RECONNECTING: changed = true; @@ -591,6 +633,10 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, } cmd->common.command_id = req->tag; + if (ns) + trace_nvme_setup_nvm_cmd(req->q->id, cmd); + else + trace_nvme_setup_admin_cmd(cmd); return ret; } EXPORT_SYMBOL_GPL(nvme_setup_cmd); @@ -1217,16 +1263,27 @@ static int nvme_open(struct block_device *bdev, fmode_t mode) #ifdef CONFIG_NVME_MULTIPATH /* should never be called due to GENHD_FL_HIDDEN */ if (WARN_ON_ONCE(ns->head->disk)) - return -ENXIO; + goto fail; #endif if (!kref_get_unless_zero(&ns->kref)) - return -ENXIO; + goto fail; + if (!try_module_get(ns->ctrl->ops->module)) + goto fail_put_ns; + return 0; + +fail_put_ns: + nvme_put_ns(ns); +fail: + return -ENXIO; } static void nvme_release(struct gendisk *disk, fmode_t mode) { - nvme_put_ns(disk->private_data); + struct nvme_ns *ns = disk->private_data; + + module_put(ns->ctrl->ops->module); + nvme_put_ns(ns); } static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) @@ -2052,6 +2109,22 @@ static const struct attribute_group *nvme_subsys_attrs_groups[] = { NULL, }; +static int nvme_active_ctrls(struct nvme_subsystem *subsys) +{ + int count = 0; + struct nvme_ctrl *ctrl; + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DEAD) + count++; + } + mutex_unlock(&subsys->lock); + + return count; +} + static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { struct nvme_subsystem *subsys, *found; @@ -2090,7 +2163,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) * Verify that the subsystem actually supports multiple * controllers, else bail out. */ - if (!(id->cmic & (1 << 1))) { + if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) { dev_err(ctrl->device, "ignoring ctrl due to duplicate subnqn (%s).\n", found->subnqn); @@ -2257,7 +2330,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) shutdown_timeout, 60); if (ctrl->shutdown_timeout != shutdown_timeout) - dev_warn(ctrl->device, + dev_info(ctrl->device, "Shutdown timeout set to %u seconds\n", ctrl->shutdown_timeout); } else @@ -2341,8 +2414,14 @@ static int nvme_dev_open(struct inode *inode, struct file *file) struct nvme_ctrl *ctrl = container_of(inode->i_cdev, struct nvme_ctrl, cdev); - if (ctrl->state != NVME_CTRL_LIVE) + switch (ctrl->state) { + case NVME_CTRL_LIVE: + case NVME_CTRL_ADMIN_ONLY: + break; + default: return -EWOULDBLOCK; + } + file->private_data = ctrl; return 0; } @@ -2606,6 +2685,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, static const char *const state_name[] = { [NVME_CTRL_NEW] = "new", [NVME_CTRL_LIVE] = "live", + [NVME_CTRL_ADMIN_ONLY] = "only-admin", [NVME_CTRL_RESETTING] = "resetting", [NVME_CTRL_RECONNECTING]= "reconnecting", [NVME_CTRL_DELETING] = "deleting", @@ -3079,6 +3159,8 @@ static void nvme_scan_work(struct work_struct *work) if (ctrl->state != NVME_CTRL_LIVE) return; + WARN_ON_ONCE(!ctrl->tagset); + if (nvme_identify_ctrl(ctrl, &id)) return; @@ -3099,8 +3181,7 @@ static void nvme_scan_work(struct work_struct *work) void nvme_queue_scan(struct nvme_ctrl *ctrl) { /* - * Do not queue new scan work when a controller is reset during - * removal. + * Only new queue scan work when admin and IO queues are both alive */ if (ctrl->state == NVME_CTRL_LIVE) queue_work(nvme_wq, &ctrl->scan_work); @@ -3477,16 +3558,26 @@ EXPORT_SYMBOL_GPL(nvme_reinit_tagset); int __init nvme_core_init(void) { - int result; + int result = -ENOMEM; nvme_wq = alloc_workqueue("nvme-wq", WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); if (!nvme_wq) - return -ENOMEM; + goto out; + + nvme_reset_wq = alloc_workqueue("nvme-reset-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_reset_wq) + goto destroy_wq; + + nvme_delete_wq = alloc_workqueue("nvme-delete-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_delete_wq) + goto destroy_reset_wq; result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); if (result < 0) - goto destroy_wq; + goto destroy_delete_wq; nvme_class = class_create(THIS_MODULE, "nvme"); if (IS_ERR(nvme_class)) { @@ -3505,8 +3596,13 @@ destroy_class: class_destroy(nvme_class); unregister_chrdev: unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); +destroy_delete_wq: + destroy_workqueue(nvme_delete_wq); +destroy_reset_wq: + destroy_workqueue(nvme_reset_wq); destroy_wq: destroy_workqueue(nvme_wq); +out: return result; } @@ -3516,6 +3612,8 @@ void nvme_core_exit(void) class_destroy(nvme_subsys_class); class_destroy(nvme_class); unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); + destroy_workqueue(nvme_delete_wq); + destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); } diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 894c2cc..5dd4cee 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -493,7 +493,7 @@ EXPORT_SYMBOL_GPL(nvmf_should_reconnect); */ int nvmf_register_transport(struct nvmf_transport_ops *ops) { - if (!ops->create_ctrl) + if (!ops->create_ctrl || !ops->module) return -EINVAL; down_write(&nvmf_transports_rwsem); @@ -739,11 +739,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -ENOMEM; goto out; } - if (uuid_parse(p, &hostid)) { + ret = uuid_parse(p, &hostid); + if (ret) { pr_err("Invalid hostid %s\n", p); ret = -EINVAL; + kfree(p); goto out; } + kfree(p); break; case NVMF_OPT_DUP_CONNECT: opts->duplicate_connect = true; @@ -869,32 +872,41 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) goto out_unlock; } + if (!try_module_get(ops->module)) { + ret = -EBUSY; + goto out_unlock; + } + ret = nvmf_check_required_opts(opts, ops->required_opts); if (ret) - goto out_unlock; + goto out_module_put; ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS | ops->allowed_opts | ops->required_opts); if (ret) - goto out_unlock; + goto out_module_put; ctrl = ops->create_ctrl(dev, opts); if (IS_ERR(ctrl)) { ret = PTR_ERR(ctrl); - goto out_unlock; + goto out_module_put; } if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) { dev_warn(ctrl->device, "controller returned incorrect NQN: \"%s\".\n", ctrl->subsys->subnqn); + module_put(ops->module); up_read(&nvmf_transports_rwsem); nvme_delete_ctrl_sync(ctrl); return ERR_PTR(-EINVAL); } + module_put(ops->module); up_read(&nvmf_transports_rwsem); return ctrl; +out_module_put: + module_put(ops->module); out_unlock: up_read(&nvmf_transports_rwsem); out_free_opts: diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 9ba6149..25b19f7 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -108,6 +108,7 @@ struct nvmf_ctrl_options { * fabric implementation of NVMe fabrics. * @entry: Used by the fabrics library to add the new * registration entry to its linked-list internal tree. + * @module: Transport module reference * @name: Name of the NVMe fabric driver implementation. * @required_opts: sysfs command-line options that must be specified * when adding a new NVMe controller. @@ -126,6 +127,7 @@ struct nvmf_ctrl_options { */ struct nvmf_transport_ops { struct list_head entry; + struct module *module; const char *name; int required_opts; int allowed_opts; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 794e66e..99bf51c 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2921,6 +2921,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); nvme_fc_free_queue(&ctrl->queues[0]); + /* re-enable the admin_q so anything new can fast fail */ + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_fc_ctlr_inactive_on_rport(ctrl); } @@ -2935,6 +2938,9 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) * waiting for io to terminate */ nvme_fc_delete_association(ctrl); + + /* resume the io queues so that things will fast fail */ + nvme_start_queues(nctrl); } static void @@ -3380,6 +3386,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) static struct nvmf_transport_ops nvme_fc_transport = { .name = "fc", + .module = THIS_MODULE, .required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR, .allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO, .create_ctrl = nvme_fc_create_ctrl, diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index ba3d7f3..50ef71ee 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -31,27 +31,10 @@ enum nvme_nvm_admin_opcode { nvme_nvm_admin_identity = 0xe2, - nvme_nvm_admin_get_l2p_tbl = 0xea, nvme_nvm_admin_get_bb_tbl = 0xf2, nvme_nvm_admin_set_bb_tbl = 0xf1, }; -struct nvme_nvm_hb_rw { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2; - __le64 metadata; - __le64 prp1; - __le64 prp2; - __le64 spba; - __le16 length; - __le16 control; - __le32 dsmgmt; - __le64 slba; -}; - struct nvme_nvm_ph_rw { __u8 opcode; __u8 flags; @@ -80,19 +63,6 @@ struct nvme_nvm_identity { __u32 rsvd11[5]; }; -struct nvme_nvm_l2ptbl { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __le32 cdw2[4]; - __le64 prp1; - __le64 prp2; - __le64 slba; - __le32 nlb; - __le16 cdw14[6]; -}; - struct nvme_nvm_getbbtbl { __u8 opcode; __u8 flags; @@ -139,9 +109,7 @@ struct nvme_nvm_command { union { struct nvme_common_command common; struct nvme_nvm_identity identity; - struct nvme_nvm_hb_rw hb_rw; struct nvme_nvm_ph_rw ph_rw; - struct nvme_nvm_l2ptbl l2p; struct nvme_nvm_getbbtbl get_bb; struct nvme_nvm_setbbtbl set_bb; struct nvme_nvm_erase_blk erase; @@ -167,7 +135,7 @@ struct nvme_nvm_id_group { __u8 num_lun; __u8 num_pln; __u8 rsvd1; - __le16 num_blk; + __le16 num_chk; __le16 num_pg; __le16 fpg_sz; __le16 csecs; @@ -234,11 +202,9 @@ struct nvme_nvm_bb_tbl { static inline void _nvme_nvm_check_size(void) { BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_hb_rw) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960); BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16); @@ -249,51 +215,58 @@ static inline void _nvme_nvm_check_size(void) static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) { struct nvme_nvm_id_group *src; - struct nvm_id_group *dst; + struct nvm_id_group *grp; + int sec_per_pg, sec_per_pl, pg_per_blk; if (nvme_nvm_id->cgrps != 1) return -EINVAL; src = &nvme_nvm_id->groups[0]; - dst = &nvm_id->grp; - - dst->mtype = src->mtype; - dst->fmtype = src->fmtype; - dst->num_ch = src->num_ch; - dst->num_lun = src->num_lun; - dst->num_pln = src->num_pln; - - dst->num_pg = le16_to_cpu(src->num_pg); - dst->num_blk = le16_to_cpu(src->num_blk); - dst->fpg_sz = le16_to_cpu(src->fpg_sz); - dst->csecs = le16_to_cpu(src->csecs); - dst->sos = le16_to_cpu(src->sos); - - dst->trdt = le32_to_cpu(src->trdt); - dst->trdm = le32_to_cpu(src->trdm); - dst->tprt = le32_to_cpu(src->tprt); - dst->tprm = le32_to_cpu(src->tprm); - dst->tbet = le32_to_cpu(src->tbet); - dst->tbem = le32_to_cpu(src->tbem); - dst->mpos = le32_to_cpu(src->mpos); - dst->mccap = le32_to_cpu(src->mccap); - - dst->cpar = le16_to_cpu(src->cpar); - - if (dst->fmtype == NVM_ID_FMTYPE_MLC) { - memcpy(dst->lptbl.id, src->lptbl.id, 8); - dst->lptbl.mlc.num_pairs = - le16_to_cpu(src->lptbl.mlc.num_pairs); - - if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) { - pr_err("nvm: number of MLC pairs not supported\n"); - return -EINVAL; - } + grp = &nvm_id->grp; + + grp->mtype = src->mtype; + grp->fmtype = src->fmtype; + + grp->num_ch = src->num_ch; + grp->num_lun = src->num_lun; + + grp->num_chk = le16_to_cpu(src->num_chk); + grp->csecs = le16_to_cpu(src->csecs); + grp->sos = le16_to_cpu(src->sos); + + pg_per_blk = le16_to_cpu(src->num_pg); + sec_per_pg = le16_to_cpu(src->fpg_sz) / grp->csecs; + sec_per_pl = sec_per_pg * src->num_pln; + grp->clba = sec_per_pl * pg_per_blk; + grp->ws_per_chk = pg_per_blk; - memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, - dst->lptbl.mlc.num_pairs); + grp->mpos = le32_to_cpu(src->mpos); + grp->cpar = le16_to_cpu(src->cpar); + grp->mccap = le32_to_cpu(src->mccap); + + grp->ws_opt = grp->ws_min = sec_per_pg; + grp->ws_seq = NVM_IO_SNGL_ACCESS; + + if (grp->mpos & 0x020202) { + grp->ws_seq = NVM_IO_DUAL_ACCESS; + grp->ws_opt <<= 1; + } else if (grp->mpos & 0x040404) { + grp->ws_seq = NVM_IO_QUAD_ACCESS; + grp->ws_opt <<= 2; } + grp->trdt = le32_to_cpu(src->trdt); + grp->trdm = le32_to_cpu(src->trdm); + grp->tprt = le32_to_cpu(src->tprt); + grp->tprm = le32_to_cpu(src->tprm); + grp->tbet = le32_to_cpu(src->tbet); + grp->tbem = le32_to_cpu(src->tbem); + + /* 1.2 compatibility */ + grp->num_pln = src->num_pln; + grp->num_pg = le16_to_cpu(src->num_pg); + grp->fpg_sz = le16_to_cpu(src->fpg_sz); + return 0; } @@ -332,62 +305,6 @@ out: return ret; } -static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, - nvm_l2p_update_fn *update_l2p, void *priv) -{ - struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_nvm_command c = {}; - u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9; - u32 nlb_pr_rq = len / sizeof(u64); - u64 cmd_slba = slba; - void *entries; - int ret = 0; - - c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl; - c.l2p.nsid = cpu_to_le32(ns->head->ns_id); - entries = kmalloc(len, GFP_KERNEL); - if (!entries) - return -ENOMEM; - - while (nlb) { - u32 cmd_nlb = min(nlb_pr_rq, nlb); - u64 elba = slba + cmd_nlb; - - c.l2p.slba = cpu_to_le64(cmd_slba); - c.l2p.nlb = cpu_to_le32(cmd_nlb); - - ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, - (struct nvme_command *)&c, entries, len); - if (ret) { - dev_err(ns->ctrl->device, - "L2P table transfer failed (%d)\n", ret); - ret = -EIO; - goto out; - } - - if (unlikely(elba > nvmdev->total_secs)) { - pr_err("nvm: L2P data from device is out of bounds!\n"); - ret = -EINVAL; - goto out; - } - - /* Transform physical address to target address space */ - nvm_part_to_tgt(nvmdev, entries, cmd_nlb); - - if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) { - ret = -EINTR; - goto out; - } - - cmd_slba += cmd_nlb; - nlb -= cmd_nlb; - } - -out: - kfree(entries); - return ret; -} - static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, u8 *blks) { @@ -397,7 +314,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_nvm_command c = {}; struct nvme_nvm_bb_tbl *bb_tbl; - int nr_blks = geo->blks_per_lun * geo->plane_mode; + int nr_blks = geo->nr_chks * geo->plane_mode; int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks; int ret = 0; @@ -438,7 +355,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, goto out; } - memcpy(blks, bb_tbl->blk, geo->blks_per_lun * geo->plane_mode); + memcpy(blks, bb_tbl->blk, geo->nr_chks * geo->plane_mode); out: kfree(bb_tbl); return ret; @@ -474,10 +391,6 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns, c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); c->ph_rw.control = cpu_to_le16(rqd->flags); c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); - - if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD) - c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns, - rqd->bio->bi_iter.bi_sector)); } static void nvme_nvm_end_io(struct request *rq, blk_status_t status) @@ -597,8 +510,6 @@ static void nvme_nvm_dev_dma_free(void *pool, void *addr, static struct nvm_dev_ops nvme_nvm_dev_ops = { .identity = nvme_nvm_identity, - .get_l2p_tbl = nvme_nvm_get_l2p_tbl, - .get_bb_tbl = nvme_nvm_get_bb_tbl, .set_bb_tbl = nvme_nvm_set_bb_tbl, @@ -883,7 +794,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev, } else if (strcmp(attr->name, "num_planes") == 0) { return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln); } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */ - return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk); + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_chk); } else if (strcmp(attr->name, "num_pages") == 0) { return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg); } else if (strcmp(attr->name, "page_size") == 0) { diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 1218a9f..3b211d9 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -33,51 +33,11 @@ void nvme_failover_req(struct request *req) kblockd_schedule_work(&ns->head->requeue_work); } -bool nvme_req_needs_failover(struct request *req) +bool nvme_req_needs_failover(struct request *req, blk_status_t error) { if (!(req->cmd_flags & REQ_NVME_MPATH)) return false; - - switch (nvme_req(req)->status & 0x7ff) { - /* - * Generic command status: - */ - case NVME_SC_INVALID_OPCODE: - case NVME_SC_INVALID_FIELD: - case NVME_SC_INVALID_NS: - case NVME_SC_LBA_RANGE: - case NVME_SC_CAP_EXCEEDED: - case NVME_SC_RESERVATION_CONFLICT: - return false; - - /* - * I/O command set specific error. Unfortunately these values are - * reused for fabrics commands, but those should never get here. - */ - case NVME_SC_BAD_ATTRIBUTES: - case NVME_SC_INVALID_PI: - case NVME_SC_READ_ONLY: - case NVME_SC_ONCS_NOT_SUPPORTED: - WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode == - nvme_fabrics_command); - return false; - - /* - * Media and Data Integrity Errors: - */ - case NVME_SC_WRITE_FAULT: - case NVME_SC_READ_ERROR: - case NVME_SC_GUARD_CHECK: - case NVME_SC_APPTAG_CHECK: - case NVME_SC_REFTAG_CHECK: - case NVME_SC_COMPARE_FAILED: - case NVME_SC_ACCESS_DENIED: - case NVME_SC_UNWRITTEN_BLOCK: - return false; - } - - /* Everything else could be a path failure, so should be retried */ - return true; + return blk_path_error(error); } void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a00eabd..8e4550f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -32,6 +32,8 @@ extern unsigned int admin_timeout; #define NVME_KATO_GRACE 10 extern struct workqueue_struct *nvme_wq; +extern struct workqueue_struct *nvme_reset_wq; +extern struct workqueue_struct *nvme_delete_wq; enum { NVME_NS_LBA = 0, @@ -119,6 +121,7 @@ static inline struct nvme_request *nvme_req(struct request *req) enum nvme_ctrl_state { NVME_CTRL_NEW, NVME_CTRL_LIVE, + NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */ NVME_CTRL_RESETTING, NVME_CTRL_RECONNECTING, NVME_CTRL_DELETING, @@ -393,6 +396,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); @@ -401,7 +405,7 @@ extern const struct block_device_operations nvme_ns_head_ops; #ifdef CONFIG_NVME_MULTIPATH void nvme_failover_req(struct request *req); -bool nvme_req_needs_failover(struct request *req); +bool nvme_req_needs_failover(struct request *req, blk_status_t error); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns_head *head); @@ -430,7 +434,8 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) static inline void nvme_failover_req(struct request *req) { } -static inline bool nvme_req_needs_failover(struct request *req) +static inline bool nvme_req_needs_failover(struct request *req, + blk_status_t error) { return false; } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4276ebf..6fe7af0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); * Represents an NVM Express device. Each nvme_dev is a PCI function. */ struct nvme_dev { - struct nvme_queue **queues; + struct nvme_queue *queues; struct blk_mq_tag_set tagset; struct blk_mq_tag_set admin_tagset; u32 __iomem *dbs; @@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_queue *nvmeq = &dev->queues[0]; WARN_ON(hctx_idx != 0); WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); @@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; if (!nvmeq->tags) nvmeq->tags = &dev->tagset.tags[hctx_idx]; @@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, struct nvme_dev *dev = set->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; - struct nvme_queue *nvmeq = dev->queues[queue_idx]; + struct nvme_queue *nvmeq = &dev->queues[queue_idx]; BUG_ON(!nvmeq); iod->nvmeq = nvmeq; @@ -1044,7 +1044,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); - struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_queue *nvmeq = &dev->queues[0]; struct nvme_command c; memset(&c, 0, sizeof(c)); @@ -1138,9 +1138,14 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) */ bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); - /* If there is a reset ongoing, we shouldn't reset again. */ - if (dev->ctrl.state == NVME_CTRL_RESETTING) + /* If there is a reset/reinit ongoing, we shouldn't reset again. */ + switch (dev->ctrl.state) { + case NVME_CTRL_RESETTING: + case NVME_CTRL_RECONNECTING: return false; + default: + break; + } /* We shouldn't reset unless the controller is on fatal error state * _or_ if we lost the communication with it. @@ -1280,7 +1285,6 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) if (nvmeq->sq_cmds) dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), nvmeq->sq_cmds, nvmeq->sq_dma_addr); - kfree(nvmeq); } static void nvme_free_queues(struct nvme_dev *dev, int lowest) @@ -1288,10 +1292,8 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) int i; for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; dev->ctrl.queue_count--; - dev->queues[i] = NULL; - nvme_free_queue(nvmeq); + nvme_free_queue(&dev->queues[i]); } } @@ -1323,12 +1325,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { - struct nvme_queue *nvmeq = dev->queues[0]; - - if (!nvmeq) - return; - if (nvme_suspend_queue(nvmeq)) - return; + struct nvme_queue *nvmeq = &dev->queues[0]; if (shutdown) nvme_shutdown_ctrl(&dev->ctrl); @@ -1367,7 +1364,7 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, int qid, int depth) { - if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { + if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), dev->ctrl.page_size); nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; @@ -1382,13 +1379,13 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, return 0; } -static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, - int depth, int node) +static int nvme_alloc_queue(struct nvme_dev *dev, int qid, + int depth, int node) { - struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL, - node); - if (!nvmeq) - return NULL; + struct nvme_queue *nvmeq = &dev->queues[qid]; + + if (dev->ctrl.queue_count > qid) + return 0; nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), &nvmeq->cq_dma_addr, GFP_KERNEL); @@ -1407,17 +1404,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_depth = depth; nvmeq->qid = qid; nvmeq->cq_vector = -1; - dev->queues[qid] = nvmeq; dev->ctrl.queue_count++; - return nvmeq; + return 0; free_cqdma: dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); free_nvmeq: - kfree(nvmeq); - return NULL; + return -ENOMEM; } static int queue_request_irq(struct nvme_queue *nvmeq) @@ -1590,14 +1585,12 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) if (result < 0) return result; - nvmeq = dev->queues[0]; - if (!nvmeq) { - nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, - dev_to_node(dev->dev)); - if (!nvmeq) - return -ENOMEM; - } + result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, + dev_to_node(dev->dev)); + if (result) + return result; + nvmeq = &dev->queues[0]; aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; @@ -1627,7 +1620,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { /* vector == qid - 1, match nvme_create_queue */ - if (!nvme_alloc_queue(dev, i, dev->q_depth, + if (nvme_alloc_queue(dev, i, dev->q_depth, pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { ret = -ENOMEM; break; @@ -1636,15 +1629,15 @@ static int nvme_create_io_queues(struct nvme_dev *dev) max = min(dev->max_qid, dev->ctrl.queue_count - 1); for (i = dev->online_queues; i <= max; i++) { - ret = nvme_create_queue(dev->queues[i], i); + ret = nvme_create_queue(&dev->queues[i], i); if (ret) break; } /* * Ignore failing Create SQ/CQ commands, we can continue with less - * than the desired aount of queues, and even a controller without - * I/O queues an still be used to issue admin commands. This might + * than the desired amount of queues, and even a controller without + * I/O queues can still be used to issue admin commands. This might * be useful to upgrade a buggy firmware for example. */ return ret >= 0 ? 0 : ret; @@ -1661,30 +1654,40 @@ static ssize_t nvme_cmb_show(struct device *dev, } static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); -static void __iomem *nvme_map_cmb(struct nvme_dev *dev) +static u64 nvme_cmb_size_unit(struct nvme_dev *dev) { - u64 szu, size, offset; + u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; + + return 1ULL << (12 + 4 * szu); +} + +static u32 nvme_cmb_size(struct nvme_dev *dev) +{ + return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK; +} + +static void nvme_map_cmb(struct nvme_dev *dev) +{ + u64 size, offset; resource_size_t bar_size; struct pci_dev *pdev = to_pci_dev(dev->dev); - void __iomem *cmb; int bar; dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); - if (!(NVME_CMB_SZ(dev->cmbsz))) - return NULL; + if (!dev->cmbsz) + return; dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); if (!use_cmb_sqes) - return NULL; + return; - szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); - size = szu * NVME_CMB_SZ(dev->cmbsz); - offset = szu * NVME_CMB_OFST(dev->cmbloc); + size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); + offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); bar = NVME_CMB_BIR(dev->cmbloc); bar_size = pci_resource_len(pdev, bar); if (offset > bar_size) - return NULL; + return; /* * Controllers may support a CMB size larger than their BAR, @@ -1694,13 +1697,16 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) if (size > bar_size - offset) size = bar_size - offset; - cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); - if (!cmb) - return NULL; - + dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); + if (!dev->cmb) + return; dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; dev->cmb_size = size; - return cmb; + + if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, + &dev_attr_cmb.attr, NULL)) + dev_warn(dev->ctrl.device, + "failed to add sysfs attribute for CMB\n"); } static inline void nvme_release_cmb(struct nvme_dev *dev) @@ -1768,7 +1774,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, dma_addr_t descs_dma; int i = 0; void **bufs; - u64 size = 0, tmp; + u64 size, tmp; tmp = (preferred + chunk_size - 1); do_div(tmp, chunk_size); @@ -1851,7 +1857,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) u64 preferred = (u64)dev->ctrl.hmpre * 4096; u64 min = (u64)dev->ctrl.hmmin * 4096; u32 enable_bits = NVME_HOST_MEM_ENABLE; - int ret = 0; + int ret; preferred = min(preferred, max); if (min > max) { @@ -1892,7 +1898,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) static int nvme_setup_io_queues(struct nvme_dev *dev) { - struct nvme_queue *adminq = dev->queues[0]; + struct nvme_queue *adminq = &dev->queues[0]; struct pci_dev *pdev = to_pci_dev(dev->dev); int result, nr_io_queues; unsigned long size; @@ -1905,7 +1911,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (nr_io_queues == 0) return 0; - if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { + if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) { result = nvme_cmb_qdepth(dev, nr_io_queues, sizeof(struct nvme_command)); if (result > 0) @@ -2005,9 +2011,9 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) return 0; } -static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) +static void nvme_disable_io_queues(struct nvme_dev *dev) { - int pass; + int pass, queues = dev->online_queues - 1; unsigned long timeout; u8 opcode = nvme_admin_delete_sq; @@ -2018,7 +2024,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) retry: timeout = ADMIN_TIMEOUT; for (; i > 0; i--, sent++) - if (nvme_delete_queue(dev->queues[i], opcode)) + if (nvme_delete_queue(&dev->queues[i], opcode)) break; while (sent--) { @@ -2033,13 +2039,12 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) } /* - * Return: error value if an error occurred setting up the queues or calling - * Identify Device. 0 if these succeeded, even if adding some of the - * namespaces failed. At the moment, these failures are silent. TBD which - * failures should be reported. + * return error value only when tagset allocation failed */ static int nvme_dev_add(struct nvme_dev *dev) { + int ret; + if (!dev->ctrl.tagset) { dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; @@ -2055,8 +2060,12 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; - if (blk_mq_alloc_tag_set(&dev->tagset)) - return 0; + ret = blk_mq_alloc_tag_set(&dev->tagset); + if (ret) { + dev_warn(dev->ctrl.device, + "IO queues tagset allocation failed %d\n", ret); + return ret; + } dev->ctrl.tagset = &dev->tagset; nvme_dbbuf_set(dev); @@ -2122,22 +2131,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) "set queue depth=%u\n", dev->q_depth); } - /* - * CMBs can currently only exist on >=1.2 PCIe devices. We only - * populate sysfs if a CMB is implemented. Since nvme_dev_attrs_group - * has no name we can pass NULL as final argument to - * sysfs_add_file_to_group. - */ - - if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) { - dev->cmb = nvme_map_cmb(dev); - if (dev->cmb) { - if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL)) - dev_warn(dev->ctrl.device, - "failed to add sysfs attribute for CMB\n"); - } - } + nvme_map_cmb(dev); pci_enable_pcie_error_reporting(pdev); pci_save_state(pdev); @@ -2170,7 +2164,7 @@ static void nvme_pci_disable(struct nvme_dev *dev) static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { - int i, queues; + int i; bool dead = true; struct pci_dev *pdev = to_pci_dev(dev->dev); @@ -2205,21 +2199,13 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) } nvme_stop_queues(&dev->ctrl); - queues = dev->online_queues - 1; - for (i = dev->ctrl.queue_count - 1; i > 0; i--) - nvme_suspend_queue(dev->queues[i]); - - if (dead) { - /* A device might become IO incapable very soon during - * probe, before the admin queue is configured. Thus, - * queue_count can be 0 here. - */ - if (dev->ctrl.queue_count) - nvme_suspend_queue(dev->queues[0]); - } else { - nvme_disable_io_queues(dev, queues); + if (!dead) { + nvme_disable_io_queues(dev); nvme_disable_admin_queue(dev, shutdown); } + for (i = dev->ctrl.queue_count - 1; i >= 0; i--) + nvme_suspend_queue(&dev->queues[i]); + nvme_pci_disable(dev); blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); @@ -2289,6 +2275,7 @@ static void nvme_reset_work(struct work_struct *work) container_of(work, struct nvme_dev, ctrl.reset_work); bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result = -ENODEV; + enum nvme_ctrl_state new_state = NVME_CTRL_LIVE; if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) goto out; @@ -2300,6 +2287,16 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); + /* + * Introduce RECONNECTING state from nvme-fc/rdma transports to mark the + * initializing procedure here. + */ + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) { + dev_warn(dev->ctrl.device, + "failed to mark controller RECONNECTING\n"); + goto out; + } + result = nvme_pci_enable(dev); if (result) goto out; @@ -2352,15 +2349,23 @@ static void nvme_reset_work(struct work_struct *work) dev_warn(dev->ctrl.device, "IO queues not created\n"); nvme_kill_queues(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); + new_state = NVME_CTRL_ADMIN_ONLY; } else { nvme_start_queues(&dev->ctrl); nvme_wait_freeze(&dev->ctrl); - nvme_dev_add(dev); + /* hit this only when allocate tagset fails */ + if (nvme_dev_add(dev)) + new_state = NVME_CTRL_ADMIN_ONLY; nvme_unfreeze(&dev->ctrl); } - if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { - dev_warn(dev->ctrl.device, "failed to mark controller live\n"); + /* + * If only admin queue live, keep it to do further investigation or + * recovery. + */ + if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) { + dev_warn(dev->ctrl.device, + "failed to mark controller state %d\n", new_state); goto out; } @@ -2468,8 +2473,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) return -ENOMEM; - dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), - GFP_KERNEL, node); + + dev->queues = kcalloc_node(num_possible_cpus() + 1, + sizeof(struct nvme_queue), GFP_KERNEL, node); if (!dev->queues) goto free; @@ -2496,10 +2502,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release_pools; - nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING); dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); - queue_work(nvme_wq, &dev->ctrl.reset_work); + nvme_reset_ctrl(&dev->ctrl); + return 0; release_pools: @@ -2523,7 +2529,7 @@ static void nvme_reset_prepare(struct pci_dev *pdev) static void nvme_reset_done(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_reset_ctrl(&dev->ctrl); + nvme_reset_ctrl_sync(&dev->ctrl); } static void nvme_shutdown(struct pci_dev *pdev) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2a0bba7..2bc059f 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -66,7 +66,6 @@ struct nvme_rdma_request { struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; u32 num_sge; int nents; - bool inline_data; struct ib_reg_wr reg_wr; struct ib_cqe reg_cqe; struct nvme_rdma_queue *queue; @@ -1092,7 +1091,6 @@ static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; - req->inline_data = true; req->num_sge++; return 0; } @@ -1164,7 +1162,6 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, int count, ret; req->num_sge = 1; - req->inline_data = false; refcount_set(&req->ref, 2); /* send and recv completions */ c->common.flags |= NVME_CMD_SGL_METABUF; @@ -2018,6 +2015,7 @@ out_free_ctrl: static struct nvmf_transport_ops nvme_rdma_transport = { .name = "rdma", + .module = THIS_MODULE, .required_opts = NVMF_OPT_TRADDR, .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO, @@ -2040,7 +2038,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) } mutex_unlock(&nvme_rdma_ctrl_mutex); - flush_workqueue(nvme_wq); + flush_workqueue(nvme_delete_wq); } static struct ib_client nvme_rdma_ib_client = { diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c new file mode 100644 index 0000000..41944bb --- /dev/null +++ b/drivers/nvme/host/trace.c @@ -0,0 +1,130 @@ +/* + * NVM Express device driver tracepoints + * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <asm/unaligned.h> +#include "trace.h" + +static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 sqid = get_unaligned_le16(cdw10); + u16 qsize = get_unaligned_le16(cdw10 + 2); + u16 sq_flags = get_unaligned_le16(cdw10 + 4); + u16 cqid = get_unaligned_le16(cdw10 + 6); + + + trace_seq_printf(p, "sqid=%u, qsize=%u, sq_flags=0x%x, cqid=%u", + sqid, qsize, sq_flags, cqid); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 cqid = get_unaligned_le16(cdw10); + u16 qsize = get_unaligned_le16(cdw10 + 2); + u16 cq_flags = get_unaligned_le16(cdw10 + 4); + u16 irq_vector = get_unaligned_le16(cdw10 + 6); + + trace_seq_printf(p, "cqid=%u, qsize=%u, cq_flags=0x%x, irq_vector=%u", + cqid, qsize, cq_flags, irq_vector); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 cns = cdw10[0]; + u16 ctrlid = get_unaligned_le16(cdw10 + 2); + + trace_seq_printf(p, "cns=%u, ctrlid=%u", cns, ctrlid); + trace_seq_putc(p, 0); + + return ret; +} + + + +static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u16 length = get_unaligned_le16(cdw10 + 8); + u16 control = get_unaligned_le16(cdw10 + 10); + u32 dsmgmt = get_unaligned_le32(cdw10 + 12); + u32 reftag = get_unaligned_le32(cdw10 + 16); + + trace_seq_printf(p, + "slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u", + slba, length, control, dsmgmt, reftag); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "nr=%u, attributes=%u", + get_unaligned_le32(cdw10), + get_unaligned_le32(cdw10 + 4)); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "cdw10=%*ph", 24, cdw10); + trace_seq_putc(p, 0); + + return ret; +} + +const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, + u8 opcode, u8 *cdw10) +{ + switch (opcode) { + case nvme_admin_create_sq: + return nvme_trace_create_sq(p, cdw10); + case nvme_admin_create_cq: + return nvme_trace_create_cq(p, cdw10); + case nvme_admin_identify: + return nvme_trace_admin_identify(p, cdw10); + default: + return nvme_trace_common(p, cdw10); + } +} + +const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, + u8 opcode, u8 *cdw10) +{ + switch (opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + case nvme_cmd_write_zeroes: + return nvme_trace_read_write(p, cdw10); + case nvme_cmd_dsm: + return nvme_trace_dsm(p, cdw10); + default: + return nvme_trace_common(p, cdw10); + } +} diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h new file mode 100644 index 0000000..ea91fcc --- /dev/null +++ b/drivers/nvme/host/trace.h @@ -0,0 +1,165 @@ +/* + * NVM Express device driver tracepoints + * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nvme + +#if !defined(_TRACE_NVME_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NVME_H + +#include <linux/nvme.h> +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "nvme.h" + +#define nvme_admin_opcode_name(opcode) { opcode, #opcode } +#define show_admin_opcode_name(val) \ + __print_symbolic(val, \ + nvme_admin_opcode_name(nvme_admin_delete_sq), \ + nvme_admin_opcode_name(nvme_admin_create_sq), \ + nvme_admin_opcode_name(nvme_admin_get_log_page), \ + nvme_admin_opcode_name(nvme_admin_delete_cq), \ + nvme_admin_opcode_name(nvme_admin_create_cq), \ + nvme_admin_opcode_name(nvme_admin_identify), \ + nvme_admin_opcode_name(nvme_admin_abort_cmd), \ + nvme_admin_opcode_name(nvme_admin_set_features), \ + nvme_admin_opcode_name(nvme_admin_get_features), \ + nvme_admin_opcode_name(nvme_admin_async_event), \ + nvme_admin_opcode_name(nvme_admin_ns_mgmt), \ + nvme_admin_opcode_name(nvme_admin_activate_fw), \ + nvme_admin_opcode_name(nvme_admin_download_fw), \ + nvme_admin_opcode_name(nvme_admin_ns_attach), \ + nvme_admin_opcode_name(nvme_admin_keep_alive), \ + nvme_admin_opcode_name(nvme_admin_directive_send), \ + nvme_admin_opcode_name(nvme_admin_directive_recv), \ + nvme_admin_opcode_name(nvme_admin_dbbuf), \ + nvme_admin_opcode_name(nvme_admin_format_nvm), \ + nvme_admin_opcode_name(nvme_admin_security_send), \ + nvme_admin_opcode_name(nvme_admin_security_recv), \ + nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) + +const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); +#define __parse_nvme_admin_cmd(opcode, cdw10) \ + nvme_trace_parse_admin_cmd(p, opcode, cdw10) + +#define nvme_opcode_name(opcode) { opcode, #opcode } +#define show_opcode_name(val) \ + __print_symbolic(val, \ + nvme_opcode_name(nvme_cmd_flush), \ + nvme_opcode_name(nvme_cmd_write), \ + nvme_opcode_name(nvme_cmd_read), \ + nvme_opcode_name(nvme_cmd_write_uncor), \ + nvme_opcode_name(nvme_cmd_compare), \ + nvme_opcode_name(nvme_cmd_write_zeroes), \ + nvme_opcode_name(nvme_cmd_dsm), \ + nvme_opcode_name(nvme_cmd_resv_register), \ + nvme_opcode_name(nvme_cmd_resv_report), \ + nvme_opcode_name(nvme_cmd_resv_acquire), \ + nvme_opcode_name(nvme_cmd_resv_release)) + +const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); +#define __parse_nvme_cmd(opcode, cdw10) \ + nvme_trace_parse_nvm_cmd(p, opcode, cdw10) + +TRACE_EVENT(nvme_setup_admin_cmd, + TP_PROTO(struct nvme_command *cmd), + TP_ARGS(cmd), + TP_STRUCT__entry( + __field(u8, opcode) + __field(u8, flags) + __field(u16, cid) + __field(u64, metadata) + __array(u8, cdw10, 24) + ), + TP_fast_assign( + __entry->opcode = cmd->common.opcode; + __entry->flags = cmd->common.flags; + __entry->cid = cmd->common.command_id; + __entry->metadata = le64_to_cpu(cmd->common.metadata); + memcpy(__entry->cdw10, cmd->common.cdw10, + sizeof(__entry->cdw10)); + ), + TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", + __entry->cid, __entry->flags, __entry->metadata, + show_admin_opcode_name(__entry->opcode), + __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10)) +); + + +TRACE_EVENT(nvme_setup_nvm_cmd, + TP_PROTO(int qid, struct nvme_command *cmd), + TP_ARGS(qid, cmd), + TP_STRUCT__entry( + __field(int, qid) + __field(u8, opcode) + __field(u8, flags) + __field(u16, cid) + __field(u32, nsid) + __field(u64, metadata) + __array(u8, cdw10, 24) + ), + TP_fast_assign( + __entry->qid = qid; + __entry->opcode = cmd->common.opcode; + __entry->flags = cmd->common.flags; + __entry->cid = cmd->common.command_id; + __entry->nsid = le32_to_cpu(cmd->common.nsid); + __entry->metadata = le64_to_cpu(cmd->common.metadata); + memcpy(__entry->cdw10, cmd->common.cdw10, + sizeof(__entry->cdw10)); + ), + TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", + __entry->qid, __entry->nsid, __entry->cid, + __entry->flags, __entry->metadata, + show_opcode_name(__entry->opcode), + __parse_nvme_cmd(__entry->opcode, __entry->cdw10)) +); + +TRACE_EVENT(nvme_complete_rq, + TP_PROTO(struct request *req), + TP_ARGS(req), + TP_STRUCT__entry( + __field(int, qid) + __field(int, cid) + __field(u64, result) + __field(u8, retries) + __field(u8, flags) + __field(u16, status) + ), + TP_fast_assign( + __entry->qid = req->q->id; + __entry->cid = req->tag; + __entry->result = le64_to_cpu(nvme_req(req)->result.u64); + __entry->retries = nvme_req(req)->retries; + __entry->flags = nvme_req(req)->flags; + __entry->status = nvme_req(req)->status; + ), + TP_printk("cmdid=%u, qid=%d, res=%llu, retries=%u, flags=0x%x, status=%u", + __entry->cid, __entry->qid, __entry->result, + __entry->retries, __entry->flags, __entry->status) + +); + +#endif /* _TRACE_NVME_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 03e4ab6..5f4f8b1 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -29,6 +29,7 @@ config NVME_TARGET_RDMA tristate "NVMe over Fabrics RDMA target support" depends on INFINIBAND depends on NVME_TARGET + select SGL_ALLOC help This enables the NVMe RDMA target support, which allows exporting NVMe devices over RDMA. @@ -39,6 +40,7 @@ config NVME_TARGET_FC tristate "NVMe over Fabrics FC target driver" depends on NVME_TARGET depends on HAS_DMA + select SGL_ALLOC help This enables the NVMe FC target support, which allows exporting NVMe devices over FC. diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index b54748a..0bd7371 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -512,6 +512,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, req->sg_cnt = 0; req->transfer_len = 0; req->rsp->status = 0; + req->ns = NULL; /* no support for fused commands yet */ if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { @@ -557,6 +558,8 @@ EXPORT_SYMBOL_GPL(nvmet_req_init); void nvmet_req_uninit(struct nvmet_req *req) { percpu_ref_put(&req->sq->ref); + if (req->ns) + nvmet_put_namespace(req->ns); } EXPORT_SYMBOL_GPL(nvmet_req_uninit); @@ -830,7 +833,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, /* Don't accept keep-alive timeout for discovery controllers */ if (kato) { status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; - goto out_free_sqs; + goto out_remove_ida; } /* @@ -860,6 +863,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, *ctrlp = ctrl; return 0; +out_remove_ida: + ida_simple_remove(&cntlid_ida, ctrl->cntlid); out_free_sqs: kfree(ctrl->sqs); out_free_cqs: @@ -877,21 +882,22 @@ static void nvmet_ctrl_free(struct kref *ref) struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref); struct nvmet_subsys *subsys = ctrl->subsys; - nvmet_stop_keep_alive_timer(ctrl); - mutex_lock(&subsys->lock); list_del(&ctrl->subsys_entry); mutex_unlock(&subsys->lock); + nvmet_stop_keep_alive_timer(ctrl); + flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fatal_err_work); ida_simple_remove(&cntlid_ida, ctrl->cntlid); - nvmet_subsys_put(subsys); kfree(ctrl->sqs); kfree(ctrl->cqs); kfree(ctrl); + + nvmet_subsys_put(subsys); } void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index db3bf6b..19e9e42 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -225,7 +225,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) goto out_ctrl_put; } - pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); + pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); out: kfree(d); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 5fd8603..9b39a6c 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1697,31 +1697,12 @@ static int nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod) { struct scatterlist *sg; - struct page *page; unsigned int nent; - u32 page_len, length; - int i = 0; - length = fod->req.transfer_len; - nent = DIV_ROUND_UP(length, PAGE_SIZE); - sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); + sg = sgl_alloc(fod->req.transfer_len, GFP_KERNEL, &nent); if (!sg) goto out; - sg_init_table(sg, nent); - - while (length) { - page_len = min_t(u32, length, PAGE_SIZE); - - page = alloc_page(GFP_KERNEL); - if (!page) - goto out_free_pages; - - sg_set_page(&sg[i], page, page_len, 0); - length -= page_len; - i++; - } - fod->data_sg = sg; fod->data_sg_cnt = nent; fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent, @@ -1731,14 +1712,6 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod) return 0; -out_free_pages: - while (i > 0) { - i--; - __free_page(sg_page(&sg[i])); - } - kfree(sg); - fod->data_sg = NULL; - fod->data_sg_cnt = 0; out: return NVME_SC_INTERNAL; } @@ -1746,18 +1719,13 @@ out: static void nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod) { - struct scatterlist *sg; - int count; - if (!fod->data_sg || !fod->data_sg_cnt) return; fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt, ((fod->io_dir == NVMET_FCP_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE)); - for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count) - __free_page(sg_page(sg)); - kfree(fod->data_sg); + sgl_free(fod->data_sg); fod->data_sg = NULL; fod->data_sg_cnt = 0; } @@ -2522,14 +2490,8 @@ nvmet_fc_add_port(struct nvmet_port *port) list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) { if ((tgtport->fc_target_port.node_name == traddr.nn) && (tgtport->fc_target_port.port_name == traddr.pn)) { - /* a FC port can only be 1 nvmet port id */ - if (!tgtport->port) { - tgtport->port = port; - port->priv = tgtport; - nvmet_fc_tgtport_get(tgtport); - ret = 0; - } else - ret = -EALREADY; + tgtport->port = port; + ret = 0; break; } } @@ -2540,19 +2502,7 @@ nvmet_fc_add_port(struct nvmet_port *port) static void nvmet_fc_remove_port(struct nvmet_port *port) { - struct nvmet_fc_tgtport *tgtport = port->priv; - unsigned long flags; - bool matched = false; - - spin_lock_irqsave(&nvmet_fc_tgtlock, flags); - if (tgtport->port == port) { - matched = true; - tgtport->port = NULL; - } - spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); - - if (matched) - nvmet_fc_tgtport_put(tgtport); + /* nothing to do */ } static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = { diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 6a018a0..34712de 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -204,6 +204,10 @@ struct fcloop_lport { struct completion unreg_done; }; +struct fcloop_lport_priv { + struct fcloop_lport *lport; +}; + struct fcloop_rport { struct nvme_fc_remote_port *remoteport; struct nvmet_fc_target_port *targetport; @@ -238,21 +242,32 @@ struct fcloop_lsreq { int status; }; +enum { + INI_IO_START = 0, + INI_IO_ACTIVE = 1, + INI_IO_ABORTED = 2, + INI_IO_COMPLETED = 3, +}; + struct fcloop_fcpreq { struct fcloop_tport *tport; struct nvmefc_fcp_req *fcpreq; spinlock_t reqlock; u16 status; + u32 inistate; bool active; bool aborted; - struct work_struct work; + struct kref ref; + struct work_struct fcp_rcv_work; + struct work_struct abort_rcv_work; + struct work_struct tio_done_work; struct nvmefc_tgt_fcp_req tgt_fcp_req; }; struct fcloop_ini_fcpreq { struct nvmefc_fcp_req *fcpreq; struct fcloop_fcpreq *tfcp_req; - struct work_struct iniwork; + spinlock_t inilock; }; static inline struct fcloop_lsreq * @@ -343,17 +358,122 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport, return 0; } -/* - * FCP IO operation done by initiator abort. - * call back up initiator "done" flows. - */ static void -fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work) +fcloop_tfcp_req_free(struct kref *ref) { - struct fcloop_ini_fcpreq *inireq = - container_of(work, struct fcloop_ini_fcpreq, iniwork); + struct fcloop_fcpreq *tfcp_req = + container_of(ref, struct fcloop_fcpreq, ref); + + kfree(tfcp_req); +} + +static void +fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req) +{ + kref_put(&tfcp_req->ref, fcloop_tfcp_req_free); +} + +static int +fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req) +{ + return kref_get_unless_zero(&tfcp_req->ref); +} + +static void +fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq, + struct fcloop_fcpreq *tfcp_req, int status) +{ + struct fcloop_ini_fcpreq *inireq = NULL; + + if (fcpreq) { + inireq = fcpreq->private; + spin_lock(&inireq->inilock); + inireq->tfcp_req = NULL; + spin_unlock(&inireq->inilock); + + fcpreq->status = status; + fcpreq->done(fcpreq); + } + + /* release original io reference on tgt struct */ + fcloop_tfcp_req_put(tfcp_req); +} + +static void +fcloop_fcp_recv_work(struct work_struct *work) +{ + struct fcloop_fcpreq *tfcp_req = + container_of(work, struct fcloop_fcpreq, fcp_rcv_work); + struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq; + int ret = 0; + bool aborted = false; + + spin_lock(&tfcp_req->reqlock); + switch (tfcp_req->inistate) { + case INI_IO_START: + tfcp_req->inistate = INI_IO_ACTIVE; + break; + case INI_IO_ABORTED: + aborted = true; + break; + default: + spin_unlock(&tfcp_req->reqlock); + WARN_ON(1); + return; + } + spin_unlock(&tfcp_req->reqlock); + + if (unlikely(aborted)) + ret = -ECANCELED; + else + ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport, + &tfcp_req->tgt_fcp_req, + fcpreq->cmdaddr, fcpreq->cmdlen); + if (ret) + fcloop_call_host_done(fcpreq, tfcp_req, ret); + + return; +} + +static void +fcloop_fcp_abort_recv_work(struct work_struct *work) +{ + struct fcloop_fcpreq *tfcp_req = + container_of(work, struct fcloop_fcpreq, abort_rcv_work); + struct nvmefc_fcp_req *fcpreq; + bool completed = false; + + spin_lock(&tfcp_req->reqlock); + fcpreq = tfcp_req->fcpreq; + switch (tfcp_req->inistate) { + case INI_IO_ABORTED: + break; + case INI_IO_COMPLETED: + completed = true; + break; + default: + spin_unlock(&tfcp_req->reqlock); + WARN_ON(1); + return; + } + spin_unlock(&tfcp_req->reqlock); + + if (unlikely(completed)) { + /* remove reference taken in original abort downcall */ + fcloop_tfcp_req_put(tfcp_req); + return; + } - inireq->fcpreq->done(inireq->fcpreq); + if (tfcp_req->tport->targetport) + nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport, + &tfcp_req->tgt_fcp_req); + + spin_lock(&tfcp_req->reqlock); + tfcp_req->fcpreq = NULL; + spin_unlock(&tfcp_req->reqlock); + + fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED); + /* call_host_done releases reference for abort downcall */ } /* @@ -364,20 +484,15 @@ static void fcloop_tgt_fcprqst_done_work(struct work_struct *work) { struct fcloop_fcpreq *tfcp_req = - container_of(work, struct fcloop_fcpreq, work); - struct fcloop_tport *tport = tfcp_req->tport; + container_of(work, struct fcloop_fcpreq, tio_done_work); struct nvmefc_fcp_req *fcpreq; spin_lock(&tfcp_req->reqlock); fcpreq = tfcp_req->fcpreq; + tfcp_req->inistate = INI_IO_COMPLETED; spin_unlock(&tfcp_req->reqlock); - if (tport->remoteport && fcpreq) { - fcpreq->status = tfcp_req->status; - fcpreq->done(fcpreq); - } - - kfree(tfcp_req); + fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status); } @@ -390,7 +505,6 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport, struct fcloop_rport *rport = remoteport->private; struct fcloop_ini_fcpreq *inireq = fcpreq->private; struct fcloop_fcpreq *tfcp_req; - int ret = 0; if (!rport->targetport) return -ECONNREFUSED; @@ -401,16 +515,20 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport, inireq->fcpreq = fcpreq; inireq->tfcp_req = tfcp_req; - INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work); + spin_lock_init(&inireq->inilock); + tfcp_req->fcpreq = fcpreq; tfcp_req->tport = rport->targetport->private; + tfcp_req->inistate = INI_IO_START; spin_lock_init(&tfcp_req->reqlock); - INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work); + INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work); + INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work); + INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work); + kref_init(&tfcp_req->ref); - ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req, - fcpreq->cmdaddr, fcpreq->cmdlen); + schedule_work(&tfcp_req->fcp_rcv_work); - return ret; + return 0; } static void @@ -589,7 +707,7 @@ fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport, { struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq); - schedule_work(&tfcp_req->work); + schedule_work(&tfcp_req->tio_done_work); } static void @@ -605,27 +723,47 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, void *hw_queue_handle, struct nvmefc_fcp_req *fcpreq) { - struct fcloop_rport *rport = remoteport->private; struct fcloop_ini_fcpreq *inireq = fcpreq->private; - struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req; + struct fcloop_fcpreq *tfcp_req; + bool abortio = true; + + spin_lock(&inireq->inilock); + tfcp_req = inireq->tfcp_req; + if (tfcp_req) + fcloop_tfcp_req_get(tfcp_req); + spin_unlock(&inireq->inilock); if (!tfcp_req) /* abort has already been called */ return; - if (rport->targetport) - nvmet_fc_rcv_fcp_abort(rport->targetport, - &tfcp_req->tgt_fcp_req); - /* break initiator/target relationship for io */ spin_lock(&tfcp_req->reqlock); - inireq->tfcp_req = NULL; - tfcp_req->fcpreq = NULL; + switch (tfcp_req->inistate) { + case INI_IO_START: + case INI_IO_ACTIVE: + tfcp_req->inistate = INI_IO_ABORTED; + break; + case INI_IO_COMPLETED: + abortio = false; + break; + default: + spin_unlock(&tfcp_req->reqlock); + WARN_ON(1); + return; + } spin_unlock(&tfcp_req->reqlock); - /* post the aborted io completion */ - fcpreq->status = -ECANCELED; - schedule_work(&inireq->iniwork); + if (abortio) + /* leave the reference while the work item is scheduled */ + WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work)); + else { + /* + * as the io has already had the done callback made, + * nothing more to do. So release the reference taken above + */ + fcloop_tfcp_req_put(tfcp_req); + } } static void @@ -657,7 +795,8 @@ fcloop_nport_get(struct fcloop_nport *nport) static void fcloop_localport_delete(struct nvme_fc_local_port *localport) { - struct fcloop_lport *lport = localport->private; + struct fcloop_lport_priv *lport_priv = localport->private; + struct fcloop_lport *lport = lport_priv->lport; /* release any threads waiting for the unreg to complete */ complete(&lport->unreg_done); @@ -697,7 +836,7 @@ static struct nvme_fc_port_template fctemplate = { .max_dif_sgl_segments = FCLOOP_SGL_SEGS, .dma_boundary = FCLOOP_DMABOUND_4G, /* sizes of additional private data for data structures */ - .local_priv_sz = sizeof(struct fcloop_lport), + .local_priv_sz = sizeof(struct fcloop_lport_priv), .remote_priv_sz = sizeof(struct fcloop_rport), .lsrqst_priv_sz = sizeof(struct fcloop_lsreq), .fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq), @@ -714,8 +853,7 @@ static struct nvmet_fc_target_template tgttemplate = { .max_dif_sgl_segments = FCLOOP_SGL_SEGS, .dma_boundary = FCLOOP_DMABOUND_4G, /* optional features */ - .target_features = NVMET_FCTGTFEAT_CMD_IN_ISR | - NVMET_FCTGTFEAT_OPDONE_IN_ISR, + .target_features = 0, /* sizes of additional private data for data structures */ .target_priv_sz = sizeof(struct fcloop_tport), }; @@ -728,11 +866,17 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr, struct fcloop_ctrl_options *opts; struct nvme_fc_local_port *localport; struct fcloop_lport *lport; - int ret; + struct fcloop_lport_priv *lport_priv; + unsigned long flags; + int ret = -ENOMEM; + + lport = kzalloc(sizeof(*lport), GFP_KERNEL); + if (!lport) + return -ENOMEM; opts = kzalloc(sizeof(*opts), GFP_KERNEL); if (!opts) - return -ENOMEM; + goto out_free_lport; ret = fcloop_parse_options(opts, buf); if (ret) @@ -752,23 +896,25 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr, ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport); if (!ret) { - unsigned long flags; - /* success */ - lport = localport->private; + lport_priv = localport->private; + lport_priv->lport = lport; + lport->localport = localport; INIT_LIST_HEAD(&lport->lport_list); spin_lock_irqsave(&fcloop_lock, flags); list_add_tail(&lport->lport_list, &fcloop_lports); spin_unlock_irqrestore(&fcloop_lock, flags); - - /* mark all of the input buffer consumed */ - ret = count; } out_free_opts: kfree(opts); +out_free_lport: + /* free only if we're going to fail */ + if (ret) + kfree(lport); + return ret ? ret : count; } @@ -790,6 +936,8 @@ __wait_localport_unreg(struct fcloop_lport *lport) wait_for_completion(&lport->unreg_done); + kfree(lport); + return ret; } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 1e21b28..7991ec3 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -686,6 +686,7 @@ static struct nvmet_fabrics_ops nvme_loop_ops = { static struct nvmf_transport_ops nvme_loop_transport = { .name = "loop", + .module = THIS_MODULE, .create_ctrl = nvme_loop_create_ctrl, }; @@ -716,7 +717,7 @@ static void __exit nvme_loop_cleanup_module(void) nvme_delete_ctrl(&ctrl->ctrl); mutex_unlock(&nvme_loop_ctrl_mutex); - flush_workqueue(nvme_wq); + flush_workqueue(nvme_delete_wq); } module_init(nvme_loop_init_module); diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 4991290..978e169 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -185,59 +185,6 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); } -static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents) -{ - struct scatterlist *sg; - int count; - - if (!sgl || !nents) - return; - - for_each_sg(sgl, sg, nents, count) - __free_page(sg_page(sg)); - kfree(sgl); -} - -static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, - u32 length) -{ - struct scatterlist *sg; - struct page *page; - unsigned int nent; - int i = 0; - - nent = DIV_ROUND_UP(length, PAGE_SIZE); - sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); - if (!sg) - goto out; - - sg_init_table(sg, nent); - - while (length) { - u32 page_len = min_t(u32, length, PAGE_SIZE); - - page = alloc_page(GFP_KERNEL); - if (!page) - goto out_free_pages; - - sg_set_page(&sg[i], page, page_len, 0); - length -= page_len; - i++; - } - *sgl = sg; - *nents = nent; - return 0; - -out_free_pages: - while (i > 0) { - i--; - __free_page(sg_page(&sg[i])); - } - kfree(sg); -out: - return NVME_SC_INTERNAL; -} - static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *c, bool admin) { @@ -484,7 +431,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) } if (rsp->req.sg != &rsp->cmd->inline_sg) - nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt); + sgl_free(rsp->req.sg); if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) nvmet_rdma_process_wr_wait_list(queue); @@ -621,16 +568,14 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, u32 len = get_unaligned_le24(sgl->length); u32 key = get_unaligned_le32(sgl->key); int ret; - u16 status; /* no data command? */ if (!len) return 0; - status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt, - len); - if (status) - return status; + rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt); + if (!rsp->req.sg) + return NVME_SC_INTERNAL; ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, @@ -976,7 +921,7 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) { - pr_info("freeing queue %d\n", queue->idx); + pr_debug("freeing queue %d\n", queue->idx); nvmet_sq_destroy(&queue->nvme_sq); @@ -1558,25 +1503,9 @@ err_ib_client: static void __exit nvmet_rdma_exit(void) { - struct nvmet_rdma_queue *queue; - nvmet_unregister_transport(&nvmet_rdma_ops); - - flush_scheduled_work(); - - mutex_lock(&nvmet_rdma_queue_mutex); - while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list, - struct nvmet_rdma_queue, queue_list))) { - list_del_init(&queue->queue_list); - - mutex_unlock(&nvmet_rdma_queue_mutex); - __nvmet_rdma_queue_disconnect(queue); - mutex_lock(&nvmet_rdma_queue_mutex); - } - mutex_unlock(&nvmet_rdma_queue_mutex); - - flush_scheduled_work(); ib_unregister_client(&nvmet_rdma_ib_client); + WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); ida_destroy(&nvmet_rdma_queue_ida); } diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig index e2bc999..4c44d7b 100644 --- a/drivers/target/Kconfig +++ b/drivers/target/Kconfig @@ -5,6 +5,7 @@ menuconfig TARGET_CORE select CONFIGFS_FS select CRC_T10DIF select BLK_SCSI_REQUEST # only for scsi_command_size_tbl.. + select SGL_ALLOC default n help Say Y or M here to enable the TCM Storage Engine and ConfigFS enabled diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 58caacd..c03a78e 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2300,13 +2300,7 @@ queue_full: void target_free_sgl(struct scatterlist *sgl, int nents) { - struct scatterlist *sg; - int count; - - for_each_sg(sgl, sg, nents, count) - __free_page(sg_page(sg)); - - kfree(sgl); + sgl_free_n_order(sgl, nents, 0); } EXPORT_SYMBOL(target_free_sgl); @@ -2414,42 +2408,10 @@ int target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, u32 length, bool zero_page, bool chainable) { - struct scatterlist *sg; - struct page *page; - gfp_t zero_flag = (zero_page) ? __GFP_ZERO : 0; - unsigned int nalloc, nent; - int i = 0; - - nalloc = nent = DIV_ROUND_UP(length, PAGE_SIZE); - if (chainable) - nalloc++; - sg = kmalloc_array(nalloc, sizeof(struct scatterlist), GFP_KERNEL); - if (!sg) - return -ENOMEM; + gfp_t gfp = GFP_KERNEL | (zero_page ? __GFP_ZERO : 0); - sg_init_table(sg, nalloc); - - while (length) { - u32 page_len = min_t(u32, length, PAGE_SIZE); - page = alloc_page(GFP_KERNEL | zero_flag); - if (!page) - goto out; - - sg_set_page(&sg[i], page, page_len, 0); - length -= page_len; - i++; - } - *sgl = sg; - *nents = nent; - return 0; - -out: - while (i > 0) { - i--; - __free_page(sg_page(&sg[i])); - } - kfree(sg); - return -ENOMEM; + *sgl = sgl_alloc_order(length, 0, chainable, gfp, nents); + return *sgl ? 0 : -ENOMEM; } EXPORT_SYMBOL(target_alloc_sgl); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5982c8a..75610d2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -411,7 +411,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, static u64 bio_end_offset(struct bio *bio) { - struct bio_vec *last = &bio->bi_io_vec[bio->bi_vcnt - 1]; + struct bio_vec *last = bio_last_bvec_all(bio); return page_offset(last->bv_page) + last->bv_len + last->bv_offset; } @@ -563,7 +563,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* we need the actual starting offset of this extent in the file */ read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, - page_offset(bio->bi_io_vec->bv_page), + page_offset(bio_first_page_all(bio)), PAGE_SIZE); read_unlock(&em_tree->lock); if (!em) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 012d638..d43360b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2257,7 +2257,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, return 0; } -bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, +bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, struct io_failure_record *failrec, int failed_mirror) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2281,7 +2281,7 @@ bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, * a) deliver good data to the caller * b) correct the bad sectors on disk */ - if (failed_bio->bi_vcnt > 1) { + if (failed_bio_pages > 1) { /* * to fulfill b), we need to know the exact failing sectors, as * we don't want to rewrite any more than the failed ones. thus, @@ -2374,6 +2374,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, int read_mode = 0; blk_status_t status; int ret; + unsigned failed_bio_pages = bio_pages_all(failed_bio); BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2381,13 +2382,13 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, if (ret) return ret; - if (!btrfs_check_repairable(inode, failed_bio, failrec, + if (!btrfs_check_repairable(inode, failed_bio_pages, failrec, failed_mirror)) { free_io_failure(failure_tree, tree, failrec); return -EIO; } - if (failed_bio->bi_vcnt > 1) + if (failed_bio_pages > 1) read_mode |= REQ_FAILFAST_DEV; phy_offset >>= inode->i_sb->s_blocksize_bits; @@ -2724,7 +2725,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) { blk_status_t ret = 0; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec = bio_last_bvec_all(bio); struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; u64 start; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 93dcae0..20854d6 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -540,7 +540,7 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, struct io_failure_record **failrec_ret); -bool btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, +bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, struct io_failure_record *failrec, int fail_mirror); struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e1a7f3c..cb1e2d2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8015,6 +8015,7 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, int segs; int ret; blk_status_t status; + struct bio_vec bvec; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -8030,8 +8031,9 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, } segs = bio_segments(failed_bio); + bio_get_first_bvec(failed_bio, &bvec); if (segs > 1 || - (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode))) + (bvec.bv_len > btrfs_inode_sectorsize(inode))) read_mode |= REQ_FAILFAST_DEV; isector = start - btrfs_io_bio(failed_bio)->logical; @@ -8074,7 +8076,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) ASSERT(bio->bi_vcnt == 1); io_tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; - ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); + ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode)); done->uptodate = 1; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -8164,7 +8166,7 @@ static void btrfs_retry_endio(struct bio *bio) uptodate = 1; ASSERT(bio->bi_vcnt == 1); - ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); + ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode)); io_tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; diff --git a/fs/buffer.c b/fs/buffer.c index 0736a6a..8b26295 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3014,7 +3014,7 @@ static void end_bio_bh_io_sync(struct bio *bio) void guard_bio_eod(int op, struct bio *bio) { sector_t maxsector; - struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; + struct bio_vec *bvec = bio_last_bvec_all(bio); unsigned truncated_bytes; struct hd_struct *part; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 516fa0d..455f086 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -56,7 +56,7 @@ static void f2fs_read_end_io(struct bio *bio) int i; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { + if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); bio->bi_status = BLK_STS_IOERR; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cea4836..d4d04fe 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -126,7 +126,7 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb) * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list * @inode: inode to be moved * @wb: target bdi_writeback - * @head: one of @wb->b_{dirty|io|more_io} + * @head: one of @wb->b_{dirty|io|more_io|dirty_time} * * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. * Returns %true if @inode is the first occupant of the !dirty_time IO diff --git a/include/linux/bio.h b/include/linux/bio.h index 23d29b3..d0eb659 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -300,6 +300,29 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) bv->bv_len = iter.bi_bvec_done; } +static inline unsigned bio_pages_all(struct bio *bio) +{ + WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); + return bio->bi_vcnt; +} + +static inline struct bio_vec *bio_first_bvec_all(struct bio *bio) +{ + WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); + return bio->bi_io_vec; +} + +static inline struct page *bio_first_page_all(struct bio *bio) +{ + return bio_first_bvec_all(bio)->bv_page; +} + +static inline struct bio_vec *bio_last_bvec_all(struct bio *bio) +{ + WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); + return &bio->bi_io_vec[bio->bi_vcnt - 1]; +} + enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ @@ -477,7 +500,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi) #endif extern void bio_copy_data(struct bio *dst, struct bio *src); -extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); extern void bio_free_pages(struct bio *bio); extern struct bio *bio_copy_user_iov(struct request_queue *, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index e9825ff..69bea82 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -660,12 +660,14 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, struct blkg_rwstat *from) { - struct blkg_rwstat v = blkg_rwstat_read(from); + u64 sum[BLKG_RWSTAT_NR]; int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_add(atomic64_read(&v.aux_cnt[i]) + - atomic64_read(&from->aux_cnt[i]), + sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), &to->aux_cnt[i]); } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 95c9a5c..8efcf49 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -51,6 +51,7 @@ struct blk_mq_hw_ctx { unsigned int queue_num; atomic_t nr_active; + unsigned int nr_expired; struct hlist_node cpuhp_dead; struct kobject kobj; @@ -65,7 +66,7 @@ struct blk_mq_hw_ctx { #endif /* Must be the last member - see also blk_mq_hw_ctx_size(). */ - struct srcu_struct queue_rq_srcu[0]; + struct srcu_struct srcu[0]; }; struct blk_mq_tag_set { diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 9e7d8bd..c5d3db0 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -39,6 +39,34 @@ typedef u8 __bitwise blk_status_t; #define BLK_STS_AGAIN ((__force blk_status_t)12) +/** + * blk_path_error - returns true if error may be path related + * @error: status the request was completed with + * + * Description: + * This classifies block error status into non-retryable errors and ones + * that may be successful if retried on a failover path. + * + * Return: + * %false - retrying failover path will not help + * %true - may succeed if retried + */ +static inline bool blk_path_error(blk_status_t error) +{ + switch (error) { + case BLK_STS_NOTSUPP: + case BLK_STS_NOSPC: + case BLK_STS_TARGET: + case BLK_STS_NEXUS: + case BLK_STS_MEDIUM: + case BLK_STS_PROTECTION: + return false; + } + + /* Anything else could be a path failure, so should be retried */ + return true; +} + struct blk_issue_stat { u64 stat; }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0ce8a37..4f3df80 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,6 +27,8 @@ #include <linux/percpu-refcount.h> #include <linux/scatterlist.h> #include <linux/blkzoned.h> +#include <linux/seqlock.h> +#include <linux/u64_stats_sync.h> struct module; struct scsi_ioctl_command; @@ -121,6 +123,12 @@ typedef __u32 __bitwise req_flags_t; /* Look at ->special_vec for the actual data payload instead of the bio chain. */ #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) +/* The per-zone write lock is held for this request */ +#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) +/* timeout is expired */ +#define RQF_MQ_TIMEOUT_EXPIRED ((__force req_flags_t)(1 << 20)) +/* already slept for hybrid poll */ +#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 21)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ @@ -133,12 +141,6 @@ typedef __u32 __bitwise req_flags_t; * especially blk_mq_rq_ctx_init() to take care of the added fields. */ struct request { - struct list_head queuelist; - union { - struct __call_single_data csd; - u64 fifo_time; - }; - struct request_queue *q; struct blk_mq_ctx *mq_ctx; @@ -148,8 +150,6 @@ struct request { int internal_tag; - unsigned long atomic_flags; - /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ int tag; @@ -158,6 +158,8 @@ struct request { struct bio *bio; struct bio *biotail; + struct list_head queuelist; + /* * The hash is used inside the scheduler, and killed once the * request reaches the dispatch list. The ipi_list is only used @@ -205,19 +207,16 @@ struct request { struct hd_struct *part; unsigned long start_time; struct blk_issue_stat issue_stat; -#ifdef CONFIG_BLK_CGROUP - struct request_list *rl; /* rl this rq is alloced from */ - unsigned long long start_time_ns; - unsigned long long io_start_time_ns; /* when passed to hardware */ -#endif /* Number of scatter-gather DMA addr+len pairs after * physical address coalescing is performed. */ unsigned short nr_phys_segments; + #if defined(CONFIG_BLK_DEV_INTEGRITY) unsigned short nr_integrity_segments; #endif + unsigned short write_hint; unsigned short ioprio; unsigned int timeout; @@ -226,11 +225,37 @@ struct request { unsigned int extra_len; /* length of alignment and padding */ - unsigned short write_hint; + /* + * On blk-mq, the lower bits of ->gstate (generation number and + * state) carry the MQ_RQ_* state value and the upper bits the + * generation number which is monotonically incremented and used to + * distinguish the reuse instances. + * + * ->gstate_seq allows updates to ->gstate and other fields + * (currently ->deadline) during request start to be read + * atomically from the timeout path, so that it can operate on a + * coherent set of information. + */ + seqcount_t gstate_seq; + u64 gstate; + + /* + * ->aborted_gstate is used by the timeout to claim a specific + * recycle instance of this request. See blk_mq_timeout_work(). + */ + struct u64_stats_sync aborted_gstate_sync; + u64 aborted_gstate; + + /* access through blk_rq_set_deadline, blk_rq_deadline */ + unsigned long __deadline; - unsigned long deadline; struct list_head timeout_list; + union { + struct __call_single_data csd; + u64 fifo_time; + }; + /* * completion callback. */ @@ -239,6 +264,12 @@ struct request { /* for bidi */ struct request *next_rq; + +#ifdef CONFIG_BLK_CGROUP + struct request_list *rl; /* rl this rq is alloced from */ + unsigned long long start_time_ns; + unsigned long long io_start_time_ns; /* when passed to hardware */ +#endif }; static inline bool blk_op_is_scsi(unsigned int op) @@ -564,6 +595,22 @@ struct request_queue { struct queue_limits limits; /* + * Zoned block device information for request dispatch control. + * nr_zones is the total number of zones of the device. This is always + * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones + * bits which indicates if a zone is conventional (bit clear) or + * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones + * bits which indicates if a zone is write locked, that is, if a write + * request targeting the zone was dispatched. All three fields are + * initialized by the low level device driver (e.g. scsi/sd.c). + * Stacking drivers (device mappers) may or may not initialize + * these fields. + */ + unsigned int nr_zones; + unsigned long *seq_zones_bitmap; + unsigned long *seq_zones_wlock; + + /* * sg stuff */ unsigned int sg_timeout; @@ -807,6 +854,27 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } +static inline unsigned int blk_queue_nr_zones(struct request_queue *q) +{ + return q->nr_zones; +} + +static inline unsigned int blk_queue_zone_no(struct request_queue *q, + sector_t sector) +{ + if (!blk_queue_is_zoned(q)) + return 0; + return sector >> ilog2(q->limits.chunk_sectors); +} + +static inline bool blk_queue_zone_is_seq(struct request_queue *q, + sector_t sector) +{ + if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap) + return false; + return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); +} + static inline bool rq_is_sync(struct request *rq) { return op_is_sync(rq->cmd_flags); @@ -1046,6 +1114,16 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> 9; } +static inline unsigned int blk_rq_zone_no(struct request *rq) +{ + return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); +} + +static inline unsigned int blk_rq_zone_is_seq(struct request *rq) +{ + return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); +} + /* * Some commands like WRITE SAME have a payload or data transfer size which * is different from the size of the request. Any driver that supports such @@ -1595,7 +1673,15 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev) if (q) return blk_queue_zone_sectors(q); + return 0; +} + +static inline unsigned int bdev_nr_zones(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + if (q) + return blk_queue_nr_zones(q); return 0; } @@ -1731,8 +1817,6 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio) int kblockd_schedule_work(struct work_struct *work); int kblockd_schedule_work_on(int cpu, struct work_struct *work); -int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); -int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); #ifdef CONFIG_BLK_CGROUP @@ -1971,6 +2055,60 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); + +#ifdef CONFIG_BLK_DEV_ZONED +bool blk_req_needs_zone_write_lock(struct request *rq); +void __blk_req_zone_write_lock(struct request *rq); +void __blk_req_zone_write_unlock(struct request *rq); + +static inline void blk_req_zone_write_lock(struct request *rq) +{ + if (blk_req_needs_zone_write_lock(rq)) + __blk_req_zone_write_lock(rq); +} + +static inline void blk_req_zone_write_unlock(struct request *rq) +{ + if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) + __blk_req_zone_write_unlock(rq); +} + +static inline bool blk_req_zone_is_write_locked(struct request *rq) +{ + return rq->q->seq_zones_wlock && + test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock); +} + +static inline bool blk_req_can_dispatch_to_zone(struct request *rq) +{ + if (!blk_req_needs_zone_write_lock(rq)) + return true; + return !blk_req_zone_is_write_locked(rq); +} +#else +static inline bool blk_req_needs_zone_write_lock(struct request *rq) +{ + return false; +} + +static inline void blk_req_zone_write_lock(struct request *rq) +{ +} + +static inline void blk_req_zone_write_unlock(struct request *rq) +{ +} +static inline bool blk_req_zone_is_write_locked(struct request *rq) +{ + return false; +} + +static inline bool blk_req_can_dispatch_to_zone(struct request *rq) +{ + return true; +} +#endif /* CONFIG_BLK_DEV_ZONED */ + #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/bvec.h b/include/linux/bvec.h index ec8a4d7..fe7a22d 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -125,4 +125,13 @@ static inline bool bvec_iter_rewind(const struct bio_vec *bv, ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len)) +/* for iterating one bio from start to end */ +#define BVEC_ITER_ALL_INIT (struct bvec_iter) \ +{ \ + .bi_sector = 0, \ + .bi_size = UINT_MAX, \ + .bi_idx = 0, \ + .bi_bvec_done = 0, \ +} + #endif /* __LINUX_BVEC_ITER_H */ diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 3d794b3..6d9e230 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -198,8 +198,6 @@ extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); extern void elv_requeue_request(struct request_queue *, struct request *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); -extern int elv_register_queue(struct request_queue *q); -extern void elv_unregister_queue(struct request_queue *q); extern int elv_may_queue(struct request_queue *, unsigned int); extern void elv_completed_request(struct request_queue *, struct request *); extern int elv_set_request(struct request_queue *q, struct request *rq, diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5144ebe..5e35310 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -395,6 +395,11 @@ static inline void add_disk(struct gendisk *disk) { device_add_disk(NULL, disk); } +extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk); +static inline void add_disk_no_queue_reg(struct gendisk *disk) +{ + device_add_disk_no_queue_reg(NULL, disk); +} extern void del_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 2d1d9de..7f4b60a 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -50,10 +50,7 @@ struct nvm_id; struct nvm_dev; struct nvm_tgt_dev; -typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); -typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, - nvm_l2p_update_fn *, void *); typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); @@ -66,7 +63,6 @@ typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); struct nvm_dev_ops { nvm_id_fn *identity; - nvm_get_l2p_tbl_fn *get_l2p_tbl; nvm_op_bb_tbl_fn *get_bb_tbl; nvm_op_set_bb_fn *set_bb_tbl; @@ -112,8 +108,6 @@ enum { NVM_RSP_WARN_HIGHECC = 0x4700, /* Device opcodes */ - NVM_OP_HBREAD = 0x02, - NVM_OP_HBWRITE = 0x81, NVM_OP_PWRITE = 0x91, NVM_OP_PREAD = 0x92, NVM_OP_ERASE = 0x90, @@ -165,12 +159,16 @@ struct nvm_id_group { u8 fmtype; u8 num_ch; u8 num_lun; - u8 num_pln; - u16 num_blk; - u16 num_pg; - u16 fpg_sz; + u16 num_chk; + u16 clba; u16 csecs; u16 sos; + + u16 ws_min; + u16 ws_opt; + u16 ws_seq; + u16 ws_per_chk; + u32 trdt; u32 trdm; u32 tprt; @@ -181,7 +179,10 @@ struct nvm_id_group { u32 mccap; u16 cpar; - struct nvm_id_lp_tbl lptbl; + /* 1.2 compatibility */ + u8 num_pln; + u16 num_pg; + u16 fpg_sz; }; struct nvm_addr_format { @@ -217,6 +218,10 @@ struct nvm_target { #define ADDR_EMPTY (~0ULL) +#define NVM_TARGET_DEFAULT_OP (101) +#define NVM_TARGET_MIN_OP (3) +#define NVM_TARGET_MAX_OP (80) + #define NVM_VERSION_MAJOR 1 #define NVM_VERSION_MINOR 0 #define NVM_VERSION_PATCH 0 @@ -239,7 +244,6 @@ struct nvm_rq { void *meta_list; dma_addr_t dma_meta_list; - struct completion *wait; nvm_end_io_fn *end_io; uint8_t opcode; @@ -268,31 +272,38 @@ enum { NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; + /* Device generic information */ struct nvm_geo { + /* generic geometry */ int nr_chnls; - int nr_luns; - int luns_per_chnl; /* -1 if channels are not symmetric */ - int nr_planes; - int sec_per_pg; /* only sectors for a single page */ - int pgs_per_blk; - int blks_per_lun; - int fpg_size; - int pfpg_size; /* size of buffer if all pages are to be read */ + int all_luns; /* across channels */ + int nr_luns; /* per channel */ + int nr_chks; /* per lun */ + int sec_size; int oob_size; int mccap; - struct nvm_addr_format ppaf; - /* Calculated/Cached values. These do not reflect the actual usable - * blocks at run-time. - */ + int sec_per_chk; + int sec_per_lun; + + int ws_min; + int ws_opt; + int ws_seq; + int ws_per_chk; + int max_rq_size; - int plane_mode; /* drive device in single, double or quad mode */ + int op; + + struct nvm_addr_format ppaf; + + /* Legacy 1.2 specific geometry */ + int plane_mode; /* drive device in single, double or quad mode */ + int nr_planes; + int sec_per_pg; /* only sectors for a single page */ int sec_per_pl; /* all sectors across planes */ - int sec_per_blk; - int sec_per_lun; }; /* sub-device structure */ @@ -320,10 +331,6 @@ struct nvm_dev { /* Device information */ struct nvm_geo geo; - /* lower page table */ - int lps_per_blk; - int *lptbl; - unsigned long total_secs; unsigned long *lun_map; @@ -346,36 +353,6 @@ struct nvm_dev { struct list_head targets; }; -static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, - u64 pba) -{ - struct ppa_addr l; - int secs, pgs, blks, luns; - sector_t ppa = pba; - - l.ppa = 0; - - div_u64_rem(ppa, geo->sec_per_pg, &secs); - l.g.sec = secs; - - sector_div(ppa, geo->sec_per_pg); - div_u64_rem(ppa, geo->pgs_per_blk, &pgs); - l.g.pg = pgs; - - sector_div(ppa, geo->pgs_per_blk); - div_u64_rem(ppa, geo->blks_per_lun, &blks); - l.g.blk = blks; - - sector_div(ppa, geo->blks_per_lun); - div_u64_rem(ppa, geo->luns_per_chnl, &luns); - l.g.lun = luns; - - sector_div(ppa, geo->luns_per_chnl); - l.g.ch = ppa; - - return l; -} - static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev, struct ppa_addr r) { @@ -418,25 +395,6 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_tgt_dev *tgt_dev, return l; } -static inline int ppa_empty(struct ppa_addr ppa_addr) -{ - return (ppa_addr.ppa == ADDR_EMPTY); -} - -static inline void ppa_set_empty(struct ppa_addr *ppa_addr) -{ - ppa_addr->ppa = ADDR_EMPTY; -} - -static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2) -{ - if (ppa_empty(ppa1) || ppa_empty(ppa2)) - return 0; - - return ((ppa1.g.ch == ppa2.g.ch) && (ppa1.g.lun == ppa2.g.lun) && - (ppa1.g.blk == ppa2.g.blk)); -} - typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, @@ -481,17 +439,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *); -extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); -extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, - void *); -extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); -extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); extern void nvm_end_io(struct nvm_rq *); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int); - #else /* CONFIG_NVM */ struct nvm_dev_ops; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index aea87f0d..4112e2b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -124,14 +124,20 @@ enum { #define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) #define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) -#define NVME_CMB_SZ(cmbsz) (((cmbsz) >> 12) & 0xfffff) -#define NVME_CMB_SZU(cmbsz) (((cmbsz) >> 8) & 0xf) - -#define NVME_CMB_WDS(cmbsz) ((cmbsz) & 0x10) -#define NVME_CMB_RDS(cmbsz) ((cmbsz) & 0x8) -#define NVME_CMB_LISTS(cmbsz) ((cmbsz) & 0x4) -#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) -#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) + +enum { + NVME_CMBSZ_SQS = 1 << 0, + NVME_CMBSZ_CQS = 1 << 1, + NVME_CMBSZ_LISTS = 1 << 2, + NVME_CMBSZ_RDS = 1 << 3, + NVME_CMBSZ_WDS = 1 << 4, + + NVME_CMBSZ_SZ_SHIFT = 12, + NVME_CMBSZ_SZ_MASK = 0xfffff, + + NVME_CMBSZ_SZU_SHIFT = 8, + NVME_CMBSZ_SZU_MASK = 0xf, +}; /* * Submission and Completion Queue Entry Sizes for the NVM command set. diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index b7c8325..22b2131 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -276,6 +276,17 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, gfp_t gfp_mask); +#ifdef CONFIG_SGL_ALLOC +struct scatterlist *sgl_alloc_order(unsigned long long length, + unsigned int order, bool chainable, + gfp_t gfp, unsigned int *nent_p); +struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, + unsigned int *nent_p); +void sgl_free_n_order(struct scatterlist *sgl, int nents, int order); +void sgl_free_order(struct scatterlist *sgl, int order); +void sgl_free(struct scatterlist *sgl); +#endif /* CONFIG_SGL_ALLOC */ + size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip, bool to_buffer); diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index 42d1a43..f9a1be7 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h @@ -75,14 +75,23 @@ struct nvm_ioctl_create_simple { __u32 lun_end; }; +struct nvm_ioctl_create_extended { + __u16 lun_begin; + __u16 lun_end; + __u16 op; + __u16 rsv; +}; + enum { NVM_CONFIG_TYPE_SIMPLE = 0, + NVM_CONFIG_TYPE_EXTENDED = 1, }; struct nvm_ioctl_create_conf { __u32 type; union { struct nvm_ioctl_create_simple s; + struct nvm_ioctl_create_extended e; }; }; diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e12d351..a37a3b4 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, } } -static cpumask_var_t *alloc_node_to_present_cpumask(void) +static cpumask_var_t *alloc_node_to_possible_cpumask(void) { cpumask_var_t *masks; int node; @@ -62,7 +62,7 @@ out_unwind: return NULL; } -static void free_node_to_present_cpumask(cpumask_var_t *masks) +static void free_node_to_possible_cpumask(cpumask_var_t *masks) { int node; @@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t *masks) kfree(masks); } -static void build_node_to_present_cpumask(cpumask_var_t *masks) +static void build_node_to_possible_cpumask(cpumask_var_t *masks) { int cpu; - for_each_present_cpu(cpu) + for_each_possible_cpu(cpu) cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); } -static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask, +static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask, const struct cpumask *mask, nodemask_t *nodemsk) { int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ for_each_node(n) { - if (cpumask_intersects(mask, node_to_present_cpumask[n])) { + if (cpumask_intersects(mask, node_to_possible_cpumask[n])) { node_set(n, *nodemsk); nodes++; } @@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; struct cpumask *masks; - cpumask_var_t nmsk, *node_to_present_cpumask; + cpumask_var_t nmsk, *node_to_possible_cpumask; /* * If there aren't any vectors left after applying the pre/post @@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!masks) goto out; - node_to_present_cpumask = alloc_node_to_present_cpumask(); - if (!node_to_present_cpumask) + node_to_possible_cpumask = alloc_node_to_possible_cpumask(); + if (!node_to_possible_cpumask) goto out; /* Fill out vectors at the beginning that don't need affinity */ @@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Stabilize the cpumasks */ get_online_cpus(); - build_node_to_present_cpumask(node_to_present_cpumask); - nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask, + build_node_to_possible_cpumask(node_to_possible_cpumask); + nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask, &nodemsk); /* @@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (affv <= nodes) { for_each_node_mask(n, nodemsk) { cpumask_copy(masks + curvec, - node_to_present_cpumask[n]); + node_to_possible_cpumask[n]); if (++curvec == last_affv) break; } @@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]); + cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -192,7 +192,7 @@ done: /* Fill out vectors at the end that don't need affinity */ for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); - free_node_to_present_cpumask(node_to_present_cpumask); + free_node_to_possible_cpumask(node_to_possible_cpumask); out: free_cpumask_var(nmsk); return masks; @@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity return 0; get_online_cpus(); - ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; + ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv; put_online_cpus(); return ret; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a46be12..11b4282 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -240,7 +240,7 @@ static void hib_init_batch(struct hib_bio_batch *hb) static void hib_end_io(struct bio *bio) { struct hib_bio_batch *hb = bio->bi_private; - struct page *page = bio->bi_io_vec[0].bv_page; + struct page *page = bio_first_page_all(bio); if (bio->bi_status) { pr_alert("Read-error on swap-device (%u:%u:%Lu)\n", diff --git a/lib/Kconfig b/lib/Kconfig index c5e84fb..4dd5c11 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -409,6 +409,10 @@ config HAS_DMA depends on !NO_DMA default y +config SGL_ALLOC + bool + default n + config DMA_NOOP_OPS bool depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 80aa8d5..42b5ca0 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -462,7 +462,7 @@ static void sbq_wake_up(struct sbitmap_queue *sbq) */ atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch); sbq_index_atomic_inc(&sbq->wake_index); - wake_up(&ws->wait); + wake_up_nr(&ws->wait, wake_batch); } } diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 7c1c55f..53728d3 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -474,6 +474,133 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, } EXPORT_SYMBOL(sg_alloc_table_from_pages); +#ifdef CONFIG_SGL_ALLOC + +/** + * sgl_alloc_order - allocate a scatterlist and its pages + * @length: Length in bytes of the scatterlist. Must be at least one + * @order: Second argument for alloc_pages() + * @chainable: Whether or not to allocate an extra element in the scatterlist + * for scatterlist chaining purposes + * @gfp: Memory allocation flags + * @nent_p: [out] Number of entries in the scatterlist that have pages + * + * Returns: A pointer to an initialized scatterlist or %NULL upon failure. + */ +struct scatterlist *sgl_alloc_order(unsigned long long length, + unsigned int order, bool chainable, + gfp_t gfp, unsigned int *nent_p) +{ + struct scatterlist *sgl, *sg; + struct page *page; + unsigned int nent, nalloc; + u32 elem_len; + + nent = round_up(length, PAGE_SIZE << order) >> (PAGE_SHIFT + order); + /* Check for integer overflow */ + if (length > (nent << (PAGE_SHIFT + order))) + return NULL; + nalloc = nent; + if (chainable) { + /* Check for integer overflow */ + if (nalloc + 1 < nalloc) + return NULL; + nalloc++; + } + sgl = kmalloc_array(nalloc, sizeof(struct scatterlist), + (gfp & ~GFP_DMA) | __GFP_ZERO); + if (!sgl) + return NULL; + + sg_init_table(sgl, nalloc); + sg = sgl; + while (length) { + elem_len = min_t(u64, length, PAGE_SIZE << order); + page = alloc_pages(gfp, order); + if (!page) { + sgl_free(sgl); + return NULL; + } + + sg_set_page(sg, page, elem_len, 0); + length -= elem_len; + sg = sg_next(sg); + } + WARN_ONCE(length, "length = %lld\n", length); + if (nent_p) + *nent_p = nent; + return sgl; +} +EXPORT_SYMBOL(sgl_alloc_order); + +/** + * sgl_alloc - allocate a scatterlist and its pages + * @length: Length in bytes of the scatterlist + * @gfp: Memory allocation flags + * @nent_p: [out] Number of entries in the scatterlist + * + * Returns: A pointer to an initialized scatterlist or %NULL upon failure. + */ +struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, + unsigned int *nent_p) +{ + return sgl_alloc_order(length, 0, false, gfp, nent_p); +} +EXPORT_SYMBOL(sgl_alloc); + +/** + * sgl_free_n_order - free a scatterlist and its pages + * @sgl: Scatterlist with one or more elements + * @nents: Maximum number of elements to free + * @order: Second argument for __free_pages() + * + * Notes: + * - If several scatterlists have been chained and each chain element is + * freed separately then it's essential to set nents correctly to avoid that a + * page would get freed twice. + * - All pages in a chained scatterlist can be freed at once by setting @nents + * to a high number. + */ +void sgl_free_n_order(struct scatterlist *sgl, int nents, int order) +{ + struct scatterlist *sg; + struct page *page; + int i; + + for_each_sg(sgl, sg, nents, i) { + if (!sg) + break; + page = sg_page(sg); + if (page) + __free_pages(page, order); + } + kfree(sgl); +} +EXPORT_SYMBOL(sgl_free_n_order); + +/** + * sgl_free_order - free a scatterlist and its pages + * @sgl: Scatterlist with one or more elements + * @order: Second argument for __free_pages() + */ +void sgl_free_order(struct scatterlist *sgl, int order) +{ + sgl_free_n_order(sgl, INT_MAX, order); +} +EXPORT_SYMBOL(sgl_free_order); + +/** + * sgl_free - free a scatterlist and its pages + * @sgl: Scatterlist with one or more elements + */ +void sgl_free(struct scatterlist *sgl) +{ + sgl_free_order(sgl, 0); +} +EXPORT_SYMBOL(sgl_free); + +#endif /* CONFIG_SGL_ALLOC */ + void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset) diff --git a/mm/page_io.c b/mm/page_io.c index e93f1a4..b41cf96 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -50,7 +50,7 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, void end_swap_bio_write(struct bio *bio) { - struct page *page = bio->bi_io_vec[0].bv_page; + struct page *page = bio_first_page_all(bio); if (bio->bi_status) { SetPageError(page); @@ -122,7 +122,7 @@ static void swap_slot_free_notify(struct page *page) static void end_swap_bio_read(struct bio *bio) { - struct page *page = bio->bi_io_vec[0].bv_page; + struct page *page = bio_first_page_all(bio); struct task_struct *waiter = bio->bi_private; if (bio->bi_status) { |