/* * The Kyber I/O scheduler. Controls latency by throttling queue depths using * scalable techniques. * * Copyright (C) 2017 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License v2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. */ #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/elevator.h> #include <linux/module.h> #include <linux/sbitmap.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-stat.h" /* Scheduling domains. */ enum { KYBER_READ, KYBER_SYNC_WRITE, KYBER_OTHER, /* Async writes, discard, etc. */ KYBER_NUM_DOMAINS, }; enum { KYBER_MIN_DEPTH = 256, /* * In order to prevent starvation of synchronous requests by a flood of * asynchronous requests, we reserve 25% of requests for synchronous * operations. */ KYBER_ASYNC_PERCENT = 75, }; /* * Initial device-wide depths for each scheduling domain. * * Even for fast devices with lots of tags like NVMe, you can saturate * the device with only a fraction of the maximum possible queue depth. * So, we cap these to a reasonable value. */ static const unsigned int kyber_depth[] = { [KYBER_READ] = 256, [KYBER_SYNC_WRITE] = 128, [KYBER_OTHER] = 64, }; /* * Scheduling domain batch sizes. We favor reads. */ static const unsigned int kyber_batch_size[] = { [KYBER_READ] = 16, [KYBER_SYNC_WRITE] = 8, [KYBER_OTHER] = 8, }; /* * There is a same mapping between ctx & hctx and kcq & khd, * we use request->mq_ctx->index_hw to index the kcq in khd. */ struct kyber_ctx_queue { /* * Used to ensure operations on rq_list and kcq_map to be an atmoic one. * Also protect the rqs on rq_list when merge. */ spinlock_t lock; struct list_head rq_list[KYBER_NUM_DOMAINS]; } ____cacheline_aligned_in_smp; struct kyber_queue_data { struct request_queue *q; struct blk_stat_callback *cb; /* * The device is divided into multiple scheduling domains based on the * request type. Each domain has a fixed number of in-flight requests of * that type device-wide, limited by these tokens. */ struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; /* * Async request percentage, converted to per-word depth for * sbitmap_get_shallow(). */ unsigned int async_depth; /* Target latencies in nanoseconds. */ u64 read_lat_nsec, write_lat_nsec; }; struct kyber_hctx_data { spinlock_t lock; struct list_head rqs[KYBER_NUM_DOMAINS]; unsigned int cur_domain; unsigned int batching; struct kyber_ctx_queue *kcqs; struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS]; struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; atomic_t wait_index[KYBER_NUM_DOMAINS]; }; static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key); static unsigned int kyber_sched_domain(unsigned int op) { if ((op & REQ_OP_MASK) == REQ_OP_READ) return KYBER_READ; else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op)) return KYBER_SYNC_WRITE; else return KYBER_OTHER; } enum { NONE = 0, GOOD = 1, GREAT = 2, BAD = -1, AWFUL = -2, }; #define IS_GOOD(status) ((status) > 0) #define IS_BAD(status) ((status) < 0) static int kyber_lat_status(struct blk_stat_callback *cb, unsigned int sched_domain, u64 target) { u64 latency; if (!cb->stat[sched_domain].nr_samples) return NONE; latency = cb->stat[sched_domain].mean; if (latency >= 2 * target) return AWFUL; else if (latency > target) return BAD; else if (latency <= target / 2) return GREAT; else /* (latency <= target) */ return GOOD; } /* * Adjust the read or synchronous write depth given the status of reads and * writes. The goal is that the latencies of the two domains are fair (i.e., if * one is good, then the other is good). */ static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd, unsigned int sched_domain, int this_status, int other_status) { unsigned int orig_depth, depth; /* * If this domain had no samples, or reads and writes are both good or * both bad, don't adjust the depth. */ if (this_status == NONE || (IS_GOOD(this_status) && IS_GOOD(other_status)) || (IS_BAD(this_status) && IS_BAD(other_status))) return; orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth; if (other_status == NONE) { depth++; } else { switch (this_status) { case GOOD: if (other_status == AWFUL) depth -= max(depth / 4, 1U); else depth -= max(depth / 8, 1U); break; case GREAT: if (other_status == AWFUL) depth /= 2; else depth -= max(depth / 4, 1U); break; case BAD: depth++; break; case AWFUL: if (other_status == GREAT) depth += 2; else depth++; break; } } depth = clamp(depth, 1U, kyber_depth[sched_domain]); if (depth != orig_depth) sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); } /* * Adjust the depth of other requests given the status of reads and synchronous * writes. As long as either domain is doing fine, we don't throttle, but if * both domains are doing badly, we throttle heavily. */ static void kyber_adjust_other_depth(struct kyber_queue_data *kqd, int read_status, int write_status, bool have_samples) { unsigned int orig_depth, depth; int status; orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth; if (read_status == NONE && write_status == NONE) { depth += 2; } else if (have_samples) { if (read_status == NONE) status = write_status; else if (write_status == NONE) status = read_status; else status = max(read_status, write_status); switch (status) { case GREAT: depth += 2; break; case GOOD: depth++; break; case BAD: depth -= max(depth / 4, 1U); break; case AWFUL: depth /= 2; break; } } depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]); if (depth != orig_depth) sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth); } /* * Apply heuristics for limiting queue depths based on gathered latency * statistics. */ static void kyber_stat_timer_fn(struct blk_stat_callback *cb) { struct kyber_queue_data *kqd = cb->data; int read_status, write_status; read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec); write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec); kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status); kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status); kyber_adjust_other_depth(kqd, read_status, write_status, cb->stat[KYBER_OTHER].nr_samples != 0); /* * Continue monitoring latencies if we aren't hitting the targets or * we're still throttling other requests. */ if (!blk_stat_is_active(kqd->cb) && ((IS_BAD(read_status) || IS_BAD(write_status) || kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER]))) blk_stat_activate_msecs(kqd->cb, 100); } static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd) { /* * All of the hardware queues have the same depth, so we can just grab * the shift of the first one. */ return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; } static int kyber_bucket_fn(const struct request *rq) { return kyber_sched_domain(rq->cmd_flags); } static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) { struct kyber_queue_data *kqd; unsigned int max_tokens; unsigned int shift; int ret = -ENOMEM; int i; kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); if (!kqd) goto err; kqd->q = q; kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn, KYBER_NUM_DOMAINS, kqd); if (!kqd->cb) goto err_kqd; /* * The maximum number of tokens for any scheduling domain is at least * the queue depth of a single hardware queue. If the hardware doesn't * have many tags, still provide a reasonable number. */ max_tokens = max_t(unsigned int, q->tag_set->queue_depth, KYBER_MIN_DEPTH); for (i = 0; i < KYBER_NUM_DOMAINS; i++) { WARN_ON(!kyber_depth[i]); WARN_ON(!kyber_batch_size[i]); ret = sbitmap_queue_init_node(&kqd->domain_tokens[i], max_tokens, -1, false, GFP_KERNEL, q->node); if (ret) { while (--i >= 0) sbitmap_queue_free(&kqd->domain_tokens[i]); goto err_cb; } sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]); } shift = kyber_sched_tags_shift(kqd); kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; kqd->read_lat_nsec = 2000000ULL; kqd->write_lat_nsec = 10000000ULL; return kqd; err_cb: blk_stat_free_callback(kqd->cb); err_kqd: kfree(kqd); err: return ERR_PTR(ret); } static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) { struct kyber_queue_data *kqd; struct elevator_queue *eq; eq = elevator_alloc(q, e); if (!eq) return -ENOMEM; kqd = kyber_queue_data_alloc(q); if (IS_ERR(kqd)) { kobject_put(&eq->kobj); return PTR_ERR(kqd); } eq->elevator_data = kqd; q->elevator = eq; blk_stat_add_callback(q, kqd->cb); return 0; } static void kyber_exit_sched(struct elevator_queue *e) { struct kyber_queue_data *kqd = e->elevator_data; struct request_queue *q = kqd->q; int i; blk_stat_remove_callback(q, kqd->cb); for (i = 0; i < KYBER_NUM_DOMAINS; i++) sbitmap_queue_free(&kqd->domain_tokens[i]); blk_stat_free_callback(kqd->cb); kfree(kqd); } static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq) { unsigned int i; spin_lock_init(&kcq->lock); for (i = 0; i < KYBER_NUM_DOMAINS; i++) INIT_LIST_HEAD(&kcq->rq_list[i]); } static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; struct kyber_hctx_data *khd; int i; khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node); if (!khd) return -ENOMEM; khd->kcqs = kmalloc_array_node(hctx->nr_ctx, sizeof(struct kyber_ctx_queue), GFP_KERNEL, hctx->numa_node); if (!khd->kcqs) goto err_khd; for (i = 0; i < hctx->nr_ctx; i++) kyber_ctx_queue_init(&khd->kcqs[i]); for (i = 0; i < KYBER_NUM_DOMAINS; i++) { if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx, ilog2(8), GFP_KERNEL, hctx->numa_node)) { while (--i >= 0) sbitmap_free(&khd->kcq_map[i]); goto err_kcqs; } } spin_lock_init(&khd->lock); for (i = 0; i < KYBER_NUM_DOMAINS; i++) { INIT_LIST_HEAD(&khd->rqs[i]); init_waitqueue_func_entry(&khd->domain_wait[i], kyber_domain_wake); khd->domain_wait[i].private = hctx; INIT_LIST_HEAD(&khd->domain_wait[i].entry); atomic_set(&khd->wait_index[i], 0); } khd->cur_domain = 0; khd->batching = 0; hctx->sched_data = khd; sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, kqd->async_depth); return 0; err_kcqs: kfree(khd->kcqs); err_khd: kfree(khd); return -ENOMEM; } static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { struct kyber_hctx_data *khd = hctx->sched_data; int i; for (i = 0; i < KYBER_NUM_DOMAINS; i++) sbitmap_free(&khd->kcq_map[i]); kfree(khd->kcqs); kfree(hctx->sched_data); } static int rq_get_domain_token(struct request *rq) { return (long)rq->elv.priv[0]; } static void rq_set_domain_token(struct request *rq, int token) { rq->elv.priv[0] = (void *)(long)token; } static void rq_clear_domain_token(struct kyber_queue_data *kqd, struct request *rq) { unsigned int sched_domain; int nr; nr = rq_get_domain_token(rq); if (nr != -1) { sched_domain = kyber_sched_domain(rq->cmd_flags); sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr, rq->mq_ctx->cpu); } } static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) { /* * We use the scheduler tags as per-hardware queue queueing tokens. * Async requests can be limited at this stage. */ if (!op_is_sync(op)) { struct kyber_queue_data *kqd = data->q->elevator->elevator_data; data->shallow_depth = kqd->async_depth; } } static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) { struct kyber_hctx_data *khd = hctx->sched_data; struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw]; unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); struct list_head *rq_list = &kcq->rq_list[sched_domain]; bool merged; spin_lock(&kcq->lock); merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio); spin_unlock(&kcq->lock); blk_mq_put_ctx(ctx); return merged; } static void kyber_prepare_request(struct request *rq, struct bio *bio) { rq_set_domain_token(rq, -1); } static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *rq_list, bool at_head) { struct kyber_hctx_data *khd = hctx->sched_data; struct request *rq, *next; list_for_each_entry_safe(rq, next, rq_list, queuelist) { unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; struct list_head *head = &kcq->rq_list[sched_domain]; spin_lock(&kcq->lock); if (at_head) list_move(&rq->queuelist, head); else list_move_tail(&rq->queuelist, head); sbitmap_set_bit(&khd->kcq_map[sched_domain], rq->mq_ctx->index_hw); blk_mq_sched_request_inserted(rq); spin_unlock(&kcq->lock); } } static void kyber_finish_request(struct request *rq) { struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; rq_clear_domain_token(kqd, rq); } static void kyber_completed_request(struct request *rq) { struct request_queue *q = rq->q; struct kyber_queue_data *kqd = q->elevator->elevator_data; unsigned int sched_domain; u64 now, latency, target; /* * Check if this request met our latency goal. If not, quickly gather * some statistics and start throttling. */ sched_domain = kyber_sched_domain(rq->cmd_flags); switch (sched_domain) { case KYBER_READ: target = kqd->read_lat_nsec; break; case KYBER_SYNC_WRITE: target = kqd->write_lat_nsec; break; default: return; } /* If we are already monitoring latencies, don't check again. */ if (blk_stat_is_active(kqd->cb)) return; now = ktime_get_ns(); if (now < rq->io_start_time_ns) return; latency = now - rq->io_start_time_ns; if (latency > target) blk_stat_activate_msecs(kqd->cb, 10); } struct flush_kcq_data { struct kyber_hctx_data *khd; unsigned int sched_domain; struct list_head *list; }; static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data) { struct flush_kcq_data *flush_data = data; struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr]; spin_lock(&kcq->lock); list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain], flush_data->list); sbitmap_clear_bit(sb, bitnr); spin_unlock(&kcq->lock); return true; } static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd, unsigned int sched_domain, struct list_head *list) { struct flush_kcq_data data = { .khd = khd, .sched_domain = sched_domain, .list = list, }; sbitmap_for_each_set(&khd->kcq_map[sched_domain], flush_busy_kcq, &data); } static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); list_del_init(&wait->entry); blk_mq_run_hw_queue(hctx, true); return 1; } static int kyber_get_domain_token(struct kyber_queue_data *kqd, struct kyber_hctx_data *khd, struct blk_mq_hw_ctx *hctx) { unsigned int sched_domain = khd->cur_domain; struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; wait_queue_entry_t *wait = &khd->domain_wait[sched_domain]; struct sbq_wait_state *ws; int nr; nr = __sbitmap_queue_get(domain_tokens); /* * If we failed to get a domain token, make sure the hardware queue is * run when one becomes available. Note that this is serialized on * khd->lock, but we still need to be careful about the waker. */ if (nr < 0 && list_empty_careful(&wait->entry)) { ws = sbq_wait_ptr(domain_tokens, &khd->wait_index[sched_domain]); khd->domain_ws[sched_domain] = ws; add_wait_queue(&ws->wait, wait); /* * Try again in case a token was freed before we got on the wait * queue. */ nr = __sbitmap_queue_get(domain_tokens); } /* * If we got a token while we were on the wait queue, remove ourselves * from the wait queue to ensure that all wake ups make forward * progress. It's possible that the waker already deleted the entry * between the !list_empty_careful() check and us grabbing the lock, but * list_del_init() is okay with that. */ if (nr >= 0 && !list_empty_careful(&wait->entry)) { ws = khd->domain_ws[sched_domain]; spin_lock_irq(&ws->wait.lock); list_del_init(&wait->entry); spin_unlock_irq(&ws->wait.lock); } return nr; } static struct request * kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, struct kyber_hctx_data *khd, struct blk_mq_hw_ctx *hctx) { struct list_head *rqs; struct request *rq; int nr; rqs = &khd->rqs[khd->cur_domain]; /* * If we already have a flushed request, then we just need to get a * token for it. Otherwise, if there are pending requests in the kcqs, * flush the kcqs, but only if we can get a token. If not, we should * leave the requests in the kcqs so that they can be merged. Note that * khd->lock serializes the flushes, so if we observed any bit set in * the kcq_map, we will always get a request. */ rq = list_first_entry_or_null(rqs, struct request, queuelist); if (rq) { nr = kyber_get_domain_token(kqd, khd, hctx); if (nr >= 0) { khd->batching++; rq_set_domain_token(rq, nr); list_del_init(&rq->queuelist); return rq; } } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) { nr = kyber_get_domain_token(kqd, khd, hctx); if (nr >= 0) { kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs); rq = list_first_entry(rqs, struct request, queuelist); khd->batching++; rq_set_domain_token(rq, nr); list_del_init(&rq->queuelist); return rq; } } /* There were either no pending requests or no tokens. */ return NULL; } static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; struct kyber_hctx_data *khd = hctx->sched_data; struct request *rq; int i; spin_lock(&khd->lock); /* * First, if we are still entitled to batch, try to dispatch a request * from the batch. */ if (khd->batching < kyber_batch_size[khd->cur_domain]) { rq = kyber_dispatch_cur_domain(kqd, khd, hctx); if (rq) goto out; } /* * Either, * 1. We were no longer entitled to a batch. * 2. The domain we were batching didn't have any requests. * 3. The domain we were batching was out of tokens. * * Start another batch. Note that this wraps back around to the original * domain if no other domains have requests or tokens. */ khd->batching = 0; for (i = 0; i < KYBER_NUM_DOMAINS; i++) { if (khd->cur_domain == KYBER_NUM_DOMAINS - 1) khd->cur_domain = 0; else khd->cur_domain++; rq = kyber_dispatch_cur_domain(kqd, khd, hctx); if (rq) goto out; } rq = NULL; out: spin_unlock(&khd->lock); return rq; } static bool kyber_has_work(struct blk_mq_hw_ctx *hctx) { struct kyber_hctx_data *khd = hctx->sched_data; int i; for (i = 0; i < KYBER_NUM_DOMAINS; i++) { if (!list_empty_careful(&khd->rqs[i]) || sbitmap_any_bit_set(&khd->kcq_map[i])) return true; } return false; } #define KYBER_LAT_SHOW_STORE(op) \ static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \ char *page) \ { \ struct kyber_queue_data *kqd = e->elevator_data; \ \ return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \ } \ \ static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \ const char *page, size_t count) \ { \ struct kyber_queue_data *kqd = e->elevator_data; \ unsigned long long nsec; \ int ret; \ \ ret = kstrtoull(page, 10, &nsec); \ if (ret) \ return ret; \ \ kqd->op##_lat_nsec = nsec; \ \ return count; \ } KYBER_LAT_SHOW_STORE(read); KYBER_LAT_SHOW_STORE(write); #undef KYBER_LAT_SHOW_STORE #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) static struct elv_fs_entry kyber_sched_attrs[] = { KYBER_LAT_ATTR(read), KYBER_LAT_ATTR(write), __ATTR_NULL }; #undef KYBER_LAT_ATTR #ifdef CONFIG_BLK_DEBUG_FS #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \ static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \ { \ struct request_queue *q = data; \ struct kyber_queue_data *kqd = q->elevator->elevator_data; \ \ sbitmap_queue_show(&kqd->domain_tokens[domain], m); \ return 0; \ } \ \ static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \ __acquires(&khd->lock) \ { \ struct blk_mq_hw_ctx *hctx = m->private; \ struct kyber_hctx_data *khd = hctx->sched_data; \ \ spin_lock(&khd->lock); \ return seq_list_start(&khd->rqs[domain], *pos); \ } \ \ static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \ loff_t *pos) \ { \ struct blk_mq_hw_ctx *hctx = m->private; \ struct kyber_hctx_data *khd = hctx->sched_data; \ \ return seq_list_next(v, &khd->rqs[domain], pos); \ } \ \ static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \ __releases(&khd->lock) \ { \ struct blk_mq_hw_ctx *hctx = m->private; \ struct kyber_hctx_data *khd = hctx->sched_data; \ \ spin_unlock(&khd->lock); \ } \ \ static const struct seq_operations kyber_##name##_rqs_seq_ops = { \ .start = kyber_##name##_rqs_start, \ .next = kyber_##name##_rqs_next, \ .stop = kyber_##name##_rqs_stop, \ .show = blk_mq_debugfs_rq_show, \ }; \ \ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \ { \ struct blk_mq_hw_ctx *hctx = data; \ struct kyber_hctx_data *khd = hctx->sched_data; \ wait_queue_entry_t *wait = &khd->domain_wait[domain]; \ \ seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ return 0; \ } KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write) KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) #undef KYBER_DEBUGFS_DOMAIN_ATTRS static int kyber_async_depth_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct kyber_queue_data *kqd = q->elevator->elevator_data; seq_printf(m, "%u\n", kqd->async_depth); return 0; } static int kyber_cur_domain_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct kyber_hctx_data *khd = hctx->sched_data; switch (khd->cur_domain) { case KYBER_READ: seq_puts(m, "READ\n"); break; case KYBER_SYNC_WRITE: seq_puts(m, "SYNC_WRITE\n"); break; case KYBER_OTHER: seq_puts(m, "OTHER\n"); break; default: seq_printf(m, "%u\n", khd->cur_domain); break; } return 0; } static int kyber_batching_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct kyber_hctx_data *khd = hctx->sched_data; seq_printf(m, "%u\n", khd->batching); return 0; } #define KYBER_QUEUE_DOMAIN_ATTRS(name) \ {#name "_tokens", 0400, kyber_##name##_tokens_show} static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { KYBER_QUEUE_DOMAIN_ATTRS(read), KYBER_QUEUE_DOMAIN_ATTRS(sync_write), KYBER_QUEUE_DOMAIN_ATTRS(other), {"async_depth", 0400, kyber_async_depth_show}, {}, }; #undef KYBER_QUEUE_DOMAIN_ATTRS #define KYBER_HCTX_DOMAIN_ATTRS(name) \ {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \ {#name "_waiting", 0400, kyber_##name##_waiting_show} static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { KYBER_HCTX_DOMAIN_ATTRS(read), KYBER_HCTX_DOMAIN_ATTRS(sync_write), KYBER_HCTX_DOMAIN_ATTRS(other), {"cur_domain", 0400, kyber_cur_domain_show}, {"batching", 0400, kyber_batching_show}, {}, }; #undef KYBER_HCTX_DOMAIN_ATTRS #endif static struct elevator_type kyber_sched = { .ops.mq = { .init_sched = kyber_init_sched, .exit_sched = kyber_exit_sched, .init_hctx = kyber_init_hctx, .exit_hctx = kyber_exit_hctx, .limit_depth = kyber_limit_depth, .bio_merge = kyber_bio_merge, .prepare_request = kyber_prepare_request, .insert_requests = kyber_insert_requests, .finish_request = kyber_finish_request, .requeue_request = kyber_finish_request, .completed_request = kyber_completed_request, .dispatch_request = kyber_dispatch_request, .has_work = kyber_has_work, }, .uses_mq = true, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = kyber_queue_debugfs_attrs, .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs, #endif .elevator_attrs = kyber_sched_attrs, .elevator_name = "kyber", .elevator_owner = THIS_MODULE, }; static int __init kyber_init(void) { return elv_register(&kyber_sched); } static void __exit kyber_exit(void) { elv_unregister(&kyber_sched); } module_init(kyber_init); module_exit(kyber_exit); MODULE_AUTHOR("Omar Sandoval"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Kyber I/O scheduler");