diff options
Diffstat (limited to 'sys/geom/sched/gs_rr.c')
-rw-r--r-- | sys/geom/sched/gs_rr.c | 685 |
1 files changed, 685 insertions, 0 deletions
diff --git a/sys/geom/sched/gs_rr.c b/sys/geom/sched/gs_rr.c new file mode 100644 index 0000000..185dc71 --- /dev/null +++ b/sys/geom/sched/gs_rr.c @@ -0,0 +1,685 @@ +/*- + * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id$ + * $FreeBSD$ + * + * A round-robin (RR) anticipatory scheduler, with per-client queues. + * + * The goal of this implementation is to improve throughput compared + * to the pure elevator algorithm, and insure some fairness among + * clients. + * + * Requests coming from the same client are put in the same queue. + * We use anticipation to help reducing seeks, and each queue + * is never served continuously for more than a given amount of + * time or data. Queues are then served in a round-robin fashion. + * + * Each queue can be in any of the following states: + * READY immediately serve the first pending request; + * BUSY one request is under service, wait for completion; + * IDLING do not serve incoming requests immediately, unless + * they are "eligible" as defined later. + * + * Scheduling is made looking at the status of all queues, + * and the first one in round-robin order is privileged. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bio.h> +#include <sys/callout.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/sysctl.h> +#include "gs_scheduler.h" + +/* possible states of the scheduler */ +enum g_rr_state { + G_QUEUE_READY = 0, /* Ready to dispatch. */ + G_QUEUE_BUSY, /* Waiting for a completion. */ + G_QUEUE_IDLING /* Waiting for a new request. */ +}; + +/* possible queue flags */ +enum g_rr_flags { + G_FLAG_COMPLETED = 1, /* Completed a req. in the current budget. */ +}; + +struct g_rr_softc; + +/* + * Queue descriptor, containing reference count, scheduling + * state, a queue of pending requests, configuration parameters. + * Queues with pending request(s) and not under service are also + * stored in a Round Robin (RR) list. + */ +struct g_rr_queue { + struct g_rr_softc *q_sc; /* link to the parent */ + + enum g_rr_state q_status; + unsigned int q_service; /* service received so far */ + int q_slice_end; /* actual slice end in ticks */ + enum g_rr_flags q_flags; /* queue flags */ + struct bio_queue_head q_bioq; + + /* Scheduling parameters */ + unsigned int q_budget; /* slice size in bytes */ + unsigned int q_slice_duration; /* slice size in ticks */ + unsigned int q_wait_ticks; /* wait time for anticipation */ + + /* Stats to drive the various heuristics. */ + struct g_savg q_thinktime; /* Thinktime average. */ + struct g_savg q_seekdist; /* Seek distance average. */ + + int q_bionum; /* Number of requests. */ + + off_t q_lastoff; /* Last submitted req. offset. */ + int q_lastsub; /* Last submitted req. time. */ + + /* Expiration deadline for an empty queue. */ + int q_expire; + + TAILQ_ENTRY(g_rr_queue) q_tailq; /* RR list link field */ +}; + +/* List types. */ +TAILQ_HEAD(g_rr_tailq, g_rr_queue); + +/* list of scheduler instances */ +LIST_HEAD(g_scheds, g_rr_softc); + +/* Default quantum for RR between queues. */ +#define G_RR_DEFAULT_BUDGET 0x00800000 + +/* + * Per device descriptor, holding the Round Robin list of queues + * accessing the disk, a reference to the geom, and the timer. + */ +struct g_rr_softc { + struct g_geom *sc_geom; + + /* + * sc_active is the queue we are anticipating for. + * It is set only in gs_rr_next(), and possibly cleared + * only in gs_rr_next() or on a timeout. + * The active queue is never in the Round Robin list + * even if it has requests queued. + */ + struct g_rr_queue *sc_active; + struct callout sc_wait; /* timer for sc_active */ + + struct g_rr_tailq sc_rr_tailq; /* the round-robin list */ + int sc_nqueues; /* number of queues */ + + /* Statistics */ + int sc_in_flight; /* requests in the driver */ + + LIST_ENTRY(g_rr_softc) sc_next; +}; + +/* Descriptor for bounded values, min and max are constant. */ +struct x_bound { + const int x_min; + int x_cur; + const int x_max; +}; + +/* + * parameters, config and stats + */ +struct g_rr_params { + int queues; /* total number of queues */ + int w_anticipate; /* anticipate writes */ + int bypass; /* bypass scheduling writes */ + + int units; /* how many instances */ + /* sc_head is used for debugging */ + struct g_scheds sc_head; /* first scheduler instance */ + + struct x_bound queue_depth; /* max parallel requests */ + struct x_bound wait_ms; /* wait time, milliseconds */ + struct x_bound quantum_ms; /* quantum size, milliseconds */ + struct x_bound quantum_kb; /* quantum size, Kb (1024 bytes) */ + + /* statistics */ + int wait_hit; /* success in anticipation */ + int wait_miss; /* failure in anticipation */ +}; + +/* + * Default parameters for the scheduler. The quantum sizes target + * a 80MB/s disk; if the hw is faster or slower the minimum of the + * two will have effect: the clients will still be isolated but + * the fairness may be limited. A complete solution would involve + * the on-line measurement of the actual disk throughput to derive + * these parameters. Or we may just choose to ignore service domain + * fairness and accept what can be achieved with time-only budgets. + */ +static struct g_rr_params me = { + .sc_head = LIST_HEAD_INITIALIZER(&me.sc_head), + .w_anticipate = 1, + .queue_depth = { 1, 1, 50 }, + .wait_ms = { 1, 10, 30 }, + .quantum_ms = { 1, 100, 500 }, + .quantum_kb = { 16, 8192, 65536 }, +}; + +struct g_rr_params *gs_rr_me = &me; + +SYSCTL_DECL(_kern_geom_sched); +SYSCTL_NODE(_kern_geom_sched, OID_AUTO, rr, CTLFLAG_RW, 0, + "GEOM_SCHED ROUND ROBIN stuff"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, units, CTLFLAG_RD, + &me.units, 0, "Scheduler instances"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, queues, CTLFLAG_RD, + &me.queues, 0, "Total rr queues"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_ms, CTLFLAG_RW, + &me.wait_ms.x_cur, 0, "Wait time milliseconds"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, quantum_ms, CTLFLAG_RW, + &me.quantum_ms.x_cur, 0, "Quantum size milliseconds"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, bypass, CTLFLAG_RW, + &me.bypass, 0, "Bypass scheduler"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, w_anticipate, CTLFLAG_RW, + &me.w_anticipate, 0, "Do anticipation on writes"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, quantum_kb, CTLFLAG_RW, + &me.quantum_kb.x_cur, 0, "Quantum size Kbytes"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, queue_depth, CTLFLAG_RW, + &me.queue_depth.x_cur, 0, "Maximum simultaneous requests"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_hit, CTLFLAG_RW, + &me.wait_hit, 0, "Hits in anticipation"); +SYSCTL_UINT(_kern_geom_sched_rr, OID_AUTO, wait_miss, CTLFLAG_RW, + &me.wait_miss, 0, "Misses in anticipation"); + +#ifdef DEBUG_QUEUES +/* print the status of a queue */ +static void +gs_rr_dump_q(struct g_rr_queue *qp, int index) +{ + int l = 0; + struct bio *bp; + + TAILQ_FOREACH(bp, &(qp->q_bioq.queue), bio_queue) { + l++; + } + printf("--- rr queue %d %p status %d len %d ---\n", + index, qp, qp->q_status, l); +} + +/* + * Dump the scheduler status when writing to this sysctl variable. + * XXX right now we only dump the status of the last instance created. + * not a severe issue because this is only for debugging + */ +static int +gs_rr_sysctl_status(SYSCTL_HANDLER_ARGS) +{ + int error, val = 0; + struct g_rr_softc *sc; + + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr ) + return (error); + + printf("called %s\n", __FUNCTION__); + + LIST_FOREACH(sc, &me.sc_head, sc_next) { + int i, tot = 0; + printf("--- sc %p active %p nqueues %d " + "callout %d in_flight %d ---\n", + sc, sc->sc_active, sc->sc_nqueues, + callout_active(&sc->sc_wait), + sc->sc_in_flight); + for (i = 0; i < G_RR_HASH_SIZE; i++) { + struct g_rr_queue *qp; + LIST_FOREACH(qp, &sc->sc_hash[i], q_hash) { + gs_rr_dump_q(qp, tot); + tot++; + } + } + } + return (0); +} + +SYSCTL_PROC(_kern_geom_sched_rr, OID_AUTO, status, + CTLTYPE_UINT | CTLFLAG_RW, + 0, sizeof(int), gs_rr_sysctl_status, "I", "status"); + +#endif /* DEBUG_QUEUES */ + +/* + * Get a bounded value, optionally convert to a min of t_min ticks. + */ +static int +get_bounded(struct x_bound *v, int t_min) +{ + int x; + + x = v->x_cur; + if (x < v->x_min) + x = v->x_min; + else if (x > v->x_max) + x = v->x_max; + if (t_min) { + x = x * hz / 1000; /* convert to ticks */ + if (x < t_min) + x = t_min; + } + return x; +} + +/* + * Get a reference to the queue for bp, using the generic + * classification mechanism. + */ +static struct g_rr_queue * +g_rr_queue_get(struct g_rr_softc *sc, struct bio *bp) +{ + + return (g_sched_get_class(sc->sc_geom, bp)); +} + +static int +g_rr_init_class(void *data, void *priv) +{ + struct g_rr_softc *sc = data; + struct g_rr_queue *qp = priv; + + gs_bioq_init(&qp->q_bioq); + + /* + * Set the initial parameters for the client: + * slice size in bytes and ticks, and wait ticks. + * Right now these are constant, but we could have + * autoconfiguration code to adjust the values based on + * the actual workload. + */ + qp->q_budget = 1024 * get_bounded(&me.quantum_kb, 0); + qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); + qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); + + qp->q_sc = sc; /* link to the parent */ + qp->q_sc->sc_nqueues++; + me.queues++; + + return (0); +} + +/* + * Release a reference to the queue. + */ +static void +g_rr_queue_put(struct g_rr_queue *qp) +{ + + g_sched_put_class(qp->q_sc->sc_geom, qp); +} + +static void +g_rr_fini_class(void *data, void *priv) +{ + struct g_rr_queue *qp = priv; + + KASSERT(gs_bioq_first(&qp->q_bioq) == NULL, + ("released nonempty queue")); + qp->q_sc->sc_nqueues--; + me.queues--; +} + +static inline int +g_rr_queue_expired(struct g_rr_queue *qp) +{ + + if (qp->q_service >= qp->q_budget) + return (1); + + if ((qp->q_flags & G_FLAG_COMPLETED) && + ticks - qp->q_slice_end >= 0) + return (1); + + return (0); +} + +static inline int +g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp) +{ + int wait = get_bounded(&me.wait_ms, 2); + + if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE)) + return (0); + + if (g_savg_valid(&qp->q_thinktime) && + g_savg_read(&qp->q_thinktime) > wait) + return (0); + + if (g_savg_valid(&qp->q_seekdist) && + g_savg_read(&qp->q_seekdist) > 8192) + return (0); + + return (1); +} + +/* + * Called on a request arrival, timeout or completion. + * Try to serve a request among those queued. + */ +static struct bio * +g_rr_next(void *data, int force) +{ + struct g_rr_softc *sc = data; + struct g_rr_queue *qp; + struct bio *bp, *next; + int expired; + + qp = sc->sc_active; + if (me.bypass == 0 && !force) { + if (sc->sc_in_flight >= get_bounded(&me.queue_depth, 0)) + return (NULL); + + /* Try with the queue under service first. */ + if (qp != NULL && qp->q_status != G_QUEUE_READY) { + /* + * Queue is anticipating, ignore request. + * We should check that we are not past + * the timeout, but in that case the timeout + * will fire immediately afterwards so we + * don't bother. + */ + return (NULL); + } + } else if (qp != NULL && qp->q_status != G_QUEUE_READY) { + g_rr_queue_put(qp); + sc->sc_active = qp = NULL; + } + + /* + * No queue under service, look for the first in RR order. + * If we find it, select if as sc_active, clear service + * and record the end time of the slice. + */ + if (qp == NULL) { + qp = TAILQ_FIRST(&sc->sc_rr_tailq); + if (qp == NULL) + return (NULL); /* no queues at all, return */ + /* otherwise select the new queue for service. */ + TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq); + sc->sc_active = qp; + qp->q_service = 0; + qp->q_flags &= ~G_FLAG_COMPLETED; + } + + bp = gs_bioq_takefirst(&qp->q_bioq); /* surely not NULL */ + qp->q_service += bp->bio_length; /* charge the service */ + + /* + * The request at the head of the active queue is always + * dispatched, and gs_rr_next() will be called again + * immediately. + * We need to prepare for what to do next: + * + * 1. have we reached the end of the (time or service) slice ? + * If so, clear sc_active and possibly requeue the previous + * active queue if it has more requests pending; + * 2. do we have more requests in sc_active ? + * If yes, do not anticipate, as gs_rr_next() will run again; + * if no, decide whether or not to anticipate depending + * on read or writes (e.g., anticipate only on reads). + */ + expired = g_rr_queue_expired(qp); /* are we expired ? */ + next = gs_bioq_first(&qp->q_bioq); /* do we have one more ? */ + if (expired) { + sc->sc_active = NULL; + /* Either requeue or release reference. */ + if (next != NULL) + TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); + else + g_rr_queue_put(qp); + } else if (next != NULL) { + qp->q_status = G_QUEUE_READY; + } else { + if (!force && g_rr_should_anticipate(qp, bp)) { + /* anticipate */ + qp->q_status = G_QUEUE_BUSY; + } else { + /* do not anticipate, release reference */ + g_rr_queue_put(qp); + sc->sc_active = NULL; + } + } + /* If sc_active != NULL, its q_status is always correct. */ + + sc->sc_in_flight++; + + return (bp); +} + +static inline void +g_rr_update_thinktime(struct g_rr_queue *qp) +{ + int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2); + + if (qp->q_sc->sc_active != qp) + return; + + qp->q_lastsub = ticks; + delta = (delta > 2 * wait) ? 2 * wait : delta; + if (qp->q_bionum > 7) + g_savg_add_sample(&qp->q_thinktime, delta); +} + +static inline void +g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp) +{ + off_t dist; + + if (qp->q_lastoff > bp->bio_offset) + dist = qp->q_lastoff - bp->bio_offset; + else + dist = bp->bio_offset - qp->q_lastoff; + + if (dist > (8192 * 8)) + dist = 8192 * 8; + + qp->q_lastoff = bp->bio_offset + bp->bio_length; + + if (qp->q_bionum > 7) + g_savg_add_sample(&qp->q_seekdist, dist); +} + +/* + * Called when a real request for disk I/O arrives. + * Locate the queue associated with the client. + * If the queue is the one we are anticipating for, reset its timeout; + * if the queue is not in the round robin list, insert it in the list. + * On any error, do not queue the request and return -1, the caller + * will take care of this request. + */ +static int +g_rr_start(void *data, struct bio *bp) +{ + struct g_rr_softc *sc = data; + struct g_rr_queue *qp; + + if (me.bypass) + return (-1); /* bypass the scheduler */ + + /* Get the queue for the request. */ + qp = g_rr_queue_get(sc, bp); + if (qp == NULL) + return (-1); /* allocation failed, tell upstream */ + + if (gs_bioq_first(&qp->q_bioq) == NULL) { + /* + * We are inserting into an empty queue. + * Reset its state if it is sc_active, + * otherwise insert it in the RR list. + */ + if (qp == sc->sc_active) { + qp->q_status = G_QUEUE_READY; + callout_stop(&sc->sc_wait); + } else { + g_sched_priv_ref(qp); + TAILQ_INSERT_TAIL(&sc->sc_rr_tailq, qp, q_tailq); + } + } + + qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3); + + g_rr_update_thinktime(qp); + g_rr_update_seekdist(qp, bp); + + /* Inherit the reference returned by g_rr_queue_get(). */ + bp->bio_caller1 = qp; + gs_bioq_disksort(&qp->q_bioq, bp); + + return (0); +} + +/* + * Callout executed when a queue times out anticipating a new request. + */ +static void +g_rr_wait_timeout(void *data) +{ + struct g_rr_softc *sc = data; + struct g_geom *geom = sc->sc_geom; + + g_sched_lock(geom); + /* + * We can race with other events, so check if + * sc_active is still valid. + */ + if (sc->sc_active != NULL) { + /* Release the reference to the queue. */ + g_rr_queue_put(sc->sc_active); + sc->sc_active = NULL; + me.wait_hit--; + me.wait_miss++; /* record the miss */ + } + g_sched_dispatch(geom); + g_sched_unlock(geom); +} + +/* + * Module glue: allocate descriptor, initialize its fields. + */ +static void * +g_rr_init(struct g_geom *geom) +{ + struct g_rr_softc *sc; + + /* XXX check whether we can sleep */ + sc = malloc(sizeof *sc, M_GEOM_SCHED, M_NOWAIT | M_ZERO); + sc->sc_geom = geom; + TAILQ_INIT(&sc->sc_rr_tailq); + callout_init(&sc->sc_wait, CALLOUT_MPSAFE); + LIST_INSERT_HEAD(&me.sc_head, sc, sc_next); + me.units++; + + return (sc); +} + +/* + * Module glue -- drain the callout structure, destroy the + * hash table and its element, and free the descriptor. + */ +static void +g_rr_fini(void *data) +{ + struct g_rr_softc *sc = data; + + callout_drain(&sc->sc_wait); + KASSERT(sc->sc_active == NULL, ("still a queue under service")); + KASSERT(TAILQ_EMPTY(&sc->sc_rr_tailq), ("still scheduled queues")); + + LIST_REMOVE(sc, sc_next); + me.units--; + free(sc, M_GEOM_SCHED); +} + +/* + * Called when the request under service terminates. + * Start the anticipation timer if needed. + */ +static void +g_rr_done(void *data, struct bio *bp) +{ + struct g_rr_softc *sc = data; + struct g_rr_queue *qp; + + sc->sc_in_flight--; + + qp = bp->bio_caller1; + if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) { + if (!(qp->q_flags & G_FLAG_COMPLETED)) { + qp->q_flags |= G_FLAG_COMPLETED; + /* in case we want to make the slice adaptive */ + qp->q_slice_duration = get_bounded(&me.quantum_ms, 2); + qp->q_slice_end = ticks + qp->q_slice_duration; + } + + /* The queue is trying anticipation, start the timer. */ + qp->q_status = G_QUEUE_IDLING; + /* may make this adaptive */ + qp->q_wait_ticks = get_bounded(&me.wait_ms, 2); + me.wait_hit++; + callout_reset(&sc->sc_wait, qp->q_wait_ticks, + g_rr_wait_timeout, sc); + } else + g_sched_dispatch(sc->sc_geom); + + /* Release a reference to the queue. */ + g_rr_queue_put(qp); +} + +static void +g_rr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, + struct g_consumer *cp, struct g_provider *pp) +{ + if (indent == NULL) { /* plaintext */ + sbuf_printf(sb, " units %d queues %d", + me.units, me.queues); + } +} + +static struct g_gsched g_rr = { + .gs_name = "rr", + .gs_priv_size = sizeof(struct g_rr_queue), + .gs_init = g_rr_init, + .gs_fini = g_rr_fini, + .gs_start = g_rr_start, + .gs_done = g_rr_done, + .gs_next = g_rr_next, + .gs_dumpconf = g_rr_dumpconf, + .gs_init_class = g_rr_init_class, + .gs_fini_class = g_rr_fini_class, +}; + +DECLARE_GSCHED_MODULE(rr, &g_rr); |