/*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * * This source file contains the state-engine which makes things happen in the * right order. * * Outline: * 1) g_bde_start1() * Break the struct bio into multiple work packets one per zone. * 2) g_bde_start2() * Setup the necessary sector buffers and start those read operations * which we can start at this time and put the item on the work-list. * 3) g_bde_worker() * Scan the work-list for items which are ready for crypto processing * and call the matching crypto function in g_bde_crypt.c and schedule * any writes needed. Read operations finish here by releasing the * sector buffers and delivering the original bio request. * 4) g_bde_write_done() * Release sector buffers and deliver the original bio request. * * Because of the C-scope rules, the functions are almost perfectly in the * opposite order in this source file. * * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add * XXX: additional states to this state-engine. Since no hardware available * XXX: at this time has AES support, implementing this has been postponed * XXX: until such time as it would result in a benefit. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp); static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len); static void g_bde_release_sector(struct g_bde_work *wp, struct g_bde_sector *sp); static struct g_bde_sector *g_bde_get_sector(struct g_bde_work *wp, off_t offset); static int g_bde_start_read(struct g_bde_sector *sp); static void g_bde_purge_sector(struct g_bde_softc *sc, int fraction); /* * Work item allocation. * * C++ would call these constructors and destructors. */ static u_int g_bde_nwork; SYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, ""); static struct g_bde_work * g_bde_new_work(struct g_bde_softc *sc) { struct g_bde_work *wp; wp = g_malloc(sizeof *wp, M_NOWAIT | M_ZERO); if (wp == NULL) return (wp); wp->state = SETUP; wp->softc = sc; g_bde_nwork++; sc->nwork++; TAILQ_INSERT_TAIL(&sc->worklist, wp, list); return (wp); } static void g_bde_delete_work(struct g_bde_work *wp) { struct g_bde_softc *sc; sc = wp->softc; g_bde_nwork--; sc->nwork--; TAILQ_REMOVE(&sc->worklist, wp, list); g_free(wp); } /* * Sector buffer allocation * * These two functions allocate and free back variable sized sector buffers */ static u_int g_bde_nsect; SYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, ""); static void g_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp) { g_bde_nsect--; sc->nsect--; if (sp->malloc) g_free(sp->data); g_free(sp); } static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len) { struct g_bde_sector *sp; sp = g_malloc(sizeof *sp, M_NOWAIT | M_ZERO); if (sp == NULL) return (sp); if (len > 0) { sp->data = g_malloc(len, M_NOWAIT | M_ZERO); if (sp->data == NULL) { g_free(sp); return (NULL); } sp->malloc = 1; } g_bde_nsect++; wp->softc->nsect++; sp->size = len; sp->softc = wp->softc; sp->ref = 1; sp->owner = wp; sp->offset = wp->so; sp->state = JUNK; return (sp); } /* * Skey sector cache. * * Nothing prevents two separate I/O requests from addressing the same zone * and thereby needing the same skey sector. We therefore need to sequence * I/O operations to the skey sectors. A certain amount of caching is also * desirable, although the extent of benefit from this is not at this point * determined. * * XXX: GEOM may be able to grow a generic caching facility at some point * XXX: to support such needs. */ static u_int g_bde_ncache; SYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, ""); static void g_bde_purge_one_sector(struct g_bde_softc *sc, struct g_bde_sector *sp) { g_trace(G_T_TOPOLOGY, "g_bde_purge_one_sector(%p, %p)", sc, sp); if (sp->ref != 0) return; TAILQ_REMOVE(&sc->freelist, sp, list); g_bde_ncache--; sc->ncache--; bzero(sp->data, sp->size); g_bde_delete_sector(sc, sp); } static struct g_bde_sector * g_bde_get_sector(struct g_bde_work *wp, off_t offset) { struct g_bde_sector *sp; struct g_bde_softc *sc; g_trace(G_T_TOPOLOGY, "g_bde_get_sector(%p, %jd)", wp, (intmax_t)offset); sc = wp->softc; if (malloc_last_fail() < g_bde_ncache) g_bde_purge_sector(sc, -1); sp = TAILQ_FIRST(&sc->freelist); if (sp != NULL && sp->ref == 0 && sp->used + 300 < time_uptime) g_bde_purge_one_sector(sc, sp); TAILQ_FOREACH(sp, &sc->freelist, list) { if (sp->offset == offset) break; } if (sp != NULL) { sp->ref++; KASSERT(sp->offset == offset, ("wrong offset")); KASSERT(sp->softc == wp->softc, ("wrong softc")); if (sp->ref == 1) sp->owner = wp; } else { if (malloc_last_fail() < g_bde_ncache) { TAILQ_FOREACH(sp, &sc->freelist, list) if (sp->ref == 0) break; } if (sp == NULL && !TAILQ_EMPTY(&sc->freelist)) sp = TAILQ_FIRST(&sc->freelist); if (sp != NULL && sp->ref > 0) sp = NULL; if (sp == NULL) { g_bde_ncache++; sc->ncache++; sp = g_bde_new_sector(wp, sc->sectorsize); if (sp != NULL) { TAILQ_INSERT_TAIL(&sc->freelist, sp, list); sp->malloc = 2; } } if (sp != NULL) { sp->offset = offset; sp->softc = wp->softc; sp->ref = 1; sp->owner = wp; sp->state = JUNK; sp->error = 0; } } if (sp != NULL) { TAILQ_REMOVE(&sc->freelist, sp, list); TAILQ_INSERT_TAIL(&sc->freelist, sp, list); } wp->ksp = sp; if (sp == NULL) { g_bde_purge_sector(sc, -1); sp = g_bde_get_sector(wp, offset); } if (sp != NULL) sp->used = time_uptime; KASSERT(sp != NULL, ("get_sector failed")); return(sp); } static void g_bde_release_sector(struct g_bde_work *wp, struct g_bde_sector *sp) { struct g_bde_softc *sc; struct g_bde_work *wp2; g_trace(G_T_TOPOLOGY, "g_bde_release_sector(%p)", sp); KASSERT(sp->malloc == 2, ("Wrong sector released")); sc = sp->softc; KASSERT(sc != NULL, ("NULL sp->softc")); KASSERT(wp == sp->owner, ("Releasing, not owner")); sp->owner = NULL; wp->ksp = NULL; sp->ref--; if (sp->ref > 0) { TAILQ_REMOVE(&sc->freelist, sp, list); TAILQ_INSERT_TAIL(&sc->freelist, sp, list); TAILQ_FOREACH(wp2, &sc->worklist, list) { if (wp2->ksp == sp) { KASSERT(wp2 != wp, ("Self-reowning")); sp->owner = wp2; wakeup(sp->softc); break; } } KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp)); } else if (sp->error != 0) { sp->offset = ~0; sp->error = 0; sp->state = JUNK; } TAILQ_REMOVE(&sc->freelist, sp, list); TAILQ_INSERT_HEAD(&sc->freelist, sp, list); } static void g_bde_purge_sector(struct g_bde_softc *sc, int fraction) { struct g_bde_sector *sp; int n; g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc); if (fraction > 0) n = sc->ncache / fraction + 1; else n = g_bde_ncache - malloc_last_fail(); if (n < 0) return; if (n > sc->ncache) n = sc->ncache; while(n--) { TAILQ_FOREACH(sp, &sc->freelist, list) { if (sp->ref != 0) continue; TAILQ_REMOVE(&sc->freelist, sp, list); g_bde_ncache--; sc->ncache--; bzero(sp->data, sp->size); g_bde_delete_sector(sc, sp); break; } } } static struct g_bde_sector * g_bde_read_sector(struct g_bde_softc *sc, struct g_bde_work *wp, off_t offset) { struct g_bde_sector *sp; g_trace(G_T_TOPOLOGY, "g_bde_read_sector(%p)", wp); sp = g_bde_get_sector(wp, offset); if (sp == NULL) return (sp); if (sp->owner != wp) return (sp); if (sp->state == VALID) return (sp); if (g_bde_start_read(sp) == 0) return (sp); g_bde_release_sector(wp, sp); return (NULL); } /* * Contribute to the completion of the original bio request. * * We have no simple way to tell how many bits the original bio request has * been segmented into, so the easiest way to determine when we can deliver * it is to keep track of the number of bytes we have completed. We keep * track of any errors underway and latch onto the first one. * * We always report "nothing done" in case of error, because random bits here * and there may be completed and returning a number of completed bytes does * not convey any useful information about which bytes they were. If some * piece of broken code somewhere interprets this to mean that nothing has * changed on the underlying media they deserve the lossage headed for them. * * A single mutex per g_bde instance is used to prevent contention. */ static void g_bde_contribute(struct bio *bp, off_t bytes, int error) { struct g_bde_softc *sc; g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d", bp, (intmax_t)bytes, error); sc = bp->bio_driver1; if (bp->bio_error == 0) bp->bio_error = error; bp->bio_completed += bytes; KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution")); if (bp->bio_completed == bp->bio_length) { if (bp->bio_error != 0) bp->bio_completed = 0; g_io_deliver(bp, bp->bio_error); } } /* * A write operation has finished. When we have all expected cows in the * barn close the door and call it a day. */ static void g_bde_write_done(struct bio *bp) { struct g_bde_sector *sp; struct g_bde_work *wp; struct g_bde_softc *sc; sp = bp->bio_caller1; sc = bp->bio_caller2; mtx_lock(&sc->worklist_mutex); KASSERT(sp != NULL, ("NULL sp")); KASSERT(sc != NULL, ("NULL sc")); KASSERT(sp->owner != NULL, ("NULL sp->owner")); g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp); sp->error = bp->bio_error; g_destroy_bio(bp); wp = sp->owner; if (wp->error == 0) wp->error = sp->error; if (wp->bp->bio_cmd == BIO_DELETE) { KASSERT(sp == wp->sp, ("trashed delete op")); g_bde_contribute(wp->bp, wp->length, wp->error); g_bde_delete_sector(sc, sp); g_bde_delete_work(wp); mtx_unlock(&sc->worklist_mutex); return; } KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()")); KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op")); if (wp->sp == sp) { g_bde_delete_sector(sc, wp->sp); wp->sp = NULL; } else { sp->state = VALID; } if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID) { g_bde_contribute(wp->bp, wp->length, wp->error); g_bde_release_sector(wp, wp->ksp); g_bde_delete_work(wp); } mtx_unlock(&sc->worklist_mutex); return; } /* * Send a write request for the given sector down the pipeline. */ static int g_bde_start_write(struct g_bde_sector *sp) { struct bio *bp; struct g_bde_softc *sc; g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp); sc = sp->softc; KASSERT(sc != NULL, ("NULL sc in g_bde_start_write")); KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write")); bp = g_new_bio(); if (bp == NULL) return (ENOMEM); bp->bio_cmd = BIO_WRITE; bp->bio_offset = sp->offset; bp->bio_data = sp->data; bp->bio_length = sp->size; bp->bio_done = g_bde_write_done; bp->bio_caller1 = sp; bp->bio_caller2 = sc; sp->state = IO; g_io_request(bp, sc->consumer); return(0); } /* * A read operation has finished. Mark the sector no longer iobusy and * wake up the worker thread and let it do its thing. */ static void g_bde_read_done(struct bio *bp) { struct g_bde_sector *sp; struct g_bde_softc *sc; sp = bp->bio_caller1; g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp); sc = bp->bio_caller2; mtx_lock(&sc->worklist_mutex); sp->error = bp->bio_error; sp->state = VALID; wakeup(sc); g_destroy_bio(bp); mtx_unlock(&sc->worklist_mutex); } /* * Send a read request for the given sector down the pipeline. */ static int g_bde_start_read(struct g_bde_sector *sp) { struct bio *bp; struct g_bde_softc *sc; g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp); sc = sp->softc; KASSERT(sc != NULL, ("Null softc in sp %p", sp)); bp = g_new_bio(); if (bp == NULL) return (ENOMEM); bp->bio_cmd = BIO_READ; bp->bio_offset = sp->offset; bp->bio_data = sp->data; bp->bio_length = sp->size; bp->bio_done = g_bde_read_done; bp->bio_caller1 = sp; bp->bio_caller2 = sc; sp->state = IO; g_io_request(bp, sc->consumer); return(0); } /* * The worker thread. * * The up/down path of GEOM is not allowed to sleep or do any major work * so we use this thread to do the actual crypto operations and to push * the state engine onwards. * * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption * XXX: using a thread here is probably not needed. */ void g_bde_worker(void *arg) { struct g_bde_softc *sc; struct g_bde_work *wp; struct g_geom *gp; int busy, error; gp = arg; sc = gp->softc; mtx_lock(&sc->worklist_mutex); for (;;) { busy = 0; g_trace(G_T_TOPOLOGY, "g_bde_worker scan"); TAILQ_FOREACH(wp, &sc->worklist, list) { KASSERT(wp != NULL, ("NULL wp")); KASSERT(wp->softc != NULL, ("NULL wp->softc")); if (wp->state != WAIT) continue; /* Not interesting here */ KASSERT(wp->bp != NULL, ("NULL wp->bp")); KASSERT(wp->sp != NULL, ("NULL wp->sp")); if (wp->ksp != NULL) { if (wp->ksp->owner != wp) continue; if (wp->ksp->state == IO) continue; KASSERT(wp->ksp->state == VALID, ("Illegal sector state (JUNK ?)")); } if (wp->bp->bio_cmd == BIO_READ && wp->sp->state != VALID) continue; if (wp->ksp != NULL && wp->ksp->error != 0) { g_bde_contribute(wp->bp, wp->length, wp->ksp->error); g_bde_delete_sector(sc, wp->sp); g_bde_release_sector(wp, wp->ksp); g_bde_delete_work(wp); busy++; break; } switch(wp->bp->bio_cmd) { case BIO_READ: if (wp->ksp != NULL && wp->sp->error == 0) { mtx_unlock(&sc->worklist_mutex); g_bde_crypt_read(wp); mtx_lock(&sc->worklist_mutex); } g_bde_contribute(wp->bp, wp->length, wp->sp->error); g_bde_delete_sector(sc, wp->sp); if (wp->ksp != NULL) g_bde_release_sector(wp, wp->ksp); g_bde_delete_work(wp); break; case BIO_WRITE: wp->state = FINISH; KASSERT(wp->sp->owner == wp, ("Write not owner sp")); KASSERT(wp->ksp->owner == wp, ("Write not owner ksp")); mtx_unlock(&sc->worklist_mutex); g_bde_crypt_write(wp); mtx_lock(&sc->worklist_mutex); g_bde_start_write(wp->sp); g_bde_start_write(wp->ksp); break; case BIO_DELETE: wp->state = FINISH; mtx_unlock(&sc->worklist_mutex); g_bde_crypt_delete(wp); mtx_lock(&sc->worklist_mutex); g_bde_start_write(wp->sp); break; } busy++; break; } if (!busy) { /* * We don't look for our death-warrant until we are * idle. Shouldn't make a difference in practice. */ if (sc->dead) break; g_trace(G_T_TOPOLOGY, "g_bde_worker sleep"); error = msleep(sc, &sc->worklist_mutex, PRIBIO, "g_bde", hz); if (error == EWOULDBLOCK) { /* * Loose our skey cache in an orderly fashion. * The exact rate can be tuned to be less * aggressive if this is desirable. 10% per * second means that the cache is gone in a * few minutes. */ g_bde_purge_sector(sc, 10); } } } g_trace(G_T_TOPOLOGY, "g_bde_worker die"); g_bde_purge_sector(sc, 1); KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork)); KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache)); KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect)); mtx_unlock(&sc->worklist_mutex); sc->dead = 2; wakeup(sc); mtx_lock(&Giant); kthread_exit(0); } /* * g_bde_start1 has chopped the incoming request up so all the requests * we see here are inside a single zone. Map the data and key locations * grab the buffers we need and fire off the first volley of read requests. */ static void g_bde_start2(struct g_bde_work *wp) { struct g_bde_softc *sc; KASSERT(wp != NULL, ("NULL wp in g_bde_start2")); g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp); sc = wp->softc; KASSERT(wp->softc != NULL, ("NULL wp->softc")); g_bde_map_sector(&sc->key, wp->offset, &wp->so, &wp->kso, &wp->ko); if (wp->bp->bio_cmd == BIO_READ) { wp->sp = g_bde_new_sector(wp, 0); if (wp->sp == NULL) { g_bde_contribute(wp->bp, wp->length, ENOMEM); g_bde_delete_work(wp); return; } wp->sp->size = wp->length; wp->sp->data = wp->data; if (g_bde_start_read(wp->sp) != 0) { g_bde_contribute(wp->bp, wp->length, ENOMEM); g_bde_delete_sector(sc, wp->sp); g_bde_delete_work(wp); return; } g_bde_read_sector(sc, wp, wp->kso); if (wp->ksp == NULL) wp->error = ENOMEM; } else if (wp->bp->bio_cmd == BIO_DELETE) { wp->sp = g_bde_new_sector(wp, wp->length); if (wp->sp == NULL) { g_bde_contribute(wp->bp, wp->length, ENOMEM); g_bde_delete_work(wp); return; } } else if (wp->bp->bio_cmd == BIO_WRITE) { wp->sp = g_bde_new_sector(wp, wp->length); if (wp->sp == NULL) { g_bde_contribute(wp->bp, wp->length, ENOMEM); g_bde_delete_work(wp); return; } g_bde_read_sector(sc, wp, wp->kso); if (wp->ksp == NULL) { g_bde_contribute(wp->bp, wp->length, ENOMEM); g_bde_delete_sector(sc, wp->sp); g_bde_delete_work(wp); return; } } else { KASSERT(0 == 1, ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd)); } wp->state = WAIT; wakeup(sc); } /* * Split the incoming bio on zone boundaries and submit the resulting * work structures to g_bde_start2(). */ void g_bde_start1(struct bio *bp) { struct g_bde_softc *sc; struct g_bde_work *wp; off_t left; sc = bp->bio_to->geom->softc; bp->bio_driver1 = sc; mtx_lock(&sc->worklist_mutex); for(left = 0;left < bp->bio_length; left += sc->sectorsize) { wp = g_bde_new_work(sc); if (wp == NULL) { g_io_deliver(bp, ENOMEM); mtx_unlock(&sc->worklist_mutex); return; } wp->bp = bp; wp->offset = bp->bio_offset + left; wp->data = bp->bio_data + left; wp->length = sc->sectorsize; g_bde_start2(wp); } mtx_unlock(&sc->worklist_mutex); return; }