diff options
Diffstat (limited to 'sys/geom')
35 files changed, 11735 insertions, 0 deletions
diff --git a/sys/geom/bde/g_bde.c b/sys/geom/bde/g_bde.c new file mode 100644 index 0000000..e3e06ec --- /dev/null +++ b/sys/geom/bde/g_bde.c @@ -0,0 +1,286 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#include <sys/param.h> +#include <sys/bio.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/malloc.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/kthread.h> + +#include <crypto/rijndael/rijndael.h> +#include <crypto/sha2/sha2.h> +#include <geom/geom.h> +#include <geom/bde/g_bde.h> +#define BDE_CLASS_NAME "BDE" + +static void +g_bde_start(struct bio *bp) +{ + + switch (bp->bio_cmd) { + case BIO_DELETE: + case BIO_READ: + case BIO_WRITE: + g_bde_start1(bp); + break; + case BIO_GETATTR: + g_io_deliver(bp, EOPNOTSUPP); + break; + default: + g_io_deliver(bp, EOPNOTSUPP); + return; + } + return; +} + +static void +g_bde_orphan(struct g_consumer *cp) +{ + struct g_geom *gp; + struct g_provider *pp; + struct g_bde_softc *sc; + int error; + + g_trace(G_T_TOPOLOGY, "g_bde_orphan(%p/%s)", cp, cp->provider->name); + g_topology_assert(); + KASSERT(cp->provider->error != 0, + ("g_bde_orphan with error == 0")); + + gp = cp->geom; + sc = gp->softc; + gp->flags |= G_GEOM_WITHER; + error = cp->provider->error; + LIST_FOREACH(pp, &gp->provider, provider) + g_orphan_provider(pp, error); + bzero(sc, sizeof(struct g_bde_softc)); /* destroy evidence */ + return; +} + +static int +g_bde_access(struct g_provider *pp, int dr, int dw, int de) +{ + struct g_geom *gp; + struct g_consumer *cp; + + gp = pp->geom; + cp = LIST_FIRST(&gp->consumer); + if (cp->acr == 0 && cp->acw == 0 && cp->ace == 0) { + de++; + dr++; + } + /* ... and let go of it on last close */ + if ((cp->acr + dr) == 0 && (cp->acw + dw) == 0 && (cp->ace + de) == 1) { + de--; + dr--; + } + return (g_access_rel(cp, dr, dw, de)); +} + +static void +g_bde_create_geom(struct gctl_req *req, struct g_class *mp, struct g_provider *pp) +{ + struct g_geom *gp; + struct g_consumer *cp; + struct g_bde_key *kp; + int error, i; + u_int sectorsize; + off_t mediasize; + struct g_bde_softc *sc; + void *pass; + void *key; + + g_trace(G_T_TOPOLOGY, "g_bde_create_geom(%s, %s)", mp->name, pp->name); + g_topology_assert(); + gp = NULL; + + + gp = g_new_geomf(mp, "%s.bde", pp->name); + gp->start = g_bde_start; + gp->orphan = g_bde_orphan; + gp->access = g_bde_access; + gp->spoiled = g_std_spoiled; + cp = g_new_consumer(gp); + g_attach(cp, pp); + error = g_access_rel(cp, 1, 1, 1); + if (error) { + g_detach(cp); + g_destroy_consumer(cp); + g_destroy_geom(gp); + gctl_error(req, "could not access consumer"); + } + pass = NULL; + key = NULL; + do { + pass = gctl_get_param(req, "pass", &i); + if (pass == NULL || i != SHA512_DIGEST_LENGTH) { + gctl_error(req, "No usable key presented"); + break; + } + key = gctl_get_param(req, "key", &i); + if (key != NULL && i != 16) { + gctl_error(req, "Invalid key presented"); + break; + } + sectorsize = cp->provider->sectorsize; + mediasize = cp->provider->mediasize; + sc = g_malloc(sizeof(struct g_bde_softc), M_WAITOK | M_ZERO); + gp->softc = sc; + sc->geom = gp; + sc->consumer = cp; + + error = g_bde_decrypt_lock(sc, pass, key, + mediasize, sectorsize, NULL); + bzero(sc->sha2, sizeof sc->sha2); + if (error) + break; + kp = &sc->key; + + /* Initialize helper-fields */ + kp->keys_per_sector = kp->sectorsize / G_BDE_SKEYLEN; + kp->zone_cont = kp->keys_per_sector * kp->sectorsize; + kp->zone_width = kp->zone_cont + kp->sectorsize; + kp->media_width = kp->sectorN - kp->sector0 - + G_BDE_MAXKEYS * kp->sectorsize; + + /* Our external parameters */ + sc->zone_cont = kp->zone_cont; + sc->mediasize = g_bde_max_sector(kp); + sc->sectorsize = kp->sectorsize; + + TAILQ_INIT(&sc->freelist); + TAILQ_INIT(&sc->worklist); + mtx_init(&sc->worklist_mutex, "g_bde_worklist", NULL, MTX_DEF); + mtx_lock(&Giant); + /* XXX: error check */ + kthread_create(g_bde_worker, gp, &sc->thread, 0, 0, + "g_bde %s", gp->name); + mtx_unlock(&Giant); + pp = g_new_providerf(gp, gp->name); +#if 0 + /* + * XXX: Disable this for now. Appearantly UFS no longer + * XXX: issues BIO_DELETE requests correctly, with the obvious + * XXX: outcome that userdata is trashed. + */ + pp->flags |= G_PF_CANDELETE; +#endif + pp->stripesize = kp->zone_cont; + pp->stripeoffset = 0; + pp->mediasize = sc->mediasize; + pp->sectorsize = sc->sectorsize; + g_error_provider(pp, 0); + break; + } while (0); + if (pass != NULL) + bzero(pass, SHA512_DIGEST_LENGTH); + if (key != NULL) + bzero(key, 16); + if (error == 0) + return; + g_access_rel(cp, -1, -1, -1); + g_detach(cp); + g_destroy_consumer(cp); + if (gp->softc != NULL) + g_free(gp->softc); + g_destroy_geom(gp); + return; +} + + +static int +g_bde_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) +{ + struct g_consumer *cp; + struct g_provider *pp; + int error; + struct g_bde_softc *sc; + + g_trace(G_T_TOPOLOGY, "g_bde_destroy_geom(%s, %s)", mp->name, gp->name); + g_topology_assert(); + /* + * Orderly detachment. + */ + KASSERT(gp != NULL, ("NULL geom")); + pp = LIST_FIRST(&gp->provider); + KASSERT(pp != NULL, ("NULL provider")); + if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) + return (EBUSY); + sc = gp->softc; + cp = LIST_FIRST(&gp->consumer); + KASSERT(cp != NULL, ("NULL consumer")); + sc->dead = 1; + wakeup(sc); + error = g_access_rel(cp, -1, -1, -1); + KASSERT(error == 0, ("error on close")); + g_detach(cp); + g_destroy_consumer(cp); + while (sc->dead != 2 && !LIST_EMPTY(&pp->consumers)) + tsleep(sc, PRIBIO, "g_bdedie", hz); + mtx_destroy(&sc->worklist_mutex); + bzero(&sc->key, sizeof sc->key); + g_free(sc); + g_wither_geom(gp, ENXIO); + return (0); +} + +static void +g_bde_ctlreq(struct gctl_req *req, struct g_class *mp, char const *verb) +{ + struct g_geom *gp; + struct g_provider *pp; + + if (!strcmp(verb, "create geom")) { + pp = gctl_get_provider(req, "provider"); + if (pp != NULL) + g_bde_create_geom(req, mp, pp); + } else if (!strcmp(verb, "destroy geom")) { + gp = gctl_get_geom(req, mp, "geom"); + if (gp != NULL) + g_bde_destroy_geom(req, mp, gp); + } else { + gctl_error(req, "unknown verb"); + } +} + +static struct g_class g_bde_class = { + .name = BDE_CLASS_NAME, + .destroy_geom = g_bde_destroy_geom, + .ctlreq = g_bde_ctlreq, +}; + +DECLARE_GEOM_CLASS(g_bde_class, g_bde); diff --git a/sys/geom/bde/g_bde.h b/sys/geom/bde/g_bde.h new file mode 100644 index 0000000..b162e96 --- /dev/null +++ b/sys/geom/bde/g_bde.h @@ -0,0 +1,211 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_GEOM_BDE_G_BDE_H_ +#define _SYS_GEOM_BDE_G_BDE_H_ 1 + +/* + * These are quite, but not entirely unlike constants. + * + * They are not commented in details here, to prevent unadvisable + * experimentation. Please consult the code where they are used before you + * even think about modifying these. + */ + +#define G_BDE_MKEYLEN (2048/8) +#define G_BDE_SKEYBITS 128 +#define G_BDE_SKEYLEN (G_BDE_SKEYBITS/8) +#define G_BDE_KKEYBITS 128 +#define G_BDE_KKEYLEN (G_BDE_KKEYBITS/8) +#define G_BDE_MAXKEYS 4 +#define G_BDE_LOCKSIZE 384 +#define NLOCK_FIELDS 13 + + +/* This just needs to be "large enough" */ +#define G_BDE_KEYBYTES 304 + +struct g_bde_work; +struct g_bde_softc; + +struct g_bde_sector { + struct g_bde_work *owner; + struct g_bde_softc *softc; + off_t offset; + u_int size; + u_int ref; + void *data; + TAILQ_ENTRY(g_bde_sector) list; + u_char valid; + u_char malloc; + enum {JUNK, IO, VALID} state; + int error; + time_t used; +}; + +struct g_bde_work { + struct mtx mutex; + off_t offset; + off_t length; + void *data; + struct bio *bp; + struct g_bde_softc *softc; + off_t so; + off_t kso; + u_int ko; + struct g_bde_sector *sp; + struct g_bde_sector *ksp; + TAILQ_ENTRY(g_bde_work) list; + enum {SETUP, WAIT, FINISH} state; + int error; +}; + +/* + * The decrypted contents of the lock sectors. Notice that this is not + * the same as the on-disk layout. The on-disk layout is dynamic and + * dependent on the pass-phrase. + */ +struct g_bde_key { + uint64_t sector0; + /* Physical byte offset of 1st byte used */ + uint64_t sectorN; + /* Physical byte offset of 1st byte not used */ + uint64_t keyoffset; + /* Number of bytes the disk image is skewed. */ + uint64_t lsector[G_BDE_MAXKEYS]; + /* Physical byte offsets of lock sectors */ + uint32_t sectorsize; + /* Our "logical" sector size */ + uint32_t flags; + /* 1 = lockfile in sector 0 */ + uint8_t salt[16]; + /* Used to frustate the kkey generation */ + uint8_t spare[32]; + /* For future use, random contents */ + uint8_t mkey[G_BDE_MKEYLEN]; + /* Our masterkey. */ + + /* Non-stored help-fields */ + uint64_t zone_width; /* On-disk width of zone */ + uint64_t zone_cont; /* Payload width of zone */ + uint64_t media_width; /* Non-magic width of zone */ + u_int keys_per_sector; +}; + +struct g_bde_softc { + off_t mediasize; + u_int sectorsize; + uint64_t zone_cont; + struct g_geom *geom; + struct g_consumer *consumer; + TAILQ_HEAD(, g_bde_sector) freelist; + TAILQ_HEAD(, g_bde_work) worklist; + struct mtx worklist_mutex; + struct proc *thread; + struct g_bde_key key; + int dead; + u_int nwork; + u_int nsect; + u_int ncache; + u_char sha2[SHA512_DIGEST_LENGTH]; +}; + +/* g_bde_crypt.c */ +void g_bde_crypt_delete(struct g_bde_work *wp); +void g_bde_crypt_read(struct g_bde_work *wp); +void g_bde_crypt_write(struct g_bde_work *wp); + +/* g_bde_key.c */ +void g_bde_zap_key(struct g_bde_softc *sc); +int g_bde_get_key(struct g_bde_softc *sc, void *ptr, int len); +int g_bde_init_keybytes(struct g_bde_softc *sc, char *passp, int len); + +/* g_bde_lock .c */ +int g_bde_encode_lock(struct g_bde_softc *sc, struct g_bde_key *gl, u_char *ptr); +int g_bde_decode_lock(struct g_bde_softc *sc, struct g_bde_key *gl, u_char *ptr); +int g_bde_keyloc_encrypt(struct g_bde_softc *sc, uint64_t *input, void *output); +int g_bde_keyloc_decrypt(struct g_bde_softc *sc, void *input, uint64_t *output); +int g_bde_decrypt_lock(struct g_bde_softc *sc, u_char *keymat, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey); +void g_bde_hash_pass(struct g_bde_softc *sc, const void *input, u_int len); + +/* g_bde_math .c */ +uint64_t g_bde_max_sector(struct g_bde_key *lp); +void g_bde_map_sector(struct g_bde_work *wp); + +/* g_bde_work.c */ +void g_bde_start1(struct bio *bp); +void g_bde_worker(void *arg); + +/* + * These four functions wrap the raw Rijndael functions and make sure we + * explode if something fails which shouldn't. + */ + +static __inline void +AES_init(cipherInstance *ci) +{ + int error; + + error = rijndael_cipherInit(ci, MODE_CBC, NULL); + KASSERT(error > 0, ("rijndael_cipherInit %d", error)); +} + +static __inline void +AES_makekey(keyInstance *ki, int dir, u_int len, void *key) +{ + int error; + + error = rijndael_makeKey(ki, dir, len, key); + KASSERT(error > 0, ("rijndael_makeKey %d", error)); +} + +static __inline void +AES_encrypt(cipherInstance *ci, keyInstance *ki, void *in, void *out, u_int len) +{ + int error; + + error = rijndael_blockEncrypt(ci, ki, in, len * 8, out); + KASSERT(error > 0, ("rijndael_blockEncrypt %d", error)); +} + +static __inline void +AES_decrypt(cipherInstance *ci, keyInstance *ki, void *in, void *out, u_int len) +{ + int error; + + error = rijndael_blockDecrypt(ci, ki, in, len * 8, out); + KASSERT(error > 0, ("rijndael_blockDecrypt %d", error)); +} + +#endif /* _SYS_GEOM_BDE_G_BDE_H_ */ diff --git a/sys/geom/bde/g_bde_crypt.c b/sys/geom/bde/g_bde_crypt.c new file mode 100644 index 0000000..97fe8d2 --- /dev/null +++ b/sys/geom/bde/g_bde_crypt.c @@ -0,0 +1,393 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + * This source file contains the functions responsible for the crypto, keying + * and mapping operations on the I/O requests. + * + */ + +#include <sys/param.h> +#include <sys/bio.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/queue.h> +#include <sys/malloc.h> +#include <sys/libkern.h> +#include <sys/endian.h> +#include <sys/md5.h> + +#include <crypto/rijndael/rijndael.h> +#include <crypto/sha2/sha2.h> + +#include <geom/geom.h> +#include <geom/bde/g_bde.h> + +/* + * XXX: Debugging DO NOT ENABLE + */ +#undef MD5_KEY + +/* + * Derive kkey from mkey + sector offset. + * + * Security objective: Derive a potentially very large number of distinct skeys + * from the comparatively small key material in our mkey, in such a way that + * if one, more or even many of the kkeys are compromised, this does not + * significantly help an attack on other kkeys and in particular does not + * weaken or compromised the mkey. + * + * First we MD5 hash the sectornumber with the salt from the lock sector. + * The salt prevents the precalculation and statistical analysis of the MD5 + * output which would be possible if we only gave it the sectornumber. + * + * The MD5 hash is used to pick out 16 bytes from the masterkey, which + * are then hashed with MD5 together with the sector number. + * + * The resulting MD5 hash is the kkey. + */ + +static void +g_bde_kkey(struct g_bde_softc *sc, keyInstance *ki, int dir, off_t sector) +{ + u_int t; + MD5_CTX ct; + u_char buf[16]; + u_char buf2[8]; + + /* We have to be architecture neutral */ + le64enc(buf2, sector); + + MD5Init(&ct); + MD5Update(&ct, sc->key.salt, 8); + MD5Update(&ct, buf2, sizeof buf2); + MD5Update(&ct, sc->key.salt + 8, 8); + MD5Final(buf, &ct); + + MD5Init(&ct); + for (t = 0; t < 16; t++) { + MD5Update(&ct, &sc->key.mkey[buf[t]], 1); + if (t == 8) + MD5Update(&ct, buf2, sizeof buf2); + } + bzero(buf2, sizeof buf2); + MD5Final(buf, &ct); + bzero(&ct, sizeof ct); + AES_makekey(ki, dir, G_BDE_KKEYBITS, buf); + bzero(buf, sizeof buf); +} + +/* + * Encryption work for read operation. + * + * Security objective: Find the kkey, find the skey, decrypt the sector data. + */ + +void +g_bde_crypt_read(struct g_bde_work *wp) +{ + struct g_bde_softc *sc; + u_char *d; + u_int n; + off_t o; + u_char skey[G_BDE_SKEYLEN]; + keyInstance ki; + cipherInstance ci; + + + AES_init(&ci); + sc = wp->softc; + o = 0; + for (n = 0; o < wp->length; n++, o += sc->sectorsize) { + d = (u_char *)wp->ksp->data + wp->ko + n * G_BDE_SKEYLEN; + g_bde_kkey(sc, &ki, DIR_DECRYPT, wp->offset + o); + AES_decrypt(&ci, &ki, d, skey, sizeof skey); + d = (u_char *)wp->data + o; +#ifdef MD5_KEY + { + MD5_CTX ct; + u_char rkey[16]; + int i; + + MD5Init(&ct); + MD5Update(&ct, d, sc->sectorsize); + MD5Final(rkey, &ct); + if (bcmp(rkey, skey, 16) != 0) { +#if 0 + printf("MD5_KEY failed at %jd (t=%d)\n", + (intmax_t)(wp->offset + o), time_second); +#endif + for (i = 0; i < sc->sectorsize; i++) + d[i] = 'A' + i % 26; + sprintf(d, "MD5_KEY failed at %jd (t=%d)", + (intmax_t)(wp->offset + o), time_second); + } + } +#else + AES_makekey(&ki, DIR_DECRYPT, G_BDE_SKEYBITS, skey); + AES_decrypt(&ci, &ki, d, d, sc->sectorsize); +#endif + } + bzero(skey, sizeof skey); + bzero(&ci, sizeof ci); + bzero(&ki, sizeof ci); +} + +/* + * Encryption work for write operation. + * + * Security objective: Create random skey, encrypt sector data, + * encrypt skey with the kkey. + */ + +void +g_bde_crypt_write(struct g_bde_work *wp) +{ + u_char *s, *d; + struct g_bde_softc *sc; + u_int n; + off_t o; + u_char skey[G_BDE_SKEYLEN]; + keyInstance ki; + cipherInstance ci; + + sc = wp->softc; + AES_init(&ci); + o = 0; + for (n = 0; o < wp->length; n++, o += sc->sectorsize) { + + s = (u_char *)wp->data + o; + d = (u_char *)wp->sp->data + o; +#ifdef MD5_KEY + { + MD5_CTX ct; + + MD5Init(&ct); + MD5Update(&ct, s, sc->sectorsize); + MD5Final(skey, &ct); + bcopy(s, d, sc->sectorsize); + } +#else + arc4rand(skey, sizeof skey, 0); + AES_makekey(&ki, DIR_ENCRYPT, G_BDE_SKEYBITS, skey); + AES_encrypt(&ci, &ki, s, d, sc->sectorsize); +#endif + + d = (u_char *)wp->ksp->data + wp->ko + n * G_BDE_SKEYLEN; + g_bde_kkey(sc, &ki, DIR_ENCRYPT, wp->offset + o); + AES_encrypt(&ci, &ki, skey, d, sizeof skey); + bzero(skey, sizeof skey); + } + bzero(skey, sizeof skey); + bzero(&ci, sizeof ci); + bzero(&ki, sizeof ci); +} + +/* + * Encryption work for delete operation. + * + * Security objective: Write random data to the sectors. + * + * XXX: At a hit in performance we would trash the encrypted skey as well. + * XXX: This would add frustration to the cleaning lady attack by making + * XXX: deletes look like writes. + */ + +void +g_bde_crypt_delete(struct g_bde_work *wp) +{ + struct g_bde_softc *sc; + u_char *d; + off_t o; + u_char skey[G_BDE_SKEYLEN]; + keyInstance ki; + cipherInstance ci; + + sc = wp->softc; + d = wp->sp->data; + AES_init(&ci); + /* + * Do not unroll this loop! + * Our zone may be significantly wider than the amount of random + * bytes arc4rand likes to give in one reseeding, whereas our + * sectorsize is far more likely to be in the same range. + */ + for (o = 0; o < wp->length; o += sc->sectorsize) { + arc4rand(d, sc->sectorsize, 0); + arc4rand(skey, sizeof skey, 0); + AES_makekey(&ki, DIR_ENCRYPT, G_BDE_SKEYBITS, skey); + AES_encrypt(&ci, &ki, d, d, sc->sectorsize); + d += sc->sectorsize; + } + /* + * Having written a long random sequence to disk here, we want to + * force a reseed, to avoid weakening the next time we use random + * data for something important. + */ + arc4rand(&o, sizeof o, 1); +} + +/* + * Calculate the total payload size of the encrypted device. + * + * Security objectives: none. + * + * This function needs to agree with g_bde_map_sector() about things. + */ + +uint64_t +g_bde_max_sector(struct g_bde_key *kp) +{ + uint64_t maxsect; + + maxsect = kp->media_width; + maxsect /= kp->zone_width; + maxsect *= kp->zone_cont; + return (maxsect); +} + +/* + * Convert an unencrypted side offset to offsets on the encrypted side. + * + * Security objective: Make it harder to identify what sectors contain what + * on a "cold" disk image. + * + * We do this by adding the "keyoffset" from the lock to the physical sector + * number modulus the available number of sectors. Since all physical sectors + * presumably look the same cold, this will do. + * + * As part of the mapping we have to skip the lock sectors which we know + * the physical address off. We also truncate the work packet, respecting + * zone boundaries and lock sectors, so that we end up with a sequence of + * sectors which are physically contiguous. + * + * Shuffling things further is an option, but the incremental frustration is + * not currently deemed worth the run-time performance hit resulting from the + * increased number of disk arm movements it would incur. + * + * This function offers nothing but a trivial diversion for an attacker able + * to do "the cleaning lady attack" in its current static mapping form. + */ + +void +g_bde_map_sector(struct g_bde_work *wp) +{ + + u_int zone, zoff, u, len; + uint64_t ko; + struct g_bde_softc *sc; + struct g_bde_key *kp; + + sc = wp->softc; + kp = &sc->key; + + /* find which zone and the offset in it */ + zone = wp->offset / kp->zone_cont; + zoff = wp->offset % kp->zone_cont; + + /* Calculate the offset of the key in the key sector */ + wp->ko = (zoff / kp->sectorsize) * G_BDE_SKEYLEN; + + /* restrict length to that zone */ + len = kp->zone_cont - zoff; + + /* ... and in general */ + if (len > DFLTPHYS) + len = DFLTPHYS; + + if (len < wp->length) + wp->length = len; + + /* Find physical sector address */ + wp->so = zone * kp->zone_width + zoff; + wp->so += kp->keyoffset; + wp->so %= kp->media_width; + if (wp->so + wp->length > kp->media_width) + wp->length = kp->media_width - wp->so; + wp->so += kp->sector0; + + /* The key sector is the last in this zone. */ + wp->kso = zone * kp->zone_width + kp->zone_cont; + wp->kso += kp->keyoffset; + wp->kso %= kp->media_width; + wp->kso += kp->sector0; + + /* Compensate for lock sectors */ + for (u = 0; u < G_BDE_MAXKEYS; u++) { + /* Find the start of this lock sector */ + ko = kp->lsector[u] & ~(kp->sectorsize - 1); + + if (wp->kso >= ko) + wp->kso += kp->sectorsize; + + if (wp->so >= ko) { + /* lock sector before work packet */ + wp->so += kp->sectorsize; + } else if ((wp->so + wp->length) > ko) { + /* lock sector in work packet, truncate */ + wp->length = ko - wp->so; + } + } + +#if 0 + printf("off %jd len %jd so %jd ko %jd kso %u\n", + (intmax_t)wp->offset, + (intmax_t)wp->length, + (intmax_t)wp->so, + (intmax_t)wp->kso, + wp->ko); +#endif + KASSERT(wp->so + wp->length <= kp->sectorN, + ("wp->so (%jd) + wp->length (%jd) > EOM (%jd), offset = %jd", + (intmax_t)wp->so, + (intmax_t)wp->length, + (intmax_t)kp->sectorN, + (intmax_t)wp->offset)); + + KASSERT(wp->kso + kp->sectorsize <= kp->sectorN, + ("wp->kso (%jd) + kp->sectorsize > EOM (%jd), offset = %jd", + (intmax_t)wp->kso, + (intmax_t)kp->sectorN, + (intmax_t)wp->offset)); + + KASSERT(wp->so >= kp->sector0, + ("wp->so (%jd) < BOM (%jd), offset = %jd", + (intmax_t)wp->so, + (intmax_t)kp->sector0, + (intmax_t)wp->offset)); + + KASSERT(wp->kso >= kp->sector0, + ("wp->kso (%jd) <BOM (%jd), offset = %jd", + (intmax_t)wp->kso, + (intmax_t)kp->sector0, + (intmax_t)wp->offset)); +} diff --git a/sys/geom/bde/g_bde_lock.c b/sys/geom/bde/g_bde_lock.c new file mode 100644 index 0000000..b06f279 --- /dev/null +++ b/sys/geom/bde/g_bde_lock.c @@ -0,0 +1,482 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + * This souce file contains routines which operates on the lock sectors, both + * for the kernel and the userland program gbde(1). + * + */ + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/endian.h> +#include <sys/md5.h> + +#ifdef _KERNEL +#include <sys/malloc.h> +#include <sys/systm.h> +#else +#include <err.h> +#define CTASSERT(foo) +#define KASSERT(foo, bar) do { if(!(foo)) { warn bar ; exit (1); } } while (0) +#include <errno.h> +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#define g_free(foo) free(foo) +#endif + +#include <crypto/rijndael/rijndael.h> +#include <crypto/sha2/sha2.h> + +#include <geom/geom.h> +#include <geom/bde/g_bde.h> + +/* + * Hash the raw pass-phrase. + * + * Security objectives: produce from the pass-phrase a fixed length + * bytesequence with PRN like properties in a reproducible way retaining + * as much entropy from the pass-phrase as possible. + * + * SHA2-512 makes this easy. + */ + +void +g_bde_hash_pass(struct g_bde_softc *sc, const void *input, u_int len) +{ + SHA512_CTX cx; + + SHA512_Init(&cx); + SHA512_Update(&cx, input, len); + SHA512_Final(sc->sha2, &cx); +} + +/* + * Encode/Decode the lock structure in byte-sequence format. + * + * Security objectives: Store in pass-phrase dependent variant format. + * + * C-structure packing and byte-endianess depends on architecture, compiler + * and compiler options. Writing raw structures to disk is therefore a bad + * idea in these enlightend days. + * + * We spend a fraction of the key-material on shuffling the fields around + * so they will be stored in an unpredictable sequence. + * + * For each byte of the key-material we derive two field indexes, and swap + * the position of those two fields. + * + * I have not worked out the statistical properties of this shuffle, but + * given that the key-material has PRN properties, the primary objective + * of making it hard to figure out which bits are where in the lock sector + * is sufficiently fulfilled. + * + * We include (and shuffle) an extra hash field in the stored version for + * identification and versioning purposes. This field contains the MD5 hash + * of a version identifier (currently "0000") followed by the stored lock + * sector byte-sequence substituting zero bytes for the hash field. + * + * The stored keysequence is protected by AES/256/CBC elsewhere in the code + * so the fact that the generated byte sequence has a much higher than + * average density of zero bits (from the numeric fields) is not currently + * a concern. + * + * Should this later become a concern, a simple software update and + * pass-phrase change can remedy the situation. One possible solution + * could be to XOR the numeric fields with a key-material derived PRN. + * + * The chosen shuffle algorithm only works as long as we have no more than 16 + * fields in the stored part of the lock structure (hence the CTASSERT below). + */ + +CTASSERT(NLOCK_FIELDS <= 16); + +static void +g_bde_shuffle_lock(struct g_bde_softc *sc, int *buf) +{ + int j, k, l; + u_int u; + + /* Assign the fields sequential positions */ + for(u = 0; u < NLOCK_FIELDS; u++) + buf[u] = u; + + /* Then mix it all up */ + for(u = 48; u < sizeof(sc->sha2); u++) { + j = sc->sha2[u] % NLOCK_FIELDS; + k = (sc->sha2[u] / NLOCK_FIELDS) % NLOCK_FIELDS; + l = buf[j]; + buf[j] = buf[k]; + buf[k] = l; + } +} + +int +g_bde_encode_lock(struct g_bde_softc *sc, struct g_bde_key *gl, u_char *ptr) +{ + int shuffle[NLOCK_FIELDS]; + u_char *hash, *p; + int i; + MD5_CTX c; + + p = ptr; + hash = NULL; + g_bde_shuffle_lock(sc, shuffle); + for (i = 0; i < NLOCK_FIELDS; i++) { + switch(shuffle[i]) { + case 0: + le64enc(p, gl->sector0); + p += 8; + break; + case 1: + le64enc(p, gl->sectorN); + p += 8; + break; + case 2: + le64enc(p, gl->keyoffset); + p += 8; + break; + case 3: + le32enc(p, gl->sectorsize); + p += 4; + break; + case 4: + le32enc(p, gl->flags); + p += 4; + break; + case 5: + case 6: + case 7: + case 8: + le64enc(p, gl->lsector[shuffle[i] - 5]); + p += 8; + break; + case 9: + bcopy(gl->spare, p, sizeof gl->spare); + p += sizeof gl->spare; + break; + case 10: + bcopy(gl->salt, p, sizeof gl->salt); + p += sizeof gl->salt; + break; + case 11: + bcopy(gl->mkey, p, sizeof gl->mkey); + p += sizeof gl->mkey; + break; + case 12: + bzero(p, 16); + hash = p; + p += 16; + break; + } + } + if(ptr + G_BDE_LOCKSIZE != p) + return(-1); + if (hash == NULL) + return(-1); + MD5Init(&c); + MD5Update(&c, "0000", 4); /* Versioning */ + MD5Update(&c, ptr, G_BDE_LOCKSIZE); + MD5Final(hash, &c); + return(0); +} + +int +g_bde_decode_lock(struct g_bde_softc *sc, struct g_bde_key *gl, u_char *ptr) +{ + int shuffle[NLOCK_FIELDS]; + u_char *p; + u_char hash[16], hash2[16]; + MD5_CTX c; + int i; + + p = ptr; + g_bde_shuffle_lock(sc, shuffle); + for (i = 0; i < NLOCK_FIELDS; i++) { + switch(shuffle[i]) { + case 0: + gl->sector0 = le64dec(p); + p += 8; + break; + case 1: + gl->sectorN = le64dec(p); + p += 8; + break; + case 2: + gl->keyoffset = le64dec(p); + p += 8; + break; + case 3: + gl->sectorsize = le32dec(p); + p += 4; + break; + case 4: + gl->flags = le32dec(p); + p += 4; + break; + case 5: + case 6: + case 7: + case 8: + gl->lsector[shuffle[i] - 5] = le64dec(p); + p += 8; + break; + case 9: + bcopy(p, gl->spare, sizeof gl->spare); + p += sizeof gl->spare; + break; + case 10: + bcopy(p, gl->salt, sizeof gl->salt); + p += sizeof gl->salt; + break; + case 11: + bcopy(p, gl->mkey, sizeof gl->mkey); + p += sizeof gl->mkey; + break; + case 12: + bcopy(p, hash2, sizeof hash2); + bzero(p, sizeof hash2); + p += sizeof hash2; + break; + } + } + if(ptr + G_BDE_LOCKSIZE != p) + return(-1); + MD5Init(&c); + MD5Update(&c, "0000", 4); /* Versioning */ + MD5Update(&c, ptr, G_BDE_LOCKSIZE); + MD5Final(hash, &c); + if (bcmp(hash, hash2, sizeof hash2)) + return (1); + return (0); +} + +/* + * Encode/Decode the locksector address ("metadata") with key-material. + * + * Security objectives: Encode/Decode the metadata encrypted by key-material. + * + * A simple AES/128/CBC will do. We take care to always store the metadata + * in the same endianess to make it MI. + * + * In the typical case the metadata is stored in encrypted format in sector + * zero on the media, but at the users discretion or if the piece of the + * device used (sector0...sectorN) does not contain sector zero, it can + * be stored in a filesystem or on a PostIt. + * + * The inability to easily locate the lock sectors makes an attack on a + * cold disk much less attractive, without unduly inconveniencing the + * legitimate user who can feasibly do a brute-force scan if the metadata + * was lost. + */ + +int +g_bde_keyloc_encrypt(struct g_bde_softc *sc, uint64_t *input, void *output) +{ + u_char buf[16]; + keyInstance ki; + cipherInstance ci; + + le64enc(buf, input[0]); + le64enc(buf + 8, input[1]); + AES_init(&ci); + AES_makekey(&ki, DIR_ENCRYPT, G_BDE_KKEYBITS, sc->sha2 + 0); + AES_encrypt(&ci, &ki, buf, output, sizeof buf); + bzero(buf, sizeof buf); + bzero(&ci, sizeof ci); + bzero(&ki, sizeof ki); + return (0); +} + +int +g_bde_keyloc_decrypt(struct g_bde_softc *sc, void *input, uint64_t *output) +{ + keyInstance ki; + cipherInstance ci; + u_char buf[16]; + + AES_init(&ci); + AES_makekey(&ki, DIR_DECRYPT, G_BDE_KKEYBITS, sc->sha2 + 0); + AES_decrypt(&ci, &ki, input, buf, sizeof buf); + output[0] = le64dec(buf); + output[1] = le64dec(buf + 8); + bzero(buf, sizeof buf); + bzero(&ci, sizeof ci); + bzero(&ki, sizeof ki); + return (0); +} + +/* + * Find and Encode/Decode lock sectors. + * + * Security objective: given the pass-phrase, find, decrypt, decode and + * validate the lock sector contents. + * + * For ondisk metadata we cannot know beforehand which of the lock sectors + * a given pass-phrase opens so we must try each of the metadata copies in + * sector zero in turn. If metadata was passed as an argument, we don't + * have this problem. + * + */ + +static int +g_bde_decrypt_lockx(struct g_bde_softc *sc, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey) +{ + u_char *buf, *q; + struct g_bde_key *gl; + uint64_t off[2]; + int error, m, i; + keyInstance ki; + cipherInstance ci; + + gl = &sc->key; + + /* Try to decrypt the metadata */ + error = g_bde_keyloc_decrypt(sc, meta, off); + if (error) + return(error); + + /* loose the random part */ + off[1] = 0; + + /* If it points ito thin blue air, forget it */ + if (off[0] + G_BDE_LOCKSIZE > (uint64_t)mediasize) { + off[0] = 0; + return (EINVAL); + } + + /* The lock data may span two physical sectors. */ + + m = 1; + if (off[0] % sectorsize > sectorsize - G_BDE_LOCKSIZE) + m++; + + /* Read the suspected sector(s) */ + buf = g_read_data(sc->consumer, + off[0] - (off[0] % sectorsize), + m * sectorsize, &error); + if (buf == NULL) { + off[0] = 0; + return(error); + } + + /* Find the byte-offset of the stored byte sequence */ + q = buf + off[0] % sectorsize; + + /* If it is all zero, somebody nuked our lock sector */ + for (i = 0; i < G_BDE_LOCKSIZE; i++) + off[1] += q[i]; + if (off[1] == 0) { + off[0] = 0; + g_free(buf); + return (ESRCH); + } + + /* Decrypt the byte-sequence in place */ + AES_init(&ci); + AES_makekey(&ki, DIR_DECRYPT, 256, sc->sha2 + 16); + AES_decrypt(&ci, &ki, q, q, G_BDE_LOCKSIZE); + + /* Decode the byte-sequence */ + i = g_bde_decode_lock(sc, gl, q); + q = NULL; + if (i < 0) { + off[0] = 0; + return (EDOOFUS); /* Programming error */ + } else if (i > 0) { + off[0] = 0; + return (ENOTDIR); /* Hash didn't match */ + } + + bzero(buf, sectorsize * m); + g_free(buf); + + /* If the masterkey is all zeros, user destroyed it */ + off[1] = 0; + for (i = 0; i < (int)sizeof(gl->mkey); i++) + off[1] += gl->mkey[i]; + if (off[1] == 0) + return (ENOENT); + + /* If we have an unsorted lock-sequence, refuse */ + if (gl->lsector[0] > gl->lsector[1] || + gl->lsector[1] > gl->lsector[2] || + gl->lsector[2] > gl->lsector[3]) + return (EINVAL); + + /* Finally, find out which key was used by matching the byte offset */ + for (i = 0; i < G_BDE_MAXKEYS; i++) + if (nkey != NULL && off[0] == gl->lsector[i]) + *nkey = i; + off[0] = 0; + return (0); +} + +int +g_bde_decrypt_lock(struct g_bde_softc *sc, u_char *keymat, u_char *meta, off_t mediasize, u_int sectorsize, u_int *nkey) +{ + u_char *buf, buf1[16]; + int error, e, i; + + /* set up the key-material */ + bcopy(keymat, sc->sha2, SHA512_DIGEST_LENGTH); + + /* If passed-in metadata is non-zero, use it */ + bzero(buf1, sizeof buf1); + if (meta != NULL && bcmp(buf1, meta, sizeof buf1)) + return (g_bde_decrypt_lockx(sc, meta, mediasize, + sectorsize, nkey)); + + /* Read sector zero */ + buf = g_read_data(sc->consumer, 0, sectorsize, &error); + if (buf == NULL) + return(error); + + /* Try each index in turn, save indicative errors for final result */ + error = EINVAL; + for (i = 0; i < G_BDE_MAXKEYS; i++) { + e = g_bde_decrypt_lockx(sc, buf + i * 16, mediasize, + sectorsize, nkey); + /* Success or destroyed master key terminates */ + if (e == 0 || e == ENOENT) { + error = e; + break; + } + if (e != 0 && error == EINVAL) + error = e; + } + g_free(buf); + return (error); +} diff --git a/sys/geom/bde/g_bde_work.c b/sys/geom/bde/g_bde_work.c new file mode 100644 index 0000000..b2f5aa9 --- /dev/null +++ b/sys/geom/bde/g_bde_work.c @@ -0,0 +1,763 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + * This source file contains the state-engine which makes things happen in the + * right order. + * + * Outline: + * 1) g_bde_start1() + * Break the struct bio into multiple work packets one per zone. + * 2) g_bde_start2() + * Setup the necessary sector buffers and start those read operations + * which we can start at this time and put the item on the work-list. + * 3) g_bde_worker() + * Scan the work-list for items which are ready for crypto processing + * and call the matching crypto function in g_bde_crypt.c and schedule + * any writes needed. Read operations finish here by releasing the + * sector buffers and delivering the original bio request. + * 4) g_bde_write_done() + * Release sector buffers and deliver the original bio request. + * + * Because of the C-scope rules, the functions are almost perfectly in the + * opposite order in this source file. + * + * XXX: A switch to the hardware assisted crypto in src/sys/opencrypto will add + * XXX: additional states to this state-engine. Since no hardware available + * XXX: at this time has AES support, implementing this has been postponed + * XXX: until such time as it would result in a benefit. + */ + +#include <sys/param.h> +#include <sys/bio.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/queue.h> +#include <sys/malloc.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/kthread.h> + +#include <crypto/rijndael/rijndael.h> +#include <crypto/sha2/sha2.h> +#include <geom/geom.h> +#include <geom/bde/g_bde.h> + +static void g_bde_delete_sector(struct g_bde_softc *wp, struct g_bde_sector *sp); +static struct g_bde_sector * g_bde_new_sector(struct g_bde_work *wp, u_int len); +static void g_bde_release_keysector(struct g_bde_work *wp); +static struct g_bde_sector *g_bde_get_keysector(struct g_bde_work *wp); +static int g_bde_start_read(struct g_bde_sector *sp); +static void g_bde_purge_sector(struct g_bde_softc *sc, int fraction); + +/* + * Work item allocation. + * + * C++ would call these constructors and destructors. + */ +static u_int g_bde_nwork; +SYSCTL_UINT(_debug, OID_AUTO, gbde_nwork, CTLFLAG_RD, &g_bde_nwork, 0, ""); + +static MALLOC_DEFINE(M_GBDE, "GBDE", "GBDE data structures"); + +static struct g_bde_work * +g_bde_new_work(struct g_bde_softc *sc) +{ + struct g_bde_work *wp; + + wp = malloc(sizeof *wp, M_GBDE, M_NOWAIT | M_ZERO); + if (wp == NULL) + return (wp); + wp->state = SETUP; + wp->softc = sc; + g_bde_nwork++; + sc->nwork++; + TAILQ_INSERT_TAIL(&sc->worklist, wp, list); + return (wp); +} + +static void +g_bde_delete_work(struct g_bde_work *wp) +{ + struct g_bde_softc *sc; + + sc = wp->softc; + g_bde_nwork--; + sc->nwork--; + TAILQ_REMOVE(&sc->worklist, wp, list); + free(wp, M_GBDE); +} + +/* + * Sector buffer allocation + * + * These two functions allocate and free back variable sized sector buffers + */ + +static u_int g_bde_nsect; +SYSCTL_UINT(_debug, OID_AUTO, gbde_nsect, CTLFLAG_RD, &g_bde_nsect, 0, ""); + +static void +g_bde_delete_sector(struct g_bde_softc *sc, struct g_bde_sector *sp) +{ + + g_bde_nsect--; + sc->nsect--; + if (sp->malloc) + free(sp->data, M_GBDE); + free(sp, M_GBDE); +} + +static struct g_bde_sector * +g_bde_new_sector(struct g_bde_work *wp, u_int len) +{ + struct g_bde_sector *sp; + + sp = malloc(sizeof *sp, M_GBDE, M_NOWAIT | M_ZERO); + if (sp == NULL) + return (sp); + if (len > 0) { + sp->data = malloc(len, M_GBDE, M_NOWAIT | M_ZERO); + if (sp->data == NULL) { + free(sp, M_GBDE); + return (NULL); + } + sp->malloc = 1; + } + g_bde_nsect++; + wp->softc->nsect++; + sp->size = len; + sp->softc = wp->softc; + sp->ref = 1; + sp->owner = wp; + sp->offset = wp->so; + sp->state = JUNK; + return (sp); +} + +/* + * Skey sector cache. + * + * Nothing prevents two separate I/O requests from addressing the same zone + * and thereby needing the same skey sector. We therefore need to sequence + * I/O operations to the skey sectors. A certain amount of caching is also + * desirable, although the extent of benefit from this is not at this point + * determined. + * + * XXX: GEOM may be able to grow a generic caching facility at some point + * XXX: to support such needs. + */ + +static u_int g_bde_ncache; +SYSCTL_UINT(_debug, OID_AUTO, gbde_ncache, CTLFLAG_RD, &g_bde_ncache, 0, ""); + +static void +g_bde_purge_one_sector(struct g_bde_softc *sc, struct g_bde_sector *sp) +{ + + g_trace(G_T_TOPOLOGY, "g_bde_purge_one_sector(%p, %p)", sc, sp); + if (sp->ref != 0) + return; + TAILQ_REMOVE(&sc->freelist, sp, list); + g_bde_ncache--; + sc->ncache--; + bzero(sp->data, sp->size); + g_bde_delete_sector(sc, sp); +} + +static struct g_bde_sector * +g_bde_get_keysector(struct g_bde_work *wp) +{ + struct g_bde_sector *sp; + struct g_bde_softc *sc; + off_t offset; + + offset = wp->kso; + g_trace(G_T_TOPOLOGY, "g_bde_get_keysector(%p, %jd)", wp, (intmax_t)offset); + sc = wp->softc; + + if (malloc_last_fail() < g_bde_ncache) + g_bde_purge_sector(sc, -1); + + sp = TAILQ_FIRST(&sc->freelist); + if (sp != NULL && sp->ref == 0 && sp->used + 300 < time_uptime) + g_bde_purge_one_sector(sc, sp); + + TAILQ_FOREACH(sp, &sc->freelist, list) { + if (sp->offset == offset) + break; + } + if (sp != NULL) { + sp->ref++; + KASSERT(sp->offset == offset, ("wrong offset")); + KASSERT(sp->softc == wp->softc, ("wrong softc")); + if (sp->ref == 1) + sp->owner = wp; + } else { + if (malloc_last_fail() < g_bde_ncache) { + TAILQ_FOREACH(sp, &sc->freelist, list) + if (sp->ref == 0) + break; + } + if (sp == NULL && !TAILQ_EMPTY(&sc->freelist)) + sp = TAILQ_FIRST(&sc->freelist); + if (sp != NULL && sp->ref > 0) + sp = NULL; + if (sp == NULL) { + sp = g_bde_new_sector(wp, sc->sectorsize); + if (sp != NULL) { + g_bde_ncache++; + sc->ncache++; + TAILQ_INSERT_TAIL(&sc->freelist, sp, list); + sp->malloc = 2; + } + } + if (sp != NULL) { + sp->offset = offset; + sp->softc = wp->softc; + sp->ref = 1; + sp->owner = wp; + sp->state = JUNK; + sp->error = 0; + } + } + if (sp != NULL) { + TAILQ_REMOVE(&sc->freelist, sp, list); + TAILQ_INSERT_TAIL(&sc->freelist, sp, list); + sp->used = time_uptime; + } + wp->ksp = sp; + return(sp); +} + +static void +g_bde_release_keysector(struct g_bde_work *wp) +{ + struct g_bde_softc *sc; + struct g_bde_work *wp2; + struct g_bde_sector *sp; + + sp = wp->ksp; + g_trace(G_T_TOPOLOGY, "g_bde_release_keysector(%p)", sp); + KASSERT(sp->malloc == 2, ("Wrong sector released")); + sc = sp->softc; + KASSERT(sc != NULL, ("NULL sp->softc")); + KASSERT(wp == sp->owner, ("Releasing, not owner")); + sp->owner = NULL; + wp->ksp = NULL; + sp->ref--; + if (sp->ref > 0) { + TAILQ_REMOVE(&sc->freelist, sp, list); + TAILQ_INSERT_TAIL(&sc->freelist, sp, list); + TAILQ_FOREACH(wp2, &sc->worklist, list) { + if (wp2->ksp == sp) { + KASSERT(wp2 != wp, ("Self-reowning")); + sp->owner = wp2; + wakeup(sp->softc); + break; + } + } + KASSERT(wp2 != NULL, ("Failed to pick up owner for %p\n", sp)); + } else if (sp->error != 0) { + sp->offset = ~0; + sp->error = 0; + sp->state = JUNK; + } + TAILQ_REMOVE(&sc->freelist, sp, list); + TAILQ_INSERT_HEAD(&sc->freelist, sp, list); +} + +static void +g_bde_purge_sector(struct g_bde_softc *sc, int fraction) +{ + struct g_bde_sector *sp; + int n; + + g_trace(G_T_TOPOLOGY, "g_bde_purge_sector(%p)", sc); + if (fraction > 0) + n = sc->ncache / fraction + 1; + else + n = g_bde_ncache - malloc_last_fail(); + if (n < 0) + return; + if (n > sc->ncache) + n = sc->ncache; + while(n--) { + TAILQ_FOREACH(sp, &sc->freelist, list) { + if (sp->ref != 0) + continue; + TAILQ_REMOVE(&sc->freelist, sp, list); + g_bde_ncache--; + sc->ncache--; + bzero(sp->data, sp->size); + g_bde_delete_sector(sc, sp); + break; + } + } +} + +static struct g_bde_sector * +g_bde_read_keysector(struct g_bde_softc *sc, struct g_bde_work *wp) +{ + struct g_bde_sector *sp; + + g_trace(G_T_TOPOLOGY, "g_bde_read_keysector(%p)", wp); + sp = g_bde_get_keysector(wp); + if (sp == NULL) { + g_bde_purge_sector(sc, -1); + sp = g_bde_get_keysector(wp); + } + if (sp == NULL) + return (sp); + if (sp->owner != wp) + return (sp); + if (sp->state == VALID) + return (sp); + if (g_bde_start_read(sp) == 0) + return (sp); + g_bde_release_keysector(wp); + return (NULL); +} + +/* + * Contribute to the completion of the original bio request. + * + * We have no simple way to tell how many bits the original bio request has + * been segmented into, so the easiest way to determine when we can deliver + * it is to keep track of the number of bytes we have completed. We keep + * track of any errors underway and latch onto the first one. + * + * We always report "nothing done" in case of error, because random bits here + * and there may be completed and returning a number of completed bytes does + * not convey any useful information about which bytes they were. If some + * piece of broken code somewhere interprets this to mean that nothing has + * changed on the underlying media they deserve the lossage headed for them. + * + * A single mutex per g_bde instance is used to prevent contention. + */ + +static void +g_bde_contribute(struct bio *bp, off_t bytes, int error) +{ + + g_trace(G_T_TOPOLOGY, "g_bde_contribute bp %p bytes %jd error %d", + bp, (intmax_t)bytes, error); + if (bp->bio_error == 0) + bp->bio_error = error; + bp->bio_completed += bytes; + KASSERT(bp->bio_completed <= bp->bio_length, ("Too large contribution")); + if (bp->bio_completed == bp->bio_length) { + if (bp->bio_error != 0) + bp->bio_completed = 0; + g_io_deliver(bp, bp->bio_error); + } +} + +/* + * A write operation has finished. When we have all expected cows in the + * barn close the door and call it a day. + */ + +static void +g_bde_write_done(struct bio *bp) +{ + struct g_bde_sector *sp; + struct g_bde_work *wp; + struct g_bde_softc *sc; + + sp = bp->bio_caller1; + sc = bp->bio_caller2; + mtx_lock(&sc->worklist_mutex); + KASSERT(sp != NULL, ("NULL sp")); + KASSERT(sc != NULL, ("NULL sc")); + KASSERT(sp->owner != NULL, ("NULL sp->owner")); + g_trace(G_T_TOPOLOGY, "g_bde_write_done(%p)", sp); + if (bp->bio_error == 0 && bp->bio_completed != sp->size) + bp->bio_error = EIO; + sp->error = bp->bio_error; + g_destroy_bio(bp); + wp = sp->owner; + if (wp->error == 0) + wp->error = sp->error; + + if (wp->bp->bio_cmd == BIO_DELETE) { + KASSERT(sp == wp->sp, ("trashed delete op")); + g_bde_contribute(wp->bp, wp->length, wp->error); + g_bde_delete_sector(sc, sp); + g_bde_delete_work(wp); + mtx_unlock(&sc->worklist_mutex); + return; + } + + KASSERT(wp->bp->bio_cmd == BIO_WRITE, ("Confused in g_bde_write_done()")); + KASSERT(sp == wp->sp || sp == wp->ksp, ("trashed write op")); + if (wp->sp == sp) { + g_bde_delete_sector(sc, wp->sp); + wp->sp = NULL; + } else { + sp->state = VALID; + } + if (wp->sp == NULL && wp->ksp != NULL && wp->ksp->state == VALID) { + g_bde_contribute(wp->bp, wp->length, wp->error); + g_bde_release_keysector(wp); + g_bde_delete_work(wp); + } + mtx_unlock(&sc->worklist_mutex); + return; +} + +/* + * Send a write request for the given sector down the pipeline. + */ + +static int +g_bde_start_write(struct g_bde_sector *sp) +{ + struct bio *bp; + struct g_bde_softc *sc; + + g_trace(G_T_TOPOLOGY, "g_bde_start_write(%p)", sp); + sc = sp->softc; + KASSERT(sc != NULL, ("NULL sc in g_bde_start_write")); + KASSERT(sp->owner != NULL, ("NULL sp->owner in g_bde_start_write")); + bp = g_new_bio(); + if (bp == NULL) + return (ENOMEM); + bp->bio_cmd = BIO_WRITE; + bp->bio_offset = sp->offset; + bp->bio_data = sp->data; + bp->bio_length = sp->size; + bp->bio_done = g_bde_write_done; + bp->bio_caller1 = sp; + bp->bio_caller2 = sc; + sp->state = IO; + g_io_request(bp, sc->consumer); + return(0); +} + +/* + * A read operation has finished. Mark the sector no longer iobusy and + * wake up the worker thread and let it do its thing. + */ + +static void +g_bde_read_done(struct bio *bp) +{ + struct g_bde_sector *sp; + struct g_bde_softc *sc; + + sp = bp->bio_caller1; + g_trace(G_T_TOPOLOGY, "g_bde_read_done(%p)", sp); + sc = bp->bio_caller2; + mtx_lock(&sc->worklist_mutex); + if (bp->bio_error == 0 && bp->bio_completed != sp->size) + bp->bio_error = EIO; + sp->error = bp->bio_error; + if (sp->error == 0) + sp->state = VALID; + else + sp->state = JUNK; + wakeup(sc); + g_destroy_bio(bp); + mtx_unlock(&sc->worklist_mutex); +} + +/* + * Send a read request for the given sector down the pipeline. + */ + +static int +g_bde_start_read(struct g_bde_sector *sp) +{ + struct bio *bp; + struct g_bde_softc *sc; + + g_trace(G_T_TOPOLOGY, "g_bde_start_read(%p)", sp); + sc = sp->softc; + KASSERT(sc != NULL, ("Null softc in sp %p", sp)); + bp = g_new_bio(); + if (bp == NULL) + return (ENOMEM); + bp->bio_cmd = BIO_READ; + bp->bio_offset = sp->offset; + bp->bio_data = sp->data; + bp->bio_length = sp->size; + bp->bio_done = g_bde_read_done; + bp->bio_caller1 = sp; + bp->bio_caller2 = sc; + sp->state = IO; + g_io_request(bp, sc->consumer); + return(0); +} + +/* + * The worker thread. + * + * The up/down path of GEOM is not allowed to sleep or do any major work + * so we use this thread to do the actual crypto operations and to push + * the state engine onwards. + * + * XXX: if we switch to the src/sys/opencrypt hardware assisted encryption + * XXX: using a thread here is probably not needed. + */ + +void +g_bde_worker(void *arg) +{ + struct g_bde_softc *sc; + struct g_bde_work *wp; + struct g_geom *gp; + int busy, error; + + gp = arg; + sc = gp->softc; + + mtx_lock(&sc->worklist_mutex); + for (;;) { + busy = 0; + g_trace(G_T_TOPOLOGY, "g_bde_worker scan"); + TAILQ_FOREACH(wp, &sc->worklist, list) { + KASSERT(wp != NULL, ("NULL wp")); + KASSERT(wp->softc != NULL, ("NULL wp->softc")); + if (wp->state != WAIT) + continue; /* Not interesting here */ + + KASSERT(wp->bp != NULL, ("NULL wp->bp")); + KASSERT(wp->sp != NULL, ("NULL wp->sp")); + + if (wp->ksp != NULL) { + if (wp->ksp->owner != wp) + continue; + if (wp->ksp->state == IO) + continue; + KASSERT(wp->ksp->state == VALID, + ("Illegal sector state (JUNK ?)")); + } + + if (wp->bp->bio_cmd == BIO_READ && + wp->sp->state == IO) + continue; + + if (wp->ksp != NULL && wp->ksp->error != 0) { + g_bde_contribute(wp->bp, wp->length, + wp->ksp->error); + g_bde_delete_sector(sc, wp->sp); + g_bde_release_keysector(wp); + g_bde_delete_work(wp); + busy++; + break; + } + switch(wp->bp->bio_cmd) { + case BIO_READ: + if (wp->ksp == NULL) { + KASSERT(wp->error != 0, + ("BIO_READ, no ksp and no error")); + g_bde_contribute(wp->bp, wp->length, + wp->error); + } else { + if (wp->sp->error == 0) { + mtx_unlock(&sc->worklist_mutex); + g_bde_crypt_read(wp); + mtx_lock(&sc->worklist_mutex); + } + g_bde_contribute(wp->bp, wp->length, + wp->sp->error); + } + g_bde_delete_sector(sc, wp->sp); + if (wp->ksp != NULL) + g_bde_release_keysector(wp); + g_bde_delete_work(wp); + break; + case BIO_WRITE: + wp->state = FINISH; + KASSERT(wp->sp->owner == wp, ("Write not owner sp")); + KASSERT(wp->ksp->owner == wp, ("Write not owner ksp")); + mtx_unlock(&sc->worklist_mutex); + g_bde_crypt_write(wp); + mtx_lock(&sc->worklist_mutex); + error = g_bde_start_write(wp->sp); + if (error) { + g_bde_contribute(wp->bp, wp->length, error); + g_bde_release_keysector(wp); + g_bde_delete_sector(sc, wp->sp); + g_bde_delete_work(wp); + break; + } + error = g_bde_start_write(wp->ksp); + if (wp->error == 0) + wp->error = error; + break; + case BIO_DELETE: + wp->state = FINISH; + mtx_unlock(&sc->worklist_mutex); + g_bde_crypt_delete(wp); + mtx_lock(&sc->worklist_mutex); + g_bde_start_write(wp->sp); + break; + } + busy++; + break; + } + if (!busy) { + /* + * We don't look for our death-warrant until we are + * idle. Shouldn't make a difference in practice. + */ + if (sc->dead) + break; + g_trace(G_T_TOPOLOGY, "g_bde_worker sleep"); + error = msleep(sc, &sc->worklist_mutex, + PRIBIO, "g_bde", hz); + if (error == EWOULDBLOCK) { + /* + * Loose our skey cache in an orderly fashion. + * The exact rate can be tuned to be less + * aggressive if this is desirable. 10% per + * second means that the cache is gone in a + * few minutes. + */ + g_bde_purge_sector(sc, 10); + } + } + } + g_trace(G_T_TOPOLOGY, "g_bde_worker die"); + g_bde_purge_sector(sc, 1); + KASSERT(sc->nwork == 0, ("Dead but %d work remaining", sc->nwork)); + KASSERT(sc->ncache == 0, ("Dead but %d cache remaining", sc->ncache)); + KASSERT(sc->nsect == 0, ("Dead but %d sect remaining", sc->nsect)); + mtx_unlock(&sc->worklist_mutex); + sc->dead = 2; + wakeup(sc); + mtx_lock(&Giant); + kthread_exit(0); +} + +/* + * g_bde_start1 has chopped the incoming request up so all the requests + * we see here are inside a single zone. Map the data and key locations + * grab the buffers we need and fire off the first volley of read requests. + */ + +static void +g_bde_start2(struct g_bde_work *wp) +{ + struct g_bde_softc *sc; + + KASSERT(wp != NULL, ("NULL wp in g_bde_start2")); + KASSERT(wp->softc != NULL, ("NULL wp->softc")); + g_trace(G_T_TOPOLOGY, "g_bde_start2(%p)", wp); + sc = wp->softc; + if (wp->bp->bio_cmd == BIO_READ) { + wp->sp = g_bde_new_sector(wp, 0); + if (wp->sp == NULL) { + g_bde_contribute(wp->bp, wp->length, ENOMEM); + g_bde_delete_work(wp); + return; + } + wp->sp->size = wp->length; + wp->sp->data = wp->data; + if (g_bde_start_read(wp->sp) != 0) { + g_bde_contribute(wp->bp, wp->length, ENOMEM); + g_bde_delete_sector(sc, wp->sp); + g_bde_delete_work(wp); + return; + } + g_bde_read_keysector(sc, wp); + if (wp->ksp == NULL) + wp->error = ENOMEM; + } else if (wp->bp->bio_cmd == BIO_DELETE) { + wp->sp = g_bde_new_sector(wp, wp->length); + if (wp->sp == NULL) { + g_bde_contribute(wp->bp, wp->length, ENOMEM); + g_bde_delete_work(wp); + return; + } + } else if (wp->bp->bio_cmd == BIO_WRITE) { + wp->sp = g_bde_new_sector(wp, wp->length); + if (wp->sp == NULL) { + g_bde_contribute(wp->bp, wp->length, ENOMEM); + g_bde_delete_work(wp); + return; + } + g_bde_read_keysector(sc, wp); + if (wp->ksp == NULL) { + g_bde_contribute(wp->bp, wp->length, ENOMEM); + g_bde_delete_sector(sc, wp->sp); + g_bde_delete_work(wp); + return; + } + } else { + KASSERT(0 == 1, + ("Wrong bio_cmd %d in g_bde_start2", wp->bp->bio_cmd)); + } + + wp->state = WAIT; + wakeup(sc); +} + +/* + * Create a sequence of work structures, and have g_bde_map_sector() determine + * how long they each can be. Feed them to g_bde_start2(). + */ + +void +g_bde_start1(struct bio *bp) +{ + struct g_bde_softc *sc; + struct g_bde_work *wp; + off_t done; + + sc = bp->bio_to->geom->softc; + bp->bio_driver1 = sc; + + mtx_lock(&sc->worklist_mutex); + for(done = 0; done < bp->bio_length; ) { + wp = g_bde_new_work(sc); + if (wp != NULL) { + wp->bp = bp; + wp->offset = bp->bio_offset + done; + wp->data = bp->bio_data + done; + wp->length = bp->bio_length - done; + g_bde_map_sector(wp); + done += wp->length; + g_bde_start2(wp); + } + if (wp == NULL || bp->bio_error != 0) { + g_bde_contribute(bp, bp->bio_length - done, ENOMEM); + break; + } + } + mtx_unlock(&sc->worklist_mutex); + return; +} diff --git a/sys/geom/geom.h b/sys/geom/geom.h new file mode 100644 index 0000000..53f7356 --- /dev/null +++ b/sys/geom/geom.h @@ -0,0 +1,313 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _GEOM_GEOM_H_ +#define _GEOM_GEOM_H_ + +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/queue.h> +#include <sys/ioccom.h> +#include <sys/sbuf.h> +#include <sys/module.h> + +struct g_class; +struct g_geom; +struct g_consumer; +struct g_provider; +struct g_stat; +struct thread; +struct bio; +struct sbuf; +struct gctl_req; +struct g_configargs; + +typedef int g_config_t (struct g_configargs *ca); +typedef void g_ctl_req_t (struct gctl_req *, struct g_class *cp, char const *verb); +typedef int g_ctl_create_geom_t (struct gctl_req *, struct g_class *cp, struct g_provider *pp); +typedef int g_ctl_destroy_geom_t (struct gctl_req *, struct g_class *cp, struct g_geom *gp); +typedef int g_ctl_config_geom_t (struct gctl_req *, struct g_geom *gp, const char *verb); +typedef void g_init_t (struct g_class *mp); +typedef void g_fini_t (struct g_class *mp); +typedef struct g_geom * g_taste_t (struct g_class *, struct g_provider *, + int flags); +#define G_TF_NORMAL 0 +#define G_TF_INSIST 1 +#define G_TF_TRANSPARENT 2 +typedef int g_access_t (struct g_provider *, int, int, int); +/* XXX: not sure about the thread arg */ +typedef void g_orphan_t (struct g_consumer *); + +typedef void g_start_t (struct bio *); +typedef void g_spoiled_t (struct g_consumer *); +typedef void g_dumpconf_t (struct sbuf *, const char *indent, struct g_geom *, + struct g_consumer *, struct g_provider *); + +/* + * The g_class structure describes a transformation class. In other words + * all BSD disklabel handlers share one g_class, all MBR handlers share + * one common g_class and so on. + * Certain operations are instantiated on the class, most notably the + * taste and config_geom functions. + */ +struct g_class { + const char *name; + g_taste_t *taste; + g_config_t *config; + g_ctl_req_t *ctlreq; + g_init_t *init; + g_fini_t *fini; + g_ctl_destroy_geom_t *destroy_geom; + /* + * The remaining elements are private + */ + LIST_ENTRY(g_class) class; + LIST_HEAD(,g_geom) geom; +}; + +/* + * The g_geom is an instance of a g_class. + */ +struct g_geom { + char *name; + struct g_class *class; + LIST_ENTRY(g_geom) geom; + LIST_HEAD(,g_consumer) consumer; + LIST_HEAD(,g_provider) provider; + TAILQ_ENTRY(g_geom) geoms; /* XXX: better name */ + int rank; + g_start_t *start; + g_spoiled_t *spoiled; + g_dumpconf_t *dumpconf; + g_access_t *access; + g_orphan_t *orphan; + void *softc; + unsigned flags; +#define G_GEOM_WITHER 1 +}; + +/* + * The g_bioq is a queue of struct bio's. + * XXX: possibly collection point for statistics. + * XXX: should (possibly) be collapsed with sys/bio.h::bio_queue_head. + */ +struct g_bioq { + TAILQ_HEAD(, bio) bio_queue; + struct mtx bio_queue_lock; + int bio_queue_length; +}; + +/* + * A g_consumer is an attachment point for a g_provider. One g_consumer + * can only be attached to one g_provider, but multiple g_consumers + * can be attached to one g_provider. + */ + +struct g_consumer { + struct g_geom *geom; + LIST_ENTRY(g_consumer) consumer; + struct g_provider *provider; + LIST_ENTRY(g_consumer) consumers; /* XXX: better name */ + int acr, acw, ace; + int spoiled; + struct devstat *stat; + u_int nstart, nend; +}; + +/* + * A g_provider is a "logical disk". + */ +struct g_provider { + char *name; + LIST_ENTRY(g_provider) provider; + struct g_geom *geom; + LIST_HEAD(,g_consumer) consumers; + int acr, acw, ace; + int error; + TAILQ_ENTRY(g_provider) orphan; + u_int index; + off_t mediasize; + u_int sectorsize; + u_int stripesize; + u_int stripeoffset; + struct devstat *stat; + u_int nstart, nend; + u_int flags; +#define G_PF_CANDELETE 0x1 +}; + +/* geom_dev.c */ +void g_dev_print(void); + +/* geom_dump.c */ +void g_hexdump(void *ptr, int length); +void g_trace(int level, const char *, ...); +# define G_T_TOPOLOGY 1 +# define G_T_BIO 2 +# define G_T_ACCESS 4 + + +/* geom_event.c */ +typedef void g_event_t(void *, int flag); +#define EV_CANCEL 1 +int g_post_event(g_event_t *func, void *arg, int flag, ...); +int g_waitfor_event(g_event_t *func, void *arg, int flag, ...); +void g_cancel_event(void *ref); +void g_orphan_provider(struct g_provider *pp, int error); +void g_waitidle(void); + +/* geom_subr.c */ +int g_access_abs(struct g_consumer *cp, int nread, int nwrite, int nexcl); +int g_access_rel(struct g_consumer *cp, int nread, int nwrite, int nexcl); +int g_attach(struct g_consumer *cp, struct g_provider *pp); +void g_destroy_consumer(struct g_consumer *cp); +void g_destroy_geom(struct g_geom *pp); +void g_destroy_provider(struct g_provider *pp); +void g_detach(struct g_consumer *cp); +void g_error_provider(struct g_provider *pp, int error); +struct g_provider *g_provider_by_name(char const *arg); +int g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len); +#define g_getattr(a, c, v) g_getattr__((a), (c), (v), sizeof *(v)) +int g_handleattr(struct bio *bp, const char *attribute, void *val, int len); +int g_handleattr_int(struct bio *bp, const char *attribute, int val); +int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val); +struct g_consumer * g_new_consumer(struct g_geom *gp); +struct g_geom * g_new_geomf(struct g_class *mp, const char *fmt, ...); +struct g_provider * g_new_providerf(struct g_geom *gp, const char *fmt, ...); +void g_sanity(void const *ptr); +void g_spoil(struct g_provider *pp, struct g_consumer *cp); +int g_std_access(struct g_provider *pp, int dr, int dw, int de); +void g_std_done(struct bio *bp); +void g_std_spoiled(struct g_consumer *cp); +void g_wither_geom(struct g_geom *gp, int error); + +int g_modevent(module_t, int, void *); + +/* geom_io.c */ +struct bio * g_clone_bio(struct bio *); +void g_destroy_bio(struct bio *); +void g_io_deliver(struct bio *bp, int error); +int g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr); +void g_io_request(struct bio *bp, struct g_consumer *cp); +struct bio *g_new_bio(void); +void * g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error); +int g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length); + +/* geom_kern.c / geom_kernsim.c */ + +#ifndef _SYS_CONF_H_ +typedef int d_ioctl_t(dev_t dev, u_long cmd, caddr_t data, + int fflag, struct thread *td); +#endif + +struct g_ioctl { + u_long cmd; + void *data; + int fflag; + struct thread *td; + d_ioctl_t *func; + void *dev; +}; + +#ifdef _KERNEL + +struct g_kerneldump { + off_t offset; + off_t length; +}; + +MALLOC_DECLARE(M_GEOM); + +static __inline void * +g_malloc(int size, int flags) +{ + void *p; + + p = malloc(size, M_GEOM, flags); + g_sanity(p); + /* printf("malloc(%d, %x) -> %p\n", size, flags, p); */ + return (p); +} + +static __inline void +g_free(void *ptr) +{ + g_sanity(ptr); + /* printf("free(%p)\n", ptr); */ + free(ptr, M_GEOM); +} + +extern struct sx topology_lock; + +#define g_topology_lock() \ + do { \ + mtx_assert(&Giant, MA_NOTOWNED); \ + sx_xlock(&topology_lock); \ + } while (0) + +#define g_topology_unlock() \ + do { \ + g_sanity(NULL); \ + sx_xunlock(&topology_lock); \ + } while (0) + +#define g_topology_assert() \ + do { \ + g_sanity(NULL); \ + sx_assert(&topology_lock, SX_XLOCKED); \ + } while (0) + +#define DECLARE_GEOM_CLASS(class, name) \ + static moduledata_t name##_mod = { \ + #name, g_modevent, &class \ + }; \ + DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); + +#endif /* _KERNEL */ + +/* geom_ctl.c */ +void gctl_set_param(struct gctl_req *req, const char *param, void const *ptr, int len); +void *gctl_get_param(struct gctl_req *req, const char *param, int *len); +char const *gctl_get_asciiparam(struct gctl_req *req, const char *param); +void *gctl_get_paraml(struct gctl_req *req, const char *param, int len); +int gctl_error(struct gctl_req *req, const char *fmt, ...); +struct g_class *gctl_get_class(struct gctl_req *req, char const *arg); +struct g_geom *gctl_get_geom(struct gctl_req *req, struct g_class *mpr, char const *arg); +struct g_provider *gctl_get_provider(struct gctl_req *req, char const *arg); + +#endif /* _GEOM_GEOM_H_ */ diff --git a/sys/geom/geom_aes.c b/sys/geom/geom_aes.c new file mode 100644 index 0000000..867efd9 --- /dev/null +++ b/sys/geom/geom_aes.c @@ -0,0 +1,374 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This method provides AES encryption with a compiled in key (default + * all zeroes). + * + * XXX: This could probably save a lot of code by pretending to be a slicer. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/conf.h> +#include <sys/bio.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/libkern.h> +#include <sys/endian.h> +#include <sys/md5.h> +#include <sys/errno.h> +#include <geom/geom.h> + +#include <crypto/rijndael/rijndael.h> + +#include <crypto/rijndael/rijndael.h> + +#define AES_CLASS_NAME "AES" + +#define MASTER_KEY_LENGTH (1024/8) + +static const u_char *aes_magic = "<<FreeBSD-GEOM-AES>>"; +static const u_char *aes_magic_random = "<<FreeBSD-GEOM-AES-RANDOM>>"; +static const u_char *aes_magic_test = "<<FreeBSD-GEOM-AES-TEST>>"; + + +struct g_aes_softc { + enum { + KEY_ZERO, + KEY_RANDOM, + KEY_TEST + } keying; + u_int sectorsize; + off_t mediasize; + cipherInstance ci; + u_char master_key[MASTER_KEY_LENGTH]; +}; + +/* + * Generate a sectorkey from the masterkey and the offset position. + * + * For KEY_ZERO we just return a key of all zeros. + * + * We feed the sector byte offset, 16 bytes of the master-key and + * the sector byte offset once more to MD5. + * The sector byte offset is converted to little-endian format first + * to support multi-architecture operation. + * We use 16 bytes from the master-key starting at the logical sector + * number modulus he length of the master-key. If need be we wrap + * around to the start of the master-key. + */ + +static void +g_aes_makekey(struct g_aes_softc *sc, off_t off, keyInstance *ki, int dir) +{ + MD5_CTX cx; + u_int64_t u64; + u_int u, u1; + u_char *p, buf[16]; + + if (sc->keying == KEY_ZERO) { + rijndael_makeKey(ki, dir, 128, sc->master_key); + return; + } + MD5Init(&cx); + u64 = htole64(off); + MD5Update(&cx, (u_char *)&u64, sizeof(u64)); + u = off / sc->sectorsize; + u %= sizeof sc->master_key; + p = sc->master_key + u; + if (u + 16 <= sizeof(sc->master_key)) { + MD5Update(&cx, p, 16); + } else { + u1 = sizeof sc->master_key - u; + MD5Update(&cx, p, u1); + MD5Update(&cx, sc->master_key, 16 - u1); + u1 = 0; /* destroy evidence */ + } + u = 0; /* destroy evidence */ + MD5Update(&cx, (u_char *)&u64, sizeof(u64)); + u64 = 0; /* destroy evidence */ + MD5Final(buf, &cx); + bzero(&cx, sizeof cx); /* destroy evidence */ + rijndael_makeKey(ki, dir, 128, buf); + bzero(buf, sizeof buf); /* destroy evidence */ + +} + +static void +g_aes_read_done(struct bio *bp) +{ + struct g_geom *gp; + struct g_aes_softc *sc; + u_char *p, *b, *e, *sb; + keyInstance dkey; + off_t o; + + gp = bp->bio_from->geom; + sc = gp->softc; + sb = g_malloc(sc->sectorsize, M_WAITOK); + b = bp->bio_data; + e = bp->bio_data; + e += bp->bio_length; + o = bp->bio_offset - sc->sectorsize; + for (p = b; p < e; p += sc->sectorsize) { + g_aes_makekey(sc, o, &dkey, DIR_DECRYPT); + rijndael_blockDecrypt(&sc->ci, &dkey, p, sc->sectorsize * 8, sb); + bcopy(sb, p, sc->sectorsize); + o += sc->sectorsize; + } + bzero(&dkey, sizeof dkey); /* destroy evidence */ + bzero(sb, sc->sectorsize); /* destroy evidence */ + g_free(sb); + g_std_done(bp); +} + +static void +g_aes_write_done(struct bio *bp) +{ + + bzero(bp->bio_data, bp->bio_length); /* destroy evidence */ + g_free(bp->bio_data); + g_std_done(bp); +} + +static void +g_aes_start(struct bio *bp) +{ + struct g_geom *gp; + struct g_consumer *cp; + struct g_aes_softc *sc; + struct bio *bp2; + u_char *p1, *p2, *b, *e; + keyInstance ekey; + off_t o; + + gp = bp->bio_to->geom; + cp = LIST_FIRST(&gp->consumer); + sc = gp->softc; + switch (bp->bio_cmd) { + case BIO_READ: + bp2 = g_clone_bio(bp); + if (bp2 == NULL) { + g_io_deliver(bp, ENOMEM); + return; + } + bp2->bio_done = g_aes_read_done; + bp2->bio_offset += sc->sectorsize; + g_io_request(bp2, cp); + break; + case BIO_WRITE: + bp2 = g_clone_bio(bp); + if (bp2 == NULL) { + g_io_deliver(bp, ENOMEM); + return; + } + bp2->bio_done = g_aes_write_done; + bp2->bio_offset += sc->sectorsize; + bp2->bio_data = g_malloc(bp->bio_length, M_WAITOK); + b = bp->bio_data; + e = bp->bio_data; + e += bp->bio_length; + p2 = bp2->bio_data; + o = bp->bio_offset; + for (p1 = b; p1 < e; p1 += sc->sectorsize) { + g_aes_makekey(sc, o, &ekey, DIR_ENCRYPT); + rijndael_blockEncrypt(&sc->ci, &ekey, + p1, sc->sectorsize * 8, p2); + p2 += sc->sectorsize; + o += sc->sectorsize; + } + bzero(&ekey, sizeof ekey); /* destroy evidence */ + g_io_request(bp2, cp); + break; + case BIO_GETATTR: + bp2 = g_clone_bio(bp); + if (bp2 == NULL) { + g_io_deliver(bp, ENOMEM); + return; + } + bp2->bio_done = g_std_done; + bp2->bio_offset += sc->sectorsize; + g_io_request(bp2, cp); + break; + default: + g_io_deliver(bp, EOPNOTSUPP); + return; + } + return; +} + +static void +g_aes_orphan(struct g_consumer *cp) +{ + struct g_geom *gp; + struct g_aes_softc *sc; + + g_trace(G_T_TOPOLOGY, "g_aes_orphan(%p/%s)", cp, cp->provider->name); + g_topology_assert(); + KASSERT(cp->provider->error != 0, + ("g_aes_orphan with error == 0")); + + gp = cp->geom; + sc = gp->softc; + g_wither_geom(gp, cp->provider->error); + bzero(sc, sizeof(struct g_aes_softc)); /* destroy evidence */ + g_free(sc); + return; +} + +static int +g_aes_access(struct g_provider *pp, int dr, int dw, int de) +{ + struct g_geom *gp; + struct g_consumer *cp; + + gp = pp->geom; + cp = LIST_FIRST(&gp->consumer); + /* On first open, grab an extra "exclusive" bit */ + if (cp->acr == 0 && cp->acw == 0 && cp->ace == 0) + de++; + /* ... and let go of it on last close */ + if ((cp->acr + dr) == 0 && (cp->acw + dw) == 0 && (cp->ace + de) == 1) + de--; + return (g_access_rel(cp, dr, dw, de)); +} + +static struct g_geom * +g_aes_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) +{ + struct g_geom *gp; + struct g_consumer *cp; + struct g_aes_softc *sc; + int error; + u_int sectorsize; + off_t mediasize; + u_char *buf; + + g_trace(G_T_TOPOLOGY, "aes_taste(%s,%s)", mp->name, pp->name); + g_topology_assert(); + gp = g_new_geomf(mp, "%s.aes", pp->name); + gp->start = g_aes_start; + gp->orphan = g_aes_orphan; + gp->spoiled = g_std_spoiled; + cp = g_new_consumer(gp); + g_attach(cp, pp); + error = g_access_rel(cp, 1, 0, 0); + if (error) { + g_detach(cp); + g_destroy_consumer(cp); + g_destroy_geom(gp); + return (NULL); + } + buf = NULL; + g_topology_unlock(); + do { + if (gp->rank != 2) + break; + sectorsize = cp->provider->sectorsize; + mediasize = cp->provider->mediasize; + buf = g_read_data(cp, 0, sectorsize, &error); + if (buf == NULL || error != 0) { + break; + } + sc = g_malloc(sizeof(struct g_aes_softc), M_WAITOK | M_ZERO); + if (!memcmp(buf, aes_magic, strlen(aes_magic))) { + sc->keying = KEY_ZERO; + } else if (!memcmp(buf, aes_magic_random, + strlen(aes_magic_random))) { + sc->keying = KEY_RANDOM; + } else if (!memcmp(buf, aes_magic_test, + strlen(aes_magic_test))) { + sc->keying = KEY_TEST; + } else { + g_free(sc); + break; + } + g_free(buf); + gp->softc = sc; + gp->access = g_aes_access; + sc->sectorsize = sectorsize; + sc->mediasize = mediasize - sectorsize; + rijndael_cipherInit(&sc->ci, MODE_CBC, NULL); + if (sc->keying == KEY_TEST) { + int i; + u_char *p; + + p = sc->master_key; + for (i = 0; i < (int)sizeof sc->master_key; i ++) + *p++ = i; + } + if (sc->keying == KEY_RANDOM) { + int i; + u_int32_t u; + u_char *p; + + p = sc->master_key; + for (i = 0; i < (int)sizeof sc->master_key; i += sizeof u) { + u = arc4random(); + *p++ = u; + *p++ = u >> 8; + *p++ = u >> 16; + *p++ = u >> 24; + } + } + g_topology_lock(); + pp = g_new_providerf(gp, gp->name); + pp->mediasize = mediasize - sectorsize; + pp->sectorsize = sectorsize; + g_error_provider(pp, 0); + g_topology_unlock(); + } while(0); + g_topology_lock(); + if (buf) + g_free(buf); + g_access_rel(cp, -1, 0, 0); + if (gp->softc != NULL) + return (gp); + g_detach(cp); + g_destroy_consumer(cp); + g_destroy_geom(gp); + return (NULL); +} + +static struct g_class g_aes_class = { + .name = AES_CLASS_NAME, + .taste = g_aes_taste, +}; + +DECLARE_GEOM_CLASS(g_aes_class, g_aes); diff --git a/sys/geom/geom_apple.c b/sys/geom/geom_apple.c new file mode 100644 index 0000000..328b835 --- /dev/null +++ b/sys/geom/geom_apple.c @@ -0,0 +1,260 @@ +/*- + * Copyright (c) 2002 Peter Grehan. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * GEOM module for Apple Partition Maps + * As described in 'Inside Macintosh Vol 3: About the SCSI Manager - + * The Structure of Block Devices" + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/endian.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/bio.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +#include <sys/sbuf.h> +#include <geom/geom.h> +#include <geom/geom_slice.h> + +#define APPLE_CLASS_NAME "APPLE" + +#define NAPMPART 16 /* Max partitions */ + +struct apm_partition { + char am_sig[2]; + u_int32_t am_mapcnt; + u_int32_t am_start; + u_int32_t am_partcnt; + char am_name[32]; + char am_type[32]; +}; + +struct g_apple_softc { + u_int16_t dd_bsiz; + u_int32_t dd_blkcnt; + u_int16_t dd_drvrcnt; + u_int32_t am_mapcnt0; + struct apm_partition apmpart[NAPMPART]; +}; + +static void +g_dec_drvrdesc(u_char *ptr, struct g_apple_softc *sc) +{ + sc->dd_bsiz = be16dec(ptr + 2); + sc->dd_blkcnt = be32dec(ptr + 4); + sc->dd_drvrcnt = be32dec(ptr + 16); +} + +static void +g_dec_apple_partition(u_char *ptr, struct apm_partition *d) +{ + d->am_sig[0] = ptr[0]; + d->am_sig[1] = ptr[1]; + d->am_mapcnt = be32dec(ptr + 4); + d->am_start = be32dec(ptr + 8); + d->am_partcnt = be32dec(ptr + 12); + memcpy(d->am_name, ptr + 16, 32); + memcpy(d->am_type, ptr + 48, 32); +} + +static int +g_apple_start(struct bio *bp) +{ + struct g_provider *pp; + struct g_geom *gp; + struct g_slicer *gsp; + + pp = bp->bio_to; + gp = pp->geom; + gsp = gp->softc; + if (bp->bio_cmd == BIO_GETATTR) { + if (g_handleattr_off_t(bp, "APM::offset", + gsp->slices[pp->index].offset)) + return (1); + } + return (0); +} + +static void +g_apple_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, + struct g_consumer *cp __unused, struct g_provider *pp) +{ + struct g_apple_softc *mp; + struct g_slicer *gsp; + + gsp = gp->softc; + mp = gsp->softc; + g_slice_dumpconf(sb, indent, gp, cp, pp); + if (pp != NULL) { + if (indent == NULL) + sbuf_printf(sb, " n %s ty %s", + mp->apmpart[pp->index].am_name, + mp->apmpart[pp->index].am_type); + else { + sbuf_printf(sb, "%s<name>%s</name>\n", indent, + mp->apmpart[pp->index].am_name); + sbuf_printf(sb, "%s<type>%s</type>\n", indent, + mp->apmpart[pp->index].am_type); + } + } +} + +#if 0 +static void +g_apple_print() +{ + + /* XXX */ +} +#endif + +static struct g_geom * +g_apple_taste(struct g_class *mp, struct g_provider *pp, int insist) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error, i; + struct g_apple_softc *ms; + struct apm_partition *apm; + u_int sectorsize; + u_char *buf; + + g_trace(G_T_TOPOLOGY, "apple_taste(%s,%s)", mp->name, pp->name); + g_topology_assert(); + gp = g_slice_new(mp, NAPMPART, pp, &cp, &ms, sizeof *ms, g_apple_start); + if (gp == NULL) + return (NULL); + g_topology_unlock(); + gp->dumpconf = g_apple_dumpconf; + do { + if (gp->rank != 2 && insist == 0) + break; + + sectorsize = cp->provider->sectorsize; + if (sectorsize != 512) + break; + + buf = g_read_data(cp, 0, sectorsize, &error); + if (buf == NULL || error != 0) + break; + + /* + * Test for the sector 0 driver record signature, and + * validate sector and disk size + */ + if (buf[0] != 'E' && buf[1] != 'R') { + g_free(buf); + break; + } + g_dec_drvrdesc(buf, ms); + g_free(buf); + + if (ms->dd_bsiz != 512) { + break; + } + + /* + * Read in the first partition map + */ + buf = g_read_data(cp, sectorsize, sectorsize, &error); + if (buf == NULL || error != 0) + break; + + /* + * Decode the first partition: it's another indication of + * validity, as well as giving the size of the partition + * map + */ + apm = &ms->apmpart[0]; + g_dec_apple_partition(buf, apm); + g_free(buf); + + if (apm->am_sig[0] != 'P' || apm->am_sig[1] != 'M') + break; + ms->am_mapcnt0 = apm->am_mapcnt; + + buf = g_read_data(cp, 2 * sectorsize, + (NAPMPART - 1) * sectorsize, &error); + if (buf == NULL || error != 0) + break; + + for (i = 1; i < NAPMPART; i++) { + g_dec_apple_partition(buf + ((i - 1) * sectorsize), + &ms->apmpart[i]); + } + + for (i = 0; i < NAPMPART; i++) { + apm = &ms->apmpart[i]; + + /* + * Validate partition sig and global mapcount + */ + if (apm->am_sig[0] != 'P' || + apm->am_sig[1] != 'M') + continue; + if (apm->am_mapcnt != ms->am_mapcnt0) + continue; + + if (bootverbose) { + printf("APM Slice %d (%s/%s) on %s:\n", + i + 1, apm->am_name, apm->am_type, + gp->name); + /* g_apple_print(i, dp + i); */ + } + g_topology_lock(); + g_slice_config(gp, i, G_SLICE_CONFIG_SET, + (off_t)apm->am_start << 9ULL, + (off_t)apm->am_partcnt << 9ULL, + sectorsize, + "%ss%d", gp->name, i + 1); + g_topology_unlock(); + } + g_free(buf); + break; + } while(0); + g_topology_lock(); + g_access_rel(cp, -1, 0, 0); + if (LIST_EMPTY(&gp->provider)) { + g_slice_spoiled(cp); + return (NULL); + } + return (gp); +} + + +static struct g_class g_apple_class = { + .name = APPLE_CLASS_NAME, + .taste = g_apple_taste, +}; + +DECLARE_GEOM_CLASS(g_apple_class, g_apple); diff --git a/sys/geom/geom_bsd.c b/sys/geom/geom_bsd.c new file mode 100644 index 0000000..4f4d565 --- /dev/null +++ b/sys/geom/geom_bsd.c @@ -0,0 +1,739 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This is the method for dealing with BSD disklabels. It has been + * extensively (by my standards at least) commented, in the vain hope that + * it will serve as the source in future copy&paste operations. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/endian.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/conf.h> +#include <sys/bio.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/md5.h> +#include <sys/errno.h> +#include <sys/disklabel.h> +#include <geom/geom.h> +#include <geom/geom_slice.h> + +#define BSD_CLASS_NAME "BSD" + +#define ALPHA_LABEL_OFFSET 64 + +#define LABELSIZE (148 + 16 * MAXPARTITIONS) + +static void g_bsd_hotwrite(void *arg, int flag); +/* + * Our private data about one instance. All the rest is handled by the + * slice code and stored in its softc, so this is just the stuff + * specific to BSD disklabels. + */ +struct g_bsd_softc { + off_t labeloffset; + off_t mbroffset; + off_t rawoffset; + struct disklabel ondisk; + u_char label[LABELSIZE]; + u_char labelsum[16]; +}; + +/* + * Modify our slicer to match proposed disklabel, if possible. + * This is where we make sure we don't do something stupid. + */ +static int +g_bsd_modify(struct g_geom *gp, u_char *label) +{ + int i, error; + struct partition *ppp; + struct g_slicer *gsp; + struct g_consumer *cp; + struct g_bsd_softc *ms; + u_int secsize, u; + off_t rawoffset, o; + struct disklabel dl; + MD5_CTX md5sum; + + g_topology_assert(); + gsp = gp->softc; + ms = gsp->softc; + + error = bsd_disklabel_le_dec(label, &dl, MAXPARTITIONS); + if (error) { + return (error); + } + + /* Get dimensions of our device. */ + cp = LIST_FIRST(&gp->consumer); + secsize = cp->provider->sectorsize; + + /* ... or a smaller sector size. */ + if (dl.d_secsize < secsize) { + return (EINVAL); + } + + /* ... or a non-multiple sector size. */ + if (dl.d_secsize % secsize != 0) { + return (EINVAL); + } + + /* Historical braindamage... */ + rawoffset = (off_t)dl.d_partitions[RAW_PART].p_offset * dl.d_secsize; + + for (i = 0; i < dl.d_npartitions; i++) { + ppp = &dl.d_partitions[i]; + if (ppp->p_size == 0) + continue; + o = (off_t)ppp->p_offset * dl.d_secsize; + + if (o < rawoffset) + rawoffset = 0; + } + + if (rawoffset != 0 && (off_t)rawoffset != ms->mbroffset) + printf("WARNING: Expected rawoffset %jd, found %jd\n", + (intmax_t)ms->mbroffset/dl.d_secsize, + (intmax_t)rawoffset/dl.d_secsize); + + /* Don't munge open partitions. */ + for (i = 0; i < dl.d_npartitions; i++) { + ppp = &dl.d_partitions[i]; + + o = (off_t)ppp->p_offset * dl.d_secsize; + if (o == 0) + o = rawoffset; + error = g_slice_config(gp, i, G_SLICE_CONFIG_CHECK, + o - rawoffset, + (off_t)ppp->p_size * dl.d_secsize, + dl.d_secsize, + "%s%c", gp->name, 'a' + i); + if (error) + return (error); + } + + /* Look good, go for it... */ + for (u = 0; u < gsp->nslice; u++) { + ppp = &dl.d_partitions[u]; + o = (off_t)ppp->p_offset * dl.d_secsize; + if (o == 0) + o = rawoffset; + g_slice_config(gp, u, G_SLICE_CONFIG_SET, + o - rawoffset, + (off_t)ppp->p_size * dl.d_secsize, + dl.d_secsize, + "%s%c", gp->name, 'a' + u); + } + + /* Update our softc */ + ms->ondisk = dl; + if (label != ms->label) + bcopy(label, ms->label, LABELSIZE); + ms->rawoffset = rawoffset; + + /* + * In order to avoid recursively attaching to the same + * on-disk label (it's usually visible through the 'c' + * partition) we calculate an MD5 and ask if other BSD's + * below us love that label. If they do, we don't. + */ + MD5Init(&md5sum); + MD5Update(&md5sum, ms->label, sizeof(ms->label)); + MD5Final(ms->labelsum, &md5sum); + + return (0); +} + +/* + * This is an internal helper function, called multiple times from the taste + * function to try to locate a disklabel on the disk. More civilized formats + * will not need this, as there is only one possible place on disk to look + * for the magic spot. + */ + +static int +g_bsd_try(struct g_geom *gp, struct g_slicer *gsp, struct g_consumer *cp, int secsize, struct g_bsd_softc *ms, off_t offset) +{ + int error; + u_char *buf; + struct disklabel *dl; + off_t secoff; + + /* + * We need to read entire aligned sectors, and we assume that the + * disklabel does not span sectors, so one sector is enough. + */ + error = 0; + secoff = offset % secsize; + buf = g_read_data(cp, offset - secoff, secsize, &error); + if (buf == NULL || error != 0) + return (ENOENT); + + /* Decode into our native format. */ + dl = &ms->ondisk; + error = bsd_disklabel_le_dec(buf + secoff, dl, MAXPARTITIONS); + if (!error) + bcopy(buf + secoff, ms->label, LABELSIZE); + + /* Remember to free the buffer g_read_data() gave us. */ + g_free(buf); + + ms->labeloffset = offset; + return (error); +} + +/* + * This function writes the current label to disk, possibly updating + * the alpha SRM checksum. + */ + +static int +g_bsd_writelabel(struct g_geom *gp, u_char *bootcode) +{ + off_t secoff; + u_int secsize; + struct g_consumer *cp; + struct g_slicer *gsp; + struct g_bsd_softc *ms; + u_char *buf; + uint64_t sum; + int error, i; + + gsp = gp->softc; + ms = gsp->softc; + cp = LIST_FIRST(&gp->consumer); + /* Get sector size, we need it to read data. */ + secsize = cp->provider->sectorsize; + secoff = ms->labeloffset % secsize; + if (bootcode == NULL) { + buf = g_read_data(cp, ms->labeloffset - secoff, secsize, &error); + if (buf == NULL || error != 0) + return (error); + bcopy(ms->label, buf + secoff, sizeof(ms->label)); + } else { + buf = bootcode; + bcopy(ms->label, buf + ms->labeloffset, sizeof(ms->label)); + } + if (ms->labeloffset == ALPHA_LABEL_OFFSET) { + sum = 0; + for (i = 0; i < 63; i++) + sum += le64dec(buf + i * 8); + le64enc(buf + 504, sum); + } + if (bootcode == NULL) { + error = g_write_data(cp, ms->labeloffset - secoff, buf, secsize); + g_free(buf); + } else { + error = g_write_data(cp, 0, bootcode, BBSIZE); + } + return(error); +} + + +/* + * Implement certain ioctls to modify disklabels with. This function + * is called by the event handler thread with topology locked as result + * of the g_post_event() in g_bsd_start(). It is not necessary to keep + * topology locked all the time but make sure to return with topology + * locked as well. + */ + +static void +g_bsd_ioctl(void *arg, int flag) +{ + struct bio *bp; + struct g_geom *gp; + struct g_ioctl *gio; + u_char *label; + int error; + + g_topology_assert(); + bp = arg; + if (flag == EV_CANCEL) { + g_io_deliver(bp, ENXIO); + return; + } + + gp = bp->bio_to->geom; + gio = (struct g_ioctl *)bp->bio_data; + + label = g_malloc(LABELSIZE, M_WAITOK); + + /* The disklabel to set is the ioctl argument. */ + bsd_disklabel_le_enc(label, gio->data); + + /* Validate and modify our slice instance to match. */ + error = g_bsd_modify(gp, label); /* Picks up topology lock on success. */ + g_free(label); + if (error || gio->cmd == DIOCSDINFO) { + g_io_deliver(bp, error); + return; + } + + KASSERT(gio->cmd == DIOCWDINFO, ("Unknown ioctl in g_bsd_ioctl")); + g_io_deliver(bp, g_bsd_writelabel(gp, NULL)); +} + +/* + * Rewrite the bootblock, which is BBSIZE bytes from the start of the disk. + * We punch down the disklabel where we expect it to be before writing. + */ +static int +g_bsd_diocbsdbb(dev_t dev, u_long cmd __unused, caddr_t data, int fflag __unused, struct thread *td __unused) +{ + struct g_geom *gp; + struct g_slicer *gsp; + struct g_bsd_softc *ms; + struct g_consumer *cp; + u_char *buf; + void *p; + int error, i; + uint64_t sum; + + /* Get hold of the interesting bits from the bio. */ + gp = (void *)dev; + gsp = gp->softc; + ms = gsp->softc; + + /* The disklabel to set is the ioctl argument. */ + buf = g_malloc(BBSIZE, M_WAITOK); + p = *(void **)data; + error = copyin(p, buf, BBSIZE); + if (!error) { + DROP_GIANT(); + g_topology_lock(); + /* Validate and modify our slice instance to match. */ + error = g_bsd_modify(gp, buf + ms->labeloffset); + if (!error) { + cp = LIST_FIRST(&gp->consumer); + if (ms->labeloffset == ALPHA_LABEL_OFFSET) { + sum = 0; + for (i = 0; i < 63; i++) + sum += le64dec(buf + i * 8); + le64enc(buf + 504, sum); + } + error = g_write_data(cp, 0, buf, BBSIZE); + } + g_topology_unlock(); + PICKUP_GIANT(); + } + g_free(buf); + return (error); +} + +/* + * If the user tries to overwrite our disklabel through an open partition + * or via a magicwrite config call, we end up here and try to prevent + * footshooting as best we can. + */ +static void +g_bsd_hotwrite(void *arg, int flag) +{ + struct bio *bp; + struct g_geom *gp; + struct g_slicer *gsp; + struct g_slice *gsl; + struct g_bsd_softc *ms; + u_char *p; + int error; + + g_topology_assert(); + /* + * We should never get canceled, because that would amount to a removal + * of the geom while there was outstanding I/O requests. + */ + KASSERT(flag != EV_CANCEL, ("g_bsd_hotwrite cancelled")); + bp = arg; + gp = bp->bio_to->geom; + gsp = gp->softc; + ms = gsp->softc; + gsl = &gsp->slices[bp->bio_to->index]; + p = (u_char*)bp->bio_data + ms->labeloffset + - (bp->bio_offset + gsl->offset); + error = g_bsd_modify(gp, p); + if (error) { + g_io_deliver(bp, EPERM); + return; + } + g_slice_finish_hot(bp); +} + +/*- + * This start routine is only called for non-trivial requests, all the + * trivial ones are handled autonomously by the slice code. + * For requests we handle here, we must call the g_io_deliver() on the + * bio, and return non-zero to indicate to the slice code that we did so. + * This code executes in the "DOWN" I/O path, this means: + * * No sleeping. + * * Don't grab the topology lock. + * * Don't call biowait, g_getattr(), g_setattr() or g_read_data() + */ + +static int +g_bsd_start(struct bio *bp) +{ + struct g_geom *gp; + struct g_bsd_softc *ms; + struct g_slicer *gsp; + struct g_ioctl *gio; + int error; + + gp = bp->bio_to->geom; + gsp = gp->softc; + ms = gsp->softc; + switch(bp->bio_cmd) { + case BIO_GETATTR: + if (g_handleattr(bp, "BSD::labelsum", ms->labelsum, + sizeof(ms->labelsum))) + return (1); + break; + default: + KASSERT(0 == 1, ("Unknown bio_cmd in g_bsd_start (%d)", + bp->bio_cmd)); + } + + /* We only handle ioctl(2) requests of the right format. */ + if (strcmp(bp->bio_attribute, "GEOM::ioctl")) + return (0); + else if (bp->bio_length != sizeof(*gio)) + return (0); + + /* Get hold of the ioctl parameters. */ + gio = (struct g_ioctl *)bp->bio_data; + + switch (gio->cmd) { + case DIOCGDINFO: + /* Return a copy of the disklabel to userland. */ + bsd_disklabel_le_dec(ms->label, gio->data, MAXPARTITIONS); + g_io_deliver(bp, 0); + return (1); + case DIOCBSDBB: + gio->func = g_bsd_diocbsdbb; + gio->dev = (void *)gp; + g_io_deliver(bp, EDIRIOCTL); + return (1); + case DIOCSDINFO: + case DIOCWDINFO: + /* + * These we cannot do without the topology lock and some + * some I/O requests. Ask the event-handler to schedule + * us in a less restricted environment. + */ + error = g_post_event(g_bsd_ioctl, bp, M_NOWAIT, gp, NULL); + if (error) + g_io_deliver(bp, error); + /* + * We must return non-zero to indicate that we will deal + * with this bio, even though we have not done so yet. + */ + return (1); + default: + return (0); + } +} + +/* + * Dump configuration information in XML format. + * Notice that the function is called once for the geom and once for each + * consumer and provider. We let g_slice_dumpconf() do most of the work. + */ +static void +g_bsd_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) +{ + struct g_bsd_softc *ms; + struct g_slicer *gsp; + + gsp = gp->softc; + ms = gsp->softc; + g_slice_dumpconf(sb, indent, gp, cp, pp); + if (indent != NULL && pp == NULL && cp == NULL) { + sbuf_printf(sb, "%s<labeloffset>%jd</labeloffset>\n", + indent, (intmax_t)ms->labeloffset); + sbuf_printf(sb, "%s<rawoffset>%jd</rawoffset>\n", + indent, (intmax_t)ms->rawoffset); + sbuf_printf(sb, "%s<mbroffset>%jd</mbroffset>\n", + indent, (intmax_t)ms->mbroffset); + } else if (pp != NULL) { + if (indent == NULL) + sbuf_printf(sb, " ty %d", + ms->ondisk.d_partitions[pp->index].p_fstype); + else + sbuf_printf(sb, "%s<type>%d</type>\n", indent, + ms->ondisk.d_partitions[pp->index].p_fstype); + } +} + +/* + * The taste function is called from the event-handler, with the topology + * lock already held and a provider to examine. The flags are unused. + * + * If flags == G_TF_NORMAL, the idea is to take a bite of the provider and + * if we find valid, consistent magic on it, build a geom on it. + * any magic bits which indicate that we should automatically put a BSD + * geom on it. + * + * There may be cases where the operator would like to put a BSD-geom on + * providers which do not meet all of the requirements. This can be done + * by instead passing the G_TF_INSIST flag, which will override these + * checks. + * + * The final flags value is G_TF_TRANSPARENT, which instructs the method + * to put a geom on top of the provider and configure it to be as transparent + * as possible. This is not really relevant to the BSD method and therefore + * not implemented here. + */ + +static struct g_geom * +g_bsd_taste(struct g_class *mp, struct g_provider *pp, int flags) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error, i; + struct g_bsd_softc *ms; + u_int secsize; + struct g_slicer *gsp; + u_char hash[16]; + MD5_CTX md5sum; + + g_trace(G_T_TOPOLOGY, "bsd_taste(%s,%s)", mp->name, pp->name); + g_topology_assert(); + + /* We don't implement transparent inserts. */ + if (flags == G_TF_TRANSPARENT) + return (NULL); + + /* + * BSD labels are a subclass of the general "slicing" topology so + * a lot of the work can be done by the common "slice" code. + * Create a geom with space for MAXPARTITIONS providers, one consumer + * and a softc structure for us. Specify the provider to attach + * the consumer to and our "start" routine for special requests. + * The provider is opened with mode (1,0,0) so we can do reads + * from it. + */ + gp = g_slice_new(mp, MAXPARTITIONS, pp, &cp, &ms, + sizeof(*ms), g_bsd_start); + if (gp == NULL) + return (NULL); + + /* + * Fill in the optional details, in our case we have a dumpconf + * routine which the "slice" code should call at the right time + */ + gp->dumpconf = g_bsd_dumpconf; + + /* Get the geom_slicer softc from the geom. */ + gsp = gp->softc; + + /* + * The do...while loop here allows us to have multiple escapes + * using a simple "break". This improves code clarity without + * ending up in deep nesting and without using goto or come from. + */ + do { + /* + * If the provider is an MBR we will only auto attach + * to type 165 slices in the G_TF_NORMAL case. We will + * attach to any other type. + */ + error = g_getattr("MBR::type", cp, &i); + if (!error) { + if (i != 165 && flags == G_TF_NORMAL) + break; + error = g_getattr("MBR::offset", cp, &ms->mbroffset); + if (error) + break; + } + + /* Same thing if we are inside a PC98 */ + error = g_getattr("PC98::type", cp, &i); + if (!error) { + if (i != 0xc494 && flags == G_TF_NORMAL) + break; + error = g_getattr("PC98::offset", cp, &ms->mbroffset); + if (error) + break; + } + + /* Get sector size, we need it to read data. */ + secsize = cp->provider->sectorsize; + if (secsize < 512) + break; + + /* First look for a label at the start of the second sector. */ + error = g_bsd_try(gp, gsp, cp, secsize, ms, secsize); + + /* Next, look for alpha labels */ + if (error) + error = g_bsd_try(gp, gsp, cp, secsize, ms, + ALPHA_LABEL_OFFSET); + + /* If we didn't find a label, punt. */ + if (error) + break; + + /* + * In order to avoid recursively attaching to the same + * on-disk label (it's usually visible through the 'c' + * partition) we calculate an MD5 and ask if other BSD's + * below us love that label. If they do, we don't. + */ + MD5Init(&md5sum); + MD5Update(&md5sum, ms->label, sizeof(ms->label)); + MD5Final(ms->labelsum, &md5sum); + + error = g_getattr("BSD::labelsum", cp, &hash); + if (!error && !bcmp(ms->labelsum, hash, sizeof(hash))) + break; + + /* + * Process the found disklabel, and modify our "slice" + * instance to match it, if possible. + */ + error = g_bsd_modify(gp, ms->label); + } while (0); + + /* Success or failure, we can close our provider now. */ + error = g_access_rel(cp, -1, 0, 0); + + /* If we have configured any providers, return the new geom. */ + if (gsp->nprovider > 0) { + g_slice_conf_hot(gp, 0, ms->labeloffset, LABELSIZE, + G_SLICE_HOT_ALLOW, G_SLICE_HOT_DENY, G_SLICE_HOT_CALL); + gsp->hot = g_bsd_hotwrite; + return (gp); + } + /* + * ...else push the "self-destruct" button, by spoiling our own + * consumer. This triggers a call to g_slice_spoiled which will + * dismantle what was setup. + */ + g_slice_spoiled(cp); + return (NULL); +} + +struct h0h0 { + struct g_geom *gp; + struct g_bsd_softc *ms; + u_char *label; + int error; +}; + +static void +g_bsd_callconfig(void *arg, int flag) +{ + struct h0h0 *hp; + + hp = arg; + hp->error = g_bsd_modify(hp->gp, hp->label); + if (!hp->error) + hp->error = g_bsd_writelabel(hp->gp, NULL); +} + +/* + * NB! curthread is user process which GCTL'ed. + */ +static void +g_bsd_config(struct gctl_req *req, struct g_class *mp, char const *verb) +{ + u_char *label; + int error; + struct h0h0 h0h0; + struct g_geom *gp; + struct g_slicer *gsp; + struct g_consumer *cp; + struct g_bsd_softc *ms; + + g_topology_assert(); + gp = gctl_get_geom(req, mp, "geom"); + if (gp == NULL) + return; + cp = LIST_FIRST(&gp->consumer); + gsp = gp->softc; + ms = gsp->softc; + if (!strcmp(verb, "read mbroffset")) { + gctl_set_param(req, "mbroffset", + &ms->mbroffset, sizeof(ms->mbroffset)); + return; + } else if (!strcmp(verb, "write label")) { + label = gctl_get_paraml(req, "label", LABELSIZE); + if (label == NULL) + return; + h0h0.gp = gp; + h0h0.ms = gsp->softc; + h0h0.label = label; + h0h0.error = -1; + /* XXX: Does this reference register with our selfdestruct code ? */ + error = g_access_rel(cp, 1, 1, 1); + if (error) { + gctl_error(req, "could not access consumer"); + return; + } + g_bsd_callconfig(&h0h0, 0); + error = h0h0.error; + g_access_rel(cp, -1, -1, -1); + } else if (!strcmp(verb, "write bootcode")) { + label = gctl_get_paraml(req, "bootcode", BBSIZE); + if (label == NULL) + return; + /* XXX: Does this reference register with our selfdestruct code ? */ + error = g_access_rel(cp, 1, 1, 1); + if (error) { + gctl_error(req, "could not access consumer"); + return; + } + error = g_bsd_writelabel(gp, label); + g_access_rel(cp, -1, -1, -1); + } else { + gctl_error(req, "Unknown verb parameter"); + } + + return; +} + +/* Finally, register with GEOM infrastructure. */ +static struct g_class g_bsd_class = { + .name = BSD_CLASS_NAME, + .taste = g_bsd_taste, + .ctlreq = g_bsd_config, +}; + +DECLARE_GEOM_CLASS(g_bsd_class, g_bsd); diff --git a/sys/geom/geom_bsd_enc.c b/sys/geom/geom_bsd_enc.c new file mode 100644 index 0000000..dfdeb85 --- /dev/null +++ b/sys/geom/geom_bsd_enc.c @@ -0,0 +1,194 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Functions to encode and decode struct disklabel and struct partition into + * a bytestream of little endianess and correct packing. + * + * NB! This file must be usable both in kernel and userland. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/endian.h> +#include <sys/disklabel.h> +#include <sys/errno.h> +#ifdef _KERNEL +#include <sys/systm.h> +#else +#include <string.h> +#endif + +void +bsd_partition_le_dec(u_char *ptr, struct partition *d) +{ + d->p_size = le32dec(ptr + 0); + d->p_offset = le32dec(ptr + 4); + d->p_fsize = le32dec(ptr + 8); + d->p_fstype = ptr[12]; + d->p_frag = ptr[13]; + d->p_cpg = le16dec(ptr + 14); +} + +int +bsd_disklabel_le_dec(u_char *ptr, struct disklabel *d, int maxpart) +{ + int i; + u_char *p, *pe; + uint16_t sum; + + d->d_magic = le32dec(ptr + 0); + if (d->d_magic != DISKMAGIC) + return(EINVAL); + + d->d_magic2 = le32dec(ptr + 132); + if (d->d_magic2 != DISKMAGIC) { + return(EINVAL); + } + + d->d_npartitions = le16dec(ptr + 138); + if (d->d_npartitions > maxpart) { + return(EINVAL); + } + + pe = ptr + 148 + 16 * d->d_npartitions; + sum = 0; + for (p = ptr; p < pe; p += 2) + sum ^= le16dec(p); + if (sum != 0) { + return(EINVAL); + } + + d->d_type = le16dec(ptr + 4); + d->d_subtype = le16dec(ptr + 6); + bcopy(ptr + 8, d->d_typename, 16); + bcopy(ptr + 24, d->d_packname, 16); + d->d_secsize = le32dec(ptr + 40); + d->d_nsectors = le32dec(ptr + 44); + d->d_ntracks = le32dec(ptr + 48); + d->d_ncylinders = le32dec(ptr + 52); + d->d_secpercyl = le32dec(ptr + 56); + d->d_secperunit = le32dec(ptr + 60); + d->d_sparespertrack = le16dec(ptr + 64); + d->d_sparespercyl = le16dec(ptr + 66); + d->d_acylinders = le32dec(ptr + 68); + d->d_rpm = le16dec(ptr + 72); + d->d_interleave = le16dec(ptr + 74); + d->d_trackskew = le16dec(ptr + 76); + d->d_cylskew = le16dec(ptr + 78); + d->d_headswitch = le32dec(ptr + 80); + d->d_trkseek = le32dec(ptr + 84); + d->d_flags = le32dec(ptr + 88); + d->d_drivedata[0] = le32dec(ptr + 92); + d->d_drivedata[1] = le32dec(ptr + 96); + d->d_drivedata[2] = le32dec(ptr + 100); + d->d_drivedata[3] = le32dec(ptr + 104); + d->d_drivedata[4] = le32dec(ptr + 108); + d->d_spare[0] = le32dec(ptr + 112); + d->d_spare[1] = le32dec(ptr + 116); + d->d_spare[2] = le32dec(ptr + 120); + d->d_spare[3] = le32dec(ptr + 124); + d->d_spare[4] = le32dec(ptr + 128); + d->d_checksum = le16dec(ptr + 136); + d->d_npartitions = le16dec(ptr + 138); + d->d_bbsize = le32dec(ptr + 140); + d->d_sbsize = le32dec(ptr + 144); + for (i = 0; i < MAXPARTITIONS; i++) + bsd_partition_le_dec(ptr + 148 + 16 * i, &d->d_partitions[i]); + return(0); +} + +void +bsd_partition_le_enc(u_char *ptr, struct partition *d) +{ + le32enc(ptr + 0, d->p_size); + le32enc(ptr + 4, d->p_offset); + le32enc(ptr + 8, d->p_fsize); + ptr[12] = d->p_fstype; + ptr[13] = d->p_frag; + le16enc(ptr + 14, d->p_cpg); +} + +void +bsd_disklabel_le_enc(u_char *ptr, struct disklabel *d) +{ + int i; + u_char *p, *pe; + uint16_t sum; + + le32enc(ptr + 0, d->d_magic); + le16enc(ptr + 4, d->d_type); + le16enc(ptr + 6, d->d_subtype); + bcopy(d->d_typename, ptr + 8, 16); + bcopy(d->d_packname, ptr + 24, 16); + le32enc(ptr + 40, d->d_secsize); + le32enc(ptr + 44, d->d_nsectors); + le32enc(ptr + 48, d->d_ntracks); + le32enc(ptr + 52, d->d_ncylinders); + le32enc(ptr + 56, d->d_secpercyl); + le32enc(ptr + 60, d->d_secperunit); + le16enc(ptr + 64, d->d_sparespertrack); + le16enc(ptr + 66, d->d_sparespercyl); + le32enc(ptr + 68, d->d_acylinders); + le16enc(ptr + 72, d->d_rpm); + le16enc(ptr + 74, d->d_interleave); + le16enc(ptr + 76, d->d_trackskew); + le16enc(ptr + 78, d->d_cylskew); + le32enc(ptr + 80, d->d_headswitch); + le32enc(ptr + 84, d->d_trkseek); + le32enc(ptr + 88, d->d_flags); + le32enc(ptr + 92, d->d_drivedata[0]); + le32enc(ptr + 96, d->d_drivedata[1]); + le32enc(ptr + 100, d->d_drivedata[2]); + le32enc(ptr + 104, d->d_drivedata[3]); + le32enc(ptr + 108, d->d_drivedata[4]); + le32enc(ptr + 112, d->d_spare[0]); + le32enc(ptr + 116, d->d_spare[1]); + le32enc(ptr + 120, d->d_spare[2]); + le32enc(ptr + 124, d->d_spare[3]); + le32enc(ptr + 128, d->d_spare[4]); + le32enc(ptr + 132, d->d_magic2); + le16enc(ptr + 136, 0); + le16enc(ptr + 138, d->d_npartitions); + le32enc(ptr + 140, d->d_bbsize); + le32enc(ptr + 144, d->d_sbsize); + for (i = 0; i < d->d_npartitions; i++) + bsd_partition_le_enc(ptr + 148 + 16 * i, &d->d_partitions[i]); + pe = ptr + 148 + 16 * d->d_npartitions; + sum = 0; + for (p = ptr; p < pe; p += 2) + sum ^= le16dec(p); + le16enc(ptr + 136, sum); +} diff --git a/sys/geom/geom_ccd.c b/sys/geom/geom_ccd.c new file mode 100644 index 0000000..51f70c3 --- /dev/null +++ b/sys/geom/geom_ccd.c @@ -0,0 +1,855 @@ +/* + * Copyright (c) 2003 Poul-Henning Kamp. + * Copyright (c) 1995 Jason R. Thorpe. + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * All rights reserved. + * Copyright (c) 1988 University of Utah. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project + * by Jason R. Thorpe. + * 4. The names of the authors may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Dynamic configuration and disklabel support by: + * Jason R. Thorpe <thorpej@nas.nasa.gov> + * Numerical Aerodynamic Simulation Facility + * Mail Stop 258-6 + * NASA Ames Research Center + * Moffett Field, CA 94035 + * + * from: Utah $Hdr: cd.c 1.6 90/11/28$ + * @(#)cd.c 8.2 (Berkeley) 11/16/93 + * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/bio.h> +#include <sys/malloc.h> +#include <geom/geom.h> + +/* + * Number of blocks to untouched in front of a component partition. + * This is to avoid violating its disklabel area when it starts at the + * beginning of the slice. + */ +#if !defined(CCD_OFFSET) +#define CCD_OFFSET 16 +#endif + +/* sc_flags */ +#define CCDF_UNIFORM 0x02 /* use LCCD of sizes for uniform interleave */ +#define CCDF_MIRROR 0x04 /* use mirroring */ + +/* Mask of user-settable ccd flags. */ +#define CCDF_USERMASK (CCDF_UNIFORM|CCDF_MIRROR) + +/* + * Interleave description table. + * Computed at boot time to speed irregular-interleave lookups. + * The idea is that we interleave in "groups". First we interleave + * evenly over all component disks up to the size of the smallest + * component (the first group), then we interleave evenly over all + * remaining disks up to the size of the next-smallest (second group), + * and so on. + * + * Each table entry describes the interleave characteristics of one + * of these groups. For example if a concatenated disk consisted of + * three components of 5, 3, and 7 DEV_BSIZE blocks interleaved at + * DEV_BSIZE (1), the table would have three entries: + * + * ndisk startblk startoff dev + * 3 0 0 0, 1, 2 + * 2 9 3 0, 2 + * 1 13 5 2 + * 0 - - - + * + * which says that the first nine blocks (0-8) are interleaved over + * 3 disks (0, 1, 2) starting at block offset 0 on any component disk, + * the next 4 blocks (9-12) are interleaved over 2 disks (0, 2) starting + * at component block 3, and the remaining blocks (13-14) are on disk + * 2 starting at offset 5. + */ +struct ccdiinfo { + int ii_ndisk; /* # of disks range is interleaved over */ + daddr_t ii_startblk; /* starting scaled block # for range */ + daddr_t ii_startoff; /* starting component offset (block #) */ + int *ii_index; /* ordered list of components in range */ +}; + +/* + * Component info table. + * Describes a single component of a concatenated disk. + */ +struct ccdcinfo { + size_t ci_size; /* size */ + struct g_provider *ci_provider; /* provider */ + struct g_consumer *ci_consumer; /* consumer */ +}; + +/* + * A concatenated disk is described by this structure. + */ + +struct ccd_s { + LIST_ENTRY(ccd_s) list; + + int sc_unit; /* logical unit number */ + int sc_flags; /* flags */ + size_t sc_size; /* size of ccd */ + int sc_ileave; /* interleave */ + u_int sc_ndisks; /* number of components */ + struct ccdcinfo *sc_cinfo; /* component info */ + struct ccdiinfo *sc_itable; /* interleave table */ + u_int32_t sc_secsize; /* # bytes per sector */ + int sc_pick; /* side of mirror picked */ + daddr_t sc_blk[2]; /* mirror localization */ +}; + +static g_start_t g_ccd_start; +static void ccdiodone(struct bio *bp); +static void ccdinterleave(struct ccd_s *); +static int ccdinit(struct gctl_req *req, struct ccd_s *); +static int ccdbuffer(struct bio **ret, struct ccd_s *, + struct bio *, daddr_t, caddr_t, long); + +static void +g_ccd_orphan(struct g_consumer *cp) +{ + /* + * XXX: We don't do anything here. It is not obvious + * XXX: what DTRT would be, so we do what the previous + * XXX: code did: ignore it and let the user cope. + */ +} + +static int +g_ccd_access(struct g_provider *pp, int dr, int dw, int de) +{ + struct g_geom *gp; + struct g_consumer *cp1, *cp2; + int error; + + de += dr; + de += dw; + + gp = pp->geom; + error = ENXIO; + LIST_FOREACH(cp1, &gp->consumer, consumer) { + error = g_access_rel(cp1, dr, dw, de); + if (error) { + LIST_FOREACH(cp2, &gp->consumer, consumer) { + if (cp1 == cp2) + break; + g_access_rel(cp1, -dr, -dw, -de); + } + break; + } + } + return (error); +} + +/* + * Free the softc and its substructures. + */ +static void +g_ccd_freesc(struct ccd_s *sc) +{ + struct ccdiinfo *ii; + + g_free(sc->sc_cinfo); + if (sc->sc_itable != NULL) { + for (ii = sc->sc_itable; ii->ii_ndisk > 0; ii++) + if (ii->ii_index != NULL) + g_free(ii->ii_index); + g_free(sc->sc_itable); + } + g_free(sc); +} + + +static int +ccdinit(struct gctl_req *req, struct ccd_s *cs) +{ + struct ccdcinfo *ci; + size_t size; + int ix; + size_t minsize; + int maxsecsize; + off_t mediasize; + u_int sectorsize; + + cs->sc_size = 0; + + maxsecsize = 0; + minsize = 0; + for (ix = 0; ix < cs->sc_ndisks; ix++) { + ci = &cs->sc_cinfo[ix]; + + mediasize = ci->ci_provider->mediasize; + sectorsize = ci->ci_provider->sectorsize; + if (sectorsize > maxsecsize) + maxsecsize = sectorsize; + size = mediasize / DEV_BSIZE - CCD_OFFSET; + + /* Truncate to interleave boundary */ + + if (cs->sc_ileave > 1) + size -= size % cs->sc_ileave; + + if (size == 0) { + gctl_error(req, "Component %s has effective size zero", + ci->ci_provider->name); + return(ENODEV); + } + + if (minsize == 0 || size < minsize) + minsize = size; + ci->ci_size = size; + cs->sc_size += size; + } + + /* + * Don't allow the interleave to be smaller than + * the biggest component sector. + */ + if ((cs->sc_ileave > 0) && + (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) { + gctl_error(req, "Interleave to small for sector size"); + return(EINVAL); + } + + /* + * If uniform interleave is desired set all sizes to that of + * the smallest component. This will guarentee that a single + * interleave table is generated. + * + * Lost space must be taken into account when calculating the + * overall size. Half the space is lost when CCDF_MIRROR is + * specified. + */ + if (cs->sc_flags & CCDF_UNIFORM) { + for (ix = 0; ix < cs->sc_ndisks; ix++) { + ci = &cs->sc_cinfo[ix]; + ci->ci_size = minsize; + } + cs->sc_size = cs->sc_ndisks * minsize; + } + + if (cs->sc_flags & CCDF_MIRROR) { + /* + * Check to see if an even number of components + * have been specified. The interleave must also + * be non-zero in order for us to be able to + * guarentee the topology. + */ + if (cs->sc_ndisks % 2) { + gctl_error(req, + "Mirroring requires an even number of disks"); + return(EINVAL); + } + if (cs->sc_ileave == 0) { + gctl_error(req, + "An interleave must be specified when mirroring"); + return(EINVAL); + } + cs->sc_size = (cs->sc_ndisks/2) * minsize; + } + + /* + * Construct the interleave table. + */ + ccdinterleave(cs); + + /* + * Create pseudo-geometry based on 1MB cylinders. It's + * pretty close. + */ + cs->sc_secsize = maxsecsize; + + return (0); +} + +static void +ccdinterleave(struct ccd_s *cs) +{ + struct ccdcinfo *ci, *smallci; + struct ccdiinfo *ii; + daddr_t bn, lbn; + int ix; + u_long size; + + + /* + * Allocate an interleave table. The worst case occurs when each + * of N disks is of a different size, resulting in N interleave + * tables. + * + * Chances are this is too big, but we don't care. + */ + size = (cs->sc_ndisks + 1) * sizeof(struct ccdiinfo); + cs->sc_itable = g_malloc(size, M_WAITOK | M_ZERO); + + /* + * Trivial case: no interleave (actually interleave of disk size). + * Each table entry represents a single component in its entirety. + * + * An interleave of 0 may not be used with a mirror setup. + */ + if (cs->sc_ileave == 0) { + bn = 0; + ii = cs->sc_itable; + + for (ix = 0; ix < cs->sc_ndisks; ix++) { + /* Allocate space for ii_index. */ + ii->ii_index = g_malloc(sizeof(int), M_WAITOK); + ii->ii_ndisk = 1; + ii->ii_startblk = bn; + ii->ii_startoff = 0; + ii->ii_index[0] = ix; + bn += cs->sc_cinfo[ix].ci_size; + ii++; + } + ii->ii_ndisk = 0; + return; + } + + /* + * The following isn't fast or pretty; it doesn't have to be. + */ + size = 0; + bn = lbn = 0; + for (ii = cs->sc_itable; ; ii++) { + /* + * Allocate space for ii_index. We might allocate more then + * we use. + */ + ii->ii_index = g_malloc((sizeof(int) * cs->sc_ndisks), + M_WAITOK); + + /* + * Locate the smallest of the remaining components + */ + smallci = NULL; + for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_ndisks]; + ci++) { + if (ci->ci_size > size && + (smallci == NULL || + ci->ci_size < smallci->ci_size)) { + smallci = ci; + } + } + + /* + * Nobody left, all done + */ + if (smallci == NULL) { + ii->ii_ndisk = 0; + g_free(ii->ii_index); + ii->ii_index = NULL; + break; + } + + /* + * Record starting logical block using an sc_ileave blocksize. + */ + ii->ii_startblk = bn / cs->sc_ileave; + + /* + * Record starting component block using an sc_ileave + * blocksize. This value is relative to the beginning of + * a component disk. + */ + ii->ii_startoff = lbn; + + /* + * Determine how many disks take part in this interleave + * and record their indices. + */ + ix = 0; + for (ci = cs->sc_cinfo; + ci < &cs->sc_cinfo[cs->sc_ndisks]; ci++) { + if (ci->ci_size >= smallci->ci_size) { + ii->ii_index[ix++] = ci - cs->sc_cinfo; + } + } + ii->ii_ndisk = ix; + bn += ix * (smallci->ci_size - size); + lbn = smallci->ci_size / cs->sc_ileave; + size = smallci->ci_size; + } +} + +static void +g_ccd_start(struct bio *bp) +{ + long bcount, rcount; + struct bio *cbp[2]; + caddr_t addr; + daddr_t bn; + int err; + struct ccd_s *cs; + + cs = bp->bio_to->geom->softc; + + /* + * Translate the partition-relative block number to an absolute. + */ + bn = bp->bio_offset / cs->sc_secsize; + + /* + * Allocate component buffers and fire off the requests + */ + addr = bp->bio_data; + for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) { + err = ccdbuffer(cbp, cs, bp, bn, addr, bcount); + if (err) { + bp->bio_completed += bcount; + if (bp->bio_error != 0) + bp->bio_error = err; + if (bp->bio_completed == bp->bio_length) + g_io_deliver(bp, bp->bio_error); + return; + } + rcount = cbp[0]->bio_length; + + if (cs->sc_flags & CCDF_MIRROR) { + /* + * Mirroring. Writes go to both disks, reads are + * taken from whichever disk seems most appropriate. + * + * We attempt to localize reads to the disk whos arm + * is nearest the read request. We ignore seeks due + * to writes when making this determination and we + * also try to avoid hogging. + */ + if (cbp[0]->bio_cmd != BIO_READ) { + g_io_request(cbp[0], cbp[0]->bio_from); + g_io_request(cbp[1], cbp[1]->bio_from); + } else { + int pick = cs->sc_pick; + daddr_t range = cs->sc_size / 16; + + if (bn < cs->sc_blk[pick] - range || + bn > cs->sc_blk[pick] + range + ) { + cs->sc_pick = pick = 1 - pick; + } + cs->sc_blk[pick] = bn + btodb(rcount); + g_io_request(cbp[pick], cbp[pick]->bio_from); + } + } else { + /* + * Not mirroring + */ + g_io_request(cbp[0], cbp[0]->bio_from); + } + bn += btodb(rcount); + addr += rcount; + } +} + +/* + * Build a component buffer header. + */ +static int +ccdbuffer(struct bio **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount) +{ + struct ccdcinfo *ci, *ci2 = NULL; + struct bio *cbp; + daddr_t cbn, cboff; + off_t cbc; + + /* + * Determine which component bn falls in. + */ + cbn = bn; + cboff = 0; + + if (cs->sc_ileave == 0) { + /* + * Serially concatenated and neither a mirror nor a parity + * config. This is a special case. + */ + daddr_t sblk; + + sblk = 0; + for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++) + sblk += ci->ci_size; + cbn -= sblk; + } else { + struct ccdiinfo *ii; + int ccdisk, off; + + /* + * Calculate cbn, the logical superblock (sc_ileave chunks), + * and cboff, a normal block offset (DEV_BSIZE chunks) relative + * to cbn. + */ + cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */ + cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */ + + /* + * Figure out which interleave table to use. + */ + for (ii = cs->sc_itable; ii->ii_ndisk; ii++) { + if (ii->ii_startblk > cbn) + break; + } + ii--; + + /* + * off is the logical superblock relative to the beginning + * of this interleave block. + */ + off = cbn - ii->ii_startblk; + + /* + * We must calculate which disk component to use (ccdisk), + * and recalculate cbn to be the superblock relative to + * the beginning of the component. This is typically done by + * adding 'off' and ii->ii_startoff together. However, 'off' + * must typically be divided by the number of components in + * this interleave array to be properly convert it from a + * CCD-relative logical superblock number to a + * component-relative superblock number. + */ + if (ii->ii_ndisk == 1) { + /* + * When we have just one disk, it can't be a mirror + * or a parity config. + */ + ccdisk = ii->ii_index[0]; + cbn = ii->ii_startoff + off; + } else { + if (cs->sc_flags & CCDF_MIRROR) { + /* + * We have forced a uniform mapping, resulting + * in a single interleave array. We double + * up on the first half of the available + * components and our mirror is in the second + * half. This only works with a single + * interleave array because doubling up + * doubles the number of sectors, so there + * cannot be another interleave array because + * the next interleave array's calculations + * would be off. + */ + int ndisk2 = ii->ii_ndisk / 2; + ccdisk = ii->ii_index[off % ndisk2]; + cbn = ii->ii_startoff + off / ndisk2; + ci2 = &cs->sc_cinfo[ccdisk + ndisk2]; + } else { + ccdisk = ii->ii_index[off % ii->ii_ndisk]; + cbn = ii->ii_startoff + off / ii->ii_ndisk; + } + } + + ci = &cs->sc_cinfo[ccdisk]; + + /* + * Convert cbn from a superblock to a normal block so it + * can be used to calculate (along with cboff) the normal + * block index into this particular disk. + */ + cbn *= cs->sc_ileave; + } + + /* + * Fill in the component buf structure. + */ + cbp = g_clone_bio(bp); + if (cbp == NULL) + return (ENOMEM); + cbp->bio_done = g_std_done; + cbp->bio_offset = dbtob(cbn + cboff + CCD_OFFSET); + cbp->bio_data = addr; + if (cs->sc_ileave == 0) + cbc = dbtob((off_t)(ci->ci_size - cbn)); + else + cbc = dbtob((off_t)(cs->sc_ileave - cboff)); + cbp->bio_length = (cbc < bcount) ? cbc : bcount; + + cbp->bio_from = ci->ci_consumer; + cb[0] = cbp; + + if (cs->sc_flags & CCDF_MIRROR) { + cbp = g_clone_bio(bp); + if (cbp == NULL) + return (ENOMEM); + cbp->bio_done = cb[0]->bio_done = ccdiodone; + cbp->bio_offset = cb[0]->bio_offset; + cbp->bio_data = cb[0]->bio_data; + cbp->bio_length = cb[0]->bio_length; + cbp->bio_from = ci2->ci_consumer; + cbp->bio_caller1 = cb[0]; + cb[0]->bio_caller1 = cbp; + cb[1] = cbp; + } + return (0); +} + +/* + * Called only for mirrored operations. + */ +static void +ccdiodone(struct bio *cbp) +{ + struct bio *mbp, *pbp; + + mbp = cbp->bio_caller1; + pbp = cbp->bio_parent; + + if (pbp->bio_cmd == BIO_READ) { + if (cbp->bio_error == 0) { + /* We will not be needing the partner bio */ + if (mbp != NULL) { + pbp->bio_inbed++; + g_destroy_bio(mbp); + } + g_std_done(cbp); + return; + } + if (mbp != NULL) { + /* Try partner the bio instead */ + mbp->bio_caller1 = NULL; + pbp->bio_inbed++; + g_destroy_bio(cbp); + g_io_request(mbp, mbp->bio_from); + /* + * XXX: If this comes back OK, we should actually + * try to write the good data on the failed mirror + */ + return; + } + g_std_done(cbp); + } + if (mbp != NULL) { + mbp->bio_caller1 = NULL; + pbp->bio_inbed++; + if (cbp->bio_error != 0 && pbp->bio_error == 0) + pbp->bio_error = cbp->bio_error; + return; + } + g_std_done(cbp); +} + +static void +g_ccd_create(struct gctl_req *req, struct g_class *mp) +{ + int *unit, *ileave, *nprovider; + struct g_geom *gp; + struct g_consumer *cp; + struct g_provider *pp; + struct ccd_s *sc; + struct sbuf *sb; + char buf[20]; + int i, error; + + g_topology_assert(); + unit = gctl_get_paraml(req, "unit", sizeof (*unit)); + ileave = gctl_get_paraml(req, "ileave", sizeof (*ileave)); + nprovider = gctl_get_paraml(req, "nprovider", sizeof (*nprovider)); + + /* Check for duplicate unit */ + LIST_FOREACH(gp, &mp->geom, geom) { + sc = gp->softc; + if (sc->sc_unit == *unit) { + gctl_error(req, "Unit %d already configured", *unit); + return; + } + } + + if (*nprovider <= 0) { + gctl_error(req, "Bogus nprovider argument (= %d)", *nprovider); + return; + } + + /* Check all providers are valid */ + for (i = 0; i < *nprovider; i++) { + sprintf(buf, "provider%d", i); + pp = gctl_get_provider(req, buf); + if (pp == NULL) + return; + } + + gp = g_new_geomf(mp, "ccd%d", *unit); + gp->start = g_ccd_start; + gp->orphan = g_ccd_orphan; + gp->access = g_ccd_access; + sc = g_malloc(sizeof *sc, M_WAITOK | M_ZERO); + gp->softc = sc; + sc->sc_ndisks = *nprovider; + + /* Allocate space for the component info. */ + sc->sc_cinfo = g_malloc(sc->sc_ndisks * sizeof(struct ccdcinfo), + M_WAITOK | M_ZERO); + + /* Create consumers and attach to all providers */ + for (i = 0; i < *nprovider; i++) { + sprintf(buf, "provider%d", i); + pp = gctl_get_provider(req, buf); + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + KASSERT(error == 0, ("attach to %s failed", pp->name)); + sc->sc_cinfo[i].ci_consumer = cp; + sc->sc_cinfo[i].ci_provider = pp; + } + + sc->sc_unit = *unit; + sc->sc_ileave = *ileave; + + if (gctl_get_param(req, "uniform", NULL)) + sc->sc_flags |= CCDF_UNIFORM; + if (gctl_get_param(req, "mirror", NULL)) + sc->sc_flags |= CCDF_MIRROR; + + if (sc->sc_ileave == 0 && (sc->sc_flags & CCDF_MIRROR)) { + printf("%s: disabling mirror, interleave is 0\n", gp->name); + sc->sc_flags &= ~(CCDF_MIRROR); + } + + if ((sc->sc_flags & CCDF_MIRROR) && !(sc->sc_flags & CCDF_UNIFORM)) { + printf("%s: mirror/parity forces uniform flag\n", gp->name); + sc->sc_flags |= CCDF_UNIFORM; + } + + error = ccdinit(req, sc); + if (error != 0) { + g_ccd_freesc(sc); + gp->softc = NULL; + g_wither_geom(gp, ENXIO); + return; + } + + pp = g_new_providerf(gp, "%s", gp->name); + pp->mediasize = sc->sc_size * (off_t)sc->sc_secsize; + pp->sectorsize = sc->sc_secsize; + g_error_provider(pp, 0); + + sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); + sbuf_clear(sb); + sbuf_printf(sb, "ccd%d: %d components ", sc->sc_unit, *nprovider); + for (i = 0; i < *nprovider; i++) { + sbuf_printf(sb, "%s%s", + i == 0 ? "(" : ", ", + sc->sc_cinfo[i].ci_provider->name); + } + sbuf_printf(sb, "), %jd blocks ", (off_t)pp->mediasize / DEV_BSIZE); + if (sc->sc_ileave != 0) + sbuf_printf(sb, "interleaved at %d blocks\n", + sc->sc_ileave); + else + sbuf_printf(sb, "concatenated\n"); + sbuf_finish(sb); + gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); +} + +static void +g_ccd_destroy(struct gctl_req *req, struct g_class *mp) +{ + struct g_geom *gp; + struct g_provider *pp; + struct ccd_s *sc; + + g_topology_assert(); + gp = gctl_get_geom(req, mp, "geom"); + if (gp == NULL) + return; + sc = gp->softc; + pp = LIST_FIRST(&gp->provider); + if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) { + gctl_error(req, "%s is open(r%dw%de%d)", gp->name, + pp->acr, pp->acw, pp->ace); + return; + } + g_ccd_freesc(sc); + gp->softc = NULL; + g_wither_geom(gp, ENXIO); +} + +static void +g_ccd_list(struct gctl_req *req, struct g_class *mp) +{ + struct sbuf *sb; + struct ccd_s *cs; + struct g_geom *gp; + int i, unit, *up; + + up = gctl_get_paraml(req, "unit", sizeof (int)); + unit = *up; + sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); + sbuf_clear(sb); + LIST_FOREACH(gp, &mp->geom, geom) { + cs = gp->softc; + if (unit >= 0 && unit != cs->sc_unit) + continue; + sbuf_printf(sb, "ccd%d\t\t%d\t%d\t", + cs->sc_unit, cs->sc_ileave, cs->sc_flags & CCDF_USERMASK); + + for (i = 0; i < cs->sc_ndisks; ++i) { + sbuf_printf(sb, "%s/dev/%s", i == 0 ? "" : " ", + cs->sc_cinfo[i].ci_provider->name); + } + sbuf_printf(sb, "\n"); + } + sbuf_finish(sb); + gctl_set_param(req, "output", sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); +} + +static void +g_ccd_config(struct gctl_req *req, struct g_class *mp, char const *verb) +{ + + g_topology_assert(); + if (!strcmp(verb, "create geom")) { + g_ccd_create(req, mp); + } else if (!strcmp(verb, "destroy geom")) { + g_ccd_destroy(req, mp); + } else if (!strcmp(verb, "list")) { + g_ccd_list(req, mp); + } else { + gctl_error(req, "unknown verb"); + } +} + +static struct g_class g_ccd_class = { + .name = "CCD", + .ctlreq = g_ccd_config, +}; + +DECLARE_GEOM_CLASS(g_ccd_class, g_ccd); diff --git a/sys/geom/geom_ctl.c b/sys/geom/geom_ctl.c new file mode 100644 index 0000000..d543129 --- /dev/null +++ b/sys/geom/geom_ctl.c @@ -0,0 +1,495 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_geom.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/bio.h> +#include <sys/conf.h> +#include <sys/disk.h> +#include <sys/malloc.h> +#include <sys/sysctl.h> +#include <sys/sbuf.h> + +#include <sys/lock.h> +#include <sys/mutex.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> + +#include <geom/geom.h> +#include <geom/geom_int.h> +#define GCTL_TABLE 1 +#include <geom/geom_ctl.h> + +#include <machine/stdarg.h> + +static d_ioctl_t g_ctl_ioctl; + +static struct cdevsw g_ctl_cdevsw = { + .d_open = nullopen, + .d_close = nullclose, + .d_ioctl = g_ctl_ioctl, + .d_name = "g_ctl", +}; + +void +g_ctl_init(void) +{ + + make_dev(&g_ctl_cdevsw, 0, + UID_ROOT, GID_OPERATOR, 0640, PATH_GEOM_CTL); + KASSERT(GCTL_PARAM_RD == VM_PROT_READ, + ("GCTL_PARAM_RD != VM_PROT_READ")); + KASSERT(GCTL_PARAM_WR == VM_PROT_WRITE, + ("GCTL_PARAM_WR != VM_PROT_WRITE")); +} + +/* + * Report an error back to the user in ascii format. Return whatever copyout + * returned, or EINVAL if it succeeded. + * XXX: should not be static. + * XXX: should take printf like args. + */ +int +gctl_error(struct gctl_req *req, const char *fmt, ...) +{ + va_list ap; + + if (req == NULL) + return (EINVAL); + + /* We only record the first error */ + if (req->nerror) + return (req->nerror); + + va_start(ap, fmt); + sbuf_vprintf(req->serror, fmt, ap); + va_end(ap); + sbuf_finish(req->serror); + if (g_debugflags & G_F_CTLDUMP) + printf("gctl %p error \"%s\"\n", req, sbuf_data(req->serror)); + req->nerror = copyout(sbuf_data(req->serror), req->error, + imin(req->lerror, sbuf_len(req->serror) + 1)); + if (!req->nerror) + req->nerror = EINVAL; + return (req->nerror); +} + +/* + * Allocate space and copyin() something. + * XXX: this should really be a standard function in the kernel. + */ +static void * +geom_alloc_copyin(struct gctl_req *req, void *uaddr, size_t len) +{ + void *ptr; + + ptr = g_malloc(len, M_WAITOK); + if (ptr == NULL) + req->nerror = ENOMEM; + else + req->nerror = copyin(uaddr, ptr, len); + if (!req->nerror) + return (ptr); + if (ptr != NULL) + g_free(ptr); + return (NULL); +} + +static void +gctl_copyin(struct gctl_req *req) +{ + int error, i; + struct gctl_req_arg *ap; + char *p; + + ap = geom_alloc_copyin(req, req->arg, req->narg * sizeof(*ap)); + if (ap == NULL) { + req->nerror = ENOMEM; + req->arg = NULL; + return; + } + + /* Nothing have been copyin()'ed yet */ + for (i = 0; i < req->narg; i++) { + ap[i].flag &= ~(GCTL_PARAM_NAMEKERNEL|GCTL_PARAM_VALUEKERNEL); + ap[i].flag &= ~GCTL_PARAM_CHANGED; + ap[i].kvalue = NULL; + } + + error = 0; + for (i = 0; i < req->narg; i++) { + if (ap[i].nlen < 1 || ap[i].nlen > SPECNAMELEN) { + error = gctl_error(req, + "wrong param name length %d: %d", i, ap[i].nlen); + break; + } + p = geom_alloc_copyin(req, ap[i].name, ap[i].nlen); + if (p == NULL) + break; + if (p[ap[i].nlen - 1] != '\0') { + error = gctl_error(req, "unterminated param name"); + g_free(p); + break; + } + ap[i].name = p; + ap[i].flag |= GCTL_PARAM_NAMEKERNEL; + if (ap[i].len < 0) { + error = gctl_error(req, "negative param length"); + break; + } + if (ap[i].len == 0) { + ap[i].kvalue = ap[i].value; + ap[i].flag |= GCTL_PARAM_VALUEKERNEL; + continue; + } + p = geom_alloc_copyin(req, ap[i].value, ap[i].len); + if (p == NULL) + break; + if ((ap[i].flag & GCTL_PARAM_ASCII) && + p[ap[i].len - 1] != '\0') { + error = gctl_error(req, "unterminated param value"); + g_free(p); + break; + } + ap[i].kvalue = p; + ap[i].flag |= GCTL_PARAM_VALUEKERNEL; + } + req->arg = ap; + return; +} + +static void +gctl_copyout(struct gctl_req *req) +{ + int error, i; + struct gctl_req_arg *ap; + + if (req->nerror) + return; + error = 0; + ap = req->arg; + for (i = 0; i < req->narg; i++, ap++) { + if (!(ap->flag & GCTL_PARAM_CHANGED)) + continue; + error = copyout(ap->kvalue, ap->value, ap->len); + if (!error) + continue; + req->nerror = error; + return; + } + return; +} + +static void +gctl_free(struct gctl_req *req) +{ + int i; + + if (req->arg == NULL) + return; + for (i = 0; i < req->narg; i++) { + if (req->arg[i].flag & GCTL_PARAM_NAMEKERNEL) + g_free(req->arg[i].name); + if ((req->arg[i].flag & GCTL_PARAM_VALUEKERNEL) && + req->arg[i].len > 0) + g_free(req->arg[i].kvalue); + } + g_free(req->arg); + sbuf_delete(req->serror); +} + +static void +gctl_dump(struct gctl_req *req) +{ + u_int i; + int j; + struct gctl_req_arg *ap; + + printf("Dump of gctl request at %p:\n", req); + if (req->nerror > 0) { + printf(" nerror:\t%d\n", req->nerror); + if (sbuf_len(req->serror) > 0) + printf(" error:\t\"%s\"\n", sbuf_data(req->serror)); + } + for (i = 0; i < req->narg; i++) { + ap = &req->arg[i]; + if (!(ap->flag & GCTL_PARAM_NAMEKERNEL)) + printf(" param:\t%d@%p", ap->nlen, ap->name); + else + printf(" param:\t\"%s\"", ap->name); + printf(" [%s%s%d] = ", + ap->flag & GCTL_PARAM_RD ? "R" : "", + ap->flag & GCTL_PARAM_WR ? "W" : "", + ap->len); + if (!(ap->flag & GCTL_PARAM_VALUEKERNEL)) { + printf(" =@ %p", ap->value); + } else if (ap->flag & GCTL_PARAM_ASCII) { + printf("\"%s\"", (char *)ap->kvalue); + } else if (ap->len > 0) { + for (j = 0; j < ap->len; j++) + printf(" %02x", ((u_char *)ap->kvalue)[j]); + } else { + printf(" = %p", ap->kvalue); + } + printf("\n"); + } +} + +void +gctl_set_param(struct gctl_req *req, const char *param, void const *ptr, int len) +{ + int i; + struct gctl_req_arg *ap; + + for (i = 0; i < req->narg; i++) { + ap = &req->arg[i]; + if (strcmp(param, ap->name)) + continue; + if (!(ap->flag & GCTL_PARAM_WR)) { + gctl_error(req, "No write access %s argument", param); + return; + } + if (ap->len < len) { + gctl_error(req, "Wrong length %s argument", param); + return; + } + bcopy(ptr, ap->kvalue, len); + ap->flag |= GCTL_PARAM_CHANGED; + return; + } + gctl_error(req, "Missing %s argument", param); + return; +} + +void * +gctl_get_param(struct gctl_req *req, const char *param, int *len) +{ + int i; + void *p; + struct gctl_req_arg *ap; + + for (i = 0; i < req->narg; i++) { + ap = &req->arg[i]; + if (strcmp(param, ap->name)) + continue; + if (!(ap->flag & GCTL_PARAM_RD)) + continue; + p = ap->kvalue; + if (len != NULL) + *len = ap->len; + return (p); + } + return (NULL); +} + +char const * +gctl_get_asciiparam(struct gctl_req *req, const char *param) +{ + int i; + char const *p; + struct gctl_req_arg *ap; + + for (i = 0; i < req->narg; i++) { + ap = &req->arg[i]; + if (strcmp(param, ap->name)) + continue; + if (!(ap->flag & GCTL_PARAM_RD)) + continue; + p = ap->kvalue; + if (ap->len < 1) { + gctl_error(req, "No length argument (%s)", param); + return (NULL); + } + if (p[ap->len - 1] != '\0') { + gctl_error(req, "Unterminated argument (%s)", param); + return (NULL); + } + return (p); + } + return (NULL); +} + +void * +gctl_get_paraml(struct gctl_req *req, const char *param, int len) +{ + int i; + void *p; + + p = gctl_get_param(req, param, &i); + if (p == NULL) + gctl_error(req, "Missing %s argument", param); + else if (i != len) { + p = NULL; + gctl_error(req, "Wrong length %s argument", param); + } + return (p); +} + +struct g_class * +gctl_get_class(struct gctl_req *req, char const *arg) +{ + char const *p; + struct g_class *cp; + + p = gctl_get_asciiparam(req, arg); + if (p == NULL) + return (NULL); + LIST_FOREACH(cp, &g_classes, class) { + if (!strcmp(p, cp->name)) + return (cp); + } + gctl_error(req, "Class not found"); + return (NULL); +} + +struct g_geom * +gctl_get_geom(struct gctl_req *req, struct g_class *mpr, char const *arg) +{ + char const *p; + struct g_class *mp; + struct g_geom *gp; + + p = gctl_get_asciiparam(req, arg); + if (p != NULL) { + LIST_FOREACH(mp, &g_classes, class) { + if (mpr != NULL && mpr != mp) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (!strcmp(p, gp->name)) + return (gp); + } + } + } + gctl_error(req, "Geom not found"); + return (NULL); +} + +struct g_provider * +gctl_get_provider(struct gctl_req *req, char const *arg) +{ + char const *p; + struct g_provider *pp; + + p = gctl_get_asciiparam(req, arg); + if (p == NULL) + return (NULL); + pp = g_provider_by_name(p); + if (pp != NULL) + return (pp); + gctl_error(req, "Provider not found"); + return (NULL); +} + +static void +g_ctl_req(void *arg, int flag __unused) +{ + struct g_class *mp; + struct gctl_req *req; + char const *verb; + + g_topology_assert(); + req = arg; + mp = gctl_get_class(req, "class"); + if (mp == NULL) { + gctl_error(req, "Class not found"); + return; + } + verb = gctl_get_param(req, "verb", NULL); + if (mp->ctlreq == NULL) + gctl_error(req, "Class takes no requests"); + else + mp->ctlreq(req, mp, verb); + g_topology_assert(); +} + + +static int +g_ctl_ioctl_ctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td) +{ + struct gctl_req *req; + + req = (void *)data; + req->nerror = 0; + req->serror = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); + /* It is an error if we cannot return an error text */ + if (req->lerror < 2) + return (EINVAL); + if (!useracc(req->error, req->lerror, VM_PROT_WRITE)) + return (EINVAL); + + /* Check the version */ + if (req->version != GCTL_VERSION) + return (gctl_error(req, + "kernel and libgeom version mismatch.")); + + /* Get things on board */ + gctl_copyin(req); + + if (g_debugflags & G_F_CTLDUMP) + gctl_dump(req); + + if (!req->nerror) { + g_waitfor_event(g_ctl_req, req, M_WAITOK, NULL); + gctl_copyout(req); + } + + gctl_free(req); + return (req->nerror); +} + +static int +g_ctl_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td) +{ + int error; + + switch(cmd) { + case GEOM_CTL: + error = g_ctl_ioctl_ctl(dev, cmd, data, fflag, td); + break; + default: + error = ENOIOCTL; + break; + } + return (error); + +} diff --git a/sys/geom/geom_ctl.h b/sys/geom/geom_ctl.h new file mode 100644 index 0000000..fd68bda --- /dev/null +++ b/sys/geom/geom_ctl.h @@ -0,0 +1,82 @@ +/*- + * Copyright (c) 2003 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _GEOM_GEOM_CTL_H_ +#define _GEOM_GEOM_CTL_H_ + +#include <sys/ioccom.h> + +/* + * Version number. Used to check consistency between kernel and libgeom. + */ +#define GCTL_VERSION 2 + +struct gctl_req_arg { + u_int nlen; + char *name; + off_t offset; + int flag; + int len; + void *value; + /* kernel only fields */ + void *kvalue; +}; + +#define GCTL_PARAM_RD 1 /* Must match VM_PROT_READ */ +#define GCTL_PARAM_WR 2 /* Must match VM_PROT_WRITE */ +#define GCTL_PARAM_RW (GCTL_PARAM_RD | GCTL_PARAM_WR) +#define GCTL_PARAM_ASCII 4 + +/* These are used in the kernel only */ +#define GCTL_PARAM_NAMEKERNEL 8 +#define GCTL_PARAM_VALUEKERNEL 16 +#define GCTL_PARAM_CHANGED 32 + +struct gctl_req { + u_int version; + u_int serial; + u_int narg; + struct gctl_req_arg *arg; + u_int lerror; + char *error; + struct gctl_req_table *reqt; + + /* kernel only fields */ + int nerror; + struct sbuf *serror; +}; + +#define GEOM_CTL _IOW('G', GCTL_VERSION, struct gctl_req) + +#define PATH_GEOM_CTL "geom.ctl" + + +#endif /* _GEOM_GEOM_CTL_H_ */ diff --git a/sys/geom/geom_dev.c b/sys/geom/geom_dev.c new file mode 100644 index 0000000..2dc713a --- /dev/null +++ b/sys/geom/geom_dev.c @@ -0,0 +1,468 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/conf.h> +#include <sys/bio.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/errno.h> +#include <sys/time.h> +#include <sys/disk.h> +#include <sys/fcntl.h> +#include <sys/limits.h> +#include <geom/geom.h> +#include <geom/geom_int.h> + +static d_open_t g_dev_open; +static d_close_t g_dev_close; +static d_strategy_t g_dev_strategy; +static d_ioctl_t g_dev_ioctl; + +static struct cdevsw g_dev_cdevsw = { + .d_open = g_dev_open, + .d_close = g_dev_close, + .d_read = physread, + .d_write = physwrite, + .d_ioctl = g_dev_ioctl, + .d_strategy = g_dev_strategy, + .d_name = "g_dev", + .d_maj = GEOM_MAJOR, + .d_flags = D_DISK | D_TRACKCLOSE, +}; + +static g_taste_t g_dev_taste; +static g_orphan_t g_dev_orphan; + +static struct g_class g_dev_class = { + .name = "DEV", + .taste = g_dev_taste, +}; + +void +g_dev_print(void) +{ + struct g_geom *gp; + char const *p = ""; + + LIST_FOREACH(gp, &g_dev_class.geom, geom) { + printf("%s%s", p, gp->name); + p = " "; + } + printf("\n"); +} + +/* + * XXX: This is disgusting and wrong in every way imaginable: The only reason + * XXX: we have a clone function is because of the root-mount hack we currently + * XXX: employ. An improvment would be to unregister this cloner once we know + * XXX: we no longer need it. Ideally, root-fs would be mounted through DEVFS + * XXX: eliminating the need for this hack. + */ +static void +g_dev_clone(void *arg __unused, char *name, int namelen __unused, dev_t *dev) +{ + struct g_geom *gp; + + if (*dev != NODEV) + return; + + g_waitidle(); + + /* g_topology_lock(); */ + LIST_FOREACH(gp, &g_dev_class.geom, geom) { + if (strcmp(gp->name, name)) + continue; + *dev = gp->softc; + g_trace(G_T_TOPOLOGY, "g_dev_clone(%s) = %p", name, *dev); + return; + } + /* g_topology_unlock(); */ + return; +} + +static void +g_dev_register_cloner(void *foo __unused) +{ + static int once; + + /* XXX: why would this happen more than once ?? */ + if (!once) { + EVENTHANDLER_REGISTER(dev_clone, g_dev_clone, 0, 1000); + once++; + } +} + +SYSINIT(geomdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,g_dev_register_cloner,NULL); + +static struct g_geom * +g_dev_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) +{ + struct g_geom *gp; + struct g_consumer *cp; + static int unit = GEOM_MINOR_PROVIDERS; + int error; + dev_t dev; + + g_trace(G_T_TOPOLOGY, "dev_taste(%s,%s)", mp->name, pp->name); + g_topology_assert(); + LIST_FOREACH(cp, &pp->consumers, consumers) + if (cp->geom->class == mp) + return (NULL); + gp = g_new_geomf(mp, pp->name); + gp->orphan = g_dev_orphan; + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + KASSERT(error == 0, + ("g_dev_taste(%s) failed to g_attach, err=%d", pp->name, error)); + /* + * XXX: I'm not 100% sure we can call make_dev(9) without Giant + * yet. Once we can, we don't need to drop topology here either. + */ + g_topology_unlock(); + mtx_lock(&Giant); + dev = make_dev(&g_dev_cdevsw, unit2minor(unit++), + UID_ROOT, GID_OPERATOR, 0640, gp->name); + if (pp->flags & G_PF_CANDELETE) + dev->si_flags |= SI_CANDELETE; + mtx_unlock(&Giant); + g_topology_lock(); + dev->si_iosize_max = MAXPHYS; + dev->si_stripesize = pp->stripesize; + dev->si_stripeoffset = pp->stripeoffset; + gp->softc = dev; + dev->si_drv1 = gp; + dev->si_drv2 = cp; + return (gp); +} + +static int +g_dev_open(dev_t dev, int flags, int fmt, struct thread *td) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error, r, w, e; + + gp = dev->si_drv1; + cp = dev->si_drv2; + if (gp == NULL || cp == NULL || gp->softc != dev) + return(ENXIO); /* g_dev_taste() not done yet */ + + g_trace(G_T_ACCESS, "g_dev_open(%s, %d, %d, %p)", + gp->name, flags, fmt, td); + r = flags & FREAD ? 1 : 0; + w = flags & FWRITE ? 1 : 0; +#ifdef notyet + e = flags & O_EXCL ? 1 : 0; +#else + e = 0; +#endif + DROP_GIANT(); + g_topology_lock(); + if (dev->si_devsw == NULL) + error = ENXIO; /* We were orphaned */ + else + error = g_access_rel(cp, r, w, e); + g_topology_unlock(); + PICKUP_GIANT(); + g_waitidle(); + if (!error) + dev->si_bsize_phys = cp->provider->sectorsize; + return(error); +} + +static int +g_dev_close(dev_t dev, int flags, int fmt, struct thread *td) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error, r, w, e, i; + + gp = dev->si_drv1; + cp = dev->si_drv2; + if (gp == NULL || cp == NULL) + return(ENXIO); + g_trace(G_T_ACCESS, "g_dev_close(%s, %d, %d, %p)", + gp->name, flags, fmt, td); + r = flags & FREAD ? -1 : 0; + w = flags & FWRITE ? -1 : 0; +#ifdef notyet + e = flags & O_EXCL ? -1 : 0; +#else + e = 0; +#endif + DROP_GIANT(); + g_topology_lock(); + if (dev->si_devsw == NULL) + error = ENXIO; /* We were orphaned */ + else + error = g_access_rel(cp, r, w, e); + for (i = 0; i < 10 * hz;) { + if (cp->acr != 0 || cp->acw != 0) + break; + if (cp->nstart == cp->nend) + break; + tsleep(&i, PRIBIO, "gdevwclose", hz / 10); + i += hz / 10; + } + if (cp->acr == 0 && cp->acw == 0 && cp->nstart != cp->nend) { + printf("WARNING: Final close of geom_dev(%s) %s %s", + gp->name, + "still has outstanding I/O after 10 seconds.", + "Completing close anyway, panic may happen later."); + } + g_topology_unlock(); + PICKUP_GIANT(); + g_waitidle(); + return (error); +} + +/* + * XXX: Until we have unmessed the ioctl situation, there is a race against + * XXX: a concurrent orphanization. We cannot close it by holding topology + * XXX: since that would prevent us from doing our job, and stalling events + * XXX: will break (actually: stall) the BSD disklabel hacks. + */ +static int +g_dev_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td) +{ + struct g_geom *gp; + struct g_consumer *cp; + struct g_kerneldump kd; + int i, error; + u_int u; + struct g_ioctl *gio; + + gp = dev->si_drv1; + cp = dev->si_drv2; + gio = NULL; + + error = 0; + KASSERT(cp->acr || cp->acw, + ("Consumer with zero access count in g_dev_ioctl")); + DROP_GIANT(); + + gio = NULL; + i = IOCPARM_LEN(cmd); + switch (cmd) { + case DIOCGSECTORSIZE: + *(u_int *)data = cp->provider->sectorsize; + if (*(u_int *)data == 0) + error = ENOENT; + break; + case DIOCGMEDIASIZE: + *(off_t *)data = cp->provider->mediasize; + if (*(off_t *)data == 0) + error = ENOENT; + break; + case DIOCGFWSECTORS: + error = g_io_getattr("GEOM::fwsectors", cp, &i, data); + if (error == 0 && *(u_int *)data == 0) + error = ENOENT; + break; + case DIOCGFWHEADS: + error = g_io_getattr("GEOM::fwheads", cp, &i, data); + if (error == 0 && *(u_int *)data == 0) + error = ENOENT; + break; + case DIOCGFRONTSTUFF: + error = g_io_getattr("GEOM::frontstuff", cp, &i, data); + break; + case DIOCSKERNELDUMP: + u = *((u_int *)data); + if (!u) { + set_dumper(NULL); + error = 0; + break; + } + kd.offset = 0; + kd.length = OFF_MAX; + i = sizeof kd; + error = g_io_getattr("GEOM::kerneldump", cp, &i, &kd); + if (!error) + dev->si_flags |= SI_DUMPDEV; + break; + + default: + gio = g_malloc(sizeof *gio, M_WAITOK | M_ZERO); + gio->cmd = cmd; + gio->data = data; + gio->fflag = fflag; + gio->td = td; + i = sizeof *gio; + /* + * We always issue ioctls as getattr since the direction of data + * movement in ioctl is no indication of the ioctl being a "set" + * or "get" type ioctl or if such simplistic terms even apply + */ + error = g_io_getattr("GEOM::ioctl", cp, &i, gio); + break; + } + + PICKUP_GIANT(); + if (error == EDIRIOCTL) { + KASSERT(gio != NULL, ("NULL gio but EDIRIOCTL")); + KASSERT(gio->func != NULL, ("NULL function but EDIRIOCTL")); + error = (gio->func)(gio->dev, cmd, data, fflag, td); + } + g_waitidle(); + if (gio != NULL && (error == EOPNOTSUPP || error == ENOIOCTL)) { + if (g_debugflags & G_T_TOPOLOGY) { + i = IOCGROUP(cmd); + printf("IOCTL(0x%lx) \"%s\"", cmd, gp->name); + if (i > ' ' && i <= '~') + printf(" '%c'", (int)IOCGROUP(cmd)); + else + printf(" 0x%lx", IOCGROUP(cmd)); + printf("/%ld ", cmd & 0xff); + if (cmd & IOC_IN) + printf("I"); + if (cmd & IOC_OUT) + printf("O"); + printf("(%ld) = ENOIOCTL\n", IOCPARM_LEN(cmd)); + } + error = ENOTTY; + } + if (gio != NULL) + g_free(gio); + return (error); +} + +static void +g_dev_done(struct bio *bp2) +{ + struct bio *bp; + + bp = bp2->bio_parent; + bp->bio_error = bp2->bio_error; + if (bp->bio_error != 0) { + g_trace(G_T_BIO, "g_dev_done(%p) had error %d", + bp2, bp->bio_error); + bp->bio_flags |= BIO_ERROR; + } else { + g_trace(G_T_BIO, "g_dev_done(%p/%p) resid %ld completed %jd", + bp2, bp, bp->bio_resid, (intmax_t)bp2->bio_completed); + } + bp->bio_resid = bp->bio_bcount - bp2->bio_completed; + g_destroy_bio(bp2); + mtx_lock(&Giant); + biodone(bp); + mtx_unlock(&Giant); +} + +static void +g_dev_strategy(struct bio *bp) +{ + struct g_consumer *cp; + struct bio *bp2; + dev_t dev; + + KASSERT(bp->bio_cmd == BIO_READ || + bp->bio_cmd == BIO_WRITE || + bp->bio_cmd == BIO_DELETE, + ("Wrong bio_cmd bio=%p cmd=%d", bp, bp->bio_cmd)); + dev = bp->bio_dev; + cp = dev->si_drv2; + KASSERT(cp->acr || cp->acw, + ("Consumer with zero access count in g_dev_strategy")); + + bp2 = g_clone_bio(bp); + KASSERT(bp2 != NULL, ("XXX: ENOMEM in a bad place")); + bp2->bio_offset = (off_t)bp->bio_blkno << DEV_BSHIFT; + KASSERT(bp2->bio_offset >= 0, + ("Negative bio_offset (%jd) on bio %p", + (intmax_t)bp2->bio_offset, bp)); + bp2->bio_length = (off_t)bp->bio_bcount; + bp2->bio_done = g_dev_done; + g_trace(G_T_BIO, + "g_dev_strategy(%p/%p) offset %jd length %jd data %p cmd %d", + bp, bp2, (intmax_t)bp->bio_offset, (intmax_t)bp2->bio_length, + bp2->bio_data, bp2->bio_cmd); + g_io_request(bp2, cp); + KASSERT(cp->acr || cp->acw, + ("g_dev_strategy raced with g_dev_close and lost")); + +} + +/* + * g_dev_orphan() + * + * Called from below when the provider orphaned us. + * - Clear any dump settings. + * - Destroy the dev_t to prevent any more request from coming in. The + * provider is already marked with an error, so anything which comes in + * in the interrim will be returned immediately. + * - Wait for any outstanding I/O to finish. + * - Set our access counts to zero, whatever they were. + * - Detach and self-destruct. + */ + +static void +g_dev_orphan(struct g_consumer *cp) +{ + struct g_geom *gp; + dev_t dev; + + g_topology_assert(); + gp = cp->geom; + dev = gp->softc; + g_trace(G_T_TOPOLOGY, "g_dev_orphan(%p(%s))", cp, gp->name); + + /* Reset any dump-area set on this device */ + if (dev->si_flags & SI_DUMPDEV) + set_dumper(NULL); + + /* Destroy the dev_t so we get no more requests */ + destroy_dev(dev); + + /* Wait for the cows to come home */ + while (cp->nstart != cp->nend) + msleep(&dev, NULL, PRIBIO, "gdevorphan", hz / 10); + + if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) + g_access_rel(cp, -cp->acr, -cp->acw, -cp->ace); + + g_detach(cp); + g_destroy_consumer(cp); + g_destroy_geom(gp); +} + +DECLARE_GEOM_CLASS(g_dev_class, g_dev); diff --git a/sys/geom/geom_disk.c b/sys/geom/geom_disk.c new file mode 100644 index 0000000..9b5f79e --- /dev/null +++ b/sys/geom/geom_disk.c @@ -0,0 +1,419 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_geom.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/bio.h> +#include <sys/conf.h> +#include <sys/fcntl.h> +#include <sys/malloc.h> +#include <sys/sysctl.h> +#include <sys/devicestat.h> +#include <machine/md_var.h> + +#include <sys/lock.h> +#include <sys/mutex.h> +#include <geom/geom.h> +#include <geom/geom_disk.h> +#include <geom/geom_int.h> + +static struct mtx g_disk_done_mtx; + +static g_access_t g_disk_access; +static g_init_t g_disk_init; +static g_fini_t g_disk_fini; + +struct g_class g_disk_class = { + .name = "DISK", + .init = g_disk_init, + .fini = g_disk_fini, +}; + +static void +g_disk_init(struct g_class *mp __unused) +{ + + mtx_init(&g_disk_done_mtx, "g_disk_done", MTX_DEF, 0); +} + +static void +g_disk_fini(struct g_class *mp __unused) +{ + + mtx_destroy(&g_disk_done_mtx); +} + +DECLARE_GEOM_CLASS(g_disk_class, g_disk); + +static void __inline +g_disk_lock_giant(struct disk *dp) +{ + if (dp->d_flags & DISKFLAG_NOGIANT) + return; + mtx_lock(&Giant); +} + +static void __inline +g_disk_unlock_giant(struct disk *dp) +{ + if (dp->d_flags & DISKFLAG_NOGIANT) + return; + mtx_unlock(&Giant); +} + +static int +g_disk_access(struct g_provider *pp, int r, int w, int e) +{ + struct disk *dp; + int error; + + g_trace(G_T_ACCESS, "g_disk_access(%s, %d, %d, %d)", + pp->name, r, w, e); + g_topology_assert(); + r += pp->acr; + w += pp->acw; + e += pp->ace; + dp = pp->geom->softc; + if (dp == NULL) + return (ENXIO); + error = 0; + if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) { + if (dp->d_open != NULL) { + g_disk_lock_giant(dp); + error = dp->d_open(dp); + if (error != 0) + printf("Opened disk %s -> %d\n", + pp->name, error); + g_disk_unlock_giant(dp); + } + pp->mediasize = dp->d_mediasize; + pp->sectorsize = dp->d_sectorsize; + dp->d_flags |= DISKFLAG_OPEN; + if (dp->d_maxsize == 0) { + printf("WARNING: Disk drive %s%d has no d_maxsize\n", + dp->d_name, dp->d_unit); + dp->d_maxsize = DFLTPHYS; + } + } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) { + if (dp->d_close != NULL) { + g_disk_lock_giant(dp); + error = dp->d_close(dp); + if (error != 0) + printf("Closed disk %s -> %d\n", + pp->name, error); + g_disk_unlock_giant(dp); + } + dp->d_flags &= ~DISKFLAG_OPEN; + } + return (error); +} + +static void +g_disk_kerneldump(struct bio *bp, struct disk *dp) +{ + int error; + struct g_kerneldump *gkd; + struct dumperinfo di; + struct g_geom *gp; + + gkd = (struct g_kerneldump*)bp->bio_data; + gp = bp->bio_to->geom; + g_trace(G_T_TOPOLOGY, "g_disk_kernedump(%s, %jd, %jd)", + gp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); + di.dumper = dp->d_dump; + di.priv = dp; + di.blocksize = dp->d_sectorsize; + di.mediaoffset = gkd->offset; + di.mediasize = gkd->length; + error = set_dumper(&di); + g_io_deliver(bp, error); +} + +static void +g_disk_done(struct bio *bp) +{ + struct bio *bp2; + struct disk *dp; + + /* See "notes" for why we need a mutex here */ + /* XXX: will witness accept a mix of Giant/unGiant drivers here ? */ + mtx_lock(&g_disk_done_mtx); + bp->bio_completed = bp->bio_length - bp->bio_resid; + + bp2 = bp->bio_parent; + dp = bp2->bio_to->geom->softc; + if (bp2->bio_error == 0) + bp2->bio_error = bp->bio_error; + bp2->bio_completed += bp->bio_completed; + g_destroy_bio(bp); + bp2->bio_inbed++; + if (bp2->bio_children == bp2->bio_inbed) { + bp2->bio_resid = bp2->bio_bcount - bp2->bio_completed; + devstat_end_transaction_bio(dp->d_devstat, bp2); + g_io_deliver(bp2, bp2->bio_error); + } + mtx_unlock(&g_disk_done_mtx); +} + +static void +g_disk_start(struct bio *bp) +{ + struct bio *bp2, *bp3; + struct disk *dp; + struct g_ioctl *gio; + int error; + off_t off; + + dp = bp->bio_to->geom->softc; + if (dp == NULL) + g_io_deliver(bp, ENXIO); + error = EJUSTRETURN; + switch(bp->bio_cmd) { + case BIO_DELETE: + if (!(dp->d_flags & DISKFLAG_CANDELETE)) { + error = 0; + break; + } + /* fall-through */ + case BIO_READ: + case BIO_WRITE: + off = 0; + bp3 = NULL; + bp2 = g_clone_bio(bp); + if (bp2 == NULL) { + error = ENOMEM; + break; + } + devstat_start_transaction_bio(dp->d_devstat, bp); + do { + bp2->bio_offset += off; + bp2->bio_length -= off; + bp2->bio_data += off; + if (bp2->bio_length > dp->d_maxsize) { + /* + * XXX: If we have a stripesize we should really + * use it here. + */ + bp2->bio_length = dp->d_maxsize; + off += dp->d_maxsize; + /* + * To avoid a race, we need to grab the next bio + * before we schedule this one. See "notes". + */ + bp3 = g_clone_bio(bp); + if (bp3 == NULL) + bp->bio_error = ENOMEM; + } + bp2->bio_done = g_disk_done; + bp2->bio_blkno = bp2->bio_offset >> DEV_BSHIFT; + bp2->bio_pblkno = bp2->bio_offset / dp->d_sectorsize; + bp2->bio_bcount = bp2->bio_length; + bp2->bio_disk = dp; + g_disk_lock_giant(dp); + dp->d_strategy(bp2); + g_disk_unlock_giant(dp); + bp2 = bp3; + bp3 = NULL; + } while (bp2 != NULL); + break; + case BIO_GETATTR: + if (g_handleattr_int(bp, "GEOM::fwsectors", dp->d_fwsectors)) + break; + else if (g_handleattr_int(bp, "GEOM::fwheads", dp->d_fwheads)) + break; + else if (g_handleattr_off_t(bp, "GEOM::frontstuff", 0)) + break; + else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) + g_disk_kerneldump(bp, dp); + else if ((g_debugflags & G_F_DISKIOCTL) && + (dp->d_ioctl != NULL) && + !strcmp(bp->bio_attribute, "GEOM::ioctl") && + bp->bio_length == sizeof *gio) { + gio = (struct g_ioctl *)bp->bio_data; + gio->dev = dp; + gio->func = (d_ioctl_t *)(dp->d_ioctl); + error = EDIRIOCTL; + } else + error = ENOIOCTL; + break; + default: + error = EOPNOTSUPP; + break; + } + if (error != EJUSTRETURN) + g_io_deliver(bp, error); + return; +} + +static void +g_disk_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) +{ + struct disk *dp; + + dp = gp->softc; + if (indent == NULL) { + sbuf_printf(sb, " hd %u", dp->d_fwheads); + sbuf_printf(sb, " sc %u", dp->d_fwsectors); + return; + } + if (pp != NULL) { + sbuf_printf(sb, "%s<fwheads>%u</fwheads>\n", + indent, dp->d_fwheads); + sbuf_printf(sb, "%s<fwsectors>%u</fwsectors>\n", + indent, dp->d_fwsectors); + } +} + +static void +g_disk_create(void *arg, int flag) +{ + struct g_geom *gp; + struct g_provider *pp; + struct disk *dp; + + if (flag == EV_CANCEL) + return; + g_topology_assert(); + dp = arg; + gp = g_new_geomf(&g_disk_class, "%s%d", dp->d_name, dp->d_unit); + gp->start = g_disk_start; + gp->access = g_disk_access; + gp->softc = dp; + gp->dumpconf = g_disk_dumpconf; + pp = g_new_providerf(gp, "%s", gp->name); + pp->mediasize = dp->d_mediasize; + pp->sectorsize = dp->d_sectorsize; + if (dp->d_flags & DISKFLAG_CANDELETE) + pp->flags |= G_PF_CANDELETE; + pp->stripeoffset = dp->d_stripeoffset; + pp->stripesize = dp->d_stripesize; + if (bootverbose) + printf("GEOM: new disk %s\n", gp->name); + dp->d_geom = gp; + g_error_provider(pp, 0); +} + + + +void +disk_create(int unit, struct disk *dp, int flags, void *unused __unused, void * unused2 __unused) +{ + + dp->d_unit = unit; + dp->d_flags = flags; + KASSERT(dp->d_strategy != NULL, ("disk_create need d_strategy")); + KASSERT(dp->d_name != NULL, ("disk_create need d_name")); + KASSERT(*dp->d_name != 0, ("disk_create need d_name")); + KASSERT(strlen(dp->d_name) < SPECNAMELEN - 4, ("disk name too long")); + dp->d_devstat = devstat_new_entry(dp->d_name, dp->d_unit, + dp->d_sectorsize, DEVSTAT_ALL_SUPPORTED, + DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); + dp->d_geom = NULL; + g_post_event(g_disk_create, dp, M_WAITOK, dp, NULL); +} + +/* + * XXX: There is a race if disk_destroy() is called while the g_disk_create() + * XXX: event is running. I belive the current result is that disk_destroy() + * XXX: actually doesn't do anything. Considering that the driver owns the + * XXX: struct disk and is likely to free it in a few moments, this can + * XXX: hardly be said to be optimal. To what extent we can sleep in + * XXX: disk_create() and disk_destroy() is currently undefined (but generally + * XXX: undesirable) so any solution seems to involve an intrusive decision. + */ + +static void +disk_destroy_event(void *ptr, int flag) +{ + + g_topology_assert(); + g_wither_geom(ptr, ENXIO); +} + +void +disk_destroy(struct disk *dp) +{ + struct g_geom *gp; + + g_cancel_event(dp); + gp = dp->d_geom; + if (gp == NULL) + return; + gp->softc = NULL; + devstat_remove_entry(dp->d_devstat); + g_post_event(disk_destroy_event, gp, M_WAITOK, NULL, NULL); +} + +static void +g_kern_disks(void *p, int flag __unused) +{ + struct sbuf *sb; + struct g_geom *gp; + char *sp; + + sb = p; + sp = ""; + g_topology_assert(); + LIST_FOREACH(gp, &g_disk_class.geom, geom) { + sbuf_printf(sb, "%s%s", sp, gp->name); + sp = " "; + } + sbuf_finish(sb); +} + +static int +sysctl_disks(SYSCTL_HANDLER_ARGS) +{ + int error; + struct sbuf *sb; + + sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); + sbuf_clear(sb); + g_waitfor_event(g_kern_disks, sb, M_WAITOK, NULL); + error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); + return error; +} + +SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NOLOCK, 0, 0, + sysctl_disks, "A", "names of available disks"); + diff --git a/sys/geom/geom_disk.h b/sys/geom/geom_disk.h new file mode 100644 index 0000000..712e871 --- /dev/null +++ b/sys/geom/geom_disk.h @@ -0,0 +1,97 @@ +/*- + * Copyright (c) 2003 Poul-Henning Kamp + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _GEOM_GEOM_DISK_H_ +#define _GEOM_GEOM_DISK_H_ + +#ifdef _KERNEL + +#include <sys/queue.h> +#include <sys/_lock.h> +#include <sys/_mutex.h> + +typedef int disk_open_t(struct disk *); +typedef int disk_close_t(struct disk *); +typedef void disk_strategy_t(struct bio *bp); +typedef int disk_ioctl_t(struct disk *, u_long cmd, void *data, + int fflag, struct thread *td); + /* NB: disk_ioctl_t SHALL be cast'able to d_ioctl_t */ + +struct g_geom; +struct devstat; + +struct disk { + /* Fields which are private to geom_disk */ + struct g_geom *d_geom; + struct devstat *d_devstat; + + /* Shared fields */ + u_int d_flags; + const char *d_name; + u_int d_unit; + struct bio_queue_head *d_queue; + struct mtx *d_lock; + + /* Disk methods */ + disk_open_t *d_open; + disk_close_t *d_close; + disk_strategy_t *d_strategy; + disk_ioctl_t *d_ioctl; + dumper_t *d_dump; + + /* Info fields from driver to geom_disk.c. Valid when open */ + u_int d_sectorsize; + off_t d_mediasize; + u_int d_fwsectors; + u_int d_fwheads; + u_int d_maxsize; + u_int d_stripeoffset; + u_int d_stripesize; + + /* Fields private to the driver */ + void *d_drv1; +}; + +#define DISKFLAG_NOGIANT 0x1 +#define DISKFLAG_OPEN 0x2 +#define DISKFLAG_CANDELETE 0x4 + +void disk_create(int unit, struct disk *disk, int flags, void *unused, void *unused2); +void disk_destroy(struct disk *disk); + + +#endif /* _KERNEL */ +#endif /* _GEOM_GEOM_DISK_H_ */ diff --git a/sys/geom/geom_dump.c b/sys/geom/geom_dump.c new file mode 100644 index 0000000..869d7c4 --- /dev/null +++ b/sys/geom/geom_dump.c @@ -0,0 +1,306 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/sbuf.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <machine/stdarg.h> + +#include <geom/geom.h> +#include <geom/geom_int.h> + + +static void +g_confdot_consumer(struct sbuf *sb, struct g_consumer *cp) +{ + + sbuf_printf(sb, "z%p [label=\"r%dw%de%d\"];\n", + cp, cp->acr, cp->acw, cp->ace); + if (cp->provider) + sbuf_printf(sb, "z%p -> z%p;\n", cp, cp->provider); +} + +static void +g_confdot_provider(struct sbuf *sb, struct g_provider *pp) +{ + + sbuf_printf(sb, "z%p [shape=hexagon,label=\"%s\\nr%dw%de%d\\nerr#%d\"];\n", + pp, pp->name, pp->acr, pp->acw, pp->ace, pp->error); +} + +static void +g_confdot_geom(struct sbuf *sb, struct g_geom *gp) +{ + struct g_consumer *cp; + struct g_provider *pp; + + sbuf_printf(sb, "z%p [shape=box,label=\"%s\\n%s\\nr#%d\"];\n", + gp, gp->class->name, gp->name, gp->rank); + LIST_FOREACH(cp, &gp->consumer, consumer) { + g_confdot_consumer(sb, cp); + sbuf_printf(sb, "z%p -> z%p;\n", gp, cp); + } + + LIST_FOREACH(pp, &gp->provider, provider) { + g_confdot_provider(sb, pp); + sbuf_printf(sb, "z%p -> z%p;\n", pp, gp); + } +} + +static void +g_confdot_class(struct sbuf *sb, struct g_class *mp) +{ + struct g_geom *gp; + + LIST_FOREACH(gp, &mp->geom, geom) + g_confdot_geom(sb, gp); +} + +void +g_confdot(void *p, int flag ) +{ + struct g_class *mp; + struct sbuf *sb; + + KASSERT(flag != EV_CANCEL, ("g_confdot was cancelled")); + sb = p; + g_topology_assert(); + sbuf_printf(sb, "digraph geom {\n"); + LIST_FOREACH(mp, &g_classes, class) + g_confdot_class(sb, mp); + sbuf_printf(sb, "};\n"); + sbuf_finish(sb); +} + +static void +g_conftxt_geom(struct sbuf *sb, struct g_geom *gp, int level) +{ + struct g_provider *pp; + struct g_consumer *cp; + + LIST_FOREACH(pp, &gp->provider, provider) { + sbuf_printf(sb, "%d %s %s %ju %u", level, gp->class->name, + pp->name, (uintmax_t)pp->mediasize, pp->sectorsize); + if (gp->dumpconf != NULL) + gp->dumpconf(sb, NULL, gp, NULL, pp); + sbuf_printf(sb, "\n"); + LIST_FOREACH(cp, &pp->consumers, consumers) + g_conftxt_geom(sb, cp->geom, level + 1); + } +} + +static void +g_conftxt_class(struct sbuf *sb, struct g_class *mp) +{ + struct g_geom *gp; + + LIST_FOREACH(gp, &mp->geom, geom) + g_conftxt_geom(sb, gp, 0); +} + +void +g_conftxt(void *p, int flag) +{ + struct g_class *mp; + struct sbuf *sb; + + KASSERT(flag != EV_CANCEL, ("g_conftxt was cancelled")); + sb = p; + g_topology_assert(); + LIST_FOREACH(mp, &g_classes, class) + if (!strcmp(mp->name, "DISK")) + break; + if (mp != NULL) + g_conftxt_class(sb, mp); + sbuf_finish(sb); +} + + +static void +g_conf_consumer(struct sbuf *sb, struct g_consumer *cp) +{ + + sbuf_printf(sb, "\t<consumer id=\"%p\">\n", cp); + sbuf_printf(sb, "\t <geom ref=\"%p\"/>\n", cp->geom); + if (cp->provider != NULL) + sbuf_printf(sb, "\t <provider ref=\"%p\"/>\n", cp->provider); + sbuf_printf(sb, "\t <mode>r%dw%de%d</mode>\n", + cp->acr, cp->acw, cp->ace); + if (cp->geom->dumpconf != NULL) { + sbuf_printf(sb, "\t <config>\n"); + cp->geom->dumpconf(sb, "\t ", cp->geom, cp, NULL); + sbuf_printf(sb, "\t </config>\n"); + } + sbuf_printf(sb, "\t</consumer>\n"); +} + +static void +g_conf_provider(struct sbuf *sb, struct g_provider *pp) +{ + + sbuf_printf(sb, "\t<provider id=\"%p\">\n", pp); + sbuf_printf(sb, "\t <geom ref=\"%p\"/>\n", pp->geom); + sbuf_printf(sb, "\t <mode>r%dw%de%d</mode>\n", + pp->acr, pp->acw, pp->ace); + sbuf_printf(sb, "\t <name>%s</name>\n", pp->name); + sbuf_printf(sb, "\t <mediasize>%jd</mediasize>\n", + (intmax_t)pp->mediasize); + sbuf_printf(sb, "\t <sectorsize>%u</sectorsize>\n", pp->sectorsize); + if (pp->geom->dumpconf != NULL) { + sbuf_printf(sb, "\t <config>\n"); + pp->geom->dumpconf(sb, "\t ", pp->geom, NULL, pp); + sbuf_printf(sb, "\t </config>\n"); + } + sbuf_printf(sb, "\t</provider>\n"); +} + + +static void +g_conf_geom(struct sbuf *sb, struct g_geom *gp, struct g_provider *pp, struct g_consumer *cp) +{ + struct g_consumer *cp2; + struct g_provider *pp2; + + sbuf_printf(sb, " <geom id=\"%p\">\n", gp); + sbuf_printf(sb, " <class ref=\"%p\"/>\n", gp->class); + sbuf_printf(sb, " <name>%s</name>\n", gp->name); + sbuf_printf(sb, " <rank>%d</rank>\n", gp->rank); + if (gp->dumpconf != NULL) { + sbuf_printf(sb, " <config>\n"); + gp->dumpconf(sb, "\t", gp, NULL, NULL); + sbuf_printf(sb, " </config>\n"); + } + LIST_FOREACH(cp2, &gp->consumer, consumer) { + if (cp != NULL && cp != cp2) + continue; + g_conf_consumer(sb, cp2); + } + + LIST_FOREACH(pp2, &gp->provider, provider) { + if (pp != NULL && pp != pp2) + continue; + g_conf_provider(sb, pp2); + } + sbuf_printf(sb, " </geom>\n"); +} + +static void +g_conf_class(struct sbuf *sb, struct g_class *mp, struct g_geom *gp, struct g_provider *pp, struct g_consumer *cp) +{ + struct g_geom *gp2; + + sbuf_printf(sb, " <class id=\"%p\">\n", mp); + sbuf_printf(sb, " <name>%s</name>\n", mp->name); + LIST_FOREACH(gp2, &mp->geom, geom) { + if (gp != NULL && gp != gp2) + continue; + g_conf_geom(sb, gp2, pp, cp); + } + sbuf_printf(sb, " </class>\n"); +} + +void +g_conf_specific(struct sbuf *sb, struct g_class *mp, struct g_geom *gp, struct g_provider *pp, struct g_consumer *cp) +{ + struct g_class *mp2; + + g_topology_assert(); + sbuf_printf(sb, "<mesh>\n"); + LIST_FOREACH(mp2, &g_classes, class) { + if (mp != NULL && mp != mp2) + continue; + g_conf_class(sb, mp2, gp, pp, cp); + } + sbuf_printf(sb, "</mesh>\n"); + sbuf_finish(sb); +} + +void +g_confxml(void *p, int flag) +{ + + KASSERT(flag != EV_CANCEL, ("g_confxml was cancelled")); + g_topology_assert(); + g_conf_specific(p, NULL, NULL, NULL, NULL); +} + +void +g_trace(int level, const char *fmt, ...) +{ + va_list ap; + + g_sanity(NULL); + if (!(g_debugflags & level)) + return; + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); + printf("\n"); +} + +void +g_hexdump(void *ptr, int length) +{ + int i, j, k; + unsigned char *cp; + + cp = ptr; + for (i = 0; i < length; i+= 16) { + printf("%04x ", i); + for (j = 0; j < 16; j++) { + k = i + j; + if (k < length) + printf(" %02x", cp[k]); + else + printf(" "); + } + printf(" |"); + for (j = 0; j < 16; j++) { + k = i + j; + if (k >= length) + printf(" "); + else if (cp[k] >= ' ' && cp[k] <= '~') + printf("%c", cp[k]); + else + printf("."); + } + printf("|\n"); + } +} + diff --git a/sys/geom/geom_event.c b/sys/geom/geom_event.c new file mode 100644 index 0000000..f180c43 --- /dev/null +++ b/sys/geom/geom_event.c @@ -0,0 +1,324 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * XXX: How do we in general know that objects referenced in events + * have not been destroyed before we get around to handle the event ? + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/malloc.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <machine/stdarg.h> +#include <sys/errno.h> +#include <sys/time.h> +#include <geom/geom.h> +#include <geom/geom_int.h> + +TAILQ_HEAD(event_tailq_head, g_event); + +static struct event_tailq_head g_events = TAILQ_HEAD_INITIALIZER(g_events); +static u_int g_pending_events; +static TAILQ_HEAD(,g_provider) g_doorstep = TAILQ_HEAD_INITIALIZER(g_doorstep); +static struct mtx g_eventlock; +static struct sx g_eventstall; + +#define G_N_EVENTREFS 20 + +struct g_event { + TAILQ_ENTRY(g_event) events; + g_event_t *func; + void *arg; + int flag; + void *ref[G_N_EVENTREFS]; +}; + +#define EV_DONE 0x80000 +#define EV_WAKEUP 0x40000 +#define EV_CANCELED 0x20000 + +void +g_waitidle(void) +{ + + while (g_pending_events) + tsleep(&g_pending_events, PPAUSE, "g_waitidle", hz/5); +} + +void +g_stall_events(void) +{ + + sx_xlock(&g_eventstall); +} + +void +g_release_events(void) +{ + + sx_xunlock(&g_eventstall); +} + +void +g_orphan_provider(struct g_provider *pp, int error) +{ + + g_trace(G_T_TOPOLOGY, "g_orphan_provider(%p(%s), %d)", + pp, pp->name, error); + KASSERT(error != 0, + ("g_orphan_provider(%p(%s), 0) error must be non-zero\n", + pp, pp->name)); + pp->error = error; + mtx_lock(&g_eventlock); + TAILQ_INSERT_TAIL(&g_doorstep, pp, orphan); + mtx_unlock(&g_eventlock); + wakeup(&g_wait_event); +} + +/* + * This function is called once on each provider which the event handler + * finds on its g_doorstep. + */ + +static void +g_orphan_register(struct g_provider *pp) +{ + struct g_consumer *cp, *cp2; + + g_trace(G_T_TOPOLOGY, "g_orphan_register(%s)", pp->name); + g_topology_assert(); + + /* + * Tell all consumers the bad news. + * Don't be surprised if they self-destruct. + */ + cp = LIST_FIRST(&pp->consumers); + while (cp != NULL) { + cp2 = LIST_NEXT(cp, consumers); + KASSERT(cp->geom->orphan != NULL, + ("geom %s has no orphan, class %s", + cp->geom->name, cp->geom->class->name)); + cp->geom->orphan(cp); + cp = cp2; + } +#ifdef notyet + cp = LIST_FIRST(&pp->consumers); + if (cp != NULL) + return; + if (pp->geom->flags & G_GEOM_WITHER) + g_destroy_provider(pp); +#endif +} + +static int +one_event(void) +{ + struct g_event *ep; + struct g_provider *pp; + + sx_xlock(&g_eventstall); + g_topology_lock(); + for (;;) { + mtx_lock(&g_eventlock); + pp = TAILQ_FIRST(&g_doorstep); + if (pp != NULL) + TAILQ_REMOVE(&g_doorstep, pp, orphan); + mtx_unlock(&g_eventlock); + if (pp == NULL) + break; + g_orphan_register(pp); + } + mtx_lock(&g_eventlock); + ep = TAILQ_FIRST(&g_events); + if (ep == NULL) { + mtx_unlock(&g_eventlock); + g_topology_unlock(); + sx_xunlock(&g_eventstall); + return (0); + } + TAILQ_REMOVE(&g_events, ep, events); + mtx_unlock(&g_eventlock); + g_topology_assert(); + ep->func(ep->arg, 0); + g_topology_assert(); + if (ep->flag & EV_WAKEUP) { + ep->flag |= EV_DONE; + wakeup(ep); + } else { + g_free(ep); + } + g_pending_events--; + if (g_pending_events == 0) + wakeup(&g_pending_events); + g_topology_unlock(); + sx_xunlock(&g_eventstall); + return (1); +} + +void +g_run_events() +{ + + while (one_event()) + ; +} + +void +g_cancel_event(void *ref) +{ + struct g_event *ep, *epn; + struct g_provider *pp; + u_int n; + + mtx_lock(&g_eventlock); + TAILQ_FOREACH(pp, &g_doorstep, orphan) { + if (pp != ref) + continue; + TAILQ_REMOVE(&g_doorstep, pp, orphan); + break; + } + for (ep = TAILQ_FIRST(&g_events); ep != NULL; ep = epn) { + epn = TAILQ_NEXT(ep, events); + for (n = 0; n < G_N_EVENTREFS; n++) { + if (ep->ref[n] == NULL) + break; + if (ep->ref[n] == ref) { + TAILQ_REMOVE(&g_events, ep, events); + ep->func(ep->arg, EV_CANCEL); + if (ep->flag & EV_WAKEUP) { + ep->flag |= EV_DONE; + ep->flag |= EV_CANCELED; + wakeup(ep); + } else { + g_free(ep); + } + break; + } + } + } + mtx_unlock(&g_eventlock); +} + +static int +g_post_event_x(g_event_t *func, void *arg, int flag, struct g_event **epp, va_list ap) +{ + struct g_event *ep; + void *p; + u_int n; + + g_trace(G_T_TOPOLOGY, "g_post_event_x(%p, %p, %d", func, arg, flag); + ep = g_malloc(sizeof *ep, flag | M_ZERO); + if (ep == NULL) + return (ENOMEM); + ep->flag = flag; + for (n = 0; n < G_N_EVENTREFS; n++) { + p = va_arg(ap, void *); + if (p == NULL) + break; + g_trace(G_T_TOPOLOGY, " ref %p", p); + ep->ref[n++] = p; + } + KASSERT(p == NULL, ("Too many references to event")); + ep->func = func; + ep->arg = arg; + mtx_lock(&g_eventlock); + g_pending_events++; + TAILQ_INSERT_TAIL(&g_events, ep, events); + mtx_unlock(&g_eventlock); + wakeup(&g_wait_event); + if (epp != NULL) + *epp = ep; + return (0); +} + +int +g_post_event(g_event_t *func, void *arg, int flag, ...) +{ + va_list ap; + int i; + + KASSERT(flag == M_WAITOK || flag == M_NOWAIT, + ("Wrong flag to g_post_event")); + va_start(ap, flag); + i = g_post_event_x(func, arg, flag, NULL, ap); + va_end(ap); + return (i); +} + + +/* + * XXX: It might actually be useful to call this function with topology held. + * XXX: This would ensure that the event gets created before anything else + * XXX: changes. At present all users have a handle on things in some other + * XXX: way, so this remains an XXX for now. + */ + +int +g_waitfor_event(g_event_t *func, void *arg, int flag, ...) +{ + va_list ap; + struct g_event *ep; + int error; + + /* g_topology_assert_not(); */ + KASSERT(flag == M_WAITOK || flag == M_NOWAIT, + ("Wrong flag to g_post_event")); + va_start(ap, flag); + error = g_post_event_x(func, arg, flag | EV_WAKEUP, &ep, ap); + va_end(ap); + if (error) + return (error); + do + tsleep(ep, PRIBIO, "g_waitfor_event", hz); + while (!(ep->flag & EV_DONE)); + if (ep->flag & EV_CANCELED) + error = EAGAIN; + g_free(ep); + return (error); +} + +void +g_event_init() +{ + + mtx_init(&g_eventlock, "GEOM orphanage", NULL, MTX_DEF); + sx_init(&g_eventstall, "GEOM event stalling"); +} diff --git a/sys/geom/geom_fox.c b/sys/geom/geom_fox.c new file mode 100644 index 0000000..295840f --- /dev/null +++ b/sys/geom/geom_fox.c @@ -0,0 +1,468 @@ +/*- + * Copyright (c) 2003 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + * This is a GEOM module for handling path selection for multi-path + * storage devices. It is named "fox" because it, like they, prefer + * to have multiple exits to choose from. + * + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/conf.h> +#include <sys/bio.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/libkern.h> +#include <sys/endian.h> +#include <sys/md5.h> +#include <sys/errno.h> +#include <geom/geom.h> + +#define FOX_CLASS_NAME "FOX" +#define FOX_MAGIC "GEOM::FOX" + +struct g_fox_softc { + off_t mediasize; + u_int sectorsize; + TAILQ_HEAD(, bio) queue; + struct mtx lock; + u_char magic[16]; + struct g_consumer *path; + struct g_consumer *opath; + int waiting; + int cr, cw, ce; +}; + +/* + * This function is called whenever we need to select a new path. + */ +static void +g_fox_select_path(void *arg, int flag) +{ + struct g_geom *gp; + struct g_fox_softc *sc; + struct g_consumer *cp1; + struct bio *bp; + int error; + + g_topology_assert(); + if (flag == EV_CANCEL) + return; + gp = arg; + sc = gp->softc; + + if (sc->opath != NULL) { + /* + * First, close the old path entirely. + */ + printf("Closing old path (%s) on fox (%s)\n", + sc->opath->provider->name, gp->name); + + cp1 = LIST_NEXT(sc->opath, consumer); + + error = g_access_rel(sc->opath, -sc->cr, -sc->cw, -(sc->ce + 1)); + KASSERT(error == 0, ("Failed close of old path %d", error)); + + /* + * The attempt to reopen it with a exclusive count + */ + error = g_access_rel(sc->opath, 0, 0, 1); + if (error) { + /* + * Ok, ditch this consumer, we can't use it. + */ + printf("Drop old path (%s) on fox (%s)\n", + sc->opath->provider->name, gp->name); + g_detach(sc->opath); + g_destroy_consumer(sc->opath); + if (LIST_EMPTY(&gp->consumer)) { + /* No consumers left */ + g_wither_geom(gp, ENXIO); + for (;;) { + bp = TAILQ_FIRST(&sc->queue); + if (bp == NULL) + break; + TAILQ_REMOVE(&sc->queue, bp, bio_queue); + bp->bio_error = ENXIO; + g_std_done(bp); + } + return; + } + } else { + printf("Got e-bit on old path (%s) on fox (%s)\n", + sc->opath->provider->name, gp->name); + } + sc->opath = NULL; + } else { + cp1 = LIST_FIRST(&gp->consumer); + } + if (cp1 == NULL) + cp1 = LIST_FIRST(&gp->consumer); + printf("Open new path (%s) on fox (%s)\n", + cp1->provider->name, gp->name); + error = g_access_rel(cp1, sc->cr, sc->cw, sc->ce); + if (error) { + /* + * If we failed, we take another trip through here + */ + printf("Open new path (%s) on fox (%s) failed, reselect.\n", + cp1->provider->name, gp->name); + sc->opath = cp1; + g_post_event(g_fox_select_path, gp, M_WAITOK, gp, NULL); + } else { + printf("Open new path (%s) on fox (%s) succeeded\n", + cp1->provider->name, gp->name); + mtx_lock(&sc->lock); + sc->path = cp1; + sc->waiting = 0; + for (;;) { + bp = TAILQ_FIRST(&sc->queue); + if (bp == NULL) + break; + TAILQ_REMOVE(&sc->queue, bp, bio_queue); + g_io_request(bp, sc->path); + } + mtx_unlock(&sc->lock); + } +} + +static void +g_fox_orphan(struct g_consumer *cp) +{ + struct g_geom *gp; + struct g_fox_softc *sc; + int error, mark; + + g_topology_assert(); + gp = cp->geom; + sc = gp->softc; + printf("Removing path (%s) from fox (%s)\n", + cp->provider->name, gp->name); + mtx_lock(&sc->lock); + if (cp == sc->path) { + sc->opath = NULL; + sc->path = NULL; + sc->waiting = 1; + mark = 1; + } else { + mark = 0; + } + mtx_unlock(&sc->lock); + + g_access_rel(cp, -cp->acr, -cp->acw, -cp->ace); + error = cp->provider->error; + g_detach(cp); + g_destroy_consumer(cp); + if (!LIST_EMPTY(&gp->consumer)) { + if (mark) + g_post_event(g_fox_select_path, gp, M_WAITOK, gp, NULL); + return; + } + + mtx_destroy(&sc->lock); + gp->softc = NULL; + g_free(gp->softc); + g_wither_geom(gp, ENXIO); +} + +static void +g_fox_done(struct bio *bp) +{ + struct g_geom *gp; + struct g_fox_softc *sc; + int error; + + if (bp->bio_error == 0) { + g_std_done(bp); + return; + } + gp = bp->bio_from->geom; + sc = gp->softc; + if (bp->bio_from != sc->path) { + g_io_request(bp, sc->path); + return; + } + mtx_lock(&sc->lock); + sc->opath = sc->path; + sc->path = NULL; + error = g_post_event(g_fox_select_path, gp, M_NOWAIT, gp, NULL); + if (error) { + bp->bio_error = ENOMEM; + g_std_done(bp); + } else { + sc->waiting = 1; + TAILQ_INSERT_TAIL(&sc->queue, bp, bio_queue); + } + mtx_unlock(&sc->lock); +} + +static void +g_fox_start(struct bio *bp) +{ + struct g_geom *gp; + struct bio *bp2; + struct g_fox_softc *sc; + int error; + + gp = bp->bio_to->geom; + sc = gp->softc; + if (sc == NULL) { + g_io_deliver(bp, ENXIO); + return; + } + switch(bp->bio_cmd) { + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + bp2 = g_clone_bio(bp); + if (bp2 == NULL) { + g_io_deliver(bp, ENOMEM); + break; + } + bp2->bio_offset += sc->sectorsize; + bp2->bio_done = g_fox_done; + mtx_lock(&sc->lock); + if (sc->path == NULL || !TAILQ_EMPTY(&sc->queue)) { + if (sc->waiting == 0) { + error = g_post_event(g_fox_select_path, gp, + M_NOWAIT, gp, NULL); + if (error) { + g_destroy_bio(bp2); + bp2 = NULL; + g_io_deliver(bp, error); + } else { + sc->waiting = 1; + } + } + if (bp2 != NULL) + TAILQ_INSERT_TAIL(&sc->queue, bp2, + bio_queue); + } else { + g_io_request(bp2, sc->path); + } + mtx_unlock(&sc->lock); + break; + default: + g_io_deliver(bp, EOPNOTSUPP); + break; + } + return; +} + +static int +g_fox_access(struct g_provider *pp, int dr, int dw, int de) +{ + struct g_geom *gp; + struct g_fox_softc *sc; + struct g_consumer *cp1; + int error; + + g_topology_assert(); + gp = pp->geom; + sc = gp->softc; + if (sc == NULL) + return (ENXIO); + + if (sc->cr == 0 && sc->cw == 0 && sc->ce == 0) { + /* + * First open, open all consumers with an exclusive bit + */ + error = 0; + LIST_FOREACH(cp1, &gp->consumer, consumer) { + error = g_access_rel(cp1, 0, 0, 1); + if (error) { + printf("FOX: access(%s,0,0,1) = %d\n", + cp1->provider->name, error); + break; + } + } + if (error) { + LIST_FOREACH(cp1, &gp->consumer, consumer) { + if (cp1->ace) + g_access_rel(cp1, 0, 0, -1); + } + return (error); + } + } + if (sc->path == NULL) + g_fox_select_path(gp, 0); + if (sc->path == NULL) + error = ENXIO; + else + error = g_access_rel(sc->path, dr, dw, de); + if (error == 0) { + sc->cr += dr; + sc->cw += dw; + sc->ce += de; + if (sc->cr == 0 && sc->cw == 0 && sc->ce == 0) { + /* + * Last close, remove e-bit on all consumers + */ + LIST_FOREACH(cp1, &gp->consumer, consumer) + g_access_rel(cp1, 0, 0, -1); + } + } + return (error); +} + +static struct g_geom * +g_fox_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) +{ + struct g_geom *gp, *gp2; + struct g_provider *pp2; + struct g_consumer *cp, *cp2; + struct g_fox_softc *sc, *sc2; + int error; + u_int sectorsize; + u_char *buf; + + g_trace(G_T_TOPOLOGY, "fox_taste(%s, %s)", mp->name, pp->name); + g_topology_assert(); + if (!strcmp(pp->geom->class->name, mp->name)) + return (NULL); + gp = g_new_geomf(mp, "%s.fox", pp->name); + gp->softc = g_malloc(sizeof(struct g_fox_softc), M_WAITOK | M_ZERO); + sc = gp->softc; + + gp->start = g_fox_start; + gp->spoiled = g_fox_orphan; + gp->orphan = g_fox_orphan; + gp->access= g_fox_access; + cp = g_new_consumer(gp); + g_attach(cp, pp); + error = g_access_rel(cp, 1, 0, 0); + if (error) { + g_free(sc); + g_detach(cp); + g_destroy_consumer(cp); + g_destroy_geom(gp); + return(NULL); + } + do { + sectorsize = cp->provider->sectorsize; + g_topology_unlock(); + buf = g_read_data(cp, 0, sectorsize, &error); + g_topology_lock(); + if (buf == NULL || error != 0) + break; + if (memcmp(buf, FOX_MAGIC, strlen(FOX_MAGIC))) + break; + + /* + * First we need to see if this a new path for an existing fox. + */ + LIST_FOREACH(gp2, &mp->geom, geom) { + sc2 = gp2->softc; + if (sc == NULL) + continue; + if (memcmp(buf + 16, sc2->magic, sizeof sc2->magic)) + continue; + break; + } + if (gp2 != NULL) { + /* + * It was. Create a new consumer for that fox, + * attach it, and if the fox is open, open this + * path with an exclusive count of one. + */ + printf("Adding path (%s) to fox (%s)\n", + pp->name, gp2->name); + cp2 = g_new_consumer(gp2); + g_attach(cp2, pp); + pp2 = LIST_FIRST(&gp2->provider); + if (pp2->acr > 0 || pp2->acw > 0 || pp2->ace > 0) { + error = g_access_rel(cp2, 0, 0, 1); + if (error) { + /* + * This is bad, or more likely, + * the user is doing something stupid + */ + printf( + "WARNING: New path (%s) to fox(%s) not added: %s\n%s", + cp->provider->name, gp2->name, + "Could not get exclusive bit.", + "WARNING: This indicates a risk of data inconsistency." + ); + g_detach(cp2); + g_destroy_consumer(cp2); + } + } + break; + } + printf("Creating new fox (%s)\n", pp->name); + sc->path = cp; + memcpy(sc->magic, buf + 16, sizeof sc->magic); + pp2 = g_new_providerf(gp, "%s", gp->name); + pp2->mediasize = sc->mediasize = pp->mediasize - pp->sectorsize; + pp2->sectorsize = sc->sectorsize = pp->sectorsize; +printf("fox %s lock %p\n", gp->name, &sc->lock); + + mtx_init(&sc->lock, "fox queue", NULL, MTX_DEF); + TAILQ_INIT(&sc->queue); + g_error_provider(pp2, 0); + } while (0); + if (buf != NULL) + g_free(buf); + g_access_rel(cp, -1, 0, 0); + + if (!LIST_EMPTY(&gp->provider)) + return (gp); + + g_free(gp->softc); + g_detach(cp); + g_destroy_consumer(cp); + g_destroy_geom(gp); + return (NULL); +} + +static int +g_fox_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) +{ + struct g_fox_softc *sc; + + g_topology_assert(); + sc = gp->softc; + gp->softc = NULL; + mtx_destroy(&sc->lock); + g_free(gp->softc); + g_wither_geom(gp, ENXIO); + return (0); +} + +static struct g_class g_fox_class = { + .name = FOX_CLASS_NAME, + .taste = g_fox_taste, + .destroy_geom = g_fox_destroy_geom, +}; + +DECLARE_GEOM_CLASS(g_fox_class, g_fox); diff --git a/sys/geom/geom_gpt.c b/sys/geom/geom_gpt.c new file mode 100644 index 0000000..52951c4 --- /dev/null +++ b/sys/geom/geom_gpt.c @@ -0,0 +1,227 @@ +/*- + * Copyright (c) 2002 Marcel Moolenaar + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/bio.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +#include <sys/endian.h> +#include <sys/sbuf.h> +#include <sys/uuid.h> +#include <sys/gpt.h> +#include <geom/geom.h> +#include <geom/geom_slice.h> + +CTASSERT(offsetof(struct gpt_hdr, padding) == 92); +CTASSERT(sizeof(struct gpt_ent) == 128); + +/* + * XXX: GEOM is not dynamic enough. We are forced to use a compile-time + * limit. The minimum number of partitions (128) as required by EFI is + * most of the time just a waste of space. + */ +#define GPT_MAX_SLICES 128 + +struct g_gpt_softc { + struct gpt_ent *part[GPT_MAX_SLICES]; +}; + +static int +is_gpt_hdr(struct gpt_hdr *hdr) +{ + uint32_t crc; + + if (memcmp(hdr->hdr_sig, GPT_HDR_SIG, sizeof(hdr->hdr_sig))) + return (0); + crc = hdr->hdr_crc_self; + hdr->hdr_crc_self = 0; + if (crc32(hdr, hdr->hdr_size) != crc) + return (0); + hdr->hdr_crc_self = crc; + /* We're happy... */ + return (1); +} + +static int +g_gpt_start(struct bio *bp) +{ + + return (0); +} + +static void +g_gpt_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, + struct g_consumer *cp, struct g_provider *pp) +{ + struct g_slicer *gsp = gp->softc; + struct g_gpt_softc *gs = gsp->softc; + struct uuid *uuid; + + g_slice_dumpconf(sb, indent, gp, cp, pp); + + if (pp != NULL) { + uuid = &gs->part[pp->index]->ent_type; + if (indent != NULL) + sbuf_printf(sb, "%s<type>", indent); + else + sbuf_printf(sb, " ty "); + sbuf_printf_uuid(sb, uuid); + if (indent != NULL) + sbuf_printf(sb, "</type>\n"); + } +} + +static struct g_geom * +g_gpt_taste(struct g_class *mp, struct g_provider *pp, int insist) +{ + struct g_consumer *cp; + struct g_geom *gp; + struct g_gpt_softc *gs; + u_char *buf, *mbr; + struct gpt_ent *ent; + struct gpt_hdr *hdr; + u_int i, secsz, tblsz; + int error, ps; + + g_trace(G_T_TOPOLOGY, "g_gpt_taste(%s,%s)", mp->name, pp->name); + g_topology_assert(); + + /* + * XXX: I don't like to hardcode a maximum number of slices, since + * it's wasting space most of the time and insufficient any time. + * It's easier for now... + */ + gp = g_slice_new(mp, GPT_MAX_SLICES, pp, &cp, &gs, sizeof(*gs), + g_gpt_start); + if (gp == NULL) + return (NULL); + + g_topology_unlock(); + gp->dumpconf = g_gpt_dumpconf; + + do { + + mbr = NULL; + + if (gp->rank != 2 && insist == 0) + break; + + secsz = cp->provider->sectorsize; + if (secsz < 512) + break; + + /* XXX: we need to get the media size as well. */ + + /* Read both the MBR sector and the GPT sector. */ + mbr = g_read_data(cp, 0, 2 * secsz, &error); + if (mbr == NULL || error != 0) + break; +#if 0 + /* + * XXX: we should ignore the GPT if there's a MBR and the MBR is + * not a PMBR (Protective MBR). I believe this is what the EFI + * spec is going to say eventually (this is hearsay :-) + * Currently EFI (version 1.02) accepts and uses the GPT even + * though there's a valid MBR. We do this too, because it allows + * us to test this code without first nuking the only partitioning + * scheme we grok until this is working. + */ + if (!is_pmbr((void*)mbr)) + goto out; +#endif + + hdr = (void*)(mbr + secsz); + + /* + * XXX: if we don't have a GPT header at LBA 1, we should check if + * there's a backup GPT at the end of the medium. If we have a valid + * backup GPT, we should restore the primary GPT and claim this lunch. + */ + if (!is_gpt_hdr(hdr)) + break; + + tblsz = (hdr->hdr_entries * hdr->hdr_entsz + secsz - 1) & + ~(secsz - 1); + buf = g_read_data(cp, hdr->hdr_lba_table * secsz, tblsz, &error); + for (i = 0; i < hdr->hdr_entries; i++) { + struct uuid unused = GPT_ENT_TYPE_UNUSED; + struct uuid freebsd = GPT_ENT_TYPE_FREEBSD; + struct uuid tmp; + if (i >= GPT_MAX_SLICES) + break; + ent = (void*)(buf + i * hdr->hdr_entsz); + le_uuid_dec(&ent->ent_type, &tmp); + if (!memcmp(&tmp, &unused, sizeof(unused))) + continue; + /* XXX: This memory leaks */ + gs->part[i] = g_malloc(hdr->hdr_entsz, M_WAITOK); + if (gs->part[i] == NULL) + break; + bcopy(ent, gs->part[i], hdr->hdr_entsz); + ps = (!memcmp(&tmp, &freebsd, sizeof(freebsd))) + ? 's' : 'p'; + g_topology_lock(); + (void)g_slice_config(gp, i, G_SLICE_CONFIG_SET, + ent->ent_lba_start * secsz, + (1 + ent->ent_lba_end - ent->ent_lba_start) * secsz, + secsz, + "%s%c%d", gp->name, ps, i + 1); + g_topology_unlock(); + } + g_free(buf); + + } while (0); + + if (mbr != NULL) + g_free(mbr); + + g_topology_lock(); + g_access_rel(cp, -1, 0, 0); + if (LIST_EMPTY(&gp->provider)) { + g_slice_spoiled(cp); + return (NULL); + } + return (gp); +} + +static struct g_class g_gpt_class = { + .name = "GPT", + .taste = g_gpt_taste, +}; + +DECLARE_GEOM_CLASS(g_gpt_class, g_gpt); diff --git a/sys/geom/geom_int.h b/sys/geom/geom_int.h new file mode 100644 index 0000000..952b6c6 --- /dev/null +++ b/sys/geom/geom_int.h @@ -0,0 +1,88 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +LIST_HEAD(class_list_head, g_class); +TAILQ_HEAD(g_tailq_head, g_geom); + +extern int g_collectstats; +extern int g_debugflags; +/* + * 1 G_T_TOPOLOGY + * 2 G_T_BIO + * 4 G_T_ACCESS + * 8 Enable sanity checks + * 16 Allow footshooting on rank#1 providers + * 32 G_T_DETAILS + */ +#define G_F_DISKIOCTL 64 +#define G_F_CTLDUMP 128 + +/* + * We actually have a number of drivers sharing the same major number + * so we coordinate the major/minor usage here + */ +#define GEOM_MAJOR 4 +#define GEOM_MINOR_STATS 0 +#define GEOM_MINOR_PROVIDERS 10 + +/* geom_dump.c */ +void g_confxml(void *, int flag); +void g_conf_specific(struct sbuf *sb, struct g_class *mp, struct g_geom *gp, struct g_provider *pp, struct g_consumer *cp); +void g_confdot(void *, int flag); +void g_conftxt(void *, int flag); + +/* geom_event.c */ +void g_event_init(void); +void g_run_events(void); +void g_stall_events(void); +void g_release_events(void); + +/* geom_subr.c */ +extern struct class_list_head g_classes; +extern char *g_wait_event, *g_wait_sim, *g_wait_up, *g_wait_down; + +/* geom_io.c */ +void g_io_init(void); +void g_io_schedule_down(struct thread *tp); +void g_io_schedule_up(struct thread *tp); + +/* geom_kern.c / geom_kernsim.c */ +void g_init(void); +extern int g_shutdown; + +/* geom_ctl.c */ +void g_ctl_init(void); diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c new file mode 100644 index 0000000..4bed6f6 --- /dev/null +++ b/sys/geom/geom_io.c @@ -0,0 +1,416 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/bio.h> + +#include <sys/errno.h> +#include <geom/geom.h> +#include <geom/geom_int.h> +#include <sys/devicestat.h> + +#include <vm/uma.h> + +static struct g_bioq g_bio_run_down; +static struct g_bioq g_bio_run_up; + +static u_int pace; +static uma_zone_t biozone; + +#include <machine/atomic.h> + +static void +g_bioq_lock(struct g_bioq *bq) +{ + + mtx_lock(&bq->bio_queue_lock); +} + +static void +g_bioq_unlock(struct g_bioq *bq) +{ + + mtx_unlock(&bq->bio_queue_lock); +} + +#if 0 +static void +g_bioq_destroy(struct g_bioq *bq) +{ + + mtx_destroy(&bq->bio_queue_lock); +} +#endif + +static void +g_bioq_init(struct g_bioq *bq) +{ + + TAILQ_INIT(&bq->bio_queue); + mtx_init(&bq->bio_queue_lock, "bio queue", NULL, MTX_DEF); +} + +static struct bio * +g_bioq_first(struct g_bioq *bq) +{ + struct bio *bp; + + bp = TAILQ_FIRST(&bq->bio_queue); + if (bp != NULL) { + TAILQ_REMOVE(&bq->bio_queue, bp, bio_queue); + bq->bio_queue_length--; + } + return (bp); +} + +static void +g_bioq_enqueue_tail(struct bio *bp, struct g_bioq *rq) +{ + + g_bioq_lock(rq); + TAILQ_INSERT_TAIL(&rq->bio_queue, bp, bio_queue); + rq->bio_queue_length++; + g_bioq_unlock(rq); +} + +struct bio * +g_new_bio(void) +{ + struct bio *bp; + + bp = uma_zalloc(biozone, M_NOWAIT | M_ZERO); + return (bp); +} + +void +g_destroy_bio(struct bio *bp) +{ + + uma_zfree(biozone, bp); +} + +struct bio * +g_clone_bio(struct bio *bp) +{ + struct bio *bp2; + + bp2 = uma_zalloc(biozone, M_NOWAIT | M_ZERO); + if (bp2 != NULL) { + bp2->bio_parent = bp; + bp2->bio_cmd = bp->bio_cmd; + bp2->bio_length = bp->bio_length; + bp2->bio_offset = bp->bio_offset; + bp2->bio_data = bp->bio_data; + bp2->bio_attribute = bp->bio_attribute; + bp->bio_children++; + } + return(bp2); +} + +void +g_io_init() +{ + + g_bioq_init(&g_bio_run_down); + g_bioq_init(&g_bio_run_up); + biozone = uma_zcreate("g_bio", sizeof (struct bio), + NULL, NULL, + NULL, NULL, + 0, 0); +} + +int +g_io_getattr(const char *attr, struct g_consumer *cp, int *len, void *ptr) +{ + struct bio *bp; + int error; + + g_trace(G_T_BIO, "bio_getattr(%s)", attr); + bp = g_new_bio(); + bp->bio_cmd = BIO_GETATTR; + bp->bio_done = NULL; + bp->bio_attribute = attr; + bp->bio_length = *len; + bp->bio_data = ptr; + g_io_request(bp, cp); + error = biowait(bp, "ggetattr"); + *len = bp->bio_completed; + g_destroy_bio(bp); + return (error); +} + +static int +g_io_check(struct bio *bp) +{ + struct g_consumer *cp; + struct g_provider *pp; + + cp = bp->bio_from; + pp = bp->bio_to; + + /* Fail if access counters dont allow the operation */ + switch(bp->bio_cmd) { + case BIO_READ: + case BIO_GETATTR: + if (cp->acr == 0) + return (EPERM); + break; + case BIO_WRITE: + case BIO_DELETE: + if (cp->acw == 0) + return (EPERM); + break; + default: + return (EPERM); + } + /* if provider is marked for error, don't disturb. */ + if (pp->error) + return (pp->error); + + switch(bp->bio_cmd) { + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + /* Reject I/O not on sector boundary */ + if (bp->bio_offset % pp->sectorsize) + return (EINVAL); + /* Reject I/O not integral sector long */ + if (bp->bio_length % pp->sectorsize) + return (EINVAL); + /* Reject requests past the end of media. */ + if (bp->bio_offset > pp->mediasize) + return (EIO); + break; + default: + break; + } + return (0); +} + +void +g_io_request(struct bio *bp, struct g_consumer *cp) +{ + struct g_provider *pp; + + pp = cp->provider; + KASSERT(cp != NULL, ("NULL cp in g_io_request")); + KASSERT(bp != NULL, ("NULL bp in g_io_request")); + KASSERT(bp->bio_data != NULL, ("NULL bp->data in g_io_request")); + KASSERT(pp != NULL, ("consumer not attached in g_io_request")); + + bp->bio_from = cp; + bp->bio_to = pp; + bp->bio_error = 0; + bp->bio_completed = 0; + + if (g_collectstats) { + devstat_start_transaction_bio(cp->stat, bp); + devstat_start_transaction_bio(pp->stat, bp); + } + cp->nstart++; + pp->nstart++; + + /* Pass it on down. */ + g_trace(G_T_BIO, "bio_request(%p) from %p(%s) to %p(%s) cmd %d", + bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd); + g_bioq_enqueue_tail(bp, &g_bio_run_down); + wakeup(&g_wait_down); +} + +void +g_io_deliver(struct bio *bp, int error) +{ + struct g_consumer *cp; + struct g_provider *pp; + + cp = bp->bio_from; + pp = bp->bio_to; + KASSERT(bp != NULL, ("NULL bp in g_io_deliver")); + KASSERT(cp != NULL, ("NULL bio_from in g_io_deliver")); + KASSERT(cp->geom != NULL, ("NULL bio_from->geom in g_io_deliver")); + KASSERT(pp != NULL, ("NULL bio_to in g_io_deliver")); + + g_trace(G_T_BIO, +"g_io_deliver(%p) from %p(%s) to %p(%s) cmd %d error %d off %jd len %jd", + bp, cp, cp->geom->name, pp, pp->name, bp->bio_cmd, error, + (intmax_t)bp->bio_offset, (intmax_t)bp->bio_length); + + bp->bio_bcount = bp->bio_length; + if (g_collectstats) { + bp->bio_resid = bp->bio_bcount - bp->bio_completed; + devstat_end_transaction_bio(cp->stat, bp); + devstat_end_transaction_bio(pp->stat, bp); + } + cp->nend++; + pp->nend++; + + if (error == ENOMEM) { + if (bootverbose) + printf("ENOMEM %p on %p(%s)\n", bp, pp, pp->name); + g_io_request(bp, cp); + pace++; + return; + } + bp->bio_error = error; + g_bioq_enqueue_tail(bp, &g_bio_run_up); + wakeup(&g_wait_up); +} + +void +g_io_schedule_down(struct thread *tp __unused) +{ + struct bio *bp; + off_t excess; + int error; + struct mtx mymutex; + + bzero(&mymutex, sizeof mymutex); + mtx_init(&mymutex, "g_xdown", MTX_DEF, 0); + + for(;;) { + g_bioq_lock(&g_bio_run_down); + bp = g_bioq_first(&g_bio_run_down); + if (bp == NULL) { + msleep(&g_wait_down, &g_bio_run_down.bio_queue_lock, + PRIBIO | PDROP, "-", hz/10); + continue; + } + g_bioq_unlock(&g_bio_run_down); + if (pace > 0) { + msleep(&error, NULL, PRIBIO, "g_down", hz/10); + pace--; + } + error = g_io_check(bp); + if (error) { + g_io_deliver(bp, error); + continue; + } + switch (bp->bio_cmd) { + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + /* Truncate requests to the end of providers media. */ + excess = bp->bio_offset + bp->bio_length; + if (excess > bp->bio_to->mediasize) { + excess -= bp->bio_to->mediasize; + bp->bio_length -= excess; + } + /* Deliver zero length transfers right here. */ + if (bp->bio_length == 0) { + g_io_deliver(bp, 0); + continue; + } + break; + default: + break; + } + mtx_lock(&mymutex); + bp->bio_to->geom->start(bp); + mtx_unlock(&mymutex); + } +} + +void +g_io_schedule_up(struct thread *tp __unused) +{ + struct bio *bp; + struct mtx mymutex; + + bzero(&mymutex, sizeof mymutex); + mtx_init(&mymutex, "g_xup", MTX_DEF, 0); + for(;;) { + g_bioq_lock(&g_bio_run_up); + bp = g_bioq_first(&g_bio_run_up); + if (bp != NULL) { + g_bioq_unlock(&g_bio_run_up); + mtx_lock(&mymutex); + biodone(bp); + mtx_unlock(&mymutex); + continue; + } + msleep(&g_wait_up, &g_bio_run_up.bio_queue_lock, + PRIBIO | PDROP, "-", hz/10); + } +} + +void * +g_read_data(struct g_consumer *cp, off_t offset, off_t length, int *error) +{ + struct bio *bp; + void *ptr; + int errorc; + + bp = g_new_bio(); + bp->bio_cmd = BIO_READ; + bp->bio_done = NULL; + bp->bio_offset = offset; + bp->bio_length = length; + ptr = g_malloc(length, M_WAITOK); + bp->bio_data = ptr; + g_io_request(bp, cp); + errorc = biowait(bp, "gread"); + if (error != NULL) + *error = errorc; + g_destroy_bio(bp); + if (errorc) { + g_free(ptr); + ptr = NULL; + } + return (ptr); +} + +int +g_write_data(struct g_consumer *cp, off_t offset, void *ptr, off_t length) +{ + struct bio *bp; + int error; + + bp = g_new_bio(); + bp->bio_cmd = BIO_WRITE; + bp->bio_done = NULL; + bp->bio_offset = offset; + bp->bio_length = length; + bp->bio_data = ptr; + g_io_request(bp, cp); + error = biowait(bp, "gwrite"); + g_destroy_bio(bp); + return (error); +} diff --git a/sys/geom/geom_kern.c b/sys/geom/geom_kern.c new file mode 100644 index 0000000..9492241 --- /dev/null +++ b/sys/geom/geom_kern.c @@ -0,0 +1,241 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/eventhandler.h> +#include <sys/malloc.h> +#include <sys/bio.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/kthread.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sx.h> +#include <sys/sbuf.h> +#include <geom/geom.h> +#include <geom/geom_int.h> + +MALLOC_DEFINE(M_GEOM, "GEOM", "Geom data structures"); + +struct sx topology_lock; + +static struct proc *g_up_proc; + +int g_debugflags; +int g_collectstats = 1; +int g_shutdown; + +/* + * G_UP and G_DOWN are the two threads which push I/O through the + * stack. + * + * Things are procesed in a FIFO order, but these threads could be + * part of I/O prioritization by deciding which bios/bioqs to service + * in what order. + * + * We have only one thread in each direction, it is belived that until + * a very non-trivial workload in the UP/DOWN path this will be enough, + * but more than one can actually be run without problems. + * + * Holding the "mymutex" is a debugging feature: It prevents people + * from sleeping in the UP/DOWN I/O path by mistake or design (doing + * so almost invariably result in deadlocks since it stalls all I/O + * processing in the given direction. + */ + +static void +g_up_procbody(void) +{ + struct proc *p = g_up_proc; + struct thread *tp = FIRST_THREAD_IN_PROC(p); + + mtx_assert(&Giant, MA_NOTOWNED); + tp->td_base_pri = PRIBIO; + for(;;) { + g_io_schedule_up(tp); + } +} + +struct kproc_desc g_up_kp = { + "g_up", + g_up_procbody, + &g_up_proc, +}; + +static struct proc *g_down_proc; + +static void +g_down_procbody(void) +{ + struct proc *p = g_down_proc; + struct thread *tp = FIRST_THREAD_IN_PROC(p); + + mtx_assert(&Giant, MA_NOTOWNED); + tp->td_base_pri = PRIBIO; + for(;;) { + g_io_schedule_down(tp); + } +} + +struct kproc_desc g_down_kp = { + "g_down", + g_down_procbody, + &g_down_proc, +}; + +static struct proc *g_event_proc; + +static void +g_event_procbody(void) +{ + struct proc *p = g_event_proc; + struct thread *tp = FIRST_THREAD_IN_PROC(p); + + mtx_assert(&Giant, MA_NOTOWNED); + tp->td_base_pri = PRIBIO; + for(;;) { + g_run_events(); + tsleep(&g_wait_event, PRIBIO, "-", hz/10); + } +} + +static struct kproc_desc g_event_kp = { + "g_event", + g_event_procbody, + &g_event_proc, +}; + +static void +geom_shutdown(void *foo __unused) +{ + + g_shutdown = 1; +} + +void +g_init(void) +{ + + g_trace(G_T_TOPOLOGY, "g_ignition"); + sx_init(&topology_lock, "GEOM topology"); + g_io_init(); + g_event_init(); + g_ctl_init(); + mtx_lock(&Giant); + kproc_start(&g_event_kp); + kproc_start(&g_up_kp); + kproc_start(&g_down_kp); + mtx_unlock(&Giant); + EVENTHANDLER_REGISTER(shutdown_pre_sync, geom_shutdown, NULL, + SHUTDOWN_PRI_FIRST); +} + +static int +sysctl_kern_geom_conftxt(SYSCTL_HANDLER_ARGS) +{ + int error; + struct sbuf *sb; + + sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); + sbuf_clear(sb); + g_waitfor_event(g_conftxt, sb, M_WAITOK, NULL); + error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); + return error; +} + +static int +sysctl_kern_geom_confdot(SYSCTL_HANDLER_ARGS) +{ + int error; + struct sbuf *sb; + + sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); + sbuf_clear(sb); + g_waitfor_event(g_confdot, sb, M_WAITOK, NULL); + error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); + return error; +} + +static int +sysctl_kern_geom_confxml(SYSCTL_HANDLER_ARGS) +{ + int error; + struct sbuf *sb; + + sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); + sbuf_clear(sb); + g_waitfor_event(g_confxml, sb, M_WAITOK, NULL); + error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); + return error; +} + +SYSCTL_NODE(_kern, OID_AUTO, geom, CTLFLAG_RW, 0, "GEOMetry management"); + +SYSCTL_PROC(_kern_geom, OID_AUTO, confxml, CTLTYPE_STRING|CTLFLAG_RD, + 0, 0, sysctl_kern_geom_confxml, "", + "Dump the GEOM config in XML"); + +SYSCTL_PROC(_kern_geom, OID_AUTO, confdot, CTLTYPE_STRING|CTLFLAG_RD, + 0, 0, sysctl_kern_geom_confdot, "", + "Dump the GEOM config in dot"); + +SYSCTL_PROC(_kern_geom, OID_AUTO, conftxt, CTLTYPE_STRING|CTLFLAG_RD, + 0, 0, sysctl_kern_geom_conftxt, "", + "Dump the GEOM config in txt"); + +SYSCTL_INT(_kern_geom, OID_AUTO, debugflags, CTLFLAG_RW, + &g_debugflags, 0, ""); + +SYSCTL_INT(_kern_geom, OID_AUTO, collectstats, CTLFLAG_RW, + &g_collectstats, 0, ""); + +SYSCTL_INT(_debug_sizeof, OID_AUTO, g_class, CTLFLAG_RD, + 0, sizeof(struct g_class), ""); +SYSCTL_INT(_debug_sizeof, OID_AUTO, g_geom, CTLFLAG_RD, + 0, sizeof(struct g_geom), ""); +SYSCTL_INT(_debug_sizeof, OID_AUTO, g_provider, CTLFLAG_RD, + 0, sizeof(struct g_provider), ""); +SYSCTL_INT(_debug_sizeof, OID_AUTO, g_consumer, CTLFLAG_RD, + 0, sizeof(struct g_consumer), ""); +SYSCTL_INT(_debug_sizeof, OID_AUTO, g_bioq, CTLFLAG_RD, + 0, sizeof(struct g_bioq), ""); diff --git a/sys/geom/geom_mbr.c b/sys/geom/geom_mbr.c new file mode 100644 index 0000000..3abcf76 --- /dev/null +++ b/sys/geom/geom_mbr.c @@ -0,0 +1,451 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/endian.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/bio.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +#include <sys/diskmbr.h> +#include <sys/sbuf.h> +#include <geom/geom.h> +#include <geom/geom_slice.h> + +#define MBR_CLASS_NAME "MBR" +#define MBREXT_CLASS_NAME "MBREXT" + +static struct dos_partition historical_bogus_partition_table[NDOSPART] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0x80, 0, 1, 0, DOSPTYP_386BSD, 255, 255, 255, 0, 50000, }, +}; + +static struct dos_partition historical_bogus_partition_table_fixed[NDOSPART] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0x80, 0, 1, 0, DOSPTYP_386BSD, 254, 255, 255, 0, 50000, }, +}; + +static void +g_mbr_print(int i, struct dos_partition *dp) +{ + + printf("[%d] f:%02x typ:%d", i, dp->dp_flag, dp->dp_typ); + printf(" s(CHS):%d/%d/%d", DPCYL(dp->dp_scyl, dp->dp_ssect), + dp->dp_shd, DPSECT(dp->dp_ssect)); + printf(" e(CHS):%d/%d/%d", DPCYL(dp->dp_ecyl, dp->dp_esect), + dp->dp_ehd, DPSECT(dp->dp_esect)); + printf(" s:%d l:%d\n", dp->dp_start, dp->dp_size); +} + +struct g_mbr_softc { + int type [NDOSPART]; + u_int sectorsize; + u_char sec0[512]; +}; + +static int +g_mbr_modify(struct g_geom *gp, struct g_mbr_softc *ms, u_char *sec0) +{ + int i, error; + off_t l[NDOSPART]; + struct dos_partition ndp[NDOSPART], *dp; + + g_topology_assert(); + + if (sec0[0x1fe] != 0x55 && sec0[0x1ff] != 0xaa) + return (EBUSY); + + dp = ndp; + for (i = 0; i < NDOSPART; i++) { + dos_partition_dec( + sec0 + DOSPARTOFF + i * sizeof(struct dos_partition), + dp + i); + if (bootverbose) + g_mbr_print(i, dp + i); + } + if ((!bcmp(dp, historical_bogus_partition_table, + sizeof historical_bogus_partition_table)) || + (!bcmp(dp, historical_bogus_partition_table_fixed, + sizeof historical_bogus_partition_table_fixed))) { + /* + * We will not allow people to write these from "the inside", + * Since properly selfdestructing takes too much code. If + * people really want to do this, they cannot have any + * providers of this geom open, and in that case they can just + * as easily overwrite the MBR in the parent device. + */ + return(EBUSY); + } + for (i = 0; i < NDOSPART; i++) { + /* + * A Protective MBR (PMBR) has a single partition of + * type 0xEE spanning the whole disk. Such a MBR + * protects a GPT on the disk from MBR tools that + * don't know anything about GPT. We're interpreting + * it a bit more loosely: any partition of type 0xEE + * is to be skipped as it doesn't contain any data + * that we should care about. We still allow other + * partitions to be present in the MBR. A PMBR will + * be handled correctly anyway. + */ + if (dp[i].dp_typ == DOSPTYP_PMBR) + l[i] = 0; + else if (dp[i].dp_flag != 0 && dp[i].dp_flag != 0x80) + l[i] = 0; + else if (dp[i].dp_typ == 0) + l[i] = 0; + else + l[i] = (off_t)dp[i].dp_size * ms->sectorsize; + error = g_slice_config(gp, i, G_SLICE_CONFIG_CHECK, + (off_t)dp[i].dp_start * ms->sectorsize, l[i], + ms->sectorsize, "%ss%d", gp->name, 1 + i); + if (error) + return (error); + } + for (i = 0; i < NDOSPART; i++) { + ms->type[i] = dp[i].dp_typ; + g_slice_config(gp, i, G_SLICE_CONFIG_SET, + (off_t)dp[i].dp_start * ms->sectorsize, l[i], + ms->sectorsize, "%ss%d", gp->name, 1 + i); + } + bcopy(sec0, ms->sec0, 512); + return (0); +} + +static void +g_mbr_ioctl(void *arg, int flag) +{ + struct bio *bp; + struct g_geom *gp; + struct g_slicer *gsp; + struct g_mbr_softc *ms; + struct g_ioctl *gio; + struct g_consumer *cp; + u_char *sec0; + int error; + + bp = arg; + if (flag == EV_CANCEL) { + g_io_deliver(bp, ENXIO); + return; + } + gp = bp->bio_to->geom; + gsp = gp->softc; + ms = gsp->softc; + gio = (struct g_ioctl *)bp->bio_data; + + /* The disklabel to set is the ioctl argument. */ + sec0 = gio->data; + + error = g_mbr_modify(gp, ms, sec0); + if (error) { + g_io_deliver(bp, error); + return; + } + cp = LIST_FIRST(&gp->consumer); + error = g_write_data(cp, 0, sec0, 512); + g_io_deliver(bp, error); +} + + +static int +g_mbr_start(struct bio *bp) +{ + struct g_provider *pp; + struct g_geom *gp; + struct g_mbr_softc *mp; + struct g_slicer *gsp; + struct g_ioctl *gio; + int idx, error; + + pp = bp->bio_to; + idx = pp->index; + gp = pp->geom; + gsp = gp->softc; + mp = gsp->softc; + if (bp->bio_cmd == BIO_GETATTR) { + if (g_handleattr_int(bp, "MBR::type", mp->type[idx])) + return (1); + if (g_handleattr_off_t(bp, "MBR::offset", + gsp->slices[idx].offset)) + return (1); + } + + /* We only handle ioctl(2) requests of the right format. */ + if (strcmp(bp->bio_attribute, "GEOM::ioctl")) + return (0); + else if (bp->bio_length != sizeof(*gio)) + return (0); + + /* Get hold of the ioctl parameters. */ + gio = (struct g_ioctl *)bp->bio_data; + + switch (gio->cmd) { + case DIOCSMBR: + /* + * These we cannot do without the topology lock and some + * some I/O requests. Ask the event-handler to schedule + * us in a less restricted environment. + */ + error = g_post_event(g_mbr_ioctl, bp, M_NOWAIT, gp, NULL); + if (error) + g_io_deliver(bp, error); + /* + * We must return non-zero to indicate that we will deal + * with this bio, even though we have not done so yet. + */ + return (1); + default: + return (0); + } +} + +static void +g_mbr_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) +{ + struct g_mbr_softc *mp; + struct g_slicer *gsp; + + gsp = gp->softc; + mp = gsp->softc; + g_slice_dumpconf(sb, indent, gp, cp, pp); + if (pp != NULL) { + if (indent == NULL) + sbuf_printf(sb, " ty %d", mp->type[pp->index]); + else + sbuf_printf(sb, "%s<type>%d</type>\n", indent, + mp->type[pp->index]); + } +} + +static struct g_geom * +g_mbr_taste(struct g_class *mp, struct g_provider *pp, int insist) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error; + struct g_mbr_softc *ms; + u_int fwsectors, sectorsize; + u_char *buf; + + g_trace(G_T_TOPOLOGY, "mbr_taste(%s,%s)", mp->name, pp->name); + g_topology_assert(); + gp = g_slice_new(mp, NDOSPART, pp, &cp, &ms, sizeof *ms, g_mbr_start); + if (gp == NULL) + return (NULL); + g_topology_unlock(); + gp->dumpconf = g_mbr_dumpconf; + do { + if (gp->rank != 2 && insist == 0) + break; + error = g_getattr("GEOM::fwsectors", cp, &fwsectors); + if (error) + fwsectors = 17; + sectorsize = cp->provider->sectorsize; + if (sectorsize < 512) + break; + ms->sectorsize = sectorsize; + buf = g_read_data(cp, 0, sectorsize, &error); + if (buf == NULL || error != 0) + break; + g_topology_lock(); + g_mbr_modify(gp, ms, buf); + g_topology_unlock(); + g_free(buf); + break; + } while (0); + g_topology_lock(); + g_access_rel(cp, -1, 0, 0); + if (LIST_EMPTY(&gp->provider)) { + g_slice_spoiled(cp); + return (NULL); + } + return (gp); +} + +static struct g_class g_mbr_class = { + .name = MBR_CLASS_NAME, + .taste = g_mbr_taste, +}; + +DECLARE_GEOM_CLASS(g_mbr_class, g_mbr); + +#define NDOSEXTPART 32 +struct g_mbrext_softc { + int type [NDOSEXTPART]; +}; + +static int +g_mbrext_start(struct bio *bp) +{ + struct g_provider *pp; + struct g_geom *gp; + struct g_mbrext_softc *mp; + struct g_slicer *gsp; + int idx; + + pp = bp->bio_to; + idx = pp->index; + gp = pp->geom; + gsp = gp->softc; + mp = gsp->softc; + if (bp->bio_cmd == BIO_GETATTR) { + if (g_handleattr_int(bp, "MBR::type", mp->type[idx])) + return (1); + } + return (0); +} + +static void +g_mbrext_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) +{ + struct g_mbrext_softc *mp; + struct g_slicer *gsp; + + g_slice_dumpconf(sb, indent, gp, cp, pp); + gsp = gp->softc; + mp = gsp->softc; + if (pp != NULL) { + if (indent == NULL) + sbuf_printf(sb, " ty %d", mp->type[pp->index]); + else + sbuf_printf(sb, "%s<type>%d</type>\n", indent, + mp->type[pp->index]); + } +} + +static struct g_geom * +g_mbrext_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error, i, slice; + struct g_mbrext_softc *ms; + off_t off; + u_char *buf; + struct dos_partition dp[4]; + u_int fwsectors, sectorsize; + + g_trace(G_T_TOPOLOGY, "g_mbrext_taste(%s,%s)", mp->name, pp->name); + g_topology_assert(); + if (strcmp(pp->geom->class->name, MBR_CLASS_NAME)) + return (NULL); + gp = g_slice_new(mp, NDOSEXTPART, pp, &cp, &ms, sizeof *ms, + g_mbrext_start); + if (gp == NULL) + return (NULL); + g_topology_unlock(); + gp->dumpconf = g_mbrext_dumpconf; + off = 0; + slice = 0; + do { + error = g_getattr("MBR::type", cp, &i); + if (error || (i != DOSPTYP_EXT && i != DOSPTYP_EXTLBA)) + break; + error = g_getattr("GEOM::fwsectors", cp, &fwsectors); + if (error) + fwsectors = 17; + sectorsize = cp->provider->sectorsize; + if (sectorsize != 512) + break; + for (;;) { + buf = g_read_data(cp, off, sectorsize, &error); + if (buf == NULL || error != 0) + break; + if (buf[0x1fe] != 0x55 && buf[0x1ff] != 0xaa) { + g_free(buf); + break; + } + for (i = 0; i < NDOSPART; i++) + dos_partition_dec( + buf + DOSPARTOFF + + i * sizeof(struct dos_partition), dp + i); + g_free(buf); + if (bootverbose) { + printf("MBREXT Slice %d on %s:\n", + slice + 5, gp->name); + g_mbr_print(0, dp); + g_mbr_print(1, dp + 1); + } + if ((dp[0].dp_flag & 0x7f) == 0 && + dp[0].dp_size != 0 && dp[0].dp_typ != 0) { + g_topology_lock(); + g_slice_config(gp, slice, G_SLICE_CONFIG_SET, + (((off_t)dp[0].dp_start) << 9ULL) + off, + ((off_t)dp[0].dp_size) << 9ULL, + sectorsize, + "%*.*s%d", + strlen(gp->name) - 1, + strlen(gp->name) - 1, + gp->name, + slice + 5); + g_topology_unlock(); + ms->type[slice] = dp[0].dp_typ; + slice++; + } + if (dp[1].dp_flag != 0) + break; + if (dp[1].dp_typ != DOSPTYP_EXT) + break; + if (dp[1].dp_size == 0) + break; + off = ((off_t)dp[1].dp_start) << 9ULL; + } + break; + } while (0); + g_topology_lock(); + g_access_rel(cp, -1, 0, 0); + if (LIST_EMPTY(&gp->provider)) { + g_slice_spoiled(cp); + return (NULL); + } + return (gp); +} + + +static struct g_class g_mbrext_class = { + .name = MBREXT_CLASS_NAME, + .taste = g_mbrext_taste, +}; + +DECLARE_GEOM_CLASS(g_mbrext_class, g_mbrext); diff --git a/sys/geom/geom_mbr_enc.c b/sys/geom/geom_mbr_enc.c new file mode 100644 index 0000000..da5f997 --- /dev/null +++ b/sys/geom/geom_mbr_enc.c @@ -0,0 +1,72 @@ +/*- + * Copyright (c) 2003 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Functions to encode or decode struct dos_partition into a bytestream + * of correct endianess and packing. These functions do no validation + * or sanity checking, they only pack/unpack the fields correctly. + * + * NB! This file must be usable both in kernel and userland. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/diskmbr.h> +#include <sys/endian.h> + +void +dos_partition_dec(void const *pp, struct dos_partition *d) +{ + unsigned char const *p = pp; + + d->dp_flag = p[0]; + d->dp_shd = p[1]; + d->dp_ssect = p[2]; + d->dp_scyl = p[3]; + d->dp_typ = p[4]; + d->dp_ehd = p[5]; + d->dp_esect = p[6]; + d->dp_ecyl = p[7]; + d->dp_start = le32dec(p + 8); + d->dp_size = le32dec(p + 12); +} + +void +dos_partition_enc(void *pp, struct dos_partition *d) +{ + unsigned char *p = pp; + + p[0] = d->dp_flag; + p[1] = d->dp_shd; + p[2] = d->dp_ssect; + p[3] = d->dp_scyl; + p[4] = d->dp_typ; + p[5] = d->dp_ehd; + p[6] = d->dp_esect; + p[7] = d->dp_ecyl; + le32enc(p + 8, d->dp_start); + le32enc(p + 12, d->dp_size); +} diff --git a/sys/geom/geom_mirror.c b/sys/geom/geom_mirror.c new file mode 100644 index 0000000..98111c5 --- /dev/null +++ b/sys/geom/geom_mirror.c @@ -0,0 +1,237 @@ +/*- + * Copyright (c) 2003 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/conf.h> +#include <sys/bio.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/libkern.h> +#include <sys/endian.h> +#include <sys/md5.h> +#include <sys/errno.h> +#include <geom/geom.h> + +#define MIRROR_MAGIC "GEOM::MIRROR" + +struct g_mirror_softc { + off_t mediasize; + u_int sectorsize; + u_char magic[16]; +}; + + +static int +g_mirror_add(struct g_geom *gp, struct g_provider *pp) +{ + struct g_consumer *cp; + + g_trace(G_T_TOPOLOGY, "g_mirror_add(%s, %s)", gp->name, pp->name); + g_topology_assert(); + cp = g_new_consumer(gp); + g_attach(cp, pp); + return (0); +} + +static void +g_mirror_orphan(struct g_consumer *cp) +{ + struct g_geom *gp; + int error; + + g_topology_assert(); + gp = cp->geom; + g_access_rel(cp, -cp->acr, -cp->acw, -cp->ace); + error = cp->provider->error; + g_detach(cp); + g_destroy_consumer(cp); + if (!LIST_EMPTY(&gp->consumer)) + return; + g_free(gp->softc); + g_wither_geom(gp, error); +} + +static void +g_mirror_done(struct bio *bp) +{ + struct g_geom *gp; + struct g_mirror_softc *sc; + struct g_consumer *cp; + + gp = bp->bio_to->geom; + sc = gp->softc; + cp = LIST_NEXT(bp->bio_from, consumer); + if (cp == NULL) + g_std_done(bp); + else + g_io_request(bp, cp); +} + +static void +g_mirror_start(struct bio *bp) +{ + struct g_geom *gp; + struct bio *bp2; + struct g_mirror_softc *sc; + + gp = bp->bio_to->geom; + sc = gp->softc; + switch(bp->bio_cmd) { + case BIO_READ: + bp2 = g_clone_bio(bp); + bp2->bio_offset += sc->sectorsize; + bp2->bio_done = g_std_done; + g_io_request(bp2, LIST_FIRST(&gp->consumer)); + return; + case BIO_WRITE: + case BIO_DELETE: + bp2 = g_clone_bio(bp); + bp2->bio_offset += sc->sectorsize; + bp2->bio_done = g_mirror_done; + g_io_request(bp2, LIST_FIRST(&gp->consumer)); + return; + default: + g_io_deliver(bp, EOPNOTSUPP); + return; + } +} + +static int +g_mirror_access(struct g_provider *pp, int dr, int dw, int de) +{ + struct g_geom *gp; + struct g_consumer *cp1, *cp2; + int error; + + de += dr; + de += dw; + + gp = pp->geom; + error = ENXIO; + LIST_FOREACH(cp1, &gp->consumer, consumer) { + error = g_access_rel(cp1, dr, dw, de); + if (error) { + LIST_FOREACH(cp2, &gp->consumer, consumer) { + if (cp2 == cp1) + break; + g_access_rel(cp2, -dr, -dw, -de); + } + return (error); + } + } + return (error); +} + +static struct g_geom * +g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) +{ + struct g_geom *gp, *gp2; + struct g_provider *pp2; + struct g_consumer *cp; + struct g_mirror_softc *sc; + int error; + u_int sectorsize; + u_char *buf; + + g_trace(G_T_TOPOLOGY, "mirror_taste(%s, %s)", mp->name, pp->name); + g_topology_assert(); + gp = g_new_geomf(mp, "%s.mirror", pp->name); + + gp->start = g_mirror_start; + gp->spoiled = g_mirror_orphan; + gp->orphan = g_mirror_orphan; + gp->access= g_mirror_access; + cp = g_new_consumer(gp); + g_attach(cp, pp); + error = g_access_rel(cp, 1, 0, 0); + if (error) { + g_detach(cp); + g_destroy_consumer(cp); + g_destroy_geom(gp); + return(NULL); + } + g_topology_unlock(); + do { + sectorsize = cp->provider->sectorsize; + buf = g_read_data(cp, 0, sectorsize, &error); + if (buf == NULL || error != 0) + break; + if (memcmp(buf, MIRROR_MAGIC, strlen(MIRROR_MAGIC))) + break; + LIST_FOREACH(gp2, &mp->geom, geom) { + sc = gp2->softc; + if (sc == NULL) + continue; + if (memcmp(buf + 16, sc->magic, sizeof sc->magic)) + continue; + break; + } + /* We found somebody else */ + if (gp2 != NULL) { + g_topology_lock(); + g_mirror_add(gp2, pp); + g_topology_unlock(); + break; + } + gp->softc = g_malloc(sizeof(struct g_mirror_softc), M_WAITOK); + sc = gp->softc; + memcpy(sc->magic, buf + 16, sizeof sc->magic); + g_topology_lock(); + pp2 = g_new_providerf(gp, "%s", gp->name); + pp2->mediasize = sc->mediasize = pp->mediasize - pp->sectorsize; + pp2->sectorsize = sc->sectorsize = pp->sectorsize; + g_error_provider(pp2, 0); + g_topology_unlock(); + } while (0); + g_topology_lock(); + if (buf != NULL) + g_free(buf); + g_access_rel(cp, -1, 0, 0); + if (gp->softc != NULL) + return (gp); + g_detach(cp); + g_destroy_consumer(cp); + g_destroy_geom(gp); + return (NULL); +} + +#define MIRROR_CLASS_NAME "MIRROR" + +static struct g_class g_mirror_class = { + .name = MIRROR_CLASS_NAME, + .taste = g_mirror_taste, +}; + +DECLARE_GEOM_CLASS(g_mirror_class, g_mirror); diff --git a/sys/geom/geom_pc98.c b/sys/geom/geom_pc98.c new file mode 100644 index 0000000..b6d2c21 --- /dev/null +++ b/sys/geom/geom_pc98.c @@ -0,0 +1,319 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/endian.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/bio.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +#include <sys/diskpc98.h> +#include <geom/geom.h> +#include <geom/geom_slice.h> + +#define PC98_CLASS_NAME "PC98" + +struct g_pc98_softc { + u_int fwsectors, fwheads, sectorsize; + int type[NDOSPART]; + u_char sec[8192]; +}; + +static void +g_pc98_print(int i, struct pc98_partition *dp) +{ + char sname[17]; + + strncpy(sname, dp->dp_name, 16); + sname[16] = '\0'; + + g_hexdump(dp, sizeof(dp[0])); + printf("[%d] mid:%d(0x%x) sid:%d(0x%x)", + i, dp->dp_mid, dp->dp_mid, dp->dp_sid, dp->dp_sid); + printf(" s:%d/%d/%d", dp->dp_scyl, dp->dp_shd, dp->dp_ssect); + printf(" e:%d/%d/%d", dp->dp_ecyl, dp->dp_ehd, dp->dp_esect); + printf(" sname:%s\n", sname); +} + +static int +g_pc98_modify(struct g_geom *gp, struct g_pc98_softc *ms, u_char *sec) +{ + int i, error; + off_t s[NDOSPART], l[NDOSPART]; + struct pc98_partition dp[NDOSPART]; + + g_topology_assert(); + + if (sec[0x1fe] != 0x55 || sec[0x1ff] != 0xaa) + return (EBUSY); + +#if 0 + /* + * XXX: Some sources indicate this is a magic sequence, but appearantly + * XXX: it is not universal. Documentation would be wonderful to have. + */ + if (sec[4] != 'I' || sec[5] != 'P' || sec[6] != 'L' || sec[7] != '1') + return (EBUSY); +#endif + + for (i = 0; i < NDOSPART; i++) + pc98_partition_dec( + sec + 512 + i * sizeof(struct pc98_partition), &dp[i]); + + for (i = 0; i < NDOSPART; i++) { + /* If start and end are identical it's bogus */ + if (dp[i].dp_ssect == dp[i].dp_esect && + dp[i].dp_shd == dp[i].dp_ehd && + dp[i].dp_scyl == dp[i].dp_ecyl) + s[i] = l[i] = 0; + else if (dp[i].dp_ecyl == 0) + s[i] = l[i] = 0; + else { + s[i] = (off_t)dp[i].dp_scyl * + ms->fwsectors * ms->fwheads * ms->sectorsize; + l[i] = (off_t)(dp[i].dp_ecyl - dp[i].dp_scyl + 1) * + ms->fwsectors * ms->fwheads * ms->sectorsize; + } + if (bootverbose) { + printf("PC98 Slice %d on %s:\n", i + 1, gp->name); + g_pc98_print(i, dp + i); + } + if (s[i] < 0 || l[i] < 0) + error = EBUSY; + else + error = g_slice_config(gp, i, G_SLICE_CONFIG_CHECK, + s[i], l[i], ms->sectorsize, + "%ss%d", gp->name, i + 1); + if (error) + return (error); + } + + for (i = 0; i < NDOSPART; i++) { + ms->type[i] = (dp[i].dp_sid << 8) | dp[i].dp_mid; + g_slice_config(gp, i, G_SLICE_CONFIG_SET, s[i], l[i], + ms->sectorsize, "%ss%d", gp->name, i + 1); + } + + bcopy(sec, ms->sec, sizeof (ms->sec)); + + return (0); +} + +static void +g_pc98_ioctl(void *arg, int flag) +{ + struct bio *bp; + struct g_geom *gp; + struct g_slicer *gsp; + struct g_pc98_softc *ms; + struct g_ioctl *gio; + struct g_consumer *cp; + u_char *sec; + int error; + + bp = arg; + if (flag == EV_CANCEL) { + g_io_deliver(bp, ENXIO); + return; + } + gp = bp->bio_to->geom; + gsp = gp->softc; + ms = gsp->softc; + gio = (struct g_ioctl *)bp->bio_data; + + /* The disklabel to set is the ioctl argument. */ + sec = gio->data; + + error = g_pc98_modify(gp, ms, sec); + if (error) { + g_io_deliver(bp, error); + return; + } + cp = LIST_FIRST(&gp->consumer); + error = g_write_data(cp, 0, sec, 8192); + g_io_deliver(bp, error); +} + +static int +g_pc98_start(struct bio *bp) +{ + struct g_provider *pp; + struct g_geom *gp; + struct g_pc98_softc *mp; + struct g_slicer *gsp; + struct g_ioctl *gio; + int idx, error; + + pp = bp->bio_to; + idx = pp->index; + gp = pp->geom; + gsp = gp->softc; + mp = gsp->softc; + if (bp->bio_cmd == BIO_GETATTR) { + if (g_handleattr_int(bp, "PC98::type", mp->type[idx])) + return (1); + if (g_handleattr_off_t(bp, "PC98::offset", + gsp->slices[idx].offset)) + return (1); + } + + /* We only handle ioctl(2) requests of the right format. */ + if (strcmp(bp->bio_attribute, "GEOM::ioctl")) + return (0); + else if (bp->bio_length != sizeof(*gio)) + return (0); + /* Get hold of the ioctl parameters. */ + gio = (struct g_ioctl *)bp->bio_data; + + switch (gio->cmd) { + case DIOCSPC98: + /* + * These we cannot do without the topology lock and some + * some I/O requests. Ask the event-handler to schedule + * us in a less restricted environment. + */ + error = g_post_event(g_pc98_ioctl, bp, M_NOWAIT, gp, NULL); + if (error) + g_io_deliver(bp, error); + /* + * We must return non-zero to indicate that we will deal + * with this bio, even though we have not done so yet. + */ + return (1); + default: + return (0); + } +} + +static void +g_pc98_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, + struct g_consumer *cp __unused, struct g_provider *pp) +{ + struct g_pc98_softc *mp; + struct g_slicer *gsp; + struct pc98_partition dp; + char sname[17]; + + gsp = gp->softc; + mp = gsp->softc; + g_slice_dumpconf(sb, indent, gp, cp, pp); + if (pp != NULL) { + pc98_partition_dec( + mp->sec + 512 + + pp->index * sizeof(struct pc98_partition), &dp); + strncpy(sname, dp.dp_name, 16); + sname[16] = '\0'; + if (indent == NULL) { + sbuf_printf(sb, " ty %d", mp->type[pp->index]); + sbuf_printf(sb, " sn %s", sname); + } else { + sbuf_printf(sb, "%s<type>%d</type>\n", indent, + mp->type[pp->index]); + sbuf_printf(sb, "%s<sname>%s</sname>\n", indent, + sname); + } + } +} + +static struct g_geom * +g_pc98_taste(struct g_class *mp, struct g_provider *pp, int flags) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error; + struct g_pc98_softc *ms; + u_int fwsectors, fwheads, sectorsize; + u_char *buf; + + g_trace(G_T_TOPOLOGY, "g_pc98_taste(%s,%s)", mp->name, pp->name); + g_topology_assert(); + if (flags == G_TF_NORMAL && + !strcmp(pp->geom->class->name, PC98_CLASS_NAME)) + return (NULL); + gp = g_slice_new(mp, NDOSPART, pp, &cp, &ms, sizeof *ms, g_pc98_start); + if (gp == NULL) + return (NULL); + g_topology_unlock(); + gp->dumpconf = g_pc98_dumpconf; + do { + if (gp->rank != 2 && flags == G_TF_NORMAL) + break; + error = g_getattr("GEOM::fwsectors", cp, &fwsectors); + if (error || fwsectors == 0) { + fwsectors = 17; + if (bootverbose) + printf("g_pc98_taste: guessing %d sectors\n", + fwsectors); + } + error = g_getattr("GEOM::fwheads", cp, &fwheads); + if (error || fwheads == 0) { + fwheads = 8; + if (bootverbose) + printf("g_pc98_taste: guessing %d heads\n", + fwheads); + } + sectorsize = cp->provider->sectorsize; + if (sectorsize < 512) + break; + buf = g_read_data(cp, 0, 8192, &error); + if (buf == NULL || error != 0) + break; + ms->fwsectors = fwsectors; + ms->fwheads = fwheads; + ms->sectorsize = sectorsize; + g_topology_lock(); + g_pc98_modify(gp, ms, buf); + g_topology_unlock(); + g_free(buf); + break; + } while (0); + g_topology_lock(); + g_access_rel(cp, -1, 0, 0); + if (LIST_EMPTY(&gp->provider)) { + g_slice_spoiled(cp); + return (NULL); + } + return (gp); +} + +static struct g_class g_pc98_class = { + .name = PC98_CLASS_NAME, + .taste = g_pc98_taste, +}; + +DECLARE_GEOM_CLASS(g_pc98_class, g_pc98); diff --git a/sys/geom/geom_pc98_enc.c b/sys/geom/geom_pc98_enc.c new file mode 100644 index 0000000..04de220 --- /dev/null +++ b/sys/geom/geom_pc98_enc.c @@ -0,0 +1,78 @@ +/*- + * Copyright (c) 2003 TAKAHASHI Yoshihiro + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/diskpc98.h> +#include <sys/endian.h> + +void +pc98_partition_dec(void const *pp, struct pc98_partition *d) +{ + unsigned char const *ptr = pp; + int i; + + d->dp_mid = ptr[0]; + d->dp_sid = ptr[1]; + d->dp_dum1 = ptr[2]; + d->dp_dum2 = ptr[3]; + d->dp_ipl_sct = ptr[4]; + d->dp_ipl_head = ptr[5]; + d->dp_ipl_cyl = le16dec(ptr + 6); + d->dp_ssect = ptr[8]; + d->dp_shd = ptr[9]; + d->dp_scyl = le16dec(ptr + 10); + d->dp_esect = ptr[12]; + d->dp_ehd = ptr[13]; + d->dp_ecyl = le16dec(ptr + 14); + for (i = 0; i < sizeof (d->dp_name); i++) + d->dp_name[i] = ptr[16 + i]; +} + +void +pc98_partition_enc(void *pp, struct pc98_partition *d) +{ + unsigned char *ptr = pp; + int i; + + ptr[0] = d->dp_mid; + ptr[1] = d->dp_sid; + ptr[2] = d->dp_dum1; + ptr[3] = d->dp_dum2; + ptr[4] = d->dp_ipl_sct; + ptr[5] = d->dp_ipl_head; + le16enc(ptr + 6, d->dp_ipl_cyl); + ptr[8] = d->dp_ssect; + ptr[9] = d->dp_shd; + le16enc(ptr + 10, d->dp_scyl); + ptr[12] = d->dp_esect; + ptr[13] = d->dp_ehd; + le16enc(ptr + 14, d->dp_ecyl); + for (i = 0; i < sizeof (d->dp_name); i++) + ptr[16 + i] = d->dp_name[i]; +} diff --git a/sys/geom/geom_slice.c b/sys/geom/geom_slice.c new file mode 100644 index 0000000..64c000e --- /dev/null +++ b/sys/geom/geom_slice.c @@ -0,0 +1,488 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/bio.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/kthread.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/errno.h> +#include <sys/sbuf.h> +#include <geom/geom.h> +#include <geom/geom_slice.h> +#include <machine/stdarg.h> + +static g_orphan_t g_slice_orphan; +static g_access_t g_slice_access; +static g_start_t g_slice_start; + +static struct g_slicer * +g_slice_alloc(unsigned nslice, unsigned scsize) +{ + struct g_slicer *gsp; + + gsp = g_malloc(sizeof *gsp, M_WAITOK | M_ZERO); + gsp->softc = g_malloc(scsize, M_WAITOK | M_ZERO); + gsp->slices = g_malloc(nslice * sizeof(struct g_slice), + M_WAITOK | M_ZERO); + gsp->nslice = nslice; + return (gsp); +} + +static void +g_slice_free(struct g_slicer *gsp) +{ + + g_free(gsp->slices); + if (gsp->hotspot != NULL) + g_free(gsp->hotspot); + g_free(gsp->softc); + g_free(gsp); +} + +static int +g_slice_access(struct g_provider *pp, int dr, int dw, int de) +{ + int error; + u_int u; + struct g_geom *gp; + struct g_consumer *cp; + struct g_provider *pp2; + struct g_slicer *gsp; + struct g_slice *gsl, *gsl2; + + gp = pp->geom; + cp = LIST_FIRST(&gp->consumer); + KASSERT (cp != NULL, ("g_slice_access but no consumer")); + gsp = gp->softc; + gsl = &gsp->slices[pp->index]; + for (u = 0; u < gsp->nslice; u++) { + gsl2 = &gsp->slices[u]; + if (gsl2->length == 0) + continue; + if (u == pp->index) + continue; + if (gsl->offset + gsl->length <= gsl2->offset) + continue; + if (gsl2->offset + gsl2->length <= gsl->offset) + continue; + /* overlap */ + pp2 = gsl2->provider; + if ((pp->acw + dw) > 0 && pp2->ace > 0) + return (EPERM); + if ((pp->ace + de) > 0 && pp2->acw > 0) + return (EPERM); + } + /* On first open, grab an extra "exclusive" bit */ + if (cp->acr == 0 && cp->acw == 0 && cp->ace == 0) + de++; + /* ... and let go of it on last close */ + if ((cp->acr + dr) == 0 && (cp->acw + dw) == 0 && (cp->ace + de) == 1) + de--; + error = g_access_rel(cp, dr, dw, de); + return (error); +} + +/* + * XXX: It should be possible to specify here if we should finish all of the + * XXX: bio, or only the non-hot bits. This would get messy if there were + * XXX: two hot spots in the same bio, so for now we simply finish off the + * XXX: entire bio. Modifying hot data on the way to disk is frowned on + * XXX: so making that considerably harder is not a bad idea anyway. + */ +void +g_slice_finish_hot(struct bio *bp) +{ + struct bio *bp2; + struct g_geom *gp; + struct g_consumer *cp; + struct g_slicer *gsp; + struct g_slice *gsl; + int idx; + + KASSERT(bp->bio_to != NULL, + ("NULL bio_to in g_slice_finish_hot(%p)", bp)); + KASSERT(bp->bio_from != NULL, + ("NULL bio_from in g_slice_finish_hot(%p)", bp)); + gp = bp->bio_to->geom; + gsp = gp->softc; + cp = LIST_FIRST(&gp->consumer); + KASSERT(cp != NULL, ("NULL consumer in g_slice_finish_hot(%p)", bp)); + idx = bp->bio_to->index; + gsl = &gsp->slices[idx]; + + bp2 = g_clone_bio(bp); + if (bp2 == NULL) { + g_io_deliver(bp, ENOMEM); + return; + } + if (bp2->bio_offset + bp2->bio_length > gsl->length) + bp2->bio_length = gsl->length - bp2->bio_offset; + bp2->bio_done = g_std_done; + bp2->bio_offset += gsl->offset; + g_io_request(bp2, cp); + return; +} + +static void +g_slice_start(struct bio *bp) +{ + struct bio *bp2; + struct g_provider *pp; + struct g_geom *gp; + struct g_consumer *cp; + struct g_slicer *gsp; + struct g_slice *gsl; + struct g_slice_hot *ghp; + int idx, error; + u_int m_index; + off_t t; + + pp = bp->bio_to; + gp = pp->geom; + gsp = gp->softc; + cp = LIST_FIRST(&gp->consumer); + idx = pp->index; + gsl = &gsp->slices[idx]; + switch(bp->bio_cmd) { + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + if (bp->bio_offset > gsl->length) { + g_io_deliver(bp, EINVAL); /* XXX: EWHAT ? */ + return; + } + /* + * Check if we collide with any hot spaces, and call the + * method once if so. + */ + t = bp->bio_offset + gsl->offset; + for (m_index = 0; m_index < gsp->nhotspot; m_index++) { + ghp = &gsp->hotspot[m_index]; + if (t >= ghp->offset + ghp->length) + continue; + if (t + bp->bio_length <= ghp->offset) + continue; + switch(bp->bio_cmd) { + case BIO_READ: idx = ghp->ract; break; + case BIO_WRITE: idx = ghp->wact; break; + case BIO_DELETE: idx = ghp->dact; break; + } + switch(idx) { + case G_SLICE_HOT_ALLOW: + /* Fall out and continue normal processing */ + continue; + case G_SLICE_HOT_DENY: + g_io_deliver(bp, EROFS); + return; + case G_SLICE_HOT_START: + error = gsp->start(bp); + if (error && error != EJUSTRETURN) + g_io_deliver(bp, error); + return; + case G_SLICE_HOT_CALL: + error = g_post_event(gsp->hot, bp, M_NOWAIT, + gp, NULL); + if (error) + g_io_deliver(bp, error); + return; + } + break; + } + bp2 = g_clone_bio(bp); + if (bp2 == NULL) { + g_io_deliver(bp, ENOMEM); + return; + } + if (bp2->bio_offset + bp2->bio_length > gsl->length) + bp2->bio_length = gsl->length - bp2->bio_offset; + bp2->bio_done = g_std_done; + bp2->bio_offset += gsl->offset; + g_io_request(bp2, cp); + return; + case BIO_GETATTR: + /* Give the real method a chance to override */ + if (gsp->start != NULL && gsp->start(bp)) + return; + if (!strcmp("GEOM::kerneldump", bp->bio_attribute)) { + struct g_kerneldump *gkd; + + gkd = (struct g_kerneldump *)bp->bio_data; + gkd->offset += gsp->slices[idx].offset; + if (gkd->length > gsp->slices[idx].length) + gkd->length = gsp->slices[idx].length; + /* now, pass it on downwards... */ + } + bp2 = g_clone_bio(bp); + if (bp2 == NULL) { + g_io_deliver(bp, ENOMEM); + return; + } + bp2->bio_done = g_std_done; + g_io_request(bp2, cp); + break; + default: + g_io_deliver(bp, EOPNOTSUPP); + return; + } +} + +void +g_slice_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) +{ + struct g_slicer *gsp; + + gsp = gp->softc; + if (indent == NULL) { + sbuf_printf(sb, " i %u", pp->index); + sbuf_printf(sb, " o %ju", + (uintmax_t)gsp->slices[pp->index].offset); + return; + } + if (pp != NULL) { + sbuf_printf(sb, "%s<index>%u</index>\n", indent, pp->index); + sbuf_printf(sb, "%s<length>%ju</length>\n", + indent, (uintmax_t)gsp->slices[pp->index].length); + sbuf_printf(sb, "%s<seclength>%ju</seclength>\n", indent, + (uintmax_t)gsp->slices[pp->index].length / 512); + sbuf_printf(sb, "%s<offset>%ju</offset>\n", indent, + (uintmax_t)gsp->slices[pp->index].offset); + sbuf_printf(sb, "%s<secoffset>%ju</secoffset>\n", indent, + (uintmax_t)gsp->slices[pp->index].offset / 512); + } +} + +int +g_slice_config(struct g_geom *gp, u_int idx, int how, off_t offset, off_t length, u_int sectorsize, const char *fmt, ...) +{ + struct g_provider *pp, *pp2; + struct g_slicer *gsp; + struct g_slice *gsl; + va_list ap; + struct sbuf *sb; + int acc; + + g_trace(G_T_TOPOLOGY, "g_slice_config(%s, %d, %d)", + gp->name, idx, how); + g_topology_assert(); + gsp = gp->softc; + if (idx >= gsp->nslice) + return(EINVAL); + gsl = &gsp->slices[idx]; + pp = gsl->provider; + if (pp != NULL) + acc = pp->acr + pp->acw + pp->ace; + else + acc = 0; + if (acc != 0 && how != G_SLICE_CONFIG_FORCE) { + if (length < gsl->length) + return(EBUSY); + if (offset != gsl->offset) + return(EBUSY); + } + /* XXX: check offset + length <= MEDIASIZE */ + if (how == G_SLICE_CONFIG_CHECK) + return (0); + gsl->length = length; + gsl->offset = offset; + gsl->sectorsize = sectorsize; + if (length == 0) { + if (pp == NULL) + return (0); + if (bootverbose) + printf("GEOM: Deconfigure %s\n", pp->name); + g_orphan_provider(pp, ENXIO); + gsl->provider = NULL; + gsp->nprovider--; + return (0); + } + if (pp != NULL) { + if (bootverbose) + printf("GEOM: Reconfigure %s, start %jd length %jd end %jd\n", + pp->name, (intmax_t)offset, (intmax_t)length, + (intmax_t)(offset + length - 1)); + pp->mediasize = gsl->length; + return (0); + } + sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); + va_start(ap, fmt); + sbuf_vprintf(sb, fmt, ap); + va_end(ap); + sbuf_finish(sb); + pp = g_new_providerf(gp, sbuf_data(sb)); + pp2 = LIST_FIRST(&gp->consumer)->provider; + pp->flags = pp2->flags & G_PF_CANDELETE; + if (pp2->stripesize > 0) { + pp->stripesize = pp2->stripesize; + pp->stripeoffset = (pp2->stripeoffset + offset) % pp->stripesize; + } + if (bootverbose) + printf("GEOM: Configure %s, start %jd length %jd end %jd\n", + pp->name, (intmax_t)offset, (intmax_t)length, + (intmax_t)(offset + length - 1)); + pp->index = idx; + pp->mediasize = gsl->length; + pp->sectorsize = gsl->sectorsize; + gsl->provider = pp; + gsp->nprovider++; + g_error_provider(pp, 0); + sbuf_delete(sb); + return(0); +} + +/* + * Configure "hotspots". A hotspot is a piece of the parent device which + * this particular slicer cares about for some reason. Typically because + * it contains meta-data used to configure the slicer. + * A hotspot is identified by its index number. The offset and length are + * relative to the parent device, and the three "?act" fields specify + * what action to take on BIO_READ, BIO_DELETE and BIO_WRITE. + * + * XXX: There may be a race relative to g_slice_start() here, if an existing + * XXX: hotspot is changed wile I/O is happening. Should this become a problem + * XXX: we can protect the hotspot stuff with a mutex. + */ + +int +g_slice_conf_hot(struct g_geom *gp, u_int idx, off_t offset, off_t length, int ract, int dact, int wact) +{ + struct g_slicer *gsp; + struct g_slice_hot *gsl, *gsl2; + + g_trace(G_T_TOPOLOGY, "g_slice_conf_hot(%s, idx: %d, off: %jd, len: %jd)", + gp->name, idx, (intmax_t)offset, (intmax_t)length); + g_topology_assert(); + gsp = gp->softc; + gsl = gsp->hotspot; + if(idx >= gsp->nhotspot) { + gsl2 = g_malloc((idx + 1) * sizeof *gsl2, M_WAITOK | M_ZERO); + if (gsp->hotspot != NULL) + bcopy(gsp->hotspot, gsl2, gsp->nhotspot * sizeof *gsl2); + gsp->hotspot = gsl2; + if (gsp->hotspot != NULL) + g_free(gsl); + gsl = gsl2; + gsp->nhotspot = idx + 1; + } + gsl[idx].offset = offset; + gsl[idx].length = length; + KASSERT(!((ract | dact | wact) & G_SLICE_HOT_START) + || gsp->start != NULL, ("G_SLICE_HOT_START but no slice->start")); + /* XXX: check that we _have_ a start function if HOT_START specified */ + gsl[idx].ract = ract; + gsl[idx].dact = dact; + gsl[idx].wact = wact; + return (0); +} + +void +g_slice_spoiled(struct g_consumer *cp) +{ + struct g_geom *gp; + struct g_slicer *gsp; + + g_topology_assert(); + gp = cp->geom; + g_trace(G_T_TOPOLOGY, "g_slice_spoiled(%p/%s)", cp, gp->name); + gsp = gp->softc; + gp->softc = NULL; + g_slice_free(gsp); + g_wither_geom(gp, ENXIO); +} + +int +g_slice_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) +{ + + g_slice_spoiled(LIST_FIRST(&gp->consumer)); + return (0); +} + +struct g_geom * +g_slice_new(struct g_class *mp, u_int slices, struct g_provider *pp, struct g_consumer **cpp, void *extrap, int extra, g_slice_start_t *start) +{ + struct g_geom *gp; + struct g_slicer *gsp; + struct g_consumer *cp; + void **vp; + int error; + + g_topology_assert(); + vp = (void **)extrap; + gp = g_new_geomf(mp, "%s", pp->name); + gsp = g_slice_alloc(slices, extra); + gsp->start = start; + gp->access = g_slice_access; + gp->orphan = g_slice_orphan; + gp->softc = gsp; + gp->start = g_slice_start; + gp->spoiled = g_slice_spoiled; + gp->dumpconf = g_slice_dumpconf; + if (gp->class->destroy_geom == NULL) + gp->class->destroy_geom = g_slice_destroy_geom; + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error == 0) + error = g_access_rel(cp, 1, 0, 0); + if (error) { + g_wither_geom(gp, ENXIO); + return (NULL); + } + *vp = gsp->softc; + *cpp = cp; + return (gp); +} + +static void +g_slice_orphan(struct g_consumer *cp) +{ + + g_trace(G_T_TOPOLOGY, "g_slice_orphan(%p/%s)", cp, cp->provider->name); + g_topology_assert(); + KASSERT(cp->provider->error != 0, + ("g_slice_orphan with error == 0")); + + /* XXX: Not good enough we leak the softc and its suballocations */ + g_slice_free(cp->geom->softc); + g_wither_geom(cp->geom, cp->provider->error); +} diff --git a/sys/geom/geom_slice.h b/sys/geom/geom_slice.h new file mode 100644 index 0000000..4003c8f --- /dev/null +++ b/sys/geom/geom_slice.h @@ -0,0 +1,89 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _GEOM_GEOM_SLICE_H_ +#define _GEOM_GEOM_SLICE_H_ + +struct g_slice { + off_t offset; + off_t length; + u_int sectorsize; + struct g_provider *provider; +}; + +struct g_slice_hot { + off_t offset; + off_t length; + int ract; + int dact; + int wact; +}; + +typedef int g_slice_start_t (struct bio *bp); + +struct g_slicer { + u_int nslice; + u_int nprovider; + struct g_slice *slices; + + u_int nhotspot; + struct g_slice_hot *hotspot; + + void *softc; + g_slice_start_t *start; + g_event_t *hot; +}; + +g_dumpconf_t g_slice_dumpconf; +int g_slice_config(struct g_geom *gp, u_int idx, int how, off_t offset, off_t length, u_int sectorsize, const char *fmt, ...); +void g_slice_spoiled(struct g_consumer *cp); +#define G_SLICE_CONFIG_CHECK 0 +#define G_SLICE_CONFIG_SET 1 +#define G_SLICE_CONFIG_FORCE 2 +struct g_geom * g_slice_new(struct g_class *mp, u_int slices, struct g_provider *pp, struct g_consumer **cpp, void *extrap, int extra, g_slice_start_t *start); + +int g_slice_conf_hot(struct g_geom *gp, u_int idx, off_t offset, off_t length, int ract, int dact, int wact); +#define G_SLICE_HOT_ALLOW 1 +#define G_SLICE_HOT_DENY 2 +#define G_SLICE_HOT_START 4 +#define G_SLICE_HOT_CALL 8 + +int g_slice_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); + +void g_slice_finish_hot(struct bio *bp); + +#endif /* _GEOM_GEOM_SLICE_H_ */ diff --git a/sys/geom/geom_subr.c b/sys/geom/geom_subr.c new file mode 100644 index 0000000..98b8f8f --- /dev/null +++ b/sys/geom/geom_subr.c @@ -0,0 +1,809 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/devicestat.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/bio.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/kthread.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/errno.h> +#include <sys/sbuf.h> +#include <geom/geom.h> +#include <geom/geom_int.h> +#include <machine/stdarg.h> + +struct class_list_head g_classes = LIST_HEAD_INITIALIZER(g_classes); +static struct g_tailq_head geoms = TAILQ_HEAD_INITIALIZER(geoms); +char *g_wait_event, *g_wait_up, *g_wait_down, *g_wait_sim; + + +struct g_hh00 { + struct g_class *mp; + int error; +}; + +/* + * This event offers a new class a chance to taste all preexisting providers. + */ +static void +g_load_class(void *arg, int flag) +{ + struct g_hh00 *hh; + struct g_class *mp2, *mp; + struct g_geom *gp; + struct g_provider *pp; + + g_topology_assert(); + if (flag == EV_CANCEL) /* XXX: can't happen ? */ + return; + if (g_shutdown) + return; + + hh = arg; + mp = hh->mp; + g_free(hh); + g_trace(G_T_TOPOLOGY, "g_load_class(%s)", mp->name); + LIST_FOREACH(mp2, &g_classes, class) { + KASSERT(mp2 != mp, + ("The GEOM class %s already loaded", mp2->name)); + KASSERT(strcmp(mp2->name, mp->name) != 0, + ("A GEOM class named %s is already loaded", mp2->name)); + } + + if (mp->init != NULL) + mp->init(mp); + LIST_INIT(&mp->geom); + LIST_INSERT_HEAD(&g_classes, mp, class); + if (mp->taste == NULL) + return; + LIST_FOREACH(mp2, &g_classes, class) { + if (mp == mp2) + continue; + LIST_FOREACH(gp, &mp2->geom, geom) { + LIST_FOREACH(pp, &gp->provider, provider) { + mp->taste(mp, pp, 0); + g_topology_assert(); + } + } + } +} + +static void +g_unload_class(void *arg, int flag) +{ + struct g_hh00 *hh; + struct g_class *mp; + struct g_geom *gp; + struct g_provider *pp; + struct g_consumer *cp; + int error; + + g_topology_assert(); + hh = arg; + mp = hh->mp; + g_trace(G_T_TOPOLOGY, "g_unload_class(%s)", mp->name); + if (mp->destroy_geom == NULL) { + hh->error = EOPNOTSUPP; + return; + } + + /* We refuse to unload if anything is open */ + LIST_FOREACH(gp, &mp->geom, geom) { + LIST_FOREACH(pp, &gp->provider, provider) + if (pp->acr || pp->acw || pp->ace) { + hh->error = EBUSY; + return; + } + LIST_FOREACH(cp, &gp->consumer, consumer) + if (cp->acr || cp->acw || cp->ace) { + hh->error = EBUSY; + return; + } + } + + /* Bar new entries */ + mp->taste = NULL; + mp->config = NULL; + + error = 0; + LIST_FOREACH(gp, &mp->geom, geom) { + error = mp->destroy_geom(NULL, mp, gp); + if (error != 0) + break; + } + if (error == 0) { + LIST_REMOVE(mp, class); + if (mp->fini != NULL) + mp->fini(mp); + } + hh->error = error; + return; +} + +int +g_modevent(module_t mod, int type, void *data) +{ + struct g_hh00 *hh; + int error; + static int g_ignition; + + if (!g_ignition) { + g_ignition++; + g_init(); + } + hh = g_malloc(sizeof *hh, M_WAITOK | M_ZERO); + hh->mp = data; + error = EOPNOTSUPP; + switch (type) { + case MOD_LOAD: + g_trace(G_T_TOPOLOGY, "g_modevent(%s, LOAD)", hh->mp->name); + g_post_event(g_load_class, hh, M_WAITOK, NULL); + error = 0; + break; + case MOD_UNLOAD: + g_trace(G_T_TOPOLOGY, "g_modevent(%s, UNLOAD)", hh->mp->name); + error = g_waitfor_event(g_unload_class, hh, M_WAITOK, NULL); + if (error == 0) + error = hh->error; + g_waitidle(); + KASSERT(LIST_EMPTY(&hh->mp->geom), + ("Unloaded class (%s) still has geom", hh->mp->name)); + g_free(hh); + break; + } + return (error); +} + +struct g_geom * +g_new_geomf(struct g_class *mp, const char *fmt, ...) +{ + struct g_geom *gp; + va_list ap; + struct sbuf *sb; + + g_topology_assert(); + sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); + va_start(ap, fmt); + sbuf_vprintf(sb, fmt, ap); + va_end(ap); + sbuf_finish(sb); + gp = g_malloc(sizeof *gp, M_WAITOK | M_ZERO); + gp->name = g_malloc(sbuf_len(sb) + 1, M_WAITOK | M_ZERO); + gp->class = mp; + gp->rank = 1; + LIST_INIT(&gp->consumer); + LIST_INIT(&gp->provider); + LIST_INSERT_HEAD(&mp->geom, gp, geom); + TAILQ_INSERT_HEAD(&geoms, gp, geoms); + strcpy(gp->name, sbuf_data(sb)); + sbuf_delete(sb); + return (gp); +} + +void +g_destroy_geom(struct g_geom *gp) +{ + + g_trace(G_T_TOPOLOGY, "g_destroy_geom(%p(%s))", gp, gp->name); + g_topology_assert(); + KASSERT(LIST_EMPTY(&gp->consumer), + ("g_destroy_geom(%s) with consumer(s) [%p]", + gp->name, LIST_FIRST(&gp->consumer))); + KASSERT(LIST_EMPTY(&gp->provider), + ("g_destroy_geom(%s) with provider(s) [%p]", + gp->name, LIST_FIRST(&gp->consumer))); + g_cancel_event(gp); + LIST_REMOVE(gp, geom); + TAILQ_REMOVE(&geoms, gp, geoms); + g_free(gp->name); + g_free(gp); +} + +/* + * This function is called (repeatedly) until has withered away. + */ +void +g_wither_geom(struct g_geom *gp, int error) +{ + struct g_provider *pp, *pp2; + struct g_consumer *cp, *cp2; + static int once_is_enough; + + if (once_is_enough) + return; + once_is_enough = 1; + g_trace(G_T_TOPOLOGY, "g_wither_geom(%p(%s))", gp, gp->name); + g_topology_assert(); + if (!(gp->flags & G_GEOM_WITHER)) { + gp->flags |= G_GEOM_WITHER; + LIST_FOREACH(pp, &gp->provider, provider) + g_orphan_provider(pp, error); + } + for (pp = LIST_FIRST(&gp->provider); pp != NULL; pp = pp2) { + pp2 = LIST_NEXT(pp, provider); + if (!LIST_EMPTY(&pp->consumers)) + continue; + g_destroy_provider(pp); + } + for (cp = LIST_FIRST(&gp->consumer); cp != NULL; cp = cp2) { + cp2 = LIST_NEXT(cp, consumer); + if (cp->acr || cp->acw || cp->ace) + continue; + g_detach(cp); + g_destroy_consumer(cp); + } + if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer)) + g_destroy_geom(gp); + once_is_enough = 0; +} + +struct g_consumer * +g_new_consumer(struct g_geom *gp) +{ + struct g_consumer *cp; + + g_topology_assert(); + KASSERT(gp->orphan != NULL, + ("g_new_consumer on geom(%s) (class %s) without orphan", + gp->name, gp->class->name)); + + cp = g_malloc(sizeof *cp, M_WAITOK | M_ZERO); + cp->geom = gp; + cp->stat = devstat_new_entry(cp, -1, 0, DEVSTAT_ALL_SUPPORTED, + DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); + LIST_INSERT_HEAD(&gp->consumer, cp, consumer); + return(cp); +} + +void +g_destroy_consumer(struct g_consumer *cp) +{ + struct g_geom *gp; + + g_trace(G_T_TOPOLOGY, "g_destroy_consumer(%p)", cp); + g_topology_assert(); + KASSERT (cp->provider == NULL, ("g_destroy_consumer but attached")); + KASSERT (cp->acr == 0, ("g_destroy_consumer with acr")); + KASSERT (cp->acw == 0, ("g_destroy_consumer with acw")); + KASSERT (cp->ace == 0, ("g_destroy_consumer with ace")); + g_cancel_event(cp); + gp = cp->geom; + LIST_REMOVE(cp, consumer); + devstat_remove_entry(cp->stat); + g_free(cp); + if (gp->flags & G_GEOM_WITHER) + g_wither_geom(gp, 0); +} + +static void +g_new_provider_event(void *arg, int flag) +{ + struct g_class *mp; + struct g_provider *pp; + struct g_consumer *cp; + int i; + + g_topology_assert(); + if (flag == EV_CANCEL) + return; + if (g_shutdown) + return; + pp = arg; + LIST_FOREACH(mp, &g_classes, class) { + if (mp->taste == NULL) + continue; + i = 1; + LIST_FOREACH(cp, &pp->consumers, consumers) + if (cp->geom->class == mp) + i = 0; + if (!i) + continue; + mp->taste(mp, pp, 0); + g_topology_assert(); + } +} + + +struct g_provider * +g_new_providerf(struct g_geom *gp, const char *fmt, ...) +{ + struct g_provider *pp; + struct sbuf *sb; + va_list ap; + + g_topology_assert(); + sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); + va_start(ap, fmt); + sbuf_vprintf(sb, fmt, ap); + va_end(ap); + sbuf_finish(sb); + pp = g_malloc(sizeof *pp + sbuf_len(sb) + 1, M_WAITOK | M_ZERO); + pp->name = (char *)(pp + 1); + strcpy(pp->name, sbuf_data(sb)); + sbuf_delete(sb); + LIST_INIT(&pp->consumers); + pp->error = ENXIO; + pp->geom = gp; + pp->stat = devstat_new_entry(pp, -1, 0, DEVSTAT_ALL_SUPPORTED, + DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX); + LIST_INSERT_HEAD(&gp->provider, pp, provider); + g_post_event(g_new_provider_event, pp, M_WAITOK, pp, NULL); + return (pp); +} + +void +g_error_provider(struct g_provider *pp, int error) +{ + + pp->error = error; +} + +struct g_provider * +g_provider_by_name(char const *arg) +{ + struct g_class *cp; + struct g_geom *gp; + struct g_provider *pp; + + LIST_FOREACH(cp, &g_classes, class) { + LIST_FOREACH(gp, &cp->geom, geom) { + LIST_FOREACH(pp, &gp->provider, provider) { + if (!strcmp(arg, pp->name)) + return (pp); + } + } + } + return (NULL); +} + +void +g_destroy_provider(struct g_provider *pp) +{ + struct g_geom *gp; + + g_topology_assert(); + KASSERT(LIST_EMPTY(&pp->consumers), + ("g_destroy_provider but attached")); + KASSERT (pp->acr == 0, ("g_destroy_provider with acr")); + KASSERT (pp->acw == 0, ("g_destroy_provider with acw")); + KASSERT (pp->acw == 0, ("g_destroy_provider with ace")); + g_cancel_event(pp); + LIST_REMOVE(pp, provider); + gp = pp->geom; + devstat_remove_entry(pp->stat); + g_free(pp); + if ((gp->flags & G_GEOM_WITHER)) + g_wither_geom(gp, 0); +} + +/* + * We keep the "geoms" list sorted by topological order (== increasing + * numerical rank) at all times. + * When an attach is done, the attaching geoms rank is invalidated + * and it is moved to the tail of the list. + * All geoms later in the sequence has their ranks reevaluated in + * sequence. If we cannot assign rank to a geom because it's + * prerequisites do not have rank, we move that element to the tail + * of the sequence with invalid rank as well. + * At some point we encounter our original geom and if we stil fail + * to assign it a rank, there must be a loop and we fail back to + * g_attach() which detach again and calls redo_rank again + * to fix up the damage. + * It would be much simpler code wise to do it recursively, but we + * can't risk that on the kernel stack. + */ + +static int +redo_rank(struct g_geom *gp) +{ + struct g_consumer *cp; + struct g_geom *gp1, *gp2; + int n, m; + + g_topology_assert(); + + /* Invalidate this geoms rank and move it to the tail */ + gp1 = TAILQ_NEXT(gp, geoms); + if (gp1 != NULL) { + gp->rank = 0; + TAILQ_REMOVE(&geoms, gp, geoms); + TAILQ_INSERT_TAIL(&geoms, gp, geoms); + } else { + gp1 = gp; + } + + /* re-rank the rest of the sequence */ + for (; gp1 != NULL; gp1 = gp2) { + gp1->rank = 0; + m = 1; + LIST_FOREACH(cp, &gp1->consumer, consumer) { + if (cp->provider == NULL) + continue; + n = cp->provider->geom->rank; + if (n == 0) { + m = 0; + break; + } else if (n >= m) + m = n + 1; + } + gp1->rank = m; + gp2 = TAILQ_NEXT(gp1, geoms); + + /* got a rank, moving on */ + if (m != 0) + continue; + + /* no rank to original geom means loop */ + if (gp == gp1) + return (ELOOP); + + /* no rank, put it at the end move on */ + TAILQ_REMOVE(&geoms, gp1, geoms); + TAILQ_INSERT_TAIL(&geoms, gp1, geoms); + } + return (0); +} + +int +g_attach(struct g_consumer *cp, struct g_provider *pp) +{ + int error; + + g_topology_assert(); + KASSERT(cp->provider == NULL, ("attach but attached")); + cp->provider = pp; + LIST_INSERT_HEAD(&pp->consumers, cp, consumers); + error = redo_rank(cp->geom); + if (error) { + LIST_REMOVE(cp, consumers); + cp->provider = NULL; + redo_rank(cp->geom); + } + return (error); +} + +void +g_detach(struct g_consumer *cp) +{ + struct g_provider *pp; + + g_trace(G_T_TOPOLOGY, "g_detach(%p)", cp); + KASSERT(cp != (void*)0xd0d0d0d0, ("ARGH!")); + g_topology_assert(); + KASSERT(cp->provider != NULL, ("detach but not attached")); + KASSERT(cp->acr == 0, ("detach but nonzero acr")); + KASSERT(cp->acw == 0, ("detach but nonzero acw")); + KASSERT(cp->ace == 0, ("detach but nonzero ace")); + KASSERT(cp->nstart == cp->nend, + ("detach with active requests")); + pp = cp->provider; + LIST_REMOVE(cp, consumers); + cp->provider = NULL; + if (pp->geom->flags & G_GEOM_WITHER) + g_wither_geom(pp->geom, 0); + redo_rank(cp->geom); +} + + +/* + * g_access_abs() + * + * Access-check with absolute new values: Just fall through + * and use the relative version. + */ +int +g_access_abs(struct g_consumer *cp, int acr, int acw, int ace) +{ + + g_topology_assert(); + return(g_access_rel(cp, + acr - cp->acr, + acw - cp->acw, + ace - cp->ace)); +} + +/* + * g_access_rel() + * + * Access-check with delta values. The question asked is "can provider + * "cp" change the access counters by the relative amounts dc[rwe] ?" + */ + +int +g_access_rel(struct g_consumer *cp, int dcr, int dcw, int dce) +{ + struct g_provider *pp; + int pr,pw,pe; + int error; + + pp = cp->provider; + + g_trace(G_T_ACCESS, "g_access_rel(%p(%s), %d, %d, %d)", + cp, pp->name, dcr, dcw, dce); + + g_topology_assert(); + KASSERT(cp->provider != NULL, ("access but not attached")); + KASSERT(cp->acr + dcr >= 0, ("access resulting in negative acr")); + KASSERT(cp->acw + dcw >= 0, ("access resulting in negative acw")); + KASSERT(cp->ace + dce >= 0, ("access resulting in negative ace")); + KASSERT(pp->geom->access != NULL, ("NULL geom->access")); + + /* + * If our class cares about being spoiled, and we have been, we + * are probably just ahead of the event telling us that. Fail + * now rather than having to unravel this later. + */ + if (cp->geom->spoiled != NULL && cp->spoiled) { + KASSERT(dcr <= 0, ("spoiled but dcr = %d", dcr)); + KASSERT(dcw <= 0, ("spoiled but dce = %d", dcw)); + KASSERT(dce <= 0, ("spoiled but dcw = %d", dce)); + } + + /* + * Figure out what counts the provider would have had, if this + * consumer had (r0w0e0) at this time. + */ + pr = pp->acr - cp->acr; + pw = pp->acw - cp->acw; + pe = pp->ace - cp->ace; + + g_trace(G_T_ACCESS, + "open delta:[r%dw%de%d] old:[r%dw%de%d] provider:[r%dw%de%d] %p(%s)", + dcr, dcw, dce, + cp->acr, cp->acw, cp->ace, + pp->acr, pp->acw, pp->ace, + pp, pp->name); + + /* If foot-shooting is enabled, any open on rank#1 is OK */ + if ((g_debugflags & 16) && pp->geom->rank == 1) + ; + /* If we try exclusive but already write: fail */ + else if (dce > 0 && pw > 0) + return (EPERM); + /* If we try write but already exclusive: fail */ + else if (dcw > 0 && pe > 0) + return (EPERM); + /* If we try to open more but provider is error'ed: fail */ + else if ((dcr > 0 || dcw > 0 || dce > 0) && pp->error != 0) + return (pp->error); + + /* Ok then... */ + + error = pp->geom->access(pp, dcr, dcw, dce); + if (!error) { + /* + * If we open first write, spoil any partner consumers. + * If we close last write, trigger re-taste. + */ + if (pp->acw == 0 && dcw != 0) + g_spoil(pp, cp); + else if (pp->acw != 0 && pp->acw == -dcw && + !(pp->geom->flags & G_GEOM_WITHER)) + g_post_event(g_new_provider_event, pp, M_WAITOK, + pp, NULL); + + pp->acr += dcr; + pp->acw += dcw; + pp->ace += dce; + cp->acr += dcr; + cp->acw += dcw; + cp->ace += dce; + } + return (error); +} + +int +g_handleattr_int(struct bio *bp, const char *attribute, int val) +{ + + return (g_handleattr(bp, attribute, &val, sizeof val)); +} + +int +g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val) +{ + + return (g_handleattr(bp, attribute, &val, sizeof val)); +} + +int +g_handleattr(struct bio *bp, const char *attribute, void *val, int len) +{ + int error; + + if (strcmp(bp->bio_attribute, attribute)) + return (0); + if (bp->bio_length != len) { + printf("bio_length %jd len %d -> EFAULT\n", + (intmax_t)bp->bio_length, len); + error = EFAULT; + } else { + error = 0; + bcopy(val, bp->bio_data, len); + bp->bio_completed = len; + } + g_io_deliver(bp, error); + return (1); +} + +int +g_std_access(struct g_provider *pp __unused, + int dr __unused, int dw __unused, int de __unused) +{ + + return (0); +} + +void +g_std_done(struct bio *bp) +{ + struct bio *bp2; + + bp2 = bp->bio_parent; + if (bp2->bio_error == 0) + bp2->bio_error = bp->bio_error; + bp2->bio_completed += bp->bio_completed; + g_destroy_bio(bp); + bp2->bio_inbed++; + if (bp2->bio_children == bp2->bio_inbed) + g_io_deliver(bp2, bp2->bio_error); +} + +/* XXX: maybe this is only g_slice_spoiled */ + +void +g_std_spoiled(struct g_consumer *cp) +{ + struct g_geom *gp; + struct g_provider *pp; + + g_trace(G_T_TOPOLOGY, "g_std_spoiled(%p)", cp); + g_topology_assert(); + g_detach(cp); + gp = cp->geom; + LIST_FOREACH(pp, &gp->provider, provider) + g_orphan_provider(pp, ENXIO); + g_destroy_consumer(cp); + if (LIST_EMPTY(&gp->provider) && LIST_EMPTY(&gp->consumer)) + g_destroy_geom(gp); + else + gp->flags |= G_GEOM_WITHER; +} + +/* + * Spoiling happens when a provider is opened for writing, but consumers + * which are configured by in-band data are attached (slicers for instance). + * Since the write might potentially change the in-band data, such consumers + * need to re-evaluate their existence after the writing session closes. + * We do this by (offering to) tear them down when the open for write happens + * in return for a re-taste when it closes again. + * Together with the fact that such consumers grab an 'e' bit whenever they + * are open, regardless of mode, this ends up DTRT. + */ + +static void +g_spoil_event(void *arg, int flag) +{ + struct g_provider *pp; + struct g_consumer *cp, *cp2; + + g_topology_assert(); + if (flag == EV_CANCEL) + return; + pp = arg; + for (cp = LIST_FIRST(&pp->consumers); cp != NULL; cp = cp2) { + cp2 = LIST_NEXT(cp, consumers); + if (!cp->spoiled) + continue; + cp->spoiled = 0; + if (cp->geom->spoiled == NULL) + continue; + cp->geom->spoiled(cp); + g_topology_assert(); + } +} + +void +g_spoil(struct g_provider *pp, struct g_consumer *cp) +{ + struct g_consumer *cp2; + + g_topology_assert(); + + LIST_FOREACH(cp2, &pp->consumers, consumers) { + if (cp2 == cp) + continue; +/* + KASSERT(cp2->acr == 0, ("spoiling cp->acr = %d", cp2->acr)); + KASSERT(cp2->acw == 0, ("spoiling cp->acw = %d", cp2->acw)); +*/ + KASSERT(cp2->ace == 0, ("spoiling cp->ace = %d", cp2->ace)); + cp2->spoiled++; + } + g_post_event(g_spoil_event, pp, M_WAITOK, pp, NULL); +} + +int +g_getattr__(const char *attr, struct g_consumer *cp, void *var, int len) +{ + int error, i; + + i = len; + error = g_io_getattr(attr, cp, &i, var); + if (error) + return (error); + if (i != len) + return (EINVAL); + return (0); +} + +/* + * Check if the given pointer is a live object + */ + +void +g_sanity(void const *ptr) +{ + struct g_class *mp; + struct g_geom *gp; + struct g_consumer *cp; + struct g_provider *pp; + + if (!(g_debugflags & 0x8)) + return; + LIST_FOREACH(mp, &g_classes, class) { + KASSERT(mp != ptr, ("Ptr is live class")); + LIST_FOREACH(gp, &mp->geom, geom) { + KASSERT(gp != ptr, ("Ptr is live geom")); + KASSERT(gp->name != ptr, ("Ptr is live geom's name")); + LIST_FOREACH(cp, &gp->consumer, consumer) { + KASSERT(cp != ptr, ("Ptr is live consumer")); + } + LIST_FOREACH(pp, &gp->provider, provider) { + KASSERT(pp != ptr, ("Ptr is live provider")); + } + } + } +} + diff --git a/sys/geom/geom_sunlabel.c b/sys/geom/geom_sunlabel.c new file mode 100644 index 0000000..0718056 --- /dev/null +++ b/sys/geom/geom_sunlabel.c @@ -0,0 +1,281 @@ +/*- + * Copyright (c) 2002 Poul-Henning Kamp + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Poul-Henning Kamp + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/endian.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/conf.h> +#include <sys/bio.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sun_disklabel.h> +#include <geom/geom.h> +#include <geom/geom_slice.h> +#include <machine/endian.h> + +#define SUNLABEL_CLASS_NAME "SUN" + +struct g_sunlabel_softc { + int sectorsize; + int nheads; + int nsects; + int nalt; +}; + +static int +g_sunlabel_modify(struct g_geom *gp, struct g_sunlabel_softc *ms, u_char *sec0) +{ + int i, error; + u_int u, v, csize; + struct sun_disklabel sl; + + error = sunlabel_dec(sec0, &sl); + if (error) + return (error); + + csize = sl.sl_ntracks * sl.sl_nsectors; + + for (i = 0; i < SUN_NPART; i++) { + v = sl.sl_part[i].sdkp_cyloffset; + u = sl.sl_part[i].sdkp_nsectors; + error = g_slice_config(gp, i, G_SLICE_CONFIG_CHECK, + ((off_t)v * csize) << 9ULL, + ((off_t)u) << 9ULL, + ms->sectorsize, + "%s%c", gp->name, 'a' + i); + if (error) + return (error); + } + for (i = 0; i < SUN_NPART; i++) { + v = sl.sl_part[i].sdkp_cyloffset; + u = sl.sl_part[i].sdkp_nsectors; + g_slice_config(gp, i, G_SLICE_CONFIG_SET, + ((off_t)v * csize) << 9ULL, + ((off_t)u) << 9ULL, + ms->sectorsize, + "%s%c", gp->name, 'a' + i); + } + ms->nalt = sl.sl_acylinders; + ms->nheads = sl.sl_ntracks; + ms->nsects = sl.sl_nsectors; + + return (0); +} + +static void +g_sunlabel_hotwrite(void *arg, int flag) +{ + struct bio *bp; + struct g_geom *gp; + struct g_slicer *gsp; + struct g_slice *gsl; + struct g_sunlabel_softc *ms; + u_char *p; + int error; + + KASSERT(flag != EV_CANCEL, ("g_sunlabel_hotwrite cancelled")); + bp = arg; + gp = bp->bio_to->geom; + gsp = gp->softc; + ms = gsp->softc; + gsl = &gsp->slices[bp->bio_to->index]; + /* + * XXX: For all practical purposes, this whould be equvivalent to + * XXX: "p = (u_char *)bp->bio_data;" because the label is always + * XXX: in the first sector and we refuse sectors smaller than the + * XXX: label. + */ + p = (u_char *)bp->bio_data - (bp->bio_offset + gsl->offset); + + error = g_sunlabel_modify(gp, ms, p); + if (error) { + g_io_deliver(bp, EPERM); + return; + } + g_slice_finish_hot(bp); +} + +static void +g_sunlabel_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp) +{ + struct g_slicer *gsp; + struct g_sunlabel_softc *ms; + + gsp = gp->softc; + ms = gsp->softc; + g_slice_dumpconf(sb, indent, gp, cp, pp); + if (indent == NULL) { + sbuf_printf(sb, " sc %u hd %u alt %u", + ms->nsects, ms->nheads, ms->nalt); + } +} + +struct g_hh01 { + struct g_geom *gp; + struct g_sunlabel_softc *ms; + u_char *label; + int error; +}; + +static void +g_sunlabel_callconfig(void *arg, int flag) +{ + struct g_hh01 *hp; + + hp = arg; + hp->error = g_sunlabel_modify(hp->gp, hp->ms, hp->label); + if (!hp->error) + hp->error = g_write_data(LIST_FIRST(&hp->gp->consumer), + 0, hp->label, SUN_SIZE); +} + +/* + * NB! curthread is user process which GCTL'ed. + */ +static void +g_sunlabel_config(struct gctl_req *req, struct g_class *mp, const char *verb) +{ + u_char *label; + int error, i; + struct g_hh01 h0h0; + struct g_slicer *gsp; + struct g_geom *gp; + struct g_consumer *cp; + + g_topology_assert(); + gp = gctl_get_geom(req, mp, "geom"); + if (gp == NULL) + return; + cp = LIST_FIRST(&gp->consumer); + gsp = gp->softc; + if (!strcmp(verb, "write label")) { + label = gctl_get_paraml(req, "label", SUN_SIZE); + if (label == NULL) + return; + h0h0.gp = gp; + h0h0.ms = gsp->softc; + h0h0.label = label; + h0h0.error = -1; + /* XXX: Does this reference register with our selfdestruct code ? */ + error = g_access_rel(cp, 1, 1, 1); + if (error) { + gctl_error(req, "could not access consumer"); + return; + } + g_sunlabel_callconfig(&h0h0, 0); + g_access_rel(cp, -1, -1, -1); + } else if (!strcmp(verb, "write bootcode")) { + label = gctl_get_paraml(req, "bootcode", SUN_BOOTSIZE); + if (label == NULL) + return; + /* XXX: Does this reference register with our selfdestruct code ? */ + error = g_access_rel(cp, 1, 1, 1); + if (error) { + gctl_error(req, "could not access consumer"); + return; + } + for (i = 0; i < SUN_NPART; i++) { + if (gsp->slices[i].length <= SUN_BOOTSIZE) + continue; + g_write_data(cp, + gsp->slices[i].offset + SUN_SIZE, label + SUN_SIZE, + SUN_BOOTSIZE - SUN_SIZE); + } + g_access_rel(cp, -1, -1, -1); + } else { + gctl_error(req, "Unknown verb parameter"); + } +} + +static struct g_geom * +g_sunlabel_taste(struct g_class *mp, struct g_provider *pp, int flags) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error; + u_char *buf; + struct g_sunlabel_softc *ms; + struct g_slicer *gsp; + + g_trace(G_T_TOPOLOGY, "g_sunlabel_taste(%s,%s)", mp->name, pp->name); + g_topology_assert(); + if (flags == G_TF_NORMAL && + !strcmp(pp->geom->class->name, SUNLABEL_CLASS_NAME)) + return (NULL); + gp = g_slice_new(mp, 8, pp, &cp, &ms, sizeof *ms, NULL); + if (gp == NULL) + return (NULL); + gsp = gp->softc; + gp->dumpconf = g_sunlabel_dumpconf; + do { + if (gp->rank != 2 && flags == G_TF_NORMAL) + break; + ms->sectorsize = cp->provider->sectorsize; + if (ms->sectorsize < 512) + break; + g_topology_unlock(); + buf = g_read_data(cp, 0, ms->sectorsize, &error); + g_topology_lock(); + if (buf == NULL || error != 0) + break; + + g_sunlabel_modify(gp, ms, buf); + g_free(buf); + + break; + } while (0); + g_access_rel(cp, -1, 0, 0); + if (LIST_EMPTY(&gp->provider)) { + g_slice_spoiled(cp); + return (NULL); + } + g_slice_conf_hot(gp, 0, 0, SUN_SIZE, + G_SLICE_HOT_ALLOW, G_SLICE_HOT_DENY, G_SLICE_HOT_CALL); + gsp->hot = g_sunlabel_hotwrite; + return (gp); +} + +static struct g_class g_sunlabel_class = { + .name = SUNLABEL_CLASS_NAME, + .taste = g_sunlabel_taste, + .ctlreq = g_sunlabel_config, +}; + +DECLARE_GEOM_CLASS(g_sunlabel_class, g_sunlabel); diff --git a/sys/geom/geom_sunlabel_enc.c b/sys/geom/geom_sunlabel_enc.c new file mode 100644 index 0000000..d153e11 --- /dev/null +++ b/sys/geom/geom_sunlabel_enc.c @@ -0,0 +1,127 @@ +/*- + * Copyright (c) 2003 Jake Burkholder + * Copyright (c) 2003 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Functions to encode or decode struct sun_disklabel into a bytestream + * of correct endianess and packing. + * + * NB! This file must be usable both in kernel and userland. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/endian.h> +#include <sys/errno.h> +#include <sys/sun_disklabel.h> + +#define SL_TEXT 0x0 +#define SL_TEXT_SIZEOF 0x80 +#define SL_RPM 0x1a4 +#define SL_PCYLINDERS 0x1a6 +#define SL_SPARESPERCYL 0x1a8 +#define SL_INTERLEAVE 0x1ae +#define SL_NCYLINDERS 0x1b0 +#define SL_ACYLINDERS 0x1b2 +#define SL_NTRACKS 0x1b4 +#define SL_NSECTORS 0x1b6 +#define SL_PART 0x1bc +#define SL_MAGIC 0x1fc +#define SL_CKSUM 0x1fe + +#define SDKP_CYLOFFSET 0 +#define SDKP_NSECTORS 0x4 +#define SDKP_SIZEOF 0x8 + +/* + * Decode the relevant fields of a sun disk label, and return zero if the + * magic and checksum works out OK. + */ +int +sunlabel_dec(void const *pp, struct sun_disklabel *sl) +{ + const uint8_t *p; + size_t i; + u_int u; + + p = pp; + for (i = 0; i < sizeof(sl->sl_text); i++) + sl->sl_text[i] = p[SL_TEXT + i]; + sl->sl_rpm = be16dec(p + SL_RPM); + sl->sl_pcylinders = be16dec(p + SL_PCYLINDERS); + sl->sl_sparespercyl = be16dec(p + SL_SPARESPERCYL); + sl->sl_interleave = be16dec(p + SL_INTERLEAVE); + sl->sl_ncylinders = be16dec(p + SL_NCYLINDERS); + sl->sl_acylinders = be16dec(p + SL_ACYLINDERS); + sl->sl_ntracks = be16dec(p + SL_NTRACKS); + sl->sl_nsectors = be16dec(p + SL_NSECTORS); + for (i = 0; i < SUN_NPART; i++) { + sl->sl_part[i].sdkp_cyloffset = be32dec(p + SL_PART + + (i * SDKP_SIZEOF) + SDKP_CYLOFFSET); + sl->sl_part[i].sdkp_nsectors = be32dec(p + SL_PART + + (i * SDKP_SIZEOF) + SDKP_NSECTORS); + } + sl->sl_magic = be16dec(p + SL_MAGIC); + for (i = u = 0; i < SUN_SIZE; i += 2) + u ^= be16dec(p + i); + if (u == 0 && sl->sl_magic == SUN_DKMAGIC) + return (0); + else + return (EINVAL); +} + +/* + * Encode the relevant fields into a sun disklabel, compute new checksum. + */ +void +sunlabel_enc(void *pp, struct sun_disklabel *sl) +{ + uint8_t *p; + size_t i; + u_int u; + + p = pp; + for (i = 0; i < SL_TEXT_SIZEOF; i++) + p[SL_TEXT + i] = sl->sl_text[i]; + be16enc(p + SL_RPM, sl->sl_rpm); + be16enc(p + SL_PCYLINDERS, sl->sl_pcylinders); + be16enc(p + SL_SPARESPERCYL, sl->sl_sparespercyl); + be16enc(p + SL_INTERLEAVE, sl->sl_interleave); + be16enc(p + SL_NCYLINDERS, sl->sl_ncylinders); + be16enc(p + SL_ACYLINDERS, sl->sl_acylinders); + be16enc(p + SL_NTRACKS, sl->sl_ntracks); + be16enc(p + SL_NSECTORS, sl->sl_nsectors); + for (i = 0; i < SUN_NPART; i++) { + be32enc(p + SL_PART + (i * SDKP_SIZEOF) + SDKP_CYLOFFSET, + sl->sl_part[i].sdkp_cyloffset); + be32enc(p + SL_PART + (i * SDKP_SIZEOF) + SDKP_NSECTORS, + sl->sl_part[i].sdkp_nsectors); + } + be16enc(p + SL_MAGIC, sl->sl_magic); + for (i = u = 0; i < SUN_SIZE; i += 2) + u ^= be16dec(p + i); + be16enc(p + SL_CKSUM, u); +} diff --git a/sys/geom/geom_vol_ffs.c b/sys/geom/geom_vol_ffs.c new file mode 100644 index 0000000..de046ed --- /dev/null +++ b/sys/geom/geom_vol_ffs.c @@ -0,0 +1,143 @@ +/*- + * Copyright (c) 2002, 2003 Gordon Tetlow + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/bio.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +#include <ufs/ufs/dinode.h> +#include <ufs/ffs/fs.h> + +#include <geom/geom.h> +#include <geom/geom_slice.h> + +#define VOL_FFS_CLASS_NAME "VOL_FFS" + +static int superblocks[] = SBLOCKSEARCH; + +struct g_vol_ffs_softc { + char * vol; +}; + +static int +g_vol_ffs_start(struct bio *bp __unused) +{ + return(0); +} + +static struct g_geom * +g_vol_ffs_taste(struct g_class *mp, struct g_provider *pp, int flags) +{ + struct g_geom *gp; + struct g_consumer *cp; + struct g_vol_ffs_softc *ms; + int error, sb, superblock; + struct fs *fs; + + g_trace(G_T_TOPOLOGY, "vol_taste(%s,%s)", mp->name, pp->name); + g_topology_assert(); + + /* + * XXX This is a really weak way to make sure we don't recurse. + * Probably ought to use BIO_GETATTR to check for this. + */ + if (flags == G_TF_NORMAL && + !strcmp(pp->geom->class->name, VOL_FFS_CLASS_NAME)) + return (NULL); + + gp = g_slice_new(mp, 1, pp, &cp, &ms, sizeof(*ms), g_vol_ffs_start); + if (gp == NULL) + return (NULL); + g_topology_unlock(); + /* + * Walk through the standard places that superblocks hide and look + * for UFS magic. If we find magic, then check that the size in the + * superblock corresponds to the size of the underlying provider. + * Finally, look for a volume label and create an appropriate + * provider based on that. + */ + for (sb=0; (superblock = superblocks[sb]) != -1; sb++) { + fs = (struct fs *) g_read_data(cp, superblock, + SBLOCKSIZE, &error); + if (fs == NULL || error != 0) + continue; + /* Check for magic and make sure things are the right size */ + if (fs->fs_magic == FS_UFS1_MAGIC) { + if (fs->fs_old_size * fs->fs_fsize != + (int32_t) pp->mediasize) { + g_free(fs); + continue; + } + } else if (fs->fs_magic == FS_UFS2_MAGIC) { + if (fs->fs_size * fs->fs_fsize != + (int64_t) pp->mediasize) { + g_free(fs); + continue; + } + } else { + g_free(fs); + continue; + } + /* Check for volume label */ + if (fs->fs_volname[0] == '\0') { + g_free(fs); + continue; + } + /* XXX We need to check for namespace conflicts. */ + /* XXX How do you handle a mirror set? */ + /* XXX We don't validate the volume name. */ + g_topology_lock(); + /* Alright, we have a label and a volume name, reconfig. */ + g_slice_config(gp, 0, G_SLICE_CONFIG_SET, (off_t) 0, + pp->mediasize, pp->sectorsize, "vol/%s", + fs->fs_volname); + g_free(fs); + g_topology_unlock(); + break; + } + g_topology_lock(); + g_access_rel(cp, -1, 0, 0); + if (LIST_EMPTY(&gp->provider)) { + g_slice_spoiled(cp); + return (NULL); + } + return (gp); +} + +static struct g_class g_vol_ffs_class = { + .name = VOL_FFS_CLASS_NAME, + .taste = g_vol_ffs_taste, +}; + +DECLARE_GEOM_CLASS(g_vol_ffs_class, g_vol_ffs); diff --git a/sys/geom/notes b/sys/geom/notes new file mode 100644 index 0000000..3b0f811 --- /dev/null +++ b/sys/geom/notes @@ -0,0 +1,140 @@ +$FreeBSD$ + +For the lack of a better place to put them, this file will contain +notes on some of the more intricate details of geom. + +----------------------------------------------------------------------- +Locking of bio_children and bio_inbed + +bio_children is used by g_std_done() and g_clone_bio() to keep track +of children cloned off a request. g_clone_bio will increment the +bio_children counter for each time it is called and g_std_done will +increment bio_inbed for every call, and if the two counters are +equal, call g_io_deliver() on the parent bio. + +The general assumption is that g_clone_bio() is called only in +the g_down thread, and g_std_done() only in the g_up thread and +therefore the two fields do not generally need locking. These +restrictions are not enforced by the code, but only with great +care should they be violated. + +It is the responsibility of the class implementation to avoid the +following race condition: A class intend to split a bio in two +children. It clones the bio, and requests I/O on the child. +This I/O operation completes before the second child is cloned +and g_std_done() sees the counters both equal 1 and finishes off +the bio. + +There is no race present in the common case where the bio is split +in multiple parts in the class start method and the I/O is requested +on another GEOM class below: There is only one g_down thread and +the class below will not get its start method run until we return +from our start method, and consequently the I/O cannot complete +prematurely. + +In all other cases, this race needs to be mitigated, for instance +by cloning all children before I/O is request on any of them. + +Notice that cloning an "extra" child and calling g_std_done() on +it directly opens another race since the assumption is that +g_std_done() only is called in the g_up thread. + +----------------------------------------------------------------------- +Statistics collection + +Statistics collection can run at three levels controlled by the +"kern.geom.collectstats" sysctl. + +At level zero, only the number of transactions started and completed +are counted, and this is only because GEOM internally uses the difference +between these two as sanity checks. + +At level one we collect the full statistics. Higher levels are +reserved for future use. Statistics are collected independently +on both the provider and the consumer, because multiple consumers +can be active against the same provider at the same time. + +The statistics collection falls in two parts: + +The first and simpler part consists of g_io_request() timestamping +the struct bio when the request is first started and g_io_deliver() +updating the consumer and providers statistics based on fields in +the bio when it is completed. There are no concurrency or locking +concerns in this part. The statistics collected consists of number +of requests, number of bytes, number of ENOMEM errors, number of +other errors and duration of the request for each of the three +major request types: BIO_READ, BIO_WRITE and BIO_DELETE. + +The second part is trying to keep track of the "busy%". + +If in g_io_request() we find that there are no outstanding requests, +(based on the counters for scheduled and completed requests being +equal), we set a timestamp in the "wentbusy" field. Since there +are no outstanding requests, and as long as there is only one thread +pushing the g_down queue, we cannot possibly conflict with +g_io_deliver() until we ship the current request down. + +In g_io_deliver() we calculate the delta-T from wentbusy and add this +to the "bt" field, and set wentbusy to the current timestamp. We +take care to do this before we increment the "requests completed" +counter, since that prevents g_io_request() from touching the +"wentbusy" timestamp concurrently. + +The statistics data is made available to userland through the use +of a special allocator (in geom_stats.c) which through a device +allows userland to mmap(2) the pages containing the statistics data. +In order to indicate to userland when the data in a statstics +structure might be inconsistent, g_io_deliver() atomically sets a +flag "updating" and resets it when the structure is again consistent. +----------------------------------------------------------------------- +maxsize, stripesize and stripeoffset + +maxsize is the biggest request we are willing to handle. If not +set there is no upper bound on the size of a request and the code +is responsible for chopping it up. Only hardware methods should +set an upper bound in this field. Geom_disk will inherit the upper +bound set by the device driver. + +stripesize is the width of any natural request boundaries for the +device. This would be the width of a stripe on a raid-5 unit or +one zone in GBDE. The idea with this field is to hint to clustering +type code to not trivially overrun these boundaries. + +stripeoffset is the amount of the first stripe which lies before the +devices beginning. + +If we have a device with 64k stripes: + [0...64k[ + [64k...128k[ + [128k..192k[ +Then it will have stripesize = 64k and stripeoffset = 0. + +If we put a MBR on this device, where slice#1 starts on sector#63, +then this slice will have: stripesize = 64k, stripeoffset = 63 * sectorsize. + +If the clustering code wants to widen a request which writes to +sector#53 of the slice, it can calculate how many bytes till the end of +the stripe as: + stripewith - (53 * sectorsize + stripeoffset) % stripewidth. +----------------------------------------------------------------------- + +#include file usage: + + geom.h|geom_int.h|geom_ext.h|geom_ctl.h|libgeom.h +----------------+------+----------+----------+----------+--------+ +geom class | | | | | | +implementation | X | | | | | +----------------+------+----------+----------+----------+--------+ +geom kernel | | | | | | +infrastructure | X | X | X | X | | +----------------+------+----------+----------+----------+--------+ +libgeom | | | | | | +implementation | | | X | X | X | +----------------+------+----------+----------+----------+--------+ +geom aware | | | | | | +application | | | | X | X | +----------------+------+----------+----------+----------+--------+ + +geom_slice.h is special in that it documents a "library" for implementing +a specific kind of class, and consequently does not appear in the above +matrix. |