diff options
author | pjd <pjd@FreeBSD.org> | 2008-11-17 20:49:29 +0000 |
---|---|---|
committer | pjd <pjd@FreeBSD.org> | 2008-11-17 20:49:29 +0000 |
commit | bbe899b96e388a8b82439f81ed3707e0d9c6070d (patch) | |
tree | 81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/boot | |
parent | d2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff) | |
download | FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz |
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes:
- Delegated Administration
Allows regular users to perform ZFS operations, like file system
creation, snapshot creation, etc.
- L2ARC
Level 2 cache for ZFS - allows to use additional disks for cache.
Huge performance improvements mostly for random read of mostly
static content.
- slog
Allow to use additional disks for ZFS Intent Log to speed up
operations like fsync(2).
- vfs.zfs.super_owner
Allows regular users to perform privileged operations on files stored
on ZFS file systems owned by him. Very careful with this one.
- chflags(2)
Not all the flags are supported. This still needs work.
- ZFSBoot
Support to boot off of ZFS pool. Not finished, AFAIK.
Submitted by: dfr
- Snapshot properties
- New failure modes
Before if write requested failed, system paniced. Now one
can select from one of three failure modes:
- panic - panic on write error
- wait - wait for disk to reappear
- continue - serve read requests if possible, block write requests
- Refquota, refreservation properties
Just quota and reservation properties, but don't count space consumed
by children file systems, clones and snapshots.
- Sparse volumes
ZVOLs that don't reserve space in the pool.
- External attributes
Compatible with extattr(2).
- NFSv4-ACLs
Not sure about the status, might not be complete yet.
Submitted by: trasz
- Creation-time properties
- Regression tests for zpool(8) command.
Obtained from: OpenSolaris
Diffstat (limited to 'sys/boot')
-rw-r--r-- | sys/boot/Makefile | 4 | ||||
-rw-r--r-- | sys/boot/common/bootstrap.h | 1 | ||||
-rw-r--r-- | sys/boot/i386/Makefile | 4 | ||||
-rw-r--r-- | sys/boot/i386/libi386/bootinfo32.c | 1 | ||||
-rw-r--r-- | sys/boot/i386/libi386/devicename.c | 2 | ||||
-rw-r--r-- | sys/boot/i386/loader/Makefile | 10 | ||||
-rw-r--r-- | sys/boot/i386/loader/conf.c | 14 | ||||
-rw-r--r-- | sys/boot/i386/loader/main.c | 51 | ||||
-rw-r--r-- | sys/boot/i386/zfsboot/Makefile | 108 | ||||
-rw-r--r-- | sys/boot/i386/zfsboot/zfsboot.c | 944 | ||||
-rw-r--r-- | sys/boot/i386/zfsboot/zfsldr.S | 402 | ||||
-rw-r--r-- | sys/boot/zfs/Makefile | 29 | ||||
-rw-r--r-- | sys/boot/zfs/zfs.c | 514 | ||||
-rw-r--r-- | sys/boot/zfs/zfsimpl.c | 1443 |
14 files changed, 3512 insertions, 15 deletions
diff --git a/sys/boot/Makefile b/sys/boot/Makefile index 1af1457..27cb7e3 100644 --- a/sys/boot/Makefile +++ b/sys/boot/Makefile @@ -26,6 +26,10 @@ SUBDIR+= ofw SUBDIR+= uboot .endif +.if defined(LOADER_ZFS_SUPPORT) +SUBDIR+= zfs +.endif + # Pick the machine-dependent subdir based on the target architecture. ADIR= ${MACHINE:S/amd64/i386/:S/sun4v/sparc64/} .if exists(${.CURDIR}/${ADIR}/.) diff --git a/sys/boot/common/bootstrap.h b/sys/boot/common/bootstrap.h index 57982d1..5f08480 100644 --- a/sys/boot/common/bootstrap.h +++ b/sys/boot/common/bootstrap.h @@ -43,6 +43,7 @@ struct devdesc #define DEVT_DISK 1 #define DEVT_NET 2 #define DEVT_CD 3 +#define DEVT_ZFS 4 int d_unit; }; diff --git a/sys/boot/i386/Makefile b/sys/boot/i386/Makefile index b89222d..6af8642 100644 --- a/sys/boot/i386/Makefile +++ b/sys/boot/i386/Makefile @@ -1,7 +1,7 @@ # $FreeBSD$ -SUBDIR= mbr pmbr boot0 boot0sio btx boot2 cdboot gptboot kgzldr \ - libi386 libfirewire loader +SUBDIR= mbr pmbr boot0 boot0sio btx boot2 cdboot gptboot zfsboot \ + kgzldr libi386 libfirewire loader # special boot programs, 'self-extracting boot2+loader' SUBDIR+= pxeldr diff --git a/sys/boot/i386/libi386/bootinfo32.c b/sys/boot/i386/libi386/bootinfo32.c index 6b517c5..d434427 100644 --- a/sys/boot/i386/libi386/bootinfo32.c +++ b/sys/boot/i386/libi386/bootinfo32.c @@ -183,6 +183,7 @@ bi_load32(char *args, int *howtop, int *bootdevp, vm_offset_t *bip, vm_offset_t break; case DEVT_NET: + case DEVT_ZFS: break; default: diff --git a/sys/boot/i386/libi386/devicename.c b/sys/boot/i386/libi386/devicename.c index e1035aa..79a562b 100644 --- a/sys/boot/i386/libi386/devicename.c +++ b/sys/boot/i386/libi386/devicename.c @@ -167,6 +167,7 @@ i386_parsedev(struct i386_devdesc **dev, const char *devspec, const char **path) case DEVT_CD: case DEVT_NET: + case DEVT_ZFS: unit = 0; if (*np && (*np != ':')) { @@ -238,6 +239,7 @@ i386_fmtdev(void *vdev) break; case DEVT_NET: + case DEVT_ZFS: sprintf(buf, "%s%d:", dev->d_dev->dv_name, dev->d_unit); break; } diff --git a/sys/boot/i386/loader/Makefile b/sys/boot/i386/loader/Makefile index df2ccc0..79aceca 100644 --- a/sys/boot/i386/loader/Makefile +++ b/sys/boot/i386/loader/Makefile @@ -17,6 +17,12 @@ CFLAGS+= -DLOADER_FIREWIRE_SUPPORT LIBFIREWIRE= ${.OBJDIR}/../libfirewire/libfirewire.a .endif +# Put LOADER_ZFS_SUPPORT=yes in /etc/make.conf for ZFS support +.if defined(LOADER_ZFS_SUPPORT) +CFLAGS+= -DLOADER_ZFS_SUPPORT +LIBZFS= ${.OBJDIR}/../../zfs/libzfsboot.a +.endif + # Enable PXE TFTP or NFS support, not both. .if defined(LOADER_TFTP_SUPPORT) CFLAGS+= -DLOADER_TFTP_SUPPORT @@ -98,8 +104,8 @@ FILES+= loader.rc # XXX crt0.o needs to be first for pxeboot(8) to work OBJS= ${BTXCRT} -DPADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBI386} ${LIBSTAND} -LDADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBI386} -lstand +DPADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBZFS} ${LIBI386} ${LIBSTAND} +LDADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBZFS} ${LIBI386} -lstand .include <bsd.prog.mk> diff --git a/sys/boot/i386/loader/conf.c b/sys/boot/i386/loader/conf.c index 245f960..05c9a9e9 100644 --- a/sys/boot/i386/loader/conf.c +++ b/sys/boot/i386/loader/conf.c @@ -50,6 +50,10 @@ __FBSDID("$FreeBSD$"); extern struct devsw fwohci; #endif +#if defined(LOADER_ZFS_SUPPORT) +extern struct devsw zfs_dev; +#endif + /* Exported for libstand */ struct devsw *devsw[] = { &bioscd, @@ -60,15 +64,25 @@ struct devsw *devsw[] = { #if defined(LOADER_FIREWIRE_SUPPORT) &fwohci, #endif +#if defined(LOADER_ZFS_SUPPORT) + &zfs_dev, +#endif NULL }; +#if defined(LOADER_ZFS_SUPPORT) +extern struct fs_ops zfs_fsops; +#endif + struct fs_ops *file_system[] = { &ufs_fsops, &ext2fs_fsops, &dosfs_fsops, &cd9660_fsops, &splitfs_fsops, +#if defined(LOADER_ZFS_SUPPORT) + &zfs_fsops, +#endif #ifdef LOADER_GZIP_SUPPORT &gzipfs_fsops, #endif diff --git a/sys/boot/i386/loader/main.c b/sys/boot/i386/loader/main.c index 5b23670..cac28ae 100644 --- a/sys/boot/i386/loader/main.c +++ b/sys/boot/i386/loader/main.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #define KARGS_FLAGS_CD 0x1 #define KARGS_FLAGS_PXE 0x2 +#define KARGS_FLAGS_ZFS 0x4 /* Arguments passed in from the boot1/boot2 loader */ static struct @@ -51,8 +52,13 @@ static struct u_int32_t howto; u_int32_t bootdev; u_int32_t bootflags; - u_int32_t pxeinfo; - u_int32_t res2; + union { + struct { + u_int32_t pxeinfo; + u_int32_t res2; + }; + uint64_t zfspool; + }; u_int32_t bootinfo; } *kargs; @@ -96,7 +102,7 @@ main(void) */ bios_getmem(); -#if defined(LOADER_BZIP2_SUPPORT) || defined(LOADER_FIREWIRE_SUPPORT) +#if defined(LOADER_BZIP2_SUPPORT) || defined(LOADER_FIREWIRE_SUPPORT) || defined(LOADER_ZFS_SUPPORT) heap_top = PTOV(memtop_copyin); memtop_copyin -= 0x300000; heap_bottom = PTOV(memtop_copyin); @@ -145,6 +151,14 @@ main(void) bc_add(initial_bootdev); } + archsw.arch_autoload = i386_autoload; + archsw.arch_getdev = i386_getdev; + archsw.arch_copyin = i386_copyin; + archsw.arch_copyout = i386_copyout; + archsw.arch_readin = i386_readin; + archsw.arch_isainb = isa_inb; + archsw.arch_isaoutb = isa_outb; + /* * March through the device switch probing for things. */ @@ -172,14 +186,6 @@ main(void) bios_getsmap(); - archsw.arch_autoload = i386_autoload; - archsw.arch_getdev = i386_getdev; - archsw.arch_copyin = i386_copyin; - archsw.arch_copyout = i386_copyout; - archsw.arch_readin = i386_readin; - archsw.arch_isainb = isa_inb; - archsw.arch_isaoutb = isa_outb; - interact(); /* doesn't return */ /* if we ever get here, it is an error */ @@ -252,6 +258,29 @@ extract_currdev(void) i386_setcurrdev, env_nounset); env_setenv("loaddev", EV_VOLATILE, i386_fmtdev(&new_currdev), env_noset, env_nounset); + +#ifdef LOADER_ZFS_SUPPORT + /* + * If we were started from a ZFS-aware boot2, we can work out + * which ZFS pool we are booting from. + */ + if (kargs->bootflags & KARGS_FLAGS_ZFS) { + /* + * Dig out the pool guid and convert it to a 'unit number' + */ + uint64_t guid; + int unit; + char devname[32]; + extern int zfs_guid_to_unit(uint64_t); + + guid = kargs->zfspool; + unit = zfs_guid_to_unit(guid); + if (unit >= 0) { + sprintf(devname, "zfs%d", unit); + setenv("currdev", devname, 1); + } + } +#endif } COMMAND_SET(reboot, "reboot", "reboot the system", command_reboot); diff --git a/sys/boot/i386/zfsboot/Makefile b/sys/boot/i386/zfsboot/Makefile new file mode 100644 index 0000000..41f1672 --- /dev/null +++ b/sys/boot/i386/zfsboot/Makefile @@ -0,0 +1,108 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR}/../boot2 + +FILES= zfsboot + +NM?= nm + +# A value of 0x80 enables LBA support. +BOOT_BOOT1_FLAGS?= 0x80 + +BOOT_COMCONSOLE_PORT?= 0x3f8 +BOOT_COMCONSOLE_SPEED?= 9600 +B2SIOFMT?= 0x3 + +REL1= 0x700 +ORG1= 0x7c00 +ORG2= 0x2000 + +CFLAGS= -Os -g \ + -fno-guess-branch-probability \ + -fomit-frame-pointer \ + -fno-unit-at-a-time \ + -mno-align-long-strings \ + -mrtd \ + -mno-mmx -mno-3dnow -mno-sse -mno-sse2 -mno-sse3 \ + -DBOOT2 \ + -DFLAGS=${BOOT_BOOT1_FLAGS} \ + -DSIOPRT=${BOOT_COMCONSOLE_PORT} \ + -DSIOFMT=${B2SIOFMT} \ + -DSIOSPD=${BOOT_COMCONSOLE_SPEED} \ + -I${.CURDIR}/../../zfs \ + -I${.CURDIR}/../../../cddl/boot/zfs \ + -I${.CURDIR}/../btx/lib -I. \ + -I${.CURDIR}/../boot2 \ + -Wall -Waggregate-return -Wbad-function-cast -Wcast-align \ + -Wmissing-declarations -Wmissing-prototypes -Wnested-externs \ + -Wpointer-arith -Wshadow -Wstrict-prototypes -Wwrite-strings \ + -Winline --param max-inline-insns-single=100 + +LDFLAGS=-static -N --gc-sections + +# Pick up ../Makefile.inc early. +.include <bsd.init.mk> + +CLEANFILES= zfsboot + +zfsboot: zfsboot1 zfsboot2 + cat zfsboot1 zfsboot2 > zfsboot + +CLEANFILES+= zfsboot1 zfsldr.out zfsldr.o + +zfsboot1: zfsldr.out + objcopy -S -O binary zfsldr.out ${.TARGET} + +zfsldr.out: zfsldr.o + ${LD} ${LDFLAGS} -e start -Ttext ${ORG1} -o ${.TARGET} zfsldr.o + +CLEANFILES+= zfsboot2 zfsboot.ld zfsboot.ldr zfsboot.bin zfsboot.out \ + zfsboot.o zfsboot.s zfsboot.s.tmp zfsboot.h sio.o + +# We currently allow 32768 bytes for zfsboot - in practice it could be +# any size up to 3.5Mb but keeping it fixed size simplifies zfsldr. +# +BOOT2SIZE= 32768 + +zfsboot2: zfsboot.ld + @set -- `ls -l zfsboot.ld`; x=$$((${BOOT2SIZE}-$$5)); \ + echo "$$x bytes available"; test $$x -ge 0 + dd if=zfsboot.ld of=${.TARGET} obs=${BOOT2SIZE} conv=osync + +zfsboot.ld: zfsboot.ldr zfsboot.bin ${BTXKERN} + btxld -v -E ${ORG2} -f bin -b ${BTXKERN} -l zfsboot.ldr \ + -o ${.TARGET} -P 1 zfsboot.bin + +zfsboot.ldr: + cp /dev/null ${.TARGET} + +zfsboot.bin: zfsboot.out + objcopy -S -O binary zfsboot.out ${.TARGET} + +zfsboot.out: ${BTXCRT} zfsboot.o sio.o + ${LD} ${LDFLAGS} -Ttext ${ORG2} -o ${.TARGET} ${.ALLSRC} + +zfsboot.o: zfsboot.s + +SRCS= zfsboot.c zfsboot.h + +zfsboot.s: zfsboot.c zfsboot.h ${.CURDIR}/../../zfs/zfsimpl.c + ${CC} ${CFLAGS} -S -o zfsboot.s.tmp ${.CURDIR}/zfsboot.c + sed -e '/align/d' -e '/nop/d' < zfsboot.s.tmp > zfsboot.s + rm -f zfsboot.s.tmp + +zfsboot.h: zfsldr.out + ${NM} -t d ${.ALLSRC} | awk '/([0-9])+ T xread/ \ + { x = $$1 - ORG1; \ + printf("#define XREADORG %#x\n", REL1 + x) }' \ + ORG1=`printf "%d" ${ORG1}` \ + REL1=`printf "%d" ${REL1}` > ${.TARGET} + +.if ${MACHINE_ARCH} == "amd64" +beforedepend zfsboot.s: machine +CLEANFILES+= machine +machine: + ln -sf ${.CURDIR}/../../../i386/include machine +.endif + +.include <bsd.prog.mk> diff --git a/sys/boot/i386/zfsboot/zfsboot.c b/sys/boot/i386/zfsboot/zfsboot.c new file mode 100644 index 0000000..9b0a465 --- /dev/null +++ b/sys/boot/i386/zfsboot/zfsboot.c @@ -0,0 +1,944 @@ +/*- + * Copyright (c) 1998 Robert Nordier + * All rights reserved. + * + * Redistribution and use in source and binary forms are freely + * permitted provided that the above copyright notice and this + * paragraph and the following disclaimer are duplicated in all + * such forms. + * + * This software is provided "AS IS" and without any express or + * implied warranties, including, without limitation, the implied + * warranties of merchantability and fitness for a particular + * purpose. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/diskmbr.h> +#include <sys/reboot.h> +#include <sys/queue.h> + +#include <machine/bootinfo.h> +#include <machine/elf.h> + +#include <stdarg.h> +#include <stddef.h> + +#include <a.out.h> + +#include <btxv86.h> + +#include "zfsboot.h" +#include "lib.h" + +#define IO_KEYBOARD 1 +#define IO_SERIAL 2 + +#define SECOND 18 /* Circa that many ticks in a second. */ + +#define RBX_ASKNAME 0x0 /* -a */ +#define RBX_SINGLE 0x1 /* -s */ +/* 0x2 is reserved for log2(RB_NOSYNC). */ +/* 0x3 is reserved for log2(RB_HALT). */ +/* 0x4 is reserved for log2(RB_INITNAME). */ +#define RBX_DFLTROOT 0x5 /* -r */ +#define RBX_KDB 0x6 /* -d */ +/* 0x7 is reserved for log2(RB_RDONLY). */ +/* 0x8 is reserved for log2(RB_DUMP). */ +/* 0x9 is reserved for log2(RB_MINIROOT). */ +#define RBX_CONFIG 0xa /* -c */ +#define RBX_VERBOSE 0xb /* -v */ +#define RBX_SERIAL 0xc /* -h */ +#define RBX_CDROM 0xd /* -C */ +/* 0xe is reserved for log2(RB_POWEROFF). */ +#define RBX_GDB 0xf /* -g */ +#define RBX_MUTE 0x10 /* -m */ +/* 0x11 is reserved for log2(RB_SELFTEST). */ +/* 0x12 is reserved for boot programs. */ +/* 0x13 is reserved for boot programs. */ +#define RBX_PAUSE 0x14 /* -p */ +#define RBX_QUIET 0x15 /* -q */ +#define RBX_NOINTR 0x1c /* -n */ +/* 0x1d is reserved for log2(RB_MULTIPLE) and is just misnamed here. */ +#define RBX_DUAL 0x1d /* -D */ +/* 0x1f is reserved for log2(RB_BOOTINFO). */ + +/* pass: -a, -s, -r, -d, -c, -v, -h, -C, -g, -m, -p, -D */ +#define RBX_MASK (OPT_SET(RBX_ASKNAME) | OPT_SET(RBX_SINGLE) | \ + OPT_SET(RBX_DFLTROOT) | OPT_SET(RBX_KDB ) | \ + OPT_SET(RBX_CONFIG) | OPT_SET(RBX_VERBOSE) | \ + OPT_SET(RBX_SERIAL) | OPT_SET(RBX_CDROM) | \ + OPT_SET(RBX_GDB ) | OPT_SET(RBX_MUTE) | \ + OPT_SET(RBX_PAUSE) | OPT_SET(RBX_DUAL)) + +/* Hint to loader that we came from ZFS */ +#define KARGS_FLAGS_ZFS 0x4 + +#define PATH_CONFIG "/boot.config" +#define PATH_BOOT3 "/boot/loader" +#define PATH_KERNEL "/boot/kernel/kernel" + +#define ARGS 0x900 +#define NOPT 14 +#define NDEV 3 +#define MEM_BASE 0x12 +#define MEM_EXT 0x15 +#define V86_CY(x) ((x) & 1) +#define V86_ZR(x) ((x) & 0x40) + +#define DRV_HARD 0x80 +#define DRV_MASK 0x7f + +#define TYPE_AD 0 +#define TYPE_DA 1 +#define TYPE_MAXHARD TYPE_DA +#define TYPE_FD 2 + +#define OPT_SET(opt) (1 << (opt)) +#define OPT_CHECK(opt) ((opts) & OPT_SET(opt)) + +extern uint32_t _end; + +static const char optstr[NOPT] = "DhaCcdgmnpqrsv"; /* Also 'P', 'S' */ +static const unsigned char flags[NOPT] = { + RBX_DUAL, + RBX_SERIAL, + RBX_ASKNAME, + RBX_CDROM, + RBX_CONFIG, + RBX_KDB, + RBX_GDB, + RBX_MUTE, + RBX_NOINTR, + RBX_PAUSE, + RBX_QUIET, + RBX_DFLTROOT, + RBX_SINGLE, + RBX_VERBOSE +}; + +static const char *const dev_nm[NDEV] = {"ad", "da", "fd"}; +static const unsigned char dev_maj[NDEV] = {30, 4, 2}; + +struct dsk { + unsigned drive; + unsigned type; + unsigned unit; + unsigned slice; + unsigned part; + unsigned start; + int init; +}; +static char cmd[512]; +static char kname[1024]; +static uint32_t opts; +static int comspeed = SIOSPD; +static struct bootinfo bootinfo; +static uint32_t bootdev; +static uint8_t ioctrl = IO_KEYBOARD; + +/* Buffers that must not span a 64k boundary. */ +#define READ_BUF_SIZE 8192 +struct dmadat { + char rdbuf[READ_BUF_SIZE]; /* for reading large things */ + char secbuf[READ_BUF_SIZE]; /* for MBR/disklabel */ +}; +static struct dmadat *dmadat; + +void exit(int); +static void load(void); +static int parse(void); +static void printf(const char *,...); +static void putchar(int); +static uint32_t memsize(void); +static int drvread(struct dsk *, void *, unsigned, unsigned); +static int keyhit(unsigned); +static int xputc(int); +static int xgetc(int); +static int getc(int); + +static void memcpy(void *, const void *, int); +static void +memcpy(void *dst, const void *src, int len) +{ + const char *s = src; + char *d = dst; + + while (len--) + *d++ = *s++; +} + +static void +strcpy(char *dst, const char *src) +{ + while (*src) + *dst++ = *src++; + *dst++ = 0; +} + +static void +strcat(char *dst, const char *src) +{ + while (*dst) + dst++; + while (*src) + *dst++ = *src++; + *dst++ = 0; +} + +static int +strcmp(const char *s1, const char *s2) +{ + for (; *s1 == *s2 && *s1; s1++, s2++); + return (unsigned char)*s1 - (unsigned char)*s2; +} + +static const char * +strchr(const char *s, char ch) +{ + for (; *s; s++) + if (*s == ch) + return s; + return 0; +} + +static int +memcmp(const void *p1, const void *p2, size_t n) +{ + const char *s1 = (const char *) p1; + const char *s2 = (const char *) p2; + for (; n > 0 && *s1 == *s2; s1++, s2++, n--); + if (n) + return (unsigned char)*s1 - (unsigned char)*s2; + else + return 0; +} + +static void +memset(void *p, char val, size_t n) +{ + char *s = (char *) p; + while (n--) + *s++ = val; +} + +static void * +malloc(size_t n) +{ + static char *heap_next; + static char *heap_end; + + if (!heap_next) { + heap_next = (char *) dmadat + sizeof(*dmadat); + heap_end = (char *) (640*1024); + } + + char *p = heap_next; + if (p + n > heap_end) { + printf("malloc failure\n"); + for (;;) + ; + return 0; + } + heap_next += n; + return p; +} + +static size_t +strlen(const char *s) +{ + size_t len = 0; + while (*s++) + len++; + return len; +} + +static char * +strdup(const char *s) +{ + char *p = malloc(strlen(s) + 1); + strcpy(p, s); + return p; +} + +#include "zfsimpl.c" + +/* + * Read from a dnode (which must be from a ZPL filesystem). + */ +static int +zfs_read(spa_t *spa, const dnode_phys_t *dnode, off_t *offp, void *start, size_t size) +{ + const znode_phys_t *zp = (const znode_phys_t *) dnode->dn_bonus; + size_t n; + int rc; + + n = size; + if (*offp + n > zp->zp_size) + n = zp->zp_size - *offp; + + rc = dnode_read(spa, dnode, *offp, start, n); + if (rc) + return (-1); + *offp += n; + + return (n); +} + +/* + * Current ZFS pool + */ +spa_t *spa; + +/* + * A wrapper for dskread that doesn't have to worry about whether the + * buffer pointer crosses a 64k boundary. + */ +static int +vdev_read(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes) +{ + char *p; + unsigned int lba, nb; + struct dsk *dsk = (struct dsk *) priv; + + if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1))) + return -1; + + p = buf; + lba = off / DEV_BSIZE; + while (bytes > 0) { + nb = bytes / DEV_BSIZE; + if (nb > READ_BUF_SIZE / DEV_BSIZE) + nb = READ_BUF_SIZE / DEV_BSIZE; + if (drvread(dsk, dmadat->rdbuf, lba, nb)) + return -1; + memcpy(p, dmadat->rdbuf, nb * DEV_BSIZE); + p += nb * DEV_BSIZE; + lba += nb; + bytes -= nb * DEV_BSIZE; + } + + return 0; +} + +static int +xfsread(const dnode_phys_t *dnode, off_t *offp, void *buf, size_t nbyte) +{ + if ((size_t)zfs_read(spa, dnode, offp, buf, nbyte) != nbyte) { + printf("Invalid %s\n", "format"); + return -1; + } + return 0; +} + +static inline uint32_t +memsize(void) +{ + v86.addr = MEM_EXT; + v86.eax = 0x8800; + v86int(); + return v86.eax; +} + +static inline void +getstr(void) +{ + char *s; + int c; + + s = cmd; + for (;;) { + switch (c = xgetc(0)) { + case 0: + break; + case '\177': + case '\b': + if (s > cmd) { + s--; + printf("\b \b"); + } + break; + case '\n': + case '\r': + *s = 0; + return; + default: + if (s - cmd < sizeof(cmd) - 1) + *s++ = c; + putchar(c); + } + } +} + +static inline void +putc(int c) +{ + v86.addr = 0x10; + v86.eax = 0xe00 | (c & 0xff); + v86.ebx = 0x7; + v86int(); +} + +/* + * Try to detect a device supported by the legacy int13 BIOS + */ +static int +int13probe(int drive) +{ + v86.ctl = V86_FLAGS; + v86.addr = 0x13; + v86.eax = 0x800; + v86.edx = drive; + v86int(); + + if (!(v86.efl & 0x1) && /* carry clear */ + ((v86.edx & 0xff) != (drive & DRV_MASK))) { /* unit # OK */ + if ((v86.ecx & 0x3f) == 0) { /* absurd sector size */ + return(0); /* skip device */ + } + return (1); + } + return(0); +} + +static void +probe_drive(struct dsk *dsk, spa_t **spap) +{ + struct dos_partition *dp; + char *sec; + unsigned i; + + if (!int13probe(dsk->drive)) + return; + + /* + * If we find a vdev on the whole disk, stop here. Otherwise dig + * out the MBR and probe each slice in turn for a vdev. + */ + if (vdev_probe(vdev_read, dsk, spap) == 0) + return; + + sec = dmadat->secbuf; + dsk->start = 0; + if (drvread(dsk, sec, DOSBBSECTOR, 1)) + return; + dp = (void *)(sec + DOSPARTOFF); + + for (i = 0; i < NDOSPART; i++) { + if (!dp[i].dp_typ) + continue; + dsk->start = dp[i].dp_start; + if (vdev_probe(vdev_read, dsk, spap) == 0) { + /* + * We record the first pool we find (we will try to boot + * from that one. + */ + spap = 0; + + /* + * This slice had a vdev. We need a new dsk structure now + * sice the vdev now owns this one. + */ + struct dsk *newdsk; + newdsk = malloc(sizeof(struct dsk)); + *newdsk = *dsk; + dsk = newdsk; + } + } +} + +int +main(void) +{ + int autoboot, i; + dnode_phys_t dn; + off_t off; + struct dsk *dsk; + + dmadat = (void *)(roundup2(__base + (int32_t)&_end, 0x10000) - __base); + v86.ctl = V86_FLAGS; + + dsk = malloc(sizeof(struct dsk)); + dsk->drive = *(uint8_t *)PTOV(ARGS); + dsk->type = dsk->drive & DRV_HARD ? TYPE_AD : TYPE_FD; + dsk->unit = dsk->drive & DRV_MASK; + dsk->slice = *(uint8_t *)PTOV(ARGS + 1) + 1; + dsk->part = 0; + dsk->start = 0; + dsk->init = 0; + + bootinfo.bi_version = BOOTINFO_VERSION; + bootinfo.bi_size = sizeof(bootinfo); + bootinfo.bi_basemem = 0; /* XXX will be filled by loader or kernel */ + bootinfo.bi_extmem = memsize(); + bootinfo.bi_memsizes_valid++; + bootinfo.bi_bios_dev = dsk->drive; + + bootdev = MAKEBOOTDEV(dev_maj[dsk->type], + dsk->slice, dsk->unit, dsk->part), + + /* Process configuration file */ + + autoboot = 1; + + zfs_init(); + + /* + * Probe the boot drive first - we will try to boot from whatever + * pool we find on that drive. + */ + probe_drive(dsk, &spa); + + /* + * Probe the rest of the drives that the bios knows about. This + * will find any other available pools and it may fill in missing + * vdevs for the boot pool. + */ + for (i = 0; i < 4; i++) { + if ((i | DRV_HARD) == *(uint8_t *)PTOV(ARGS)) + continue; + + dsk = malloc(sizeof(struct dsk)); + dsk->drive = i | DRV_HARD; + dsk->type = dsk->drive & TYPE_AD; + dsk->unit = i; + dsk->slice = 0; + dsk->part = 0; + dsk->start = 0; + dsk->init = 0; + probe_drive(dsk, 0); + } + + /* + * If we didn't find a pool on the boot drive, default to the + * first pool we found, if any. + */ + if (!spa) { + spa = STAILQ_FIRST(&zfs_pools); + if (!spa) { + printf("No ZFS pools located, can't boot\n"); + for (;;) + ; + } + } + + zfs_mount_pool(spa); + + if (zfs_lookup(spa, PATH_CONFIG, &dn) == 0) { + off = 0; + xfsread(&dn, &off, cmd, sizeof(cmd)); + } + + if (*cmd) { + if (parse()) + autoboot = 0; + if (!OPT_CHECK(RBX_QUIET)) + printf("%s: %s", PATH_CONFIG, cmd); + /* Do not process this command twice */ + *cmd = 0; + } + + /* + * Try to exec stage 3 boot loader. If interrupted by a keypress, + * or in case of failure, try to load a kernel directly instead. + */ + + if (autoboot && !*kname) { + memcpy(kname, PATH_BOOT3, sizeof(PATH_BOOT3)); + if (!keyhit(3*SECOND)) { + load(); + memcpy(kname, PATH_KERNEL, sizeof(PATH_KERNEL)); + } + } + + /* Present the user with the boot2 prompt. */ + + for (;;) { + if (!autoboot || !OPT_CHECK(RBX_QUIET)) + printf("\nFreeBSD/i386 boot\n" + "Default: %s:%s\n" + "boot: ", + spa->spa_name, kname); + if (ioctrl & IO_SERIAL) + sio_flush(); + if (!autoboot || keyhit(5*SECOND)) + getstr(); + else if (!autoboot || !OPT_CHECK(RBX_QUIET)) + putchar('\n'); + autoboot = 0; + if (parse()) + putchar('\a'); + else + load(); + } +} + +/* XXX - Needed for btxld to link the boot2 binary; do not remove. */ +void +exit(int x) +{ +} + +static void +load(void) +{ + union { + struct exec ex; + Elf32_Ehdr eh; + } hdr; + static Elf32_Phdr ep[2]; + static Elf32_Shdr es[2]; + caddr_t p; + dnode_phys_t dn; + off_t off; + uint32_t addr, x; + int fmt, i, j; + + if (zfs_lookup(spa, kname, &dn)) { + return; + } + off = 0; + if (xfsread(&dn, &off, &hdr, sizeof(hdr))) + return; + if (N_GETMAGIC(hdr.ex) == ZMAGIC) + fmt = 0; + else if (IS_ELF(hdr.eh)) + fmt = 1; + else { + printf("Invalid %s\n", "format"); + return; + } + if (fmt == 0) { + addr = hdr.ex.a_entry & 0xffffff; + p = PTOV(addr); + off = PAGE_SIZE; + if (xfsread(&dn, &off, p, hdr.ex.a_text)) + return; + p += roundup2(hdr.ex.a_text, PAGE_SIZE); + if (xfsread(&dn, &off, p, hdr.ex.a_data)) + return; + p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE); + bootinfo.bi_symtab = VTOP(p); + memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms)); + p += sizeof(hdr.ex.a_syms); + if (hdr.ex.a_syms) { + if (xfsread(&dn, &off, p, hdr.ex.a_syms)) + return; + p += hdr.ex.a_syms; + if (xfsread(&dn, &off, p, sizeof(int))) + return; + x = *(uint32_t *)p; + p += sizeof(int); + x -= sizeof(int); + if (xfsread(&dn, &off, p, x)) + return; + p += x; + } + } else { + off = hdr.eh.e_phoff; + for (j = i = 0; i < hdr.eh.e_phnum && j < 2; i++) { + if (xfsread(&dn, &off, ep + j, sizeof(ep[0]))) + return; + if (ep[j].p_type == PT_LOAD) + j++; + } + for (i = 0; i < 2; i++) { + p = PTOV(ep[i].p_paddr & 0xffffff); + off = ep[i].p_offset; + if (xfsread(&dn, &off, p, ep[i].p_filesz)) + return; + } + p += roundup2(ep[1].p_memsz, PAGE_SIZE); + bootinfo.bi_symtab = VTOP(p); + if (hdr.eh.e_shnum == hdr.eh.e_shstrndx + 3) { + off = hdr.eh.e_shoff + sizeof(es[0]) * + (hdr.eh.e_shstrndx + 1); + if (xfsread(&dn, &off, &es, sizeof(es))) + return; + for (i = 0; i < 2; i++) { + memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size)); + p += sizeof(es[i].sh_size); + off = es[i].sh_offset; + if (xfsread(&dn, &off, p, es[i].sh_size)) + return; + p += es[i].sh_size; + } + } + addr = hdr.eh.e_entry & 0xffffff; + } + bootinfo.bi_esymtab = VTOP(p); + bootinfo.bi_kernelname = VTOP(kname); + __exec((caddr_t)addr, RB_BOOTINFO | (opts & RBX_MASK), + bootdev, + KARGS_FLAGS_ZFS, + (uint32_t) spa->spa_guid, + (uint32_t) (spa->spa_guid >> 32), + VTOP(&bootinfo)); +} + +static int +parse() +{ + char *arg = cmd; + char *ep, *p, *q; + const char *cp; + //unsigned int drv; + int c, i, j; + + while ((c = *arg++)) { + if (c == ' ' || c == '\t' || c == '\n') + continue; + for (p = arg; *p && *p != '\n' && *p != ' ' && *p != '\t'; p++); + ep = p; + if (*p) + *p++ = 0; + if (c == '-') { + while ((c = *arg++)) { + if (c == 'P') { + if (*(uint8_t *)PTOV(0x496) & 0x10) { + cp = "yes"; + } else { + opts |= OPT_SET(RBX_DUAL) | OPT_SET(RBX_SERIAL); + cp = "no"; + } + printf("Keyboard: %s\n", cp); + continue; + } else if (c == 'S') { + j = 0; + while ((unsigned int)(i = *arg++ - '0') <= 9) + j = j * 10 + i; + if (j > 0 && i == -'0') { + comspeed = j; + break; + } + /* Fall through to error below ('S' not in optstr[]). */ + } + for (i = 0; c != optstr[i]; i++) + if (i == NOPT - 1) + return -1; + opts ^= OPT_SET(flags[i]); + } + ioctrl = OPT_CHECK(RBX_DUAL) ? (IO_SERIAL|IO_KEYBOARD) : + OPT_CHECK(RBX_SERIAL) ? IO_SERIAL : IO_KEYBOARD; + if (ioctrl & IO_SERIAL) + sio_init(115200 / comspeed); + } if (c == '?') { + dnode_phys_t dn; + + if (zfs_lookup(spa, arg, &dn) == 0) { + zap_list(spa, &dn); + } + return -1; + } else { + arg--; + + /* + * Report pool status if the comment is 'status'. Lets + * hope no-one wants to load /status as a kernel. + */ + if (!strcmp(arg, "status")) { + spa_all_status(); + return -1; + } + + /* + * If there is a colon, switch pools. + */ + q = (char *) strchr(arg, ':'); + if (q) { + spa_t *newspa; + + *q++ = 0; + newspa = spa_find_by_name(arg); + if (newspa) { + spa = newspa; + zfs_mount_pool(spa); + } else { + printf("\nCan't find ZFS pool %s\n", arg); + return -1; + } + arg = q; + } + if ((i = ep - arg)) { + if ((size_t)i >= sizeof(kname)) + return -1; + memcpy(kname, arg, i + 1); + } + } + arg = p; + } + return 0; +} + +static void +printf(const char *fmt,...) +{ + va_list ap; + char buf[10]; + char *s; + unsigned u; + int c; + int minus; + int prec; + int len; + int pad; + + va_start(ap, fmt); + while ((c = *fmt++)) { + if (c == '%') { + minus = 0; + prec = 0; + nextfmt: + c = *fmt++; + switch (c) { + case '-': + minus = 1; + goto nextfmt; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + prec = 10 * prec + (c - '0'); + goto nextfmt; + case 'c': + putchar(va_arg(ap, int)); + continue; + case 's': + s = va_arg(ap, char *); + if (prec) { + len = strlen(s); + if (len < prec) + pad = prec - len; + else + pad = 0; + if (minus) + while (pad--) + putchar(' '); + for (; *s; s++) + putchar(*s); + if (!minus) + while (pad--) + putchar(' '); + } else { + for (; *s; s++) + putchar(*s); + } + continue; + case 'u': + u = va_arg(ap, unsigned); + s = buf; + do + *s++ = '0' + u % 10U; + while (u /= 10U); + while (--s >= buf) + putchar(*s); + continue; + } + } + putchar(c); + } + va_end(ap); + return; +} + +static void +putchar(int c) +{ + if (c == '\n') + xputc('\r'); + xputc(c); +} + +static int +drvread(struct dsk *dsk, void *buf, unsigned lba, unsigned nblk) +{ + static unsigned c = 0x2d5c7c2f; + + lba += dsk->start; + if (!OPT_CHECK(RBX_QUIET)) + printf("%c\b", c = c << 8 | c >> 24); + v86.ctl = V86_ADDR | V86_CALLF | V86_FLAGS; + v86.addr = XREADORG; /* call to xread in boot1 */ + v86.es = VTOPSEG(buf); + v86.eax = lba; + v86.ebx = VTOPOFF(buf); + v86.ecx = lba >> 16; + v86.edx = nblk << 8 | dsk->drive; + v86int(); + v86.ctl = V86_FLAGS; + if (V86_CY(v86.efl)) { + printf("error %u lba %u\n", v86.eax >> 8 & 0xff, lba); + return -1; + } + return 0; +} + +static int +keyhit(unsigned ticks) +{ + uint32_t t0, t1; + + if (OPT_CHECK(RBX_NOINTR)) + return 0; + t0 = 0; + for (;;) { + if (xgetc(1)) + return 1; + t1 = *(uint32_t *)PTOV(0x46c); + if (!t0) + t0 = t1; + if (t1 < t0 || t1 >= t0 + ticks) + return 0; + } +} + +static int +xputc(int c) +{ + if (ioctrl & IO_KEYBOARD) + putc(c); + if (ioctrl & IO_SERIAL) + sio_putc(c); + return c; +} + +static int +xgetc(int fn) +{ + if (OPT_CHECK(RBX_NOINTR)) + return 0; + for (;;) { + if (ioctrl & IO_KEYBOARD && getc(1)) + return fn ? 1 : getc(0); + if (ioctrl & IO_SERIAL && sio_ischar()) + return fn ? 1 : sio_getc(); + if (fn) + return 0; + } +} + +static int +getc(int fn) +{ + /* + * The extra comparison against zero is an attempt to work around + * what appears to be a bug in QEMU and Bochs. Both emulators + * sometimes report a key-press with scancode one and ascii zero + * when no such key is pressed in reality. As far as I can tell, + * this only happens shortly after a reboot. + */ + v86.addr = 0x16; + v86.eax = fn << 8; + v86int(); + return fn == 0 ? v86.eax & 0xff : (!V86_ZR(v86.efl) && (v86.eax & 0xff)); +} diff --git a/sys/boot/i386/zfsboot/zfsldr.S b/sys/boot/i386/zfsboot/zfsldr.S new file mode 100644 index 0000000..a256d30 --- /dev/null +++ b/sys/boot/i386/zfsboot/zfsldr.S @@ -0,0 +1,402 @@ +/* + * Copyright (c) 1998 Robert Nordier + * All rights reserved. + * + * Redistribution and use in source and binary forms are freely + * permitted provided that the above copyright notice and this + * paragraph and the following disclaimer are duplicated in all + * such forms. + * + * This software is provided "AS IS" and without any express or + * implied warranties, including, without limitation, the implied + * warranties of merchantability and fitness for a particular + * purpose. + * + * $FreeBSD$ + */ + +/* Memory Locations */ + .set MEM_REL,0x700 # Relocation address + .set MEM_ARG,0x900 # Arguments + .set MEM_ORG,0x7c00 # Origin + .set MEM_BUF,0x8000 # Load area + .set MEM_BTX,0x9000 # BTX start + .set MEM_JMP,0x9010 # BTX entry point + .set MEM_USR,0xa000 # Client start + .set BDA_BOOT,0x472 # Boot howto flag + +/* Partition Constants */ + .set PRT_OFF,0x1be # Partition offset + .set PRT_NUM,0x4 # Partitions + .set PRT_BSD,0xa5 # Partition type + +/* Flag Bits */ + .set FL_PACKET,0x80 # Packet mode + +/* Misc. Constants */ + .set SIZ_PAG,0x1000 # Page size + .set SIZ_SEC,0x200 # Sector size + + .set NSECT,0x40 + .globl start + .globl xread + .code16 + +start: jmp main # Start recognizably + +/* + * This is the start of a standard BIOS Parameter Block (BPB). Most bootable + * FAT disks have this at the start of their MBR. While normal BIOS's will + * work fine without this section, IBM's El Torito emulation "fixes" up the + * BPB by writing into the memory copy of the MBR. Rather than have data + * written into our xread routine, we'll define a BPB to work around it. + * The data marked with (T) indicates a field required for a ThinkPad to + * recognize the disk and (W) indicates fields written from IBM BIOS code. + * The use of the BPB is based on what OpenBSD and NetBSD implemented in + * their boot code but the required fields were determined by trial and error. + * + * Note: If additional space is needed in boot1, one solution would be to + * move the "prompt" message data (below) to replace the OEM ID. + */ + .org 0x03, 0x00 +oemid: .space 0x08, 0x00 # OEM ID + + .org 0x0b, 0x00 +bpb: .word 512 # sector size (T) + .byte 0 # sectors/clustor + .word 0 # reserved sectors + .byte 0 # number of FATs + .word 0 # root entries + .word 0 # small sectors + .byte 0 # media type (W) + .word 0 # sectors/fat + .word 18 # sectors per track (T) + .word 2 # number of heads (T) + .long 0 # hidden sectors (W) + .long 0 # large sectors + + .org 0x24, 0x00 +ebpb: .byte 0 # BIOS physical drive number (W) + + .org 0x25,0x90 +/* + * Trampoline used by boot2 to call read to read data from the disk via + * the BIOS. Call with: + * + * %cx:%ax - long - LBA to read in + * %es:(%bx) - caddr_t - buffer to read data into + * %dl - byte - drive to read from + * %dh - byte - num sectors to read + */ + +xread: push %ss # Address + pop %ds # data +/* + * Setup an EDD disk packet and pass it to read + */ +xread.1: # Starting + pushl $0x0 # absolute + push %cx # block + push %ax # number + push %es # Address of + push %bx # transfer buffer + xor %ax,%ax # Number of + movb %dh,%al # blocks to + push %ax # transfer + push $0x10 # Size of packet + mov %sp,%bp # Packet pointer + callw read # Read from disk + lea 0x10(%bp),%sp # Clear stack + lret # To far caller +/* + * Load the rest of boot2 and BTX up, copy the parts to the right locations, + * and start it all up. + */ + +/* + * Setup the segment registers to flat addressing (segment 0) and setup the + * stack to end just below the start of our code. + */ +main: cld # String ops inc + xor %cx,%cx # Zero + mov %cx,%es # Address + mov %cx,%ds # data + mov %cx,%ss # Set up + mov $start,%sp # stack +/* + * Relocate ourself to MEM_REL. Since %cx == 0, the inc %ch sets + * %cx == 0x100. + */ + mov %sp,%si # Source + mov $MEM_REL,%di # Destination + incb %ch # Word count + rep # Copy + movsw # code +/* + * If we are on a hard drive, then load the MBR and look for the first + * FreeBSD slice. We use the fake partition entry below that points to + * the MBR when we call nread. The first pass looks for the first active + * FreeBSD slice. The second pass looks for the first non-active FreeBSD + * slice if the first one fails. + */ + mov $part4,%si # Partition + cmpb $0x80,%dl # Hard drive? + jb main.4 # No + movb $0x1,%dh # Block count + callw nread # Read MBR + mov $0x1,%cx # Two passes +main.1: mov $MEM_BUF+PRT_OFF,%si # Partition table + movb $0x1,%dh # Partition +main.2: cmpb $PRT_BSD,0x4(%si) # Our partition type? + jne main.3 # No + jcxz main.5 # If second pass + testb $0x80,(%si) # Active? + jnz main.5 # Yes +main.3: add $0x10,%si # Next entry + incb %dh # Partition + cmpb $0x1+PRT_NUM,%dh # In table? + jb main.2 # Yes + dec %cx # Do two + jcxz main.1 # passes +/* + * If we get here, we didn't find any FreeBSD slices at all, so print an + * error message and die. + */ + mov $msg_part,%si # Message + jmp error # Error +/* + * Floppies use partition 0 of drive 0. + */ +main.4: xor %dx,%dx # Partition:drive + +/* + * Ok, we have a slice and drive in %dx now, so use that to locate and + * load boot2. %si references the start of the slice we are looking + * for, so go ahead and load up the 64 sectors starting at sector 1024 + * (i.e. after the two vdev labels). We don't have do anything fancy + * here to allow for an extra copy of boot1 and a partition table + * (compare to this section of the UFS bootstrap) so we just load it + * all at 0x8000. The first part of boot2 is BTX, which wants to run + * at 0x9000. The boot2.bin binary starts right after the end of BTX, + * so we have to figure out where the start of it is and then move the + * binary to 0xc000. After we have moved the client, we relocate BTX + * itself to 0x9000 - doing it in this order means that none of the + * memcpy regions overlap which would corrupt the copy. Normally, BTX + * clients start at MEM_USR, or 0xa000, but when we use btxld to + * create boot2, we use an entry point of 0x2000. That entry point is + * relative to MEM_USR; thus boot2.bin starts at 0xc000. + * + * The load area and the target area for the client overlap so we have + * to use a decrementing string move. We also play segment register + * games with the destination address for the move so that the client + * can be larger than 16k (which would overflow the zero segment since + * the client starts at 0xc000). Relocating BTX is easy since the load + * area and target area do not overlap. + */ +main.5: mov %dx,MEM_ARG # Save args + movb $NSECT,%dh # Sector count + movw $1024,%ax # Offset to boot2 + callw nread.1 # Read disk +main.6: mov $MEM_BUF,%si # BTX (before reloc) + mov 0xa(%si),%bx # Get BTX length and set + mov $NSECT*SIZ_SEC-1,%di # Size of load area (less one) + mov %di,%si # End of load + add $MEM_BUF,%si # area + sub %bx,%di # End of client, 0xc000 rel + mov %di,%cx # Size of + inc %cx # client + mov $(MEM_USR+2*SIZ_PAG)>>4,%dx # Segment + mov %dx,%es # addressing 0xc000 + std # Move with decrement + rep # Relocate + movsb # client + mov %ds,%dx # Back to + mov %dx,%es # zero segment + mov $MEM_BUF,%si # BTX (before reloc) + mov $MEM_BTX,%di # BTX + mov %bx,%cx # Get BTX length + cld # Increment this time + rep # Relocate + movsb # BTX + +/* + * Enable A20 so we can access memory above 1 meg. + * Use the zero-valued %cx as a timeout for embedded hardware which do not + * have a keyboard controller. + */ +seta20: cli # Disable interrupts +seta20.1: dec %cx # Timeout? + jz seta20.3 # Yes + inb $0x64,%al # Get status + testb $0x2,%al # Busy? + jnz seta20.1 # Yes + movb $0xd1,%al # Command: Write + outb %al,$0x64 # output port +seta20.2: inb $0x64,%al # Get status + testb $0x2,%al # Busy? + jnz seta20.2 # Yes + movb $0xdf,%al # Enable + outb %al,$0x60 # A20 +seta20.3: sti # Enable interrupts + + jmp start+MEM_JMP-MEM_ORG # Start BTX + + +/* + * Trampoline used to call read from within boot1. + */ +nread: xor %ax,%ax # Sector offset in partition +nread.1: mov $MEM_BUF,%bx # Transfer buffer + add 0x8(%si),%ax # Get + mov 0xa(%si),%cx # LBA + push %cs # Read from + callw xread.1 # disk + jnc return # If success, return + mov $msg_read,%si # Otherwise, set the error + # message and fall through to + # the error routine +/* + * Print out the error message pointed to by %ds:(%si) followed + * by a prompt, wait for a keypress, and then reboot the machine. + */ +error: callw putstr # Display message + mov $prompt,%si # Display + callw putstr # prompt + xorb %ah,%ah # BIOS: Get + int $0x16 # keypress + movw $0x1234, BDA_BOOT # Do a warm boot + ljmp $0xffff,$0x0 # reboot the machine +/* + * Display a null-terminated string using the BIOS output. + */ +putstr.0: mov $0x7,%bx # Page:attribute + movb $0xe,%ah # BIOS: Display + int $0x10 # character +putstr: lodsb # Get char + testb %al,%al # End of string? + jne putstr.0 # No + +/* + * Overused return code. ereturn is used to return an error from the + * read function. Since we assume putstr succeeds, we (ab)use the + * same code when we return from putstr. + */ +ereturn: movb $0x1,%ah # Invalid + stc # argument +return: retw # To caller +/* + * Reads sectors from the disk. If EDD is enabled, then check if it is + * installed and use it if it is. If it is not installed or not enabled, then + * fall back to using CHS. Since we use a LBA, if we are using CHS, we have to + * fetch the drive parameters from the BIOS and divide it out ourselves. + * Call with: + * + * %dl - byte - drive number + * stack - 10 bytes - EDD Packet + */ +read: testb $FL_PACKET,%cs:MEM_REL+flags-start # LBA support enabled? + jz read.1 # No, use CHS + cmpb $0x80,%dl # Hard drive? + jb read.1 # No, use CHS + mov $0x55aa,%bx # Magic + push %dx # Save + movb $0x41,%ah # BIOS: Check + int $0x13 # extensions present + pop %dx # Restore + jc read.1 # If error, use CHS + cmp $0xaa55,%bx # Magic? + jne read.1 # No, so use CHS + testb $0x1,%cl # Packet interface? + jz read.1 # No, so use CHS + mov %bp,%si # Disk packet + movb $0x42,%ah # BIOS: Extended + int $0x13 # read + retw # To caller +#if 0 +read.1: push %dx # Save + movb $0x8,%ah # BIOS: Get drive + int $0x13 # parameters + movb %dh,%ch # Max head number + pop %dx # Restore + jc return # If error + andb $0x3f,%cl # Sectors per track + jz ereturn # If zero + cli # Disable interrupts + mov 0x8(%bp),%eax # Get LBA + push %dx # Save + movzbl %cl,%ebx # Divide by + xor %edx,%edx # sectors + div %ebx # per track + movb %ch,%bl # Max head number + movb %dl,%ch # Sector number + inc %bx # Divide by + xorb %dl,%dl # number + div %ebx # of heads + movb %dl,%bh # Head number + pop %dx # Restore + cmpl $0x3ff,%eax # Cylinder number supportable? + sti # Enable interrupts + ja ereturn # No, return an error + xchgb %al,%ah # Set up cylinder + rorb $0x2,%al # number + orb %ch,%al # Merge + inc %ax # sector + xchg %ax,%cx # number + movb %bh,%dh # Head number + subb %ah,%al # Sectors this track + mov 0x2(%bp),%ah # Blocks to read + cmpb %ah,%al # To read + jb read.2 # this +#ifdef TRACK_AT_A_TIME + movb %ah,%al # track +#else + movb $1,%al # one sector +#endif +read.2: mov $0x5,%di # Try count +read.3: les 0x4(%bp),%bx # Transfer buffer + push %ax # Save + movb $0x2,%ah # BIOS: Read + int $0x13 # from disk + pop %bx # Restore + jnc read.4 # If success + dec %di # Retry? + jz read.6 # No + xorb %ah,%ah # BIOS: Reset + int $0x13 # disk system + xchg %bx,%ax # Block count + jmp read.3 # Continue +read.4: movzbw %bl,%ax # Sectors read + add %ax,0x8(%bp) # Adjust + jnc read.5 # LBA, + incw 0xa(%bp) # transfer +read.5: shlb %bl # buffer + add %bl,0x5(%bp) # pointer, + sub %al,0x2(%bp) # block count + ja read.1 # If not done +read.6: retw # To caller +#else +read.1: mov $msg_chs,%si + jmp error +msg_chs: .asciz "CHS not supported" +#endif + +/* Messages */ + +msg_read: .asciz "Read" +msg_part: .asciz "Boot" + +prompt: .asciz " error\r\n" + +flags: .byte FLAGS # Flags + + .org PRT_OFF,0x90 + +/* Partition table */ + + .fill 0x30,0x1,0x0 +part4: .byte 0x80, 0x00, 0x01, 0x00 + .byte 0xa5, 0xfe, 0xff, 0xff + .byte 0x00, 0x00, 0x00, 0x00 + .byte 0x50, 0xc3, 0x00, 0x00 # 50000 sectors long, bleh + + .word 0xaa55 # Magic number diff --git a/sys/boot/zfs/Makefile b/sys/boot/zfs/Makefile new file mode 100644 index 0000000..723233c --- /dev/null +++ b/sys/boot/zfs/Makefile @@ -0,0 +1,29 @@ +# $FreeBSD$ + +LIB= zfsboot +INTERNALLIB= + +SRCS+= zfs.c + +CFLAGS+= -I${.CURDIR}/../common -I${.CURDIR}/../.. -I. +CFLAGS+= -I${.CURDIR}/../../../lib/libstand +CFLAGS+= -I${.CURDIR}/../../cddl/boot/zfs + +# XXX need arch-specific bootstrap CFLAGS here +# +CFLAGS+= -ffreestanding -mpreferred-stack-boundary=2 \ + -mno-mmx -mno-3dnow -mno-sse -mno-sse2 -mno-sse3 + +CFLAGS+= -Wformat -Wall + +.if ${MACHINE_ARCH} == "amd64" +CLEANFILES+= machine +machine: + ln -sf ${.CURDIR}/../../../i386/include machine +.endif + +.include <bsd.lib.mk> + +.if ${MACHINE_ARCH} == "amd64" +beforedepend ${OBJS}: machine +.endif diff --git a/sys/boot/zfs/zfs.c b/sys/boot/zfs/zfs.c new file mode 100644 index 0000000..cf0bb9c --- /dev/null +++ b/sys/boot/zfs/zfs.c @@ -0,0 +1,514 @@ +/*- + * Copyright (c) 2007 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Stand-alone file reading package. + */ + +#include <sys/param.h> +#include <sys/disklabel.h> +#include <sys/time.h> +#include <sys/queue.h> +#include <stddef.h> +#include <stdarg.h> +#include <string.h> +#include <stand.h> +#include <bootstrap.h> + +#include "zfsimpl.c" + +static int zfs_open(const char *path, struct open_file *f); +static int zfs_write(struct open_file *f, void *buf, size_t size, size_t *resid); +static int zfs_close(struct open_file *f); +static int zfs_read(struct open_file *f, void *buf, size_t size, size_t *resid); +static off_t zfs_seek(struct open_file *f, off_t offset, int where); +static int zfs_stat(struct open_file *f, struct stat *sb); +static int zfs_readdir(struct open_file *f, struct dirent *d); + +struct devsw zfs_dev; + +struct fs_ops zfs_fsops = { + "zfs", + zfs_open, + zfs_close, + zfs_read, + zfs_write, + zfs_seek, + zfs_stat, + zfs_readdir +}; + +/* + * In-core open file. + */ +struct file { + off_t f_seekp; /* seek pointer */ + dnode_phys_t f_dnode; + uint64_t f_zap_type; /* zap type for readdir */ + uint64_t f_num_leafs; /* number of fzap leaf blocks */ + zap_leaf_phys_t *f_zap_leaf; /* zap leaf buffer */ +}; + +/* + * Open a file. + */ +static int +zfs_open(const char *upath, struct open_file *f) +{ + spa_t *spa = (spa_t *) f->f_devdata; + struct file *fp; + int rc; + + if (f->f_dev != &zfs_dev) + return (EINVAL); + + rc = zfs_mount_pool(spa); + if (rc) + return (rc); + + /* allocate file system specific data structure */ + fp = malloc(sizeof(struct file)); + bzero(fp, sizeof(struct file)); + f->f_fsdata = (void *)fp; + + if (spa->spa_root_objset.os_type != DMU_OST_ZFS) { + printf("Unexpected object set type %lld\n", + spa->spa_root_objset.os_type); + rc = EIO; + goto out; + } + + rc = zfs_lookup(spa, upath, &fp->f_dnode); + if (rc) + goto out; + + fp->f_seekp = 0; +out: + if (rc) { + f->f_fsdata = NULL; + free(fp); + } + return (rc); +} + +static int +zfs_close(struct open_file *f) +{ + struct file *fp = (struct file *)f->f_fsdata; + + dnode_cache_obj = 0; + f->f_fsdata = (void *)0; + if (fp == (struct file *)0) + return (0); + + free(fp); + return (0); +} + +/* + * Copy a portion of a file into kernel memory. + * Cross block boundaries when necessary. + */ +static int +zfs_read(struct open_file *f, void *start, size_t size, size_t *resid /* out */) +{ + spa_t *spa = (spa_t *) f->f_devdata; + struct file *fp = (struct file *)f->f_fsdata; + const znode_phys_t *zp = (const znode_phys_t *) fp->f_dnode.dn_bonus; + size_t n; + int rc; + + n = size; + if (fp->f_seekp + n > zp->zp_size) + n = zp->zp_size - fp->f_seekp; + + rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n); + if (rc) + return (rc); + + if (0) { + int i; + for (i = 0; i < n; i++) + putchar(((char*) start)[i]); + } + fp->f_seekp += n; + if (resid) + *resid = size - n; + + return (0); +} + +/* + * Don't be silly - the bootstrap has no business writing anything. + */ +static int +zfs_write(struct open_file *f, void *start, size_t size, size_t *resid /* out */) +{ + + return (EROFS); +} + +static off_t +zfs_seek(struct open_file *f, off_t offset, int where) +{ + struct file *fp = (struct file *)f->f_fsdata; + znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus; + + switch (where) { + case SEEK_SET: + fp->f_seekp = offset; + break; + case SEEK_CUR: + fp->f_seekp += offset; + break; + case SEEK_END: + fp->f_seekp = zp->zp_size - offset; + break; + default: + errno = EINVAL; + return (-1); + } + return (fp->f_seekp); +} + +static int +zfs_stat(struct open_file *f, struct stat *sb) +{ + struct file *fp = (struct file *)f->f_fsdata; + znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus; + + /* only important stuff */ + sb->st_mode = zp->zp_mode; + sb->st_uid = zp->zp_uid; + sb->st_gid = zp->zp_gid; + sb->st_size = zp->zp_size; + + return (0); +} + +static int +zfs_readdir(struct open_file *f, struct dirent *d) +{ + spa_t *spa = (spa_t *) f->f_devdata; + struct file *fp = (struct file *)f->f_fsdata; + znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus; + mzap_ent_phys_t mze; + size_t bsize = fp->f_dnode.dn_datablkszsec << SPA_MINBLOCKSHIFT; + int rc; + + if ((zp->zp_mode >> 12) != 0x4) { + return (ENOTDIR); + } + + /* + * If this is the first read, get the zap type. + */ + if (fp->f_seekp == 0) { + rc = dnode_read(spa, &fp->f_dnode, + 0, &fp->f_zap_type, sizeof(fp->f_zap_type)); + if (rc) + return (rc); + + if (fp->f_zap_type == ZBT_MICRO) { + fp->f_seekp = offsetof(mzap_phys_t, mz_chunk); + } else { + rc = dnode_read(spa, &fp->f_dnode, + offsetof(zap_phys_t, zap_num_leafs), + &fp->f_num_leafs, + sizeof(fp->f_num_leafs)); + if (rc) + return (rc); + + fp->f_seekp = bsize; + fp->f_zap_leaf = (zap_leaf_phys_t *)malloc(bsize); + rc = dnode_read(spa, &fp->f_dnode, + fp->f_seekp, + fp->f_zap_leaf, + bsize); + if (rc) + return (rc); + } + } + + if (fp->f_zap_type == ZBT_MICRO) { + mzap_next: + if (fp->f_seekp >= bsize) + return (ENOENT); + + rc = dnode_read(spa, &fp->f_dnode, + fp->f_seekp, &mze, sizeof(mze)); + fp->f_seekp += sizeof(mze); + + if (!mze.mze_name[0]) + goto mzap_next; + + d->d_fileno = ZFS_DIRENT_OBJ(mze.mze_value); + d->d_type = ZFS_DIRENT_TYPE(mze.mze_value); + strcpy(d->d_name, mze.mze_name); + d->d_namlen = strlen(d->d_name); + return (0); + } else { + zap_leaf_t zl; + zap_leaf_chunk_t *zc, *nc; + int chunk; + size_t namelen; + char *p; + uint64_t value; + + /* + * Initialise this so we can use the ZAP size + * calculating macros. + */ + zl.l_bs = ilog2(bsize); + zl.l_phys = fp->f_zap_leaf; + + /* + * Figure out which chunk we are currently looking at + * and consider seeking to the next leaf. We use the + * low bits of f_seekp as a simple chunk index. + */ + fzap_next: + chunk = fp->f_seekp & (bsize - 1); + if (chunk == ZAP_LEAF_NUMCHUNKS(&zl)) { + fp->f_seekp = (fp->f_seekp & ~(bsize - 1)) + bsize; + chunk = 0; + + /* + * Check for EOF and read the new leaf. + */ + if (fp->f_seekp >= bsize * fp->f_num_leafs) + return (ENOENT); + + rc = dnode_read(spa, &fp->f_dnode, + fp->f_seekp, + fp->f_zap_leaf, + bsize); + if (rc) + return (rc); + } + + zc = &ZAP_LEAF_CHUNK(&zl, chunk); + fp->f_seekp++; + if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) + goto fzap_next; + + namelen = zc->l_entry.le_name_length; + if (namelen > sizeof(d->d_name)) + namelen = sizeof(d->d_name); + + /* + * Paste the name back together. + */ + nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk); + p = d->d_name; + while (namelen > 0) { + int len; + len = namelen; + if (len > ZAP_LEAF_ARRAY_BYTES) + len = ZAP_LEAF_ARRAY_BYTES; + memcpy(p, nc->l_array.la_array, len); + p += len; + namelen -= len; + nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next); + } + d->d_name[sizeof(d->d_name) - 1] = 0; + + /* + * Assume the first eight bytes of the value are + * a uint64_t. + */ + value = fzap_leaf_value(&zl, zc); + + d->d_fileno = ZFS_DIRENT_OBJ(value); + d->d_type = ZFS_DIRENT_TYPE(value); + d->d_namlen = strlen(d->d_name); + + return (0); + } +} + +static int +vdev_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t size) +{ + int fd; + + fd = (uintptr_t) priv; + lseek(fd, offset, SEEK_SET); + if (read(fd, buf, size) == size) { + return 0; + } else { + return (EIO); + } +} + +/* + * Convert a pool guid to a 'unit number' suitable for use with zfs_dev_open. + */ +int +zfs_guid_to_unit(uint64_t guid) +{ + spa_t *spa; + int unit; + + unit = 0; + STAILQ_FOREACH(spa, &zfs_pools, spa_link) { + if (spa->spa_guid == guid) + return unit; + unit++; + } + return (-1); +} + +static int +zfs_dev_init(void) +{ + char devname[512]; + int unit, slice; + int fd; + + /* + * Open all the disks we can find and see if we can reconstruct + * ZFS pools from them. Bogusly assumes that the disks are named + * diskN or diskNsM. + */ + zfs_init(); + for (unit = 0; unit < 32 /* XXX */; unit++) { + sprintf(devname, "disk%d:", unit); + fd = open(devname, O_RDONLY); + if (fd == -1) + continue; + + /* + * If we find a vdev, the zfs code will eat the fd, otherwise + * we close it. + */ + if (vdev_probe(vdev_read, (void*) (uintptr_t) fd, 0)) + close(fd); + + for (slice = 1; slice <= 4; slice++) { + sprintf(devname, "disk%ds%d:", unit, slice); + fd = open(devname, O_RDONLY); + if (fd == -1) + continue; + if (vdev_probe(vdev_read, (void*) (uintptr_t) fd, 0)) + close(fd); + } + } + + return (0); +} + +/* + * Print information about ZFS pools + */ +static void +zfs_dev_print(int verbose) +{ + spa_t *spa; + char line[80]; + int unit; + + if (verbose) { + spa_all_status(); + return; + } + unit = 0; + STAILQ_FOREACH(spa, &zfs_pools, spa_link) { + sprintf(line, " zfs%d: %s\n", unit, spa->spa_name); + pager_output(line); + unit++; + } +} + +/* + * Attempt to open the pool described by (dev) for use by (f). + */ +static int +zfs_dev_open(struct open_file *f, ...) +{ + va_list args; + struct devdesc *dev; + int unit, i; + spa_t *spa; + + va_start(args, f); + dev = va_arg(args, struct devdesc*); + va_end(args); + + /* + * We mostly ignore the stuff that devopen sends us. For now, + * use the unit to find a pool - later we will override the + * devname parsing so that we can name a pool and a fs within + * the pool. + */ + unit = dev->d_unit; + free(dev); + + i = 0; + STAILQ_FOREACH(spa, &zfs_pools, spa_link) { + if (i == unit) + break; + i++; + } + if (!spa) { + return (ENXIO); + } + + f->f_devdata = spa; + return (0); +} + +static int +zfs_dev_close(struct open_file *f) +{ + + f->f_devdata = NULL; + return (0); +} + +static int +zfs_dev_strategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize) +{ + + return (ENOSYS); +} + +struct devsw zfs_dev = { + .dv_name = "zfs", + .dv_type = DEVT_ZFS, + .dv_init = zfs_dev_init, + .dv_strategy = zfs_dev_strategy, + .dv_open = zfs_dev_open, + .dv_close = zfs_dev_close, + .dv_ioctl = noioctl, + .dv_print = zfs_dev_print, + .dv_cleanup = NULL +}; diff --git a/sys/boot/zfs/zfsimpl.c b/sys/boot/zfs/zfsimpl.c new file mode 100644 index 0000000..5bbc351 --- /dev/null +++ b/sys/boot/zfs/zfsimpl.c @@ -0,0 +1,1443 @@ +/*- + * Copyright (c) 2007 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Stand-alone ZFS file reader. + */ + +#include "zfsimpl.h" +#include "zfssubr.c" + +/* + * List of all vdevs, chained through v_alllink. + */ +static vdev_list_t zfs_vdevs; + +/* + * List of all pools, chained through spa_link. + */ +static spa_list_t zfs_pools; + +static uint64_t zfs_crc64_table[256]; +static char *zfs_decomp_buf; +static const dnode_phys_t *dnode_cache_obj = 0; +static uint64_t dnode_cache_bn; +static char *dnode_cache_buf; +static char *zap_scratch; + +/* + * Forward declarations. + */ +static int zio_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset); + +static void +zfs_init(void) +{ + STAILQ_INIT(&zfs_vdevs); + STAILQ_INIT(&zfs_pools); + + zfs_decomp_buf = malloc(128*1024); + dnode_cache_buf = malloc(128*1024); + zap_scratch = malloc(128*1024); + + zfs_init_crc(); +} + +static int +xdr_int(const unsigned char **xdr, int *ip) +{ + *ip = ((*xdr)[0] << 24) + | ((*xdr)[1] << 16) + | ((*xdr)[2] << 8) + | ((*xdr)[3] << 0); + (*xdr) += 4; + return (0); +} + +static int +xdr_u_int(const unsigned char **xdr, u_int *ip) +{ + *ip = ((*xdr)[0] << 24) + | ((*xdr)[1] << 16) + | ((*xdr)[2] << 8) + | ((*xdr)[3] << 0); + (*xdr) += 4; + return (0); +} + +static int +xdr_uint64_t(const unsigned char **xdr, uint64_t *lp) +{ + u_int hi, lo; + + xdr_u_int(xdr, &hi); + xdr_u_int(xdr, &lo); + *lp = (((uint64_t) hi) << 32) | lo; + return (0); +} + +static int +nvlist_find(const unsigned char *nvlist, const char *name, int type, + int* elementsp, void *valuep) +{ + const unsigned char *p, *pair; + int junk; + int encoded_size, decoded_size; + + p = nvlist; + xdr_int(&p, &junk); + xdr_int(&p, &junk); + + pair = p; + xdr_int(&p, &encoded_size); + xdr_int(&p, &decoded_size); + while (encoded_size && decoded_size) { + int namelen, pairtype, elements; + const char *pairname; + + xdr_int(&p, &namelen); + pairname = (const char*) p; + p += roundup(namelen, 4); + xdr_int(&p, &pairtype); + + if (!memcmp(name, pairname, namelen) && type == pairtype) { + xdr_int(&p, &elements); + if (elementsp) + *elementsp = elements; + if (type == DATA_TYPE_UINT64) { + xdr_uint64_t(&p, (uint64_t *) valuep); + return (0); + } else if (type == DATA_TYPE_STRING) { + int len; + xdr_int(&p, &len); + (*(const char**) valuep) = (const char*) p; + return (0); + } else if (type == DATA_TYPE_NVLIST + || type == DATA_TYPE_NVLIST_ARRAY) { + (*(const unsigned char**) valuep) = + (const unsigned char*) p; + return (0); + } else { + return (EIO); + } + } else { + /* + * Not the pair we are looking for, skip to the next one. + */ + p = pair + encoded_size; + } + + pair = p; + xdr_int(&p, &encoded_size); + xdr_int(&p, &decoded_size); + } + + return (EIO); +} + +/* + * Return the next nvlist in an nvlist array. + */ +static const unsigned char * +nvlist_next(const unsigned char *nvlist) +{ + const unsigned char *p, *pair; + int junk; + int encoded_size, decoded_size; + + p = nvlist; + xdr_int(&p, &junk); + xdr_int(&p, &junk); + + pair = p; + xdr_int(&p, &encoded_size); + xdr_int(&p, &decoded_size); + while (encoded_size && decoded_size) { + p = pair + encoded_size; + + pair = p; + xdr_int(&p, &encoded_size); + xdr_int(&p, &decoded_size); + } + + return p; +} + +#ifdef TEST + +static const unsigned char * +nvlist_print(const unsigned char *nvlist, unsigned int indent) +{ + static const char* typenames[] = { + "DATA_TYPE_UNKNOWN", + "DATA_TYPE_BOOLEAN", + "DATA_TYPE_BYTE", + "DATA_TYPE_INT16", + "DATA_TYPE_UINT16", + "DATA_TYPE_INT32", + "DATA_TYPE_UINT32", + "DATA_TYPE_INT64", + "DATA_TYPE_UINT64", + "DATA_TYPE_STRING", + "DATA_TYPE_BYTE_ARRAY", + "DATA_TYPE_INT16_ARRAY", + "DATA_TYPE_UINT16_ARRAY", + "DATA_TYPE_INT32_ARRAY", + "DATA_TYPE_UINT32_ARRAY", + "DATA_TYPE_INT64_ARRAY", + "DATA_TYPE_UINT64_ARRAY", + "DATA_TYPE_STRING_ARRAY", + "DATA_TYPE_HRTIME", + "DATA_TYPE_NVLIST", + "DATA_TYPE_NVLIST_ARRAY", + "DATA_TYPE_BOOLEAN_VALUE", + "DATA_TYPE_INT8", + "DATA_TYPE_UINT8", + "DATA_TYPE_BOOLEAN_ARRAY", + "DATA_TYPE_INT8_ARRAY", + "DATA_TYPE_UINT8_ARRAY" + }; + + unsigned int i, j; + const unsigned char *p, *pair; + int junk; + int encoded_size, decoded_size; + + p = nvlist; + xdr_int(&p, &junk); + xdr_int(&p, &junk); + + pair = p; + xdr_int(&p, &encoded_size); + xdr_int(&p, &decoded_size); + while (encoded_size && decoded_size) { + int namelen, pairtype, elements; + const char *pairname; + + xdr_int(&p, &namelen); + pairname = (const char*) p; + p += roundup(namelen, 4); + xdr_int(&p, &pairtype); + + for (i = 0; i < indent; i++) + printf(" "); + printf("%s %s", typenames[pairtype], pairname); + + xdr_int(&p, &elements); + switch (pairtype) { + case DATA_TYPE_UINT64: { + uint64_t val; + xdr_uint64_t(&p, &val); + printf(" = 0x%llx\n", val); + break; + } + + case DATA_TYPE_STRING: { + int len; + xdr_int(&p, &len); + printf(" = \"%s\"\n", p); + break; + } + + case DATA_TYPE_NVLIST: + printf("\n"); + nvlist_print(p, indent + 1); + break; + + case DATA_TYPE_NVLIST_ARRAY: + for (j = 0; j < elements; j++) { + printf("[%d]\n", j); + p = nvlist_print(p, indent + 1); + if (j != elements - 1) { + for (i = 0; i < indent; i++) + printf(" "); + printf("%s %s", typenames[pairtype], pairname); + } + } + break; + + default: + printf("\n"); + } + + p = pair + encoded_size; + + pair = p; + xdr_int(&p, &encoded_size); + xdr_int(&p, &decoded_size); + } + + return p; +} + +#endif + +static int +vdev_mirror_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t size) +{ + vdev_t *kid; + int rc; + + rc = EIO; + STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { + if (kid->v_state != VDEV_STATE_HEALTHY) + continue; + rc = kid->v_read(kid, kid->v_read_priv, offset, buf, size); + if (!rc) + return (0); + } + + return (rc); +} + +static vdev_t * +vdev_find(uint64_t guid) +{ + vdev_t *vdev; + + STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink) + if (vdev->v_guid == guid) + return (vdev); + + return (0); +} + +static vdev_t * +vdev_create(uint64_t guid, vdev_read_t *read, void *read_priv) +{ + vdev_t *vdev; + + vdev = malloc(sizeof(vdev_t)); + memset(vdev, 0, sizeof(vdev_t)); + STAILQ_INIT(&vdev->v_children); + vdev->v_guid = guid; + vdev->v_state = VDEV_STATE_OFFLINE; + vdev->v_read = read; + vdev->v_read_priv = read_priv; + STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink); + + return (vdev); +} + +static int +vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp) +{ + int rc; + uint64_t guid, id; + const char *type; + const char *path; + vdev_t *vdev, *kid; + const unsigned char *kids; + int nkids, i; + + if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, + DATA_TYPE_UINT64, 0, &guid) + || nvlist_find(nvlist, ZPOOL_CONFIG_ID, + DATA_TYPE_UINT64, 0, &id) + || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, + DATA_TYPE_STRING, 0, &type)) { + printf("ZFS: can't find vdev details\n"); + return (ENOENT); + } + + /* + * Assume that if we've seen this vdev tree before, this one + * will be identical. + */ + vdev = vdev_find(guid); + if (vdev) { + if (vdevp) + *vdevp = vdev; + return (0); + } + + if (strcmp(type, VDEV_TYPE_MIRROR) + && strcmp(type, VDEV_TYPE_DISK)) { + printf("ZFS: can only boot from disk or mirror vdevs\n"); + return (EIO); + } + + if (!strcmp(type, VDEV_TYPE_MIRROR)) + vdev = vdev_create(guid, vdev_mirror_read, 0); + else + vdev = vdev_create(guid, 0, 0); + + + if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, + DATA_TYPE_STRING, 0, &path) == 0) { + if (strlen(path) > 5 + && path[0] == '/' + && path[1] == 'd' + && path[2] == 'e' + && path[3] == 'v' + && path[4] == '/') + path += 5; + vdev->v_name = strdup(path); + } else { + vdev->v_name = strdup(type); + } + vdev->v_id = id; + rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, + DATA_TYPE_NVLIST_ARRAY, &nkids, &kids); + /* + * Its ok if we don't have any kids. + */ + if (rc == 0) { + for (i = 0; i < nkids; i++) { + rc = vdev_init_from_nvlist(kids, &kid); + if (rc) + return (rc); + STAILQ_INSERT_TAIL(&vdev->v_children, kid, v_childlink); + kids = nvlist_next(kids); + } + } + + if (vdevp) + *vdevp = vdev; + return (0); +} + +static void +vdev_set_state(vdev_t *vdev) +{ + vdev_t *kid; + int good_kids; + int bad_kids; + + /* + * We assume that if we have kids, we are a mirror. A mirror + * is healthy if all its kids are healthy. Its degraded (but + * working) if at least one kid is healty. + */ + + if (STAILQ_FIRST(&vdev->v_children)) { + good_kids = 0; + bad_kids = 0; + STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { + if (kid->v_state == VDEV_STATE_HEALTHY) + good_kids++; + else + bad_kids++; + } + if (good_kids) { + if (!bad_kids && good_kids) + vdev->v_state = VDEV_STATE_HEALTHY; + else + vdev->v_state = VDEV_STATE_DEGRADED; + } else { + vdev->v_state = VDEV_STATE_OFFLINE; + } + } +} + +static spa_t * +spa_find_by_guid(uint64_t guid) +{ + spa_t *spa; + + STAILQ_FOREACH(spa, &zfs_pools, spa_link) + if (spa->spa_guid == guid) + return (spa); + + return (0); +} + +#ifdef BOOT2 + +static spa_t * +spa_find_by_name(const char *name) +{ + spa_t *spa; + + STAILQ_FOREACH(spa, &zfs_pools, spa_link) + if (!strcmp(spa->spa_name, name)) + return (spa); + + return (0); +} + +#endif + +static spa_t * +spa_create(uint64_t guid) +{ + spa_t *spa; + + spa = malloc(sizeof(spa_t)); + memset(spa, 0, sizeof(spa_t)); + STAILQ_INIT(&spa->spa_vdevs); + spa->spa_guid = guid; + STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link); + + return (spa); +} + +static const char * +state_name(vdev_state_t state) +{ + static const char* names[] = { + "UNKNOWN", + "CLOSED", + "OFFLINE", + "CANT_OPEN", + "DEGRADED", + "ONLINE" + }; + return names[state]; +} + +#ifdef BOOT2 + +#define pager_printf printf + +#else + +static void +pager_printf(const char *fmt, ...) +{ + char line[80]; + va_list args; + + va_start(args, fmt); + vsprintf(line, fmt, args); + va_end(args); + pager_output(line); +} + +#endif + +#define STATUS_FORMAT " %-16s %-10s\n" + +static void +print_state(int indent, const char *name, vdev_state_t state) +{ + int i; + char buf[512]; + + buf[0] = 0; + for (i = 0; i < indent; i++) + strcat(buf, " "); + strcat(buf, name); + pager_printf(STATUS_FORMAT, buf, state_name(state)); + +} + +static void +vdev_status(vdev_t *vdev, int indent) +{ + vdev_t *kid; + print_state(indent, vdev->v_name, vdev->v_state); + + STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { + vdev_status(kid, indent + 1); + } +} + +static void +spa_status(spa_t *spa) +{ + vdev_t *vdev; + int good_kids, bad_kids, degraded_kids; + vdev_state_t state; + + pager_printf(" pool: %s\n", spa->spa_name); + pager_printf("config:\n\n"); + pager_printf(STATUS_FORMAT, "NAME", "STATE"); + + good_kids = 0; + degraded_kids = 0; + bad_kids = 0; + STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { + if (vdev->v_state == VDEV_STATE_HEALTHY) + good_kids++; + else if (vdev->v_state == VDEV_STATE_DEGRADED) + degraded_kids++; + else + bad_kids++; + } + + state = VDEV_STATE_CLOSED; + if (good_kids > 0 && (degraded_kids + bad_kids) == 0) + state = VDEV_STATE_HEALTHY; + else if ((good_kids + degraded_kids) > 0) + state = VDEV_STATE_DEGRADED; + + print_state(0, spa->spa_name, state); + STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { + vdev_status(vdev, 1); + } +} + +static void +spa_all_status(void) +{ + spa_t *spa; + int first = 1; + + STAILQ_FOREACH(spa, &zfs_pools, spa_link) { + if (!first) + pager_printf("\n"); + first = 0; + spa_status(spa); + } +} + +static int +vdev_probe(vdev_read_t *read, void *read_priv, spa_t **spap) +{ + vdev_t vtmp; + vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch; + spa_t *spa; + vdev_t *vdev, *top_vdev, *pool_vdev; + off_t off; + blkptr_t bp; + const unsigned char *nvlist; + uint64_t val; + uint64_t guid; + uint64_t pool_txg, pool_guid; + const char *pool_name; + const unsigned char *vdevs; + int i; + char upbuf[1024]; + const struct uberblock *up; + + /* + * Load the vdev label and figure out which + * uberblock is most current. + */ + memset(&vtmp, 0, sizeof(vtmp)); + vtmp.v_read = read; + vtmp.v_read_priv = read_priv; + off = offsetof(vdev_label_t, vl_vdev_phys); + BP_ZERO(&bp); + BP_SET_LSIZE(&bp, sizeof(vdev_phys_t)); + BP_SET_PSIZE(&bp, sizeof(vdev_phys_t)); + BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); + BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); + if (zio_read_phys(&vtmp, &bp, vdev_label, off)) + return (EIO); + + if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) { + return (EIO); + } + + nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4; + + if (nvlist_find(nvlist, + ZPOOL_CONFIG_VERSION, + DATA_TYPE_UINT64, 0, &val)) { + return (EIO); + } + + if (val != ZFS_VERSION) { + printf("ZFS: unsupported ZFS version %d\n", (int) val); + return (EIO); + } + + if (nvlist_find(nvlist, + ZPOOL_CONFIG_POOL_STATE, + DATA_TYPE_UINT64, 0, &val)) { + return (EIO); + } + + if (val != POOL_STATE_ACTIVE) { + /* + * Don't print a message here. If we happen to reboot + * while where is an exported pool around, we don't + * need a cascade of confusing messages during boot. + */ + /*printf("ZFS: pool is not active\n");*/ + return (EIO); + } + + if (nvlist_find(nvlist, + ZPOOL_CONFIG_POOL_TXG, + DATA_TYPE_UINT64, 0, &pool_txg) + || nvlist_find(nvlist, + ZPOOL_CONFIG_POOL_GUID, + DATA_TYPE_UINT64, 0, &pool_guid) + || nvlist_find(nvlist, + ZPOOL_CONFIG_POOL_NAME, + DATA_TYPE_STRING, 0, &pool_name)) { + printf("ZFS: can't find pool details\n"); + return (EIO); + } + + /* + * Create the pool if this is the first time we've seen it. + */ + spa = spa_find_by_guid(pool_guid); + if (!spa) { + spa = spa_create(pool_guid); + spa->spa_name = strdup(pool_name); + } + if (pool_txg > spa->spa_txg) + spa->spa_txg = pool_txg; + + /* + * Get the vdev tree and create our in-core copy of it. + * If we already have a healthy vdev with this guid, this must + * be some kind of alias (overlapping slices, dangerously dedicated + * disks etc). + */ + if (nvlist_find(nvlist, + ZPOOL_CONFIG_GUID, + DATA_TYPE_UINT64, 0, &guid)) { + return (EIO); + } + vdev = vdev_find(guid); + if (vdev && vdev->v_state == VDEV_STATE_HEALTHY) { + return (EIO); + } + + if (nvlist_find(nvlist, + ZPOOL_CONFIG_VDEV_TREE, + DATA_TYPE_NVLIST, 0, &vdevs)) { + return (EIO); + } + vdev_init_from_nvlist(vdevs, &top_vdev); + + /* + * Add the toplevel vdev to the pool if its not already there. + */ + STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink) + if (top_vdev == pool_vdev) + break; + if (!pool_vdev && top_vdev) + STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink); + + /* + * We should already have created an incomplete vdev for this + * vdev. Find it and initialise it with our read proc. + */ + vdev = vdev_find(guid); + if (vdev) { + vdev->v_read = read; + vdev->v_read_priv = read_priv; + vdev->v_state = VDEV_STATE_HEALTHY; + } else { + printf("ZFS: inconsistent nvlist contents\n"); + return (EIO); + } + + /* + * Re-evaluate top-level vdev state. + */ + vdev_set_state(top_vdev); + + /* + * Ok, we are happy with the pool so far. Lets find + * the best uberblock and then we can actually access + * the contents of the pool. + */ + for (i = 0; + i < VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT; + i++) { + off = offsetof(vdev_label_t, vl_uberblock); + off += i << UBERBLOCK_SHIFT; + BP_ZERO(&bp); + DVA_SET_OFFSET(&bp.blk_dva[0], off); + BP_SET_LSIZE(&bp, 1 << UBERBLOCK_SHIFT); + BP_SET_PSIZE(&bp, 1 << UBERBLOCK_SHIFT); + BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); + BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); + if (zio_read_phys(vdev, &bp, upbuf, off)) + continue; + + up = (const struct uberblock *) upbuf; + if (up->ub_magic != UBERBLOCK_MAGIC) + continue; + if (up->ub_txg < spa->spa_txg) + continue; + if (up->ub_txg > spa->spa_uberblock.ub_txg) { + spa->spa_uberblock = *up; + } else if (up->ub_txg == spa->spa_uberblock.ub_txg) { + if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp) + spa->spa_uberblock = *up; + } + } + + if (spap) + *spap = spa; + return (0); +} + +static int +ilog2(int n) +{ + int v; + + for (v = 0; v < 32; v++) + if (n == (1 << v)) + return v; + return -1; +} + +static int +zio_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset) +{ + int cpfunc = BP_GET_COMPRESS(bp); + size_t lsize = BP_GET_LSIZE(bp); + size_t psize = BP_GET_PSIZE(bp); + int rc; + + /*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/ + if (cpfunc != ZIO_COMPRESS_OFF) { + rc = vdev->v_read(vdev, vdev->v_read_priv, offset, zfs_decomp_buf, psize); + if (rc) + return (rc); + if (zio_checksum_error(bp, zfs_decomp_buf)) + return (EIO); + if (zio_decompress_data(cpfunc, zfs_decomp_buf, psize, + buf, lsize)) + return (EIO); + } else { + rc = vdev->v_read(vdev, vdev->v_read_priv, offset, buf, psize); + if (rc) + return (rc); + + if (zio_checksum_error(bp, buf)) + return (EIO); + } + return (0); +} + +static int +zio_read(spa_t *spa, const blkptr_t *bp, void *buf) +{ + int i; + + for (i = 0; i < SPA_DVAS_PER_BP; i++) { + const dva_t *dva = &bp->blk_dva[i]; + vdev_t *vdev; + int vdevid; + off_t offset; + + if (!dva->dva_word[0] && !dva->dva_word[1]) + continue; + + vdevid = DVA_GET_VDEV(dva); + offset = DVA_GET_OFFSET(dva) + VDEV_LABEL_START_SIZE; + STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) + if (vdev->v_id == vdevid) + break; + if (!vdev || !vdev->v_read) + continue; + if (zio_read_phys(vdev, bp, buf, offset)) + continue; + + return (0); + } + printf("ZFS: i/o error - all block copies unavailable\n"); + + return (EIO); +} + +static int +dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen) +{ + int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT; + int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; + int nlevels = dnode->dn_nlevels; + int i, rc; + + /* + * We truncate the offset to 32bits, mainly so that I don't + * have to find a copy of __divdi3 to put into the bootstrap. + * I don't think the bootstrap needs to access anything bigger + * than 2G anyway. Note that block addresses are still 64bit + * so it doesn't affect the possible size of the media. + * We still use 64bit block numbers so that the bitshifts + * work correctly. Note: bsize may not be a power of two here. + */ + while (buflen > 0) { + uint64_t bn = ((int) offset) / bsize; + int boff = ((int) offset) % bsize; + int ibn; + const blkptr_t *indbp; + blkptr_t bp; + + if (bn > dnode->dn_maxblkid) + return (EIO); + + if (dnode == dnode_cache_obj && bn == dnode_cache_bn) + goto cached; + + indbp = dnode->dn_blkptr; + for (i = 0; i < nlevels; i++) { + /* + * Copy the bp from the indirect array so that + * we can re-use the scratch buffer for multi-level + * objects. + */ + ibn = bn >> ((nlevels - i - 1) * ibshift); + ibn &= ((1 << ibshift) - 1); + bp = indbp[ibn]; + rc = zio_read(spa, &bp, dnode_cache_buf); + if (rc) + return (rc); + indbp = (const blkptr_t *) dnode_cache_buf; + } + dnode_cache_obj = dnode; + dnode_cache_bn = bn; + cached: + + /* + * The buffer contains our data block. Copy what we + * need from it and loop. + */ + i = bsize - boff; + if (i > buflen) i = buflen; + memcpy(buf, &dnode_cache_buf[boff], i); + buf = ((char*) buf) + i; + offset += i; + buflen -= i; + } + + return (0); +} + +/* + * Lookup a value in a microzap directory. Assumes that the zap + * scratch buffer contains the directory contents. + */ +static int +mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value) +{ + const mzap_phys_t *mz; + const mzap_ent_phys_t *mze; + size_t size; + int chunks, i; + + /* + * Microzap objects use exactly one block. Read the whole + * thing. + */ + size = dnode->dn_datablkszsec * 512; + + mz = (const mzap_phys_t *) zap_scratch; + chunks = size / MZAP_ENT_LEN - 1; + + for (i = 0; i < chunks; i++) { + mze = &mz->mz_chunk[i]; + if (!strcmp(mze->mze_name, name)) { + *value = mze->mze_value; + return (0); + } + } + + return (ENOENT); +} + +/* + * Compare a name with a zap leaf entry. Return non-zero if the name + * matches. + */ +static int +fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name) +{ + size_t namelen; + const zap_leaf_chunk_t *nc; + const char *p; + + namelen = zc->l_entry.le_name_length; + + nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); + p = name; + while (namelen > 0) { + size_t len; + len = namelen; + if (len > ZAP_LEAF_ARRAY_BYTES) + len = ZAP_LEAF_ARRAY_BYTES; + if (memcmp(p, nc->l_array.la_array, len)) + return (0); + p += len; + namelen -= len; + nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); + } + + return 1; +} + +/* + * Extract a uint64_t value from a zap leaf entry. + */ +static uint64_t +fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc) +{ + const zap_leaf_chunk_t *vc; + int i; + uint64_t value; + const uint8_t *p; + + vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk); + for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) { + value = (value << 8) | p[i]; + } + + return value; +} + +/* + * Lookup a value in a fatzap directory. Assumes that the zap scratch + * buffer contains the directory header. + */ +static int +fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value) +{ + int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; + zap_phys_t zh = *(zap_phys_t *) zap_scratch; + fat_zap_t z; + uint64_t *ptrtbl; + uint64_t hash; + int rc; + + if (zh.zap_magic != ZAP_MAGIC) + return (EIO); + + z.zap_block_shift = ilog2(bsize); + z.zap_phys = (zap_phys_t *) zap_scratch; + + /* + * Figure out where the pointer table is and read it in if necessary. + */ + if (zh.zap_ptrtbl.zt_blk) { + rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize, + zap_scratch, bsize); + if (rc) + return (rc); + ptrtbl = (uint64_t *) zap_scratch; + } else { + ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0); + } + + hash = zap_hash(zh.zap_salt, name); + + zap_leaf_t zl; + zl.l_bs = z.zap_block_shift; + + off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs; + zap_leaf_chunk_t *zc; + + rc = dnode_read(spa, dnode, off, zap_scratch, bsize); + if (rc) + return (rc); + + zl.l_phys = (zap_leaf_phys_t *) zap_scratch; + + /* + * Make sure this chunk matches our hash. + */ + if (zl.l_phys->l_hdr.lh_prefix_len > 0 + && zl.l_phys->l_hdr.lh_prefix + != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len)) + return (ENOENT); + + /* + * Hash within the chunk to find our entry. + */ + int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len); + int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1); + h = zl.l_phys->l_hash[h]; + if (h == 0xffff) + return (ENOENT); + zc = &ZAP_LEAF_CHUNK(&zl, h); + while (zc->l_entry.le_hash != hash) { + if (zc->l_entry.le_next == 0xffff) { + zc = 0; + break; + } + zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next); + } + if (fzap_name_equal(&zl, zc, name)) { + *value = fzap_leaf_value(&zl, zc); + return (0); + } + + return (ENOENT); +} + +/* + * Lookup a name in a zap object and return its value as a uint64_t. + */ +static int +zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value) +{ + int rc; + uint64_t zap_type; + size_t size = dnode->dn_datablkszsec * 512; + + rc = dnode_read(spa, dnode, 0, zap_scratch, size); + if (rc) + return (rc); + + zap_type = *(uint64_t *) zap_scratch; + if (zap_type == ZBT_MICRO) + return mzap_lookup(spa, dnode, name, value); + else + return fzap_lookup(spa, dnode, name, value); +} + +#ifdef BOOT2 + +/* + * List a microzap directory. Assumes that the zap scratch buffer contains + * the directory contents. + */ +static int +mzap_list(spa_t *spa, const dnode_phys_t *dnode) +{ + const mzap_phys_t *mz; + const mzap_ent_phys_t *mze; + size_t size; + int chunks, i; + + /* + * Microzap objects use exactly one block. Read the whole + * thing. + */ + size = dnode->dn_datablkszsec * 512; + mz = (const mzap_phys_t *) zap_scratch; + chunks = size / MZAP_ENT_LEN - 1; + + for (i = 0; i < chunks; i++) { + mze = &mz->mz_chunk[i]; + if (mze->mze_name[0]) + //printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value); + printf("%s\n", mze->mze_name); + } + + return (0); +} + +/* + * List a fatzap directory. Assumes that the zap scratch buffer contains + * the directory header. + */ +static int +fzap_list(spa_t *spa, const dnode_phys_t *dnode) +{ + int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; + zap_phys_t zh = *(zap_phys_t *) zap_scratch; + fat_zap_t z; + int i, j; + + if (zh.zap_magic != ZAP_MAGIC) + return (EIO); + + z.zap_block_shift = ilog2(bsize); + z.zap_phys = (zap_phys_t *) zap_scratch; + + /* + * This assumes that the leaf blocks start at block 1. The + * documentation isn't exactly clear on this. + */ + zap_leaf_t zl; + zl.l_bs = z.zap_block_shift; + for (i = 0; i < zh.zap_num_leafs; i++) { + off_t off = (i + 1) << zl.l_bs; + char name[256], *p; + uint64_t value; + + if (dnode_read(spa, dnode, off, zap_scratch, bsize)) + return (EIO); + + zl.l_phys = (zap_leaf_phys_t *) zap_scratch; + + for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { + zap_leaf_chunk_t *zc, *nc; + int namelen; + + zc = &ZAP_LEAF_CHUNK(&zl, j); + if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) + continue; + namelen = zc->l_entry.le_name_length; + if (namelen > sizeof(name)) + namelen = sizeof(name); + + /* + * Paste the name back together. + */ + nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk); + p = name; + while (namelen > 0) { + int len; + len = namelen; + if (len > ZAP_LEAF_ARRAY_BYTES) + len = ZAP_LEAF_ARRAY_BYTES; + memcpy(p, nc->l_array.la_array, len); + p += len; + namelen -= len; + nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next); + } + + /* + * Assume the first eight bytes of the value are + * a uint64_t. + */ + value = fzap_leaf_value(&zl, zc); + + printf("%-32s 0x%llx\n", name, value); + } + } + + return (0); +} + +/* + * List a zap directory. + */ +static int +zap_list(spa_t *spa, const dnode_phys_t *dnode) +{ + uint64_t zap_type; + size_t size = dnode->dn_datablkszsec * 512; + + if (dnode_read(spa, dnode, 0, zap_scratch, size)) + return (EIO); + + zap_type = *(uint64_t *) zap_scratch; + if (zap_type == ZBT_MICRO) + return mzap_list(spa, dnode); + else + return fzap_list(spa, dnode); +} + +#endif + +static int +objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode) +{ + off_t offset; + + offset = objnum * sizeof(dnode_phys_t); + return dnode_read(spa, &os->os_meta_dnode, offset, + dnode, sizeof(dnode_phys_t)); +} + +/* + * Find the object set given the object number of its dataset object + * and return its details in *objset + */ +static int +zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset) +{ + dnode_phys_t dataset; + dsl_dataset_phys_t *ds; + + if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) { + printf("ZFS: can't find dataset %lld\n", objnum); + return (EIO); + } + + ds = (dsl_dataset_phys_t *) &dataset.dn_bonus; + if (zio_read(spa, &ds->ds_bp, objset)) { + printf("ZFS: can't read object set for dataset %lld\n", objnum); + return (EIO); + } + + return (0); +} + +/* + * Find the object set pointed to by the BOOTFS property or the root + * dataset if there is none and return its details in *objset + */ +static int +zfs_mount_root(spa_t *spa, objset_phys_t *objset) +{ + dnode_phys_t dir, propdir; + uint64_t props, bootfs, root; + + /* + * Start with the MOS directory object. + */ + if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) { + printf("ZFS: can't read MOS object directory\n"); + return (EIO); + } + + /* + * Lookup the pool_props and see if we can find a bootfs. + */ + if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0 + && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0 + && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0) + return zfs_mount_dataset(spa, bootfs, objset); + + /* + * Lookup the root dataset directory + */ + if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root) + || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) { + printf("ZFS: can't find root dsl_dir\n"); + return (EIO); + } + + /* + * Use the information from the dataset directory's bonus buffer + * to find the dataset object and from that the object set itself. + */ + dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus; + return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset); +} + +static int +zfs_mount_pool(spa_t *spa) +{ + /* + * Find the MOS and work our way in from there. + */ + if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) { + printf("ZFS: can't read MOS\n"); + return (EIO); + } + + /* + * Find the root object set + */ + if (zfs_mount_root(spa, &spa->spa_root_objset)) { + printf("Can't find root filesystem - giving up\n"); + return (EIO); + } + + return (0); +} + +/* + * Lookup a file and return its dnode. + */ +static int +zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode) +{ + int rc; + uint64_t objnum, rootnum, parentnum; + dnode_phys_t dn; + const znode_phys_t *zp = (const znode_phys_t *) dn.dn_bonus; + const char *p, *q; + char element[256]; + char path[1024]; + int symlinks_followed = 0; + + if (spa->spa_root_objset.os_type != DMU_OST_ZFS) { + printf("ZFS: unexpected object set type %lld\n", + spa->spa_root_objset.os_type); + return (EIO); + } + + /* + * Get the root directory dnode. + */ + rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn); + if (rc) + return (rc); + + rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum); + if (rc) + return (rc); + + rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn); + if (rc) + return (rc); + + objnum = rootnum; + p = upath; + while (p && *p) { + while (*p == '/') + p++; + if (!*p) + break; + q = strchr(p, '/'); + if (q) { + memcpy(element, p, q - p); + element[q - p] = 0; + p = q; + } else { + strcpy(element, p); + p = 0; + } + + if ((zp->zp_mode >> 12) != 0x4) { + return (ENOTDIR); + } + + parentnum = objnum; + rc = zap_lookup(spa, &dn, element, &objnum); + if (rc) + return (rc); + objnum = ZFS_DIRENT_OBJ(objnum); + + rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn); + if (rc) + return (rc); + + /* + * Check for symlink. + */ + if ((zp->zp_mode >> 12) == 0xa) { + if (symlinks_followed > 10) + return (EMLINK); + symlinks_followed++; + + /* + * Read the link value and copy the tail of our + * current path onto the end. + */ + if (p) + strcpy(&path[zp->zp_size], p); + else + path[zp->zp_size] = 0; + if (zp->zp_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) { + memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)], + zp->zp_size); + } else { + rc = dnode_read(spa, &dn, 0, path, zp->zp_size); + if (rc) + return (rc); + } + + /* + * Restart with the new path, starting either at + * the root or at the parent depending whether or + * not the link is relative. + */ + p = path; + if (*p == '/') + objnum = rootnum; + else + objnum = parentnum; + objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn); + } + } + + *dnode = dn; + return (0); +} |