Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.

This bring huge amount of changes, I'll enumerate only user-visible changes: - Delegated Administration Allows regular users to perform ZFS operations, like file system creation, snapshot creation, etc. - L2ARC Level 2 cache for ZFS - allows to use additional disks for cache. Huge performance improvements mostly for random read of mostly static content. - slog Allow to use additional disks for ZFS Intent Log to speed up operations like fsync(2). - vfs.zfs.super_owner Allows regular users to perform privileged operations on files stored on ZFS file systems owned by him. Very careful with this one. - chflags(2) Not all the flags are supported. This still needs work. - ZFSBoot Support to boot off of ZFS pool. Not finished, AFAIK. Submitted by: dfr - Snapshot properties - New failure modes Before if write requested failed, system paniced. Now one can select from one of three failure modes: - panic - panic on write error - wait - wait for disk to reappear - continue - serve read requests if possible, block write requests - Refquota, refreservation properties Just quota and reservation properties, but don't count space consumed by children file systems, clones and snapshots. - Sparse volumes ZVOLs that don't reserve space in the pool. - External attributes Compatible with extattr(2). - NFSv4-ACLs Not sure about the status, might not be complete yet. Submitted by: trasz - Creation-time properties - Regression tests for zpool(8) command. Obtained from: OpenSolaris
author: pjd <pjd@FreeBSD.org> 2008-11-17 20:49:29 +0000
committer: pjd <pjd@FreeBSD.org> 2008-11-17 20:49:29 +0000
commit: bbe899b96e388a8b82439f81ed3707e0d9c6070d (patch)
tree: 81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/boot
parent: d2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff)
download: FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip
FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz
14 files changed, 3512 insertions, 15 deletions
diff --git a/sys/boot/Makefile b/sys/boot/Makefile
index 1af1457..27cb7e3 100644
--- a/sys/boot/Makefile
+++ b/sys/boot/Makefile
@@ -26,6 +26,10 @@ SUBDIR+=		ofw
 SUBDIR+=		uboot
 .endif
 
+.if defined(LOADER_ZFS_SUPPORT)
+SUBDIR+=		zfs
+.endif
+
 # Pick the machine-dependent subdir based on the target architecture.
 ADIR=			${MACHINE:S/amd64/i386/:S/sun4v/sparc64/}
 .if exists(${.CURDIR}/${ADIR}/.)
diff --git a/sys/boot/common/bootstrap.h b/sys/boot/common/bootstrap.h
index 57982d1..5f08480 100644
--- a/sys/boot/common/bootstrap.h
+++ b/sys/boot/common/bootstrap.h
@@ -43,6 +43,7 @@ struct devdesc
 #define DEVT_DISK	1
 #define DEVT_NET	2
 #define	DEVT_CD		3
+#define DEVT_ZFS	4
     int			d_unit;
 };
 
diff --git a/sys/boot/i386/Makefile b/sys/boot/i386/Makefile
index b89222d..6af8642 100644
--- a/sys/boot/i386/Makefile
+++ b/sys/boot/i386/Makefile
@@ -1,7 +1,7 @@
 # $FreeBSD$
 
-SUBDIR=		mbr pmbr boot0 boot0sio btx boot2 cdboot gptboot kgzldr \
-		libi386 libfirewire loader
+SUBDIR=		mbr pmbr boot0 boot0sio btx boot2 cdboot gptboot zfsboot \
+		kgzldr libi386 libfirewire loader
 
 # special boot programs, 'self-extracting boot2+loader'
 SUBDIR+=	pxeldr
diff --git a/sys/boot/i386/libi386/bootinfo32.c b/sys/boot/i386/libi386/bootinfo32.c
index 6b517c5..d434427 100644
--- a/sys/boot/i386/libi386/bootinfo32.c
+++ b/sys/boot/i386/libi386/bootinfo32.c
@@ -183,6 +183,7 @@ bi_load32(char *args, int *howtop, int *bootdevp, vm_offset_t *bip, vm_offset_t
 	break;
 
     case DEVT_NET:
+    case DEVT_ZFS:
 	    break;
 	    
     default:
diff --git a/sys/boot/i386/libi386/devicename.c b/sys/boot/i386/libi386/devicename.c
index e1035aa..79a562b 100644
--- a/sys/boot/i386/libi386/devicename.c
+++ b/sys/boot/i386/libi386/devicename.c
@@ -167,6 +167,7 @@ i386_parsedev(struct i386_devdesc **dev, const char *devspec, const char **path)
 
     case DEVT_CD:
     case DEVT_NET:
+    case DEVT_ZFS:
 	unit = 0;
 
 	if (*np && (*np != ':')) {
@@ -238,6 +239,7 @@ i386_fmtdev(void *vdev)
 	break;
 
     case DEVT_NET:
+    case DEVT_ZFS:
 	sprintf(buf, "%s%d:", dev->d_dev->dv_name, dev->d_unit);
 	break;
     }
diff --git a/sys/boot/i386/loader/Makefile b/sys/boot/i386/loader/Makefile
index df2ccc0..79aceca 100644
--- a/sys/boot/i386/loader/Makefile
+++ b/sys/boot/i386/loader/Makefile
@@ -17,6 +17,12 @@ CFLAGS+=	-DLOADER_FIREWIRE_SUPPORT
 LIBFIREWIRE=	${.OBJDIR}/../libfirewire/libfirewire.a
 .endif
 
+# Put LOADER_ZFS_SUPPORT=yes in /etc/make.conf for ZFS support
+.if defined(LOADER_ZFS_SUPPORT)
+CFLAGS+=	-DLOADER_ZFS_SUPPORT
+LIBZFS=		${.OBJDIR}/../../zfs/libzfsboot.a
+.endif
+
 # Enable PXE TFTP or NFS support, not both.
 .if defined(LOADER_TFTP_SUPPORT)
 CFLAGS+=	-DLOADER_TFTP_SUPPORT
@@ -98,8 +104,8 @@ FILES+=	loader.rc
 # XXX crt0.o needs to be first for pxeboot(8) to work
 OBJS=	${BTXCRT} 
 
-DPADD=	${LIBFICL} ${LIBFIREWIRE} ${LIBI386} ${LIBSTAND}
-LDADD=	${LIBFICL} ${LIBFIREWIRE} ${LIBI386} -lstand
+DPADD=	${LIBFICL} ${LIBFIREWIRE} ${LIBZFS} ${LIBI386} ${LIBSTAND}
+LDADD=	${LIBFICL} ${LIBFIREWIRE} ${LIBZFS} ${LIBI386} -lstand
 
 .include <bsd.prog.mk>
 
diff --git a/sys/boot/i386/loader/conf.c b/sys/boot/i386/loader/conf.c
index 245f960..05c9a9e9 100644
--- a/sys/boot/i386/loader/conf.c
+++ b/sys/boot/i386/loader/conf.c
@@ -50,6 +50,10 @@ __FBSDID("$FreeBSD$");
 extern struct devsw fwohci;
 #endif
 
+#if defined(LOADER_ZFS_SUPPORT)
+extern struct devsw zfs_dev;
+#endif
+
 /* Exported for libstand */
 struct devsw *devsw[] = {
     &bioscd,
@@ -60,15 +64,25 @@ struct devsw *devsw[] = {
 #if defined(LOADER_FIREWIRE_SUPPORT)
     &fwohci,
 #endif
+#if defined(LOADER_ZFS_SUPPORT)
+    &zfs_dev,
+#endif
     NULL
 };
 
+#if defined(LOADER_ZFS_SUPPORT)
+extern struct fs_ops zfs_fsops;
+#endif
+
 struct fs_ops *file_system[] = {
     &ufs_fsops,
     &ext2fs_fsops,
     &dosfs_fsops,
     &cd9660_fsops,
     &splitfs_fsops,
+#if defined(LOADER_ZFS_SUPPORT)
+    &zfs_fsops,
+#endif
 #ifdef LOADER_GZIP_SUPPORT
     &gzipfs_fsops,
 #endif
diff --git a/sys/boot/i386/loader/main.c b/sys/boot/i386/loader/main.c
index 5b23670..cac28ae 100644
--- a/sys/boot/i386/loader/main.c
+++ b/sys/boot/i386/loader/main.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
 
 #define	KARGS_FLAGS_CD		0x1
 #define	KARGS_FLAGS_PXE		0x2
+#define	KARGS_FLAGS_ZFS		0x4
 
 /* Arguments passed in from the boot1/boot2 loader */
 static struct 
@@ -51,8 +52,13 @@ static struct
     u_int32_t	howto;
     u_int32_t	bootdev;
     u_int32_t	bootflags;
-    u_int32_t	pxeinfo;
-    u_int32_t	res2;
+    union {
+	struct {
+	    u_int32_t	pxeinfo;
+	    u_int32_t	res2;
+	};
+	uint64_t	zfspool;
+    };
     u_int32_t	bootinfo;
 } *kargs;
 
@@ -96,7 +102,7 @@ main(void)
      */
     bios_getmem();
 
-#if defined(LOADER_BZIP2_SUPPORT) || defined(LOADER_FIREWIRE_SUPPORT)
+#if defined(LOADER_BZIP2_SUPPORT) || defined(LOADER_FIREWIRE_SUPPORT) || defined(LOADER_ZFS_SUPPORT)
     heap_top = PTOV(memtop_copyin);
     memtop_copyin -= 0x300000;
     heap_bottom = PTOV(memtop_copyin);
@@ -145,6 +151,14 @@ main(void)
 	    bc_add(initial_bootdev);
     }
 
+    archsw.arch_autoload = i386_autoload;
+    archsw.arch_getdev = i386_getdev;
+    archsw.arch_copyin = i386_copyin;
+    archsw.arch_copyout = i386_copyout;
+    archsw.arch_readin = i386_readin;
+    archsw.arch_isainb = isa_inb;
+    archsw.arch_isaoutb = isa_outb;
+
     /*
      * March through the device switch probing for things.
      */
@@ -172,14 +186,6 @@ main(void)
     
     bios_getsmap();
 
-    archsw.arch_autoload = i386_autoload;
-    archsw.arch_getdev = i386_getdev;
-    archsw.arch_copyin = i386_copyin;
-    archsw.arch_copyout = i386_copyout;
-    archsw.arch_readin = i386_readin;
-    archsw.arch_isainb = isa_inb;
-    archsw.arch_isaoutb = isa_outb;
-
     interact();			/* doesn't return */
 
     /* if we ever get here, it is an error */
@@ -252,6 +258,29 @@ extract_currdev(void)
 	       i386_setcurrdev, env_nounset);
     env_setenv("loaddev", EV_VOLATILE, i386_fmtdev(&new_currdev), env_noset,
 	       env_nounset);
+
+#ifdef LOADER_ZFS_SUPPORT
+    /*
+     * If we were started from a ZFS-aware boot2, we can work out
+     * which ZFS pool we are booting from.
+     */
+    if (kargs->bootflags & KARGS_FLAGS_ZFS) {
+	/*
+	 * Dig out the pool guid and convert it to a 'unit number'
+	 */
+	uint64_t guid;
+	int unit;
+	char devname[32];
+	extern int zfs_guid_to_unit(uint64_t);
+
+	guid = kargs->zfspool;
+	unit = zfs_guid_to_unit(guid);
+	if (unit >= 0) {
+	    sprintf(devname, "zfs%d", unit);
+	    setenv("currdev", devname, 1);
+	}
+    }
+#endif
 }
 
 COMMAND_SET(reboot, "reboot", "reboot the system", command_reboot);
diff --git a/sys/boot/i386/zfsboot/Makefile b/sys/boot/i386/zfsboot/Makefile
new file mode 100644
index 0000000..41f1672
--- /dev/null
+++ b/sys/boot/i386/zfsboot/Makefile
@@ -0,0 +1,108 @@
+# $FreeBSD$
+
+.PATH:		${.CURDIR}/../boot2
+
+FILES=		zfsboot
+
+NM?=		nm
+
+# A value of 0x80 enables LBA support.
+BOOT_BOOT1_FLAGS?=	0x80
+
+BOOT_COMCONSOLE_PORT?= 0x3f8
+BOOT_COMCONSOLE_SPEED?= 9600
+B2SIOFMT?=	0x3
+
+REL1=	0x700
+ORG1=	0x7c00
+ORG2=	0x2000
+
+CFLAGS=	-Os -g \
+	-fno-guess-branch-probability \
+	-fomit-frame-pointer \
+	-fno-unit-at-a-time \
+	-mno-align-long-strings \
+	-mrtd \
+	-mno-mmx -mno-3dnow -mno-sse -mno-sse2 -mno-sse3 \
+	-DBOOT2 \
+	-DFLAGS=${BOOT_BOOT1_FLAGS} \
+	-DSIOPRT=${BOOT_COMCONSOLE_PORT} \
+	-DSIOFMT=${B2SIOFMT} \
+	-DSIOSPD=${BOOT_COMCONSOLE_SPEED} \
+	-I${.CURDIR}/../../zfs \
+	-I${.CURDIR}/../../../cddl/boot/zfs \
+	-I${.CURDIR}/../btx/lib -I. \
+	-I${.CURDIR}/../boot2 \
+	-Wall -Waggregate-return -Wbad-function-cast -Wcast-align \
+	-Wmissing-declarations -Wmissing-prototypes -Wnested-externs \
+	-Wpointer-arith -Wshadow -Wstrict-prototypes -Wwrite-strings \
+	-Winline --param max-inline-insns-single=100
+
+LDFLAGS=-static -N --gc-sections
+
+# Pick up ../Makefile.inc early.
+.include <bsd.init.mk>
+
+CLEANFILES=	zfsboot
+
+zfsboot: zfsboot1 zfsboot2
+	cat zfsboot1 zfsboot2 > zfsboot
+
+CLEANFILES+=	zfsboot1 zfsldr.out zfsldr.o
+
+zfsboot1: zfsldr.out
+	objcopy -S -O binary zfsldr.out ${.TARGET}
+
+zfsldr.out: zfsldr.o
+	${LD} ${LDFLAGS} -e start -Ttext ${ORG1} -o ${.TARGET} zfsldr.o
+
+CLEANFILES+=	zfsboot2 zfsboot.ld zfsboot.ldr zfsboot.bin zfsboot.out \
+		zfsboot.o zfsboot.s zfsboot.s.tmp zfsboot.h sio.o
+
+# We currently allow 32768 bytes for zfsboot - in practice it could be
+# any size up to 3.5Mb but keeping it fixed size simplifies zfsldr.
+# 
+BOOT2SIZE=	32768
+
+zfsboot2: zfsboot.ld
+	@set -- `ls -l zfsboot.ld`; x=$$((${BOOT2SIZE}-$$5)); \
+	    echo "$$x bytes available"; test $$x -ge 0
+	dd if=zfsboot.ld of=${.TARGET} obs=${BOOT2SIZE} conv=osync
+
+zfsboot.ld: zfsboot.ldr zfsboot.bin ${BTXKERN}
+	btxld -v -E ${ORG2} -f bin -b ${BTXKERN} -l zfsboot.ldr \
+	    -o ${.TARGET} -P 1 zfsboot.bin
+
+zfsboot.ldr:
+	cp /dev/null ${.TARGET}
+
+zfsboot.bin: zfsboot.out
+	objcopy -S -O binary zfsboot.out ${.TARGET}
+
+zfsboot.out: ${BTXCRT} zfsboot.o sio.o
+	${LD} ${LDFLAGS} -Ttext ${ORG2} -o ${.TARGET} ${.ALLSRC}
+
+zfsboot.o: zfsboot.s
+
+SRCS=	zfsboot.c zfsboot.h
+
+zfsboot.s: zfsboot.c zfsboot.h ${.CURDIR}/../../zfs/zfsimpl.c
+	${CC} ${CFLAGS} -S -o zfsboot.s.tmp ${.CURDIR}/zfsboot.c
+	sed -e '/align/d' -e '/nop/d' < zfsboot.s.tmp > zfsboot.s
+	rm -f zfsboot.s.tmp
+
+zfsboot.h: zfsldr.out
+	${NM} -t d ${.ALLSRC} | awk '/([0-9])+ T xread/ \
+	    { x = $$1 - ORG1; \
+	    printf("#define XREADORG %#x\n", REL1 + x) }' \
+	    ORG1=`printf "%d" ${ORG1}` \
+	    REL1=`printf "%d" ${REL1}` > ${.TARGET}
+
+.if ${MACHINE_ARCH} == "amd64"
+beforedepend zfsboot.s: machine
+CLEANFILES+=	machine
+machine:
+	ln -sf ${.CURDIR}/../../../i386/include machine
+.endif
+
+.include <bsd.prog.mk>
diff --git a/sys/boot/i386/zfsboot/zfsboot.c b/sys/boot/i386/zfsboot/zfsboot.c
new file mode 100644
index 0000000..9b0a465
--- /dev/null
+++ b/sys/boot/i386/zfsboot/zfsboot.c
@@ -0,0 +1,944 @@
+/*-
+ * Copyright (c) 1998 Robert Nordier
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms are freely
+ * permitted provided that the above copyright notice and this
+ * paragraph and the following disclaimer are duplicated in all
+ * such forms.
+ *
+ * This software is provided "AS IS" and without any express or
+ * implied warranties, including, without limitation, the implied
+ * warranties of merchantability and fitness for a particular
+ * purpose.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/diskmbr.h>
+#include <sys/reboot.h>
+#include <sys/queue.h>
+
+#include <machine/bootinfo.h>
+#include <machine/elf.h>
+
+#include <stdarg.h>
+#include <stddef.h>
+
+#include <a.out.h>
+
+#include <btxv86.h>
+
+#include "zfsboot.h"
+#include "lib.h"
+
+#define IO_KEYBOARD	1
+#define IO_SERIAL	2
+
+#define SECOND		18	/* Circa that many ticks in a second. */
+
+#define RBX_ASKNAME	0x0	/* -a */
+#define RBX_SINGLE	0x1	/* -s */
+/* 0x2 is reserved for log2(RB_NOSYNC). */
+/* 0x3 is reserved for log2(RB_HALT). */
+/* 0x4 is reserved for log2(RB_INITNAME). */
+#define RBX_DFLTROOT	0x5	/* -r */
+#define RBX_KDB 	0x6	/* -d */
+/* 0x7 is reserved for log2(RB_RDONLY). */
+/* 0x8 is reserved for log2(RB_DUMP). */
+/* 0x9 is reserved for log2(RB_MINIROOT). */
+#define RBX_CONFIG	0xa	/* -c */
+#define RBX_VERBOSE	0xb	/* -v */
+#define RBX_SERIAL	0xc	/* -h */
+#define RBX_CDROM	0xd	/* -C */
+/* 0xe is reserved for log2(RB_POWEROFF). */
+#define RBX_GDB 	0xf	/* -g */
+#define RBX_MUTE	0x10	/* -m */
+/* 0x11 is reserved for log2(RB_SELFTEST). */
+/* 0x12 is reserved for boot programs. */
+/* 0x13 is reserved for boot programs. */
+#define RBX_PAUSE	0x14	/* -p */
+#define RBX_QUIET	0x15	/* -q */
+#define RBX_NOINTR	0x1c	/* -n */
+/* 0x1d is reserved for log2(RB_MULTIPLE) and is just misnamed here. */
+#define RBX_DUAL	0x1d	/* -D */
+/* 0x1f is reserved for log2(RB_BOOTINFO). */
+
+/* pass: -a, -s, -r, -d, -c, -v, -h, -C, -g, -m, -p, -D */
+#define RBX_MASK	(OPT_SET(RBX_ASKNAME) | OPT_SET(RBX_SINGLE) | \
+			OPT_SET(RBX_DFLTROOT) | OPT_SET(RBX_KDB ) | \
+			OPT_SET(RBX_CONFIG) | OPT_SET(RBX_VERBOSE) | \
+			OPT_SET(RBX_SERIAL) | OPT_SET(RBX_CDROM) | \
+			OPT_SET(RBX_GDB ) | OPT_SET(RBX_MUTE) | \
+			OPT_SET(RBX_PAUSE) | OPT_SET(RBX_DUAL))
+
+/* Hint to loader that we came from ZFS */
+#define	KARGS_FLAGS_ZFS		0x4
+
+#define PATH_CONFIG	"/boot.config"
+#define PATH_BOOT3	"/boot/loader"
+#define PATH_KERNEL	"/boot/kernel/kernel"
+
+#define ARGS		0x900
+#define NOPT		14
+#define NDEV		3
+#define MEM_BASE	0x12
+#define MEM_EXT 	0x15
+#define V86_CY(x)	((x) & 1)
+#define V86_ZR(x)	((x) & 0x40)
+
+#define DRV_HARD	0x80
+#define DRV_MASK	0x7f
+
+#define TYPE_AD		0
+#define TYPE_DA		1
+#define TYPE_MAXHARD	TYPE_DA
+#define TYPE_FD		2
+
+#define OPT_SET(opt)	(1 << (opt))
+#define OPT_CHECK(opt)	((opts) & OPT_SET(opt))
+
+extern uint32_t _end;
+
+static const char optstr[NOPT] = "DhaCcdgmnpqrsv"; /* Also 'P', 'S' */
+static const unsigned char flags[NOPT] = {
+    RBX_DUAL,
+    RBX_SERIAL,
+    RBX_ASKNAME,
+    RBX_CDROM,
+    RBX_CONFIG,
+    RBX_KDB,
+    RBX_GDB,
+    RBX_MUTE,
+    RBX_NOINTR,
+    RBX_PAUSE,
+    RBX_QUIET,
+    RBX_DFLTROOT,
+    RBX_SINGLE,
+    RBX_VERBOSE
+};
+
+static const char *const dev_nm[NDEV] = {"ad", "da", "fd"};
+static const unsigned char dev_maj[NDEV] = {30, 4, 2};
+
+struct dsk {
+    unsigned drive;
+    unsigned type;
+    unsigned unit;
+    unsigned slice;
+    unsigned part;
+    unsigned start;
+    int init;
+};
+static char cmd[512];
+static char kname[1024];
+static uint32_t opts;
+static int comspeed = SIOSPD;
+static struct bootinfo bootinfo;
+static uint32_t bootdev;
+static uint8_t ioctrl = IO_KEYBOARD;
+
+/* Buffers that must not span a 64k boundary. */
+#define READ_BUF_SIZE	8192
+struct dmadat {
+	char rdbuf[READ_BUF_SIZE];	/* for reading large things */
+	char secbuf[READ_BUF_SIZE];	/* for MBR/disklabel */
+};
+static struct dmadat *dmadat;
+
+void exit(int);
+static void load(void);
+static int parse(void);
+static void printf(const char *,...);
+static void putchar(int);
+static uint32_t memsize(void);
+static int drvread(struct dsk *, void *, unsigned, unsigned);
+static int keyhit(unsigned);
+static int xputc(int);
+static int xgetc(int);
+static int getc(int);
+
+static void memcpy(void *, const void *, int);
+static void
+memcpy(void *dst, const void *src, int len)
+{
+    const char *s = src;
+    char *d = dst;
+
+    while (len--)
+        *d++ = *s++;
+}
+
+static void
+strcpy(char *dst, const char *src)
+{
+    while (*src)
+	*dst++ = *src++;
+    *dst++ = 0;
+}
+
+static void
+strcat(char *dst, const char *src)
+{
+    while (*dst)
+	dst++;
+    while (*src)
+	*dst++ = *src++;
+    *dst++ = 0;
+}
+
+static int
+strcmp(const char *s1, const char *s2)
+{
+    for (; *s1 == *s2 && *s1; s1++, s2++);
+    return (unsigned char)*s1 - (unsigned char)*s2;
+}
+
+static const char *
+strchr(const char *s, char ch)
+{
+    for (; *s; s++)
+	if (*s == ch)
+		return s;
+    return 0;
+}
+
+static int
+memcmp(const void *p1, const void *p2, size_t n)
+{
+    const char *s1 = (const char *) p1;
+    const char *s2 = (const char *) p2;
+    for (; n > 0 && *s1 == *s2; s1++, s2++, n--);
+    if (n)
+        return (unsigned char)*s1 - (unsigned char)*s2;
+    else
+	return 0;
+}
+
+static void
+memset(void *p, char val, size_t n)
+{
+    char *s = (char *) p;
+    while (n--)
+	*s++ = val;
+}
+
+static void *
+malloc(size_t n)
+{
+	static char *heap_next;
+	static char *heap_end;
+
+	if (!heap_next) {
+		heap_next = (char *) dmadat + sizeof(*dmadat);
+		heap_end = (char *) (640*1024);
+	}
+
+	char *p = heap_next;
+	if (p + n > heap_end) {
+		printf("malloc failure\n");
+		for (;;)
+		    ;
+		return 0;
+	}
+	heap_next += n;
+	return p;
+}
+
+static size_t
+strlen(const char *s)
+{
+	size_t len = 0;
+	while (*s++)
+		len++;
+	return len;
+}
+
+static char *
+strdup(const char *s)
+{
+	char *p = malloc(strlen(s) + 1);
+	strcpy(p, s);
+	return p;
+}
+
+#include "zfsimpl.c"
+
+/*
+ * Read from a dnode (which must be from a ZPL filesystem).
+ */
+static int
+zfs_read(spa_t *spa, const dnode_phys_t *dnode, off_t *offp, void *start, size_t size)
+{
+	const znode_phys_t *zp = (const znode_phys_t *) dnode->dn_bonus;
+	size_t n;
+	int rc;
+
+	n = size;
+	if (*offp + n > zp->zp_size)
+		n = zp->zp_size - *offp;
+	
+	rc = dnode_read(spa, dnode, *offp, start, n);
+	if (rc)
+		return (-1);
+	*offp += n;
+
+	return (n);
+}
+
+/*
+ * Current ZFS pool
+ */
+spa_t *spa;
+
+/*
+ * A wrapper for dskread that doesn't have to worry about whether the
+ * buffer pointer crosses a 64k boundary.
+ */
+static int
+vdev_read(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes)
+{
+	char *p;
+	unsigned int lba, nb;
+	struct dsk *dsk = (struct dsk *) priv;
+
+	if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1)))
+		return -1;
+
+	p = buf;
+	lba = off / DEV_BSIZE;
+	while (bytes > 0) {
+		nb = bytes / DEV_BSIZE;
+		if (nb > READ_BUF_SIZE / DEV_BSIZE)
+			nb = READ_BUF_SIZE / DEV_BSIZE;
+		if (drvread(dsk, dmadat->rdbuf, lba, nb))
+			return -1;
+		memcpy(p, dmadat->rdbuf, nb * DEV_BSIZE);
+		p += nb * DEV_BSIZE;
+		lba += nb;
+		bytes -= nb * DEV_BSIZE;
+	}
+
+	return 0;
+}
+
+static int
+xfsread(const dnode_phys_t *dnode, off_t *offp, void *buf, size_t nbyte)
+{
+    if ((size_t)zfs_read(spa, dnode, offp, buf, nbyte) != nbyte) {
+	printf("Invalid %s\n", "format");
+	return -1;
+    }
+    return 0;
+}
+
+static inline uint32_t
+memsize(void)
+{
+    v86.addr = MEM_EXT;
+    v86.eax = 0x8800;
+    v86int();
+    return v86.eax;
+}
+
+static inline void
+getstr(void)
+{
+    char *s;
+    int c;
+
+    s = cmd;
+    for (;;) {
+	switch (c = xgetc(0)) {
+	case 0:
+	    break;
+	case '\177':
+	case '\b':
+	    if (s > cmd) {
+		s--;
+		printf("\b \b");
+	    }
+	    break;
+	case '\n':
+	case '\r':
+	    *s = 0;
+	    return;
+	default:
+	    if (s - cmd < sizeof(cmd) - 1)
+		*s++ = c;
+	    putchar(c);
+	}
+    }
+}
+
+static inline void
+putc(int c)
+{
+    v86.addr = 0x10;
+    v86.eax = 0xe00 | (c & 0xff);
+    v86.ebx = 0x7;
+    v86int();
+}
+
+/*
+ * Try to detect a device supported by the legacy int13 BIOS
+ */
+static int
+int13probe(int drive)
+{
+    v86.ctl = V86_FLAGS;
+    v86.addr = 0x13;
+    v86.eax = 0x800;
+    v86.edx = drive;
+    v86int();
+    
+    if (!(v86.efl & 0x1) &&				/* carry clear */
+	((v86.edx & 0xff) != (drive & DRV_MASK))) {	/* unit # OK */
+	if ((v86.ecx & 0x3f) == 0) {			/* absurd sector size */
+		return(0);				/* skip device */
+	}
+	return (1);
+    }
+    return(0);
+}
+
+static void
+probe_drive(struct dsk *dsk, spa_t **spap)
+{
+    struct dos_partition *dp;
+    char *sec;
+    unsigned i;
+
+    if (!int13probe(dsk->drive))
+	return;
+
+    /*
+     * If we find a vdev on the whole disk, stop here. Otherwise dig
+     * out the MBR and probe each slice in turn for a vdev.
+     */
+    if (vdev_probe(vdev_read, dsk, spap) == 0)
+	return;
+
+    sec = dmadat->secbuf;
+    dsk->start = 0;
+    if (drvread(dsk, sec, DOSBBSECTOR, 1))
+	return;
+    dp = (void *)(sec + DOSPARTOFF);
+
+    for (i = 0; i < NDOSPART; i++) {
+	if (!dp[i].dp_typ)
+	    continue;
+	dsk->start = dp[i].dp_start;
+	if (vdev_probe(vdev_read, dsk, spap) == 0) {
+	    /*
+	     * We record the first pool we find (we will try to boot
+	     * from that one.
+	     */
+	    spap = 0;
+
+	    /*
+	     * This slice had a vdev. We need a new dsk structure now
+	     * sice the vdev now owns this one.
+	     */
+	    struct dsk *newdsk;
+	    newdsk = malloc(sizeof(struct dsk));
+	    *newdsk = *dsk;
+	    dsk = newdsk;
+	}
+    }
+}
+
+int
+main(void)
+{
+    int autoboot, i;
+    dnode_phys_t dn;
+    off_t off;
+    struct dsk *dsk;
+
+    dmadat = (void *)(roundup2(__base + (int32_t)&_end, 0x10000) - __base);
+    v86.ctl = V86_FLAGS;
+
+    dsk = malloc(sizeof(struct dsk));
+    dsk->drive = *(uint8_t *)PTOV(ARGS);
+    dsk->type = dsk->drive & DRV_HARD ? TYPE_AD : TYPE_FD;
+    dsk->unit = dsk->drive & DRV_MASK;
+    dsk->slice = *(uint8_t *)PTOV(ARGS + 1) + 1;
+    dsk->part = 0;
+    dsk->start = 0;
+    dsk->init = 0;
+
+    bootinfo.bi_version = BOOTINFO_VERSION;
+    bootinfo.bi_size = sizeof(bootinfo);
+    bootinfo.bi_basemem = 0;	/* XXX will be filled by loader or kernel */
+    bootinfo.bi_extmem = memsize();
+    bootinfo.bi_memsizes_valid++;
+    bootinfo.bi_bios_dev = dsk->drive;
+
+    bootdev = MAKEBOOTDEV(dev_maj[dsk->type],
+			  dsk->slice, dsk->unit, dsk->part),
+
+    /* Process configuration file */
+
+    autoboot = 1;
+
+    zfs_init();
+
+    /*
+     * Probe the boot drive first - we will try to boot from whatever
+     * pool we find on that drive.
+     */
+    probe_drive(dsk, &spa);
+
+    /*
+     * Probe the rest of the drives that the bios knows about. This
+     * will find any other available pools and it may fill in missing
+     * vdevs for the boot pool.
+     */
+    for (i = 0; i < 4; i++) {
+	if ((i | DRV_HARD) == *(uint8_t *)PTOV(ARGS))
+	    continue;
+
+	dsk = malloc(sizeof(struct dsk));
+	dsk->drive = i | DRV_HARD;
+	dsk->type = dsk->drive & TYPE_AD;
+	dsk->unit = i;
+	dsk->slice = 0;
+	dsk->part = 0;
+	dsk->start = 0;
+	dsk->init = 0;
+	probe_drive(dsk, 0);
+    }
+
+    /*
+     * If we didn't find a pool on the boot drive, default to the
+     * first pool we found, if any.
+     */
+    if (!spa) {
+	spa = STAILQ_FIRST(&zfs_pools);
+	if (!spa) {
+	    printf("No ZFS pools located, can't boot\n");
+	    for (;;)
+		;
+	}
+    }
+
+    zfs_mount_pool(spa);
+
+    if (zfs_lookup(spa, PATH_CONFIG, &dn) == 0) {
+	off = 0;
+	xfsread(&dn, &off, cmd, sizeof(cmd));
+    }
+
+    if (*cmd) {
+	if (parse())
+	    autoboot = 0;
+	if (!OPT_CHECK(RBX_QUIET))
+	    printf("%s: %s", PATH_CONFIG, cmd);
+	/* Do not process this command twice */
+	*cmd = 0;
+    }
+
+    /*
+     * Try to exec stage 3 boot loader. If interrupted by a keypress,
+     * or in case of failure, try to load a kernel directly instead.
+     */
+
+    if (autoboot && !*kname) {
+	memcpy(kname, PATH_BOOT3, sizeof(PATH_BOOT3));
+	if (!keyhit(3*SECOND)) {
+	    load();
+	    memcpy(kname, PATH_KERNEL, sizeof(PATH_KERNEL));
+	}
+    }
+
+    /* Present the user with the boot2 prompt. */
+
+    for (;;) {
+	if (!autoboot || !OPT_CHECK(RBX_QUIET))
+	    printf("\nFreeBSD/i386 boot\n"
+		   "Default: %s:%s\n"
+		   "boot: ",
+		   spa->spa_name, kname);
+	if (ioctrl & IO_SERIAL)
+	    sio_flush();
+	if (!autoboot || keyhit(5*SECOND))
+	    getstr();
+	else if (!autoboot || !OPT_CHECK(RBX_QUIET))
+	    putchar('\n');
+	autoboot = 0;
+	if (parse())
+	    putchar('\a');
+	else
+	    load();
+    }
+}
+
+/* XXX - Needed for btxld to link the boot2 binary; do not remove. */
+void
+exit(int x)
+{
+}
+
+static void
+load(void)
+{
+    union {
+	struct exec ex;
+	Elf32_Ehdr eh;
+    } hdr;
+    static Elf32_Phdr ep[2];
+    static Elf32_Shdr es[2];
+    caddr_t p;
+    dnode_phys_t dn;
+    off_t off;
+    uint32_t addr, x;
+    int fmt, i, j;
+
+    if (zfs_lookup(spa, kname, &dn)) {
+	return;
+    }
+    off = 0;
+    if (xfsread(&dn, &off, &hdr, sizeof(hdr)))
+	return;
+    if (N_GETMAGIC(hdr.ex) == ZMAGIC)
+	fmt = 0;
+    else if (IS_ELF(hdr.eh))
+	fmt = 1;
+    else {
+	printf("Invalid %s\n", "format");
+	return;
+    }
+    if (fmt == 0) {
+	addr = hdr.ex.a_entry & 0xffffff;
+	p = PTOV(addr);
+	off = PAGE_SIZE;
+	if (xfsread(&dn, &off, p, hdr.ex.a_text))
+	    return;
+	p += roundup2(hdr.ex.a_text, PAGE_SIZE);
+	if (xfsread(&dn, &off, p, hdr.ex.a_data))
+	    return;
+	p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE);
+	bootinfo.bi_symtab = VTOP(p);
+	memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms));
+	p += sizeof(hdr.ex.a_syms);
+	if (hdr.ex.a_syms) {
+	    if (xfsread(&dn, &off, p, hdr.ex.a_syms))
+		return;
+	    p += hdr.ex.a_syms;
+	    if (xfsread(&dn, &off, p, sizeof(int)))
+		return;
+	    x = *(uint32_t *)p;
+	    p += sizeof(int);
+	    x -= sizeof(int);
+	    if (xfsread(&dn, &off, p, x))
+		return;
+	    p += x;
+	}
+    } else {
+	off = hdr.eh.e_phoff;
+	for (j = i = 0; i < hdr.eh.e_phnum && j < 2; i++) {
+	    if (xfsread(&dn, &off, ep + j, sizeof(ep[0])))
+		return;
+	    if (ep[j].p_type == PT_LOAD)
+		j++;
+	}
+	for (i = 0; i < 2; i++) {
+	    p = PTOV(ep[i].p_paddr & 0xffffff);
+	    off = ep[i].p_offset;
+	    if (xfsread(&dn, &off, p, ep[i].p_filesz))
+		return;
+	}
+	p += roundup2(ep[1].p_memsz, PAGE_SIZE);
+	bootinfo.bi_symtab = VTOP(p);
+	if (hdr.eh.e_shnum == hdr.eh.e_shstrndx + 3) {
+	    off = hdr.eh.e_shoff + sizeof(es[0]) *
+		(hdr.eh.e_shstrndx + 1);
+	    if (xfsread(&dn, &off, &es, sizeof(es)))
+		return;
+	    for (i = 0; i < 2; i++) {
+		memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size));
+		p += sizeof(es[i].sh_size);
+		off = es[i].sh_offset;
+		if (xfsread(&dn, &off, p, es[i].sh_size))
+		    return;
+		p += es[i].sh_size;
+	    }
+	}
+	addr = hdr.eh.e_entry & 0xffffff;
+    }
+    bootinfo.bi_esymtab = VTOP(p);
+    bootinfo.bi_kernelname = VTOP(kname);
+    __exec((caddr_t)addr, RB_BOOTINFO | (opts & RBX_MASK),
+	   bootdev,
+	   KARGS_FLAGS_ZFS,
+	   (uint32_t) spa->spa_guid,
+	   (uint32_t) (spa->spa_guid >> 32),
+	   VTOP(&bootinfo));
+}
+
+static int
+parse()
+{
+    char *arg = cmd;
+    char *ep, *p, *q;
+    const char *cp;
+    //unsigned int drv;
+    int c, i, j;
+
+    while ((c = *arg++)) {
+	if (c == ' ' || c == '\t' || c == '\n')
+	    continue;
+	for (p = arg; *p && *p != '\n' && *p != ' ' && *p != '\t'; p++);
+	ep = p;
+	if (*p)
+	    *p++ = 0;
+	if (c == '-') {
+	    while ((c = *arg++)) {
+		if (c == 'P') {
+		    if (*(uint8_t *)PTOV(0x496) & 0x10) {
+			cp = "yes";
+		    } else {
+			opts |= OPT_SET(RBX_DUAL) | OPT_SET(RBX_SERIAL);
+			cp = "no";
+		    }
+		    printf("Keyboard: %s\n", cp);
+		    continue;
+		} else if (c == 'S') {
+		    j = 0;
+		    while ((unsigned int)(i = *arg++ - '0') <= 9)
+			j = j * 10 + i;
+		    if (j > 0 && i == -'0') {
+			comspeed = j;
+			break;
+		    }
+		    /* Fall through to error below ('S' not in optstr[]). */
+		}
+		for (i = 0; c != optstr[i]; i++)
+		    if (i == NOPT - 1)
+			return -1;
+		opts ^= OPT_SET(flags[i]);
+	    }
+	    ioctrl = OPT_CHECK(RBX_DUAL) ? (IO_SERIAL|IO_KEYBOARD) :
+		     OPT_CHECK(RBX_SERIAL) ? IO_SERIAL : IO_KEYBOARD;
+	    if (ioctrl & IO_SERIAL)
+	        sio_init(115200 / comspeed);
+	} if (c == '?') {
+	    dnode_phys_t dn;
+
+	    if (zfs_lookup(spa, arg, &dn) == 0) {
+		zap_list(spa, &dn);
+	    }
+	    return -1;
+	} else {
+	    arg--;
+
+	    /*
+	     * Report pool status if the comment is 'status'. Lets
+	     * hope no-one wants to load /status as a kernel.
+	     */
+	    if (!strcmp(arg, "status")) {
+		spa_all_status();
+		return -1;
+	    }
+
+	    /*
+	     * If there is a colon, switch pools.
+	     */
+	    q = (char *) strchr(arg, ':');
+	    if (q) {
+		spa_t *newspa;
+
+		*q++ = 0;
+		newspa = spa_find_by_name(arg);
+		if (newspa) {
+		    spa = newspa;
+		    zfs_mount_pool(spa);
+		} else {
+		    printf("\nCan't find ZFS pool %s\n", arg);
+		    return -1;
+		}
+		arg = q;
+	    }
+	    if ((i = ep - arg)) {
+		if ((size_t)i >= sizeof(kname))
+		    return -1;
+		memcpy(kname, arg, i + 1);
+	    }
+	}
+	arg = p;
+    }
+    return 0;
+}
+
+static void
+printf(const char *fmt,...)
+{
+    va_list ap;
+    char buf[10];
+    char *s;
+    unsigned u;
+    int c;
+    int minus;
+    int prec;
+    int len;
+    int pad;
+
+    va_start(ap, fmt);
+    while ((c = *fmt++)) {
+	if (c == '%') {
+	    minus = 0;
+	    prec = 0;
+	nextfmt:
+	    c = *fmt++;
+	    switch (c) {
+	    case '-':
+		minus = 1;
+		goto nextfmt;
+	    case '0':
+	    case '1':
+	    case '2':
+	    case '3':
+	    case '4':
+	    case '5':
+	    case '6':
+	    case '7':
+	    case '8':
+	    case '9':
+		prec = 10 * prec + (c - '0');
+		goto nextfmt;
+	    case 'c':
+		putchar(va_arg(ap, int));
+		continue;
+	    case 's':
+		s = va_arg(ap, char *);
+		if (prec) {
+		    len = strlen(s);
+		    if (len < prec)
+			pad = prec - len;
+		    else
+			pad = 0;
+		    if (minus)
+			while (pad--)
+			    putchar(' ');
+		    for (; *s; s++)
+			putchar(*s);
+		    if (!minus)
+			while (pad--)
+			    putchar(' ');
+		} else {
+		    for (; *s; s++)
+			putchar(*s);
+		}
+		continue;
+	    case 'u':
+		u = va_arg(ap, unsigned);
+		s = buf;
+		do
+		    *s++ = '0' + u % 10U;
+		while (u /= 10U);
+		while (--s >= buf)
+		    putchar(*s);
+		continue;
+	    }
+	}
+	putchar(c);
+    }
+    va_end(ap);
+    return;
+}
+
+static void
+putchar(int c)
+{
+    if (c == '\n')
+	xputc('\r');
+    xputc(c);
+}
+
+static int
+drvread(struct dsk *dsk, void *buf, unsigned lba, unsigned nblk)
+{
+    static unsigned c = 0x2d5c7c2f;
+
+    lba += dsk->start;
+    if (!OPT_CHECK(RBX_QUIET))
+	printf("%c\b", c = c << 8 | c >> 24);
+    v86.ctl = V86_ADDR | V86_CALLF | V86_FLAGS;
+    v86.addr = XREADORG;		/* call to xread in boot1 */
+    v86.es = VTOPSEG(buf);
+    v86.eax = lba;
+    v86.ebx = VTOPOFF(buf);
+    v86.ecx = lba >> 16;
+    v86.edx = nblk << 8 | dsk->drive;
+    v86int();
+    v86.ctl = V86_FLAGS;
+    if (V86_CY(v86.efl)) {
+	printf("error %u lba %u\n", v86.eax >> 8 & 0xff, lba);
+	return -1;
+    }
+    return 0;
+}
+
+static int
+keyhit(unsigned ticks)
+{
+    uint32_t t0, t1;
+
+    if (OPT_CHECK(RBX_NOINTR))
+	return 0;
+    t0 = 0;
+    for (;;) {
+	if (xgetc(1))
+	    return 1;
+	t1 = *(uint32_t *)PTOV(0x46c);
+	if (!t0)
+	    t0 = t1;
+	if (t1 < t0 || t1 >= t0 + ticks)
+	    return 0;
+    }
+}
+
+static int
+xputc(int c)
+{
+    if (ioctrl & IO_KEYBOARD)
+	putc(c);
+    if (ioctrl & IO_SERIAL)
+	sio_putc(c);
+    return c;
+}
+
+static int
+xgetc(int fn)
+{
+    if (OPT_CHECK(RBX_NOINTR))
+	return 0;
+    for (;;) {
+	if (ioctrl & IO_KEYBOARD && getc(1))
+	    return fn ? 1 : getc(0);
+	if (ioctrl & IO_SERIAL && sio_ischar())
+	    return fn ? 1 : sio_getc();
+	if (fn)
+	    return 0;
+    }
+}
+
+static int
+getc(int fn)
+{
+    /*
+     * The extra comparison against zero is an attempt to work around
+     * what appears to be a bug in QEMU and Bochs. Both emulators
+     * sometimes report a key-press with scancode one and ascii zero
+     * when no such key is pressed in reality. As far as I can tell,
+     * this only happens shortly after a reboot.
+     */
+    v86.addr = 0x16;
+    v86.eax = fn << 8;
+    v86int();
+    return fn == 0 ? v86.eax & 0xff : (!V86_ZR(v86.efl) && (v86.eax & 0xff));
+}
diff --git a/sys/boot/i386/zfsboot/zfsldr.S b/sys/boot/i386/zfsboot/zfsldr.S
new file mode 100644
index 0000000..a256d30
--- /dev/null
+++ b/sys/boot/i386/zfsboot/zfsldr.S
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 1998 Robert Nordier
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms are freely
+ * permitted provided that the above copyright notice and this
+ * paragraph and the following disclaimer are duplicated in all
+ * such forms.
+ *
+ * This software is provided "AS IS" and without any express or
+ * implied warranties, including, without limitation, the implied
+ * warranties of merchantability and fitness for a particular
+ * purpose.
+ *
+ * $FreeBSD$
+ */
+
+/* Memory Locations */
+		.set MEM_REL,0x700		# Relocation address
+		.set MEM_ARG,0x900		# Arguments
+		.set MEM_ORG,0x7c00		# Origin
+		.set MEM_BUF,0x8000		# Load area
+		.set MEM_BTX,0x9000		# BTX start
+		.set MEM_JMP,0x9010		# BTX entry point
+		.set MEM_USR,0xa000		# Client start
+		.set BDA_BOOT,0x472		# Boot howto flag
+	
+/* Partition Constants */
+		.set PRT_OFF,0x1be		# Partition offset
+		.set PRT_NUM,0x4		# Partitions
+		.set PRT_BSD,0xa5		# Partition type
+
+/* Flag Bits */
+		.set FL_PACKET,0x80		# Packet mode
+
+/* Misc. Constants */
+		.set SIZ_PAG,0x1000		# Page size
+		.set SIZ_SEC,0x200		# Sector size
+
+		.set NSECT,0x40
+		.globl start
+		.globl xread
+		.code16
+
+start:		jmp main			# Start recognizably
+
+/*
+ * This is the start of a standard BIOS Parameter Block (BPB). Most bootable
+ * FAT disks have this at the start of their MBR. While normal BIOS's will
+ * work fine without this section, IBM's El Torito emulation "fixes" up the
+ * BPB by writing into the memory copy of the MBR. Rather than have data
+ * written into our xread routine, we'll define a BPB to work around it.
+ * The data marked with (T) indicates a field required for a ThinkPad to
+ * recognize the disk and (W) indicates fields written from IBM BIOS code.
+ * The use of the BPB is based on what OpenBSD and NetBSD implemented in
+ * their boot code but the required fields were determined by trial and error.
+ *
+ * Note: If additional space is needed in boot1, one solution would be to
+ * move the "prompt" message data (below) to replace the OEM ID.
+ */
+		.org 0x03, 0x00
+oemid:		.space 0x08, 0x00	# OEM ID
+
+		.org 0x0b, 0x00
+bpb:		.word   512		# sector size (T)
+		.byte	0		# sectors/clustor
+		.word	0		# reserved sectors
+		.byte	0		# number of FATs
+		.word	0		# root entries
+		.word	0		# small sectors
+		.byte	0		# media type (W)
+		.word	0		# sectors/fat
+		.word	18		# sectors per track (T)
+		.word	2		# number of heads (T)
+		.long	0		# hidden sectors (W)
+		.long	0		# large sectors
+
+		.org 0x24, 0x00
+ebpb:		.byte	0		# BIOS physical drive number (W)
+
+		.org 0x25,0x90
+/*
+ * Trampoline used by boot2 to call read to read data from the disk via
+ * the BIOS.  Call with:
+ *
+ * %cx:%ax	- long    - LBA to read in
+ * %es:(%bx)	- caddr_t - buffer to read data into
+ * %dl		- byte    - drive to read from
+ * %dh		- byte    - num sectors to read
+ */
+
+xread:		push %ss			# Address
+		pop %ds				#  data
+/*
+ * Setup an EDD disk packet and pass it to read
+ */
+xread.1:					# Starting
+		pushl $0x0			#  absolute
+		push %cx			#  block
+		push %ax			#  number
+		push %es			# Address of
+		push %bx			#  transfer buffer
+		xor %ax,%ax			# Number of
+		movb %dh,%al			#  blocks to
+		push %ax			#  transfer
+		push $0x10			# Size of packet
+		mov %sp,%bp			# Packet pointer
+		callw read			# Read from disk
+		lea 0x10(%bp),%sp		# Clear stack
+		lret				# To far caller
+/*
+ * Load the rest of boot2 and BTX up, copy the parts to the right locations,
+ * and start it all up.
+ */
+
+/*
+ * Setup the segment registers to flat addressing (segment 0) and setup the
+ * stack to end just below the start of our code.
+ */
+main:		cld				# String ops inc
+		xor %cx,%cx			# Zero
+		mov %cx,%es			# Address
+		mov %cx,%ds			#  data
+		mov %cx,%ss			# Set up
+		mov $start,%sp			#  stack
+/*
+ * Relocate ourself to MEM_REL.  Since %cx == 0, the inc %ch sets
+ * %cx == 0x100.
+ */
+		mov %sp,%si			# Source
+		mov $MEM_REL,%di		# Destination
+		incb %ch			# Word count
+		rep				# Copy
+		movsw				#  code
+/*
+ * If we are on a hard drive, then load the MBR and look for the first
+ * FreeBSD slice.  We use the fake partition entry below that points to
+ * the MBR when we call nread.  The first pass looks for the first active
+ * FreeBSD slice.  The second pass looks for the first non-active FreeBSD
+ * slice if the first one fails.
+ */
+		mov $part4,%si			# Partition
+		cmpb $0x80,%dl			# Hard drive?
+		jb main.4			# No
+		movb $0x1,%dh			# Block count
+		callw nread			# Read MBR
+		mov $0x1,%cx	 		# Two passes
+main.1: 	mov $MEM_BUF+PRT_OFF,%si	# Partition table
+		movb $0x1,%dh			# Partition
+main.2: 	cmpb $PRT_BSD,0x4(%si)		# Our partition type?
+		jne main.3			# No
+		jcxz main.5			# If second pass
+		testb $0x80,(%si)		# Active?
+		jnz main.5			# Yes
+main.3: 	add $0x10,%si	 		# Next entry
+		incb %dh			# Partition
+		cmpb $0x1+PRT_NUM,%dh		# In table?
+		jb main.2			# Yes
+		dec %cx				# Do two
+		jcxz main.1			#  passes
+/*
+ * If we get here, we didn't find any FreeBSD slices at all, so print an
+ * error message and die.
+ */
+		mov $msg_part,%si		# Message
+		jmp error			# Error
+/*
+ * Floppies use partition 0 of drive 0.
+ */
+main.4: 	xor %dx,%dx			# Partition:drive
+
+/*
+ * Ok, we have a slice and drive in %dx now, so use that to locate and
+ * load boot2.  %si references the start of the slice we are looking
+ * for, so go ahead and load up the 64 sectors starting at sector 1024
+ * (i.e. after the two vdev labels).  We don't have do anything fancy
+ * here to allow for an extra copy of boot1 and a partition table
+ * (compare to this section of the UFS bootstrap) so we just load it
+ * all at 0x8000. The first part of boot2 is BTX, which wants to run
+ * at 0x9000. The boot2.bin binary starts right after the end of BTX,
+ * so we have to figure out where the start of it is and then move the
+ * binary to 0xc000. After we have moved the client, we relocate BTX
+ * itself to 0x9000 - doing it in this order means that none of the
+ * memcpy regions overlap which would corrupt the copy.  Normally, BTX
+ * clients start at MEM_USR, or 0xa000, but when we use btxld to
+ * create boot2, we use an entry point of 0x2000.  That entry point is
+ * relative to MEM_USR; thus boot2.bin starts at 0xc000.
+ *
+ * The load area and the target area for the client overlap so we have
+ * to use a decrementing string move. We also play segment register
+ * games with the destination address for the move so that the client
+ * can be larger than 16k (which would overflow the zero segment since
+ * the client starts at 0xc000). Relocating BTX is easy since the load
+ * area and target area do not overlap.
+ */
+main.5: 	mov %dx,MEM_ARG			# Save args
+		movb $NSECT,%dh			# Sector count
+		movw $1024,%ax			# Offset to boot2
+		callw nread.1			# Read disk
+main.6:		mov $MEM_BUF,%si		# BTX (before reloc)
+		mov 0xa(%si),%bx		# Get BTX length and set
+		mov $NSECT*SIZ_SEC-1,%di	# Size of load area (less one)
+		mov %di,%si			# End of load
+		add $MEM_BUF,%si		#  area
+		sub %bx,%di			# End of client, 0xc000 rel
+		mov %di,%cx			# Size of
+		inc %cx				#  client
+		mov $(MEM_USR+2*SIZ_PAG)>>4,%dx	# Segment
+		mov %dx,%es			#   addressing 0xc000
+		std				# Move with decrement
+		rep				# Relocate
+		movsb				#  client
+		mov %ds,%dx			# Back to
+		mov %dx,%es			#  zero segment
+		mov $MEM_BUF,%si		# BTX (before reloc)
+		mov $MEM_BTX,%di		# BTX
+		mov %bx,%cx			# Get BTX length
+		cld				# Increment this time
+		rep				# Relocate
+		movsb				#  BTX
+
+/*
+ * Enable A20 so we can access memory above 1 meg.
+ * Use the zero-valued %cx as a timeout for embedded hardware which do not
+ * have a keyboard controller.
+ */
+seta20: 	cli				# Disable interrupts
+seta20.1:	dec %cx				# Timeout?
+		jz seta20.3			# Yes
+		inb $0x64,%al			# Get status
+		testb $0x2,%al			# Busy?
+		jnz seta20.1			# Yes
+		movb $0xd1,%al			# Command: Write
+		outb %al,$0x64			#  output port
+seta20.2:	inb $0x64,%al			# Get status
+		testb $0x2,%al			# Busy?
+		jnz seta20.2			# Yes
+		movb $0xdf,%al			# Enable
+		outb %al,$0x60			#  A20
+seta20.3:	sti				# Enable interrupts
+
+		jmp start+MEM_JMP-MEM_ORG	# Start BTX
+
+
+/*
+ * Trampoline used to call read from within boot1.
+ */
+nread:		xor %ax,%ax			# Sector offset in partition
+nread.1:	mov $MEM_BUF,%bx		# Transfer buffer
+		add 0x8(%si),%ax		# Get
+		mov 0xa(%si),%cx		#  LBA
+		push %cs			# Read from
+		callw xread.1	 		#  disk
+		jnc return			# If success, return
+		mov $msg_read,%si		# Otherwise, set the error
+						#  message and fall through to
+						#  the error routine
+/*
+ * Print out the error message pointed to by %ds:(%si) followed
+ * by a prompt, wait for a keypress, and then reboot the machine.
+ */
+error:		callw putstr			# Display message
+		mov $prompt,%si			# Display
+		callw putstr			#  prompt
+		xorb %ah,%ah			# BIOS: Get
+		int $0x16			#  keypress
+		movw $0x1234, BDA_BOOT		# Do a warm boot
+		ljmp $0xffff,$0x0		# reboot the machine
+/*
+ * Display a null-terminated string using the BIOS output.
+ */
+putstr.0:	mov $0x7,%bx	 		# Page:attribute
+		movb $0xe,%ah			# BIOS: Display
+		int $0x10			#  character
+putstr: 	lodsb				# Get char
+		testb %al,%al			# End of string?
+		jne putstr.0			# No
+
+/*
+ * Overused return code.  ereturn is used to return an error from the
+ * read function.  Since we assume putstr succeeds, we (ab)use the
+ * same code when we return from putstr.
+ */
+ereturn:	movb $0x1,%ah			# Invalid
+		stc				#  argument
+return: 	retw				# To caller
+/*
+ * Reads sectors from the disk.  If EDD is enabled, then check if it is
+ * installed and use it if it is.  If it is not installed or not enabled, then
+ * fall back to using CHS.  Since we use a LBA, if we are using CHS, we have to
+ * fetch the drive parameters from the BIOS and divide it out ourselves.
+ * Call with:
+ *
+ * %dl	- byte     - drive number
+ * stack - 10 bytes - EDD Packet
+ */
+read:		testb $FL_PACKET,%cs:MEM_REL+flags-start # LBA support enabled?
+		jz read.1			# No, use CHS
+		cmpb $0x80,%dl			# Hard drive?
+		jb read.1			# No, use CHS
+		mov $0x55aa,%bx			# Magic
+		push %dx			# Save
+		movb $0x41,%ah			# BIOS: Check
+		int $0x13			#  extensions present
+		pop %dx				# Restore
+		jc read.1			# If error, use CHS
+		cmp $0xaa55,%bx			# Magic?
+		jne read.1			# No, so use CHS
+		testb $0x1,%cl			# Packet interface?
+		jz read.1			# No, so use CHS
+		mov %bp,%si			# Disk packet
+		movb $0x42,%ah			# BIOS: Extended
+		int $0x13			#  read
+		retw				# To caller
+#if 0	
+read.1:	 	push %dx			# Save
+		movb $0x8,%ah			# BIOS: Get drive
+		int $0x13			#  parameters
+		movb %dh,%ch			# Max head number
+		pop %dx				# Restore
+		jc return			# If error
+		andb $0x3f,%cl			# Sectors per track
+		jz ereturn			# If zero
+		cli				# Disable interrupts
+		mov 0x8(%bp),%eax		# Get LBA
+		push %dx			# Save
+		movzbl %cl,%ebx			# Divide by
+		xor %edx,%edx			#  sectors
+		div %ebx			#  per track
+		movb %ch,%bl			# Max head number
+		movb %dl,%ch			# Sector number
+		inc %bx				# Divide by
+		xorb %dl,%dl			#  number
+		div %ebx			#  of heads
+		movb %dl,%bh			# Head number
+		pop %dx				# Restore
+		cmpl $0x3ff,%eax		# Cylinder number supportable?
+		sti				# Enable interrupts
+		ja ereturn			# No, return an error
+		xchgb %al,%ah			# Set up cylinder
+		rorb $0x2,%al			#  number
+		orb %ch,%al			# Merge
+		inc %ax				#  sector
+		xchg %ax,%cx	 		#  number
+		movb %bh,%dh			# Head number
+		subb %ah,%al			# Sectors this track
+		mov 0x2(%bp),%ah		# Blocks to read
+		cmpb %ah,%al			# To read
+		jb read.2			#  this
+#ifdef	TRACK_AT_A_TIME
+		movb %ah,%al			#  track
+#else
+		movb $1,%al			#  one sector
+#endif
+read.2: 	mov $0x5,%di	 		# Try count
+read.3: 	les 0x4(%bp),%bx		# Transfer buffer
+		push %ax			# Save
+		movb $0x2,%ah			# BIOS: Read
+		int $0x13			#  from disk
+		pop %bx				# Restore
+		jnc read.4			# If success
+		dec %di				# Retry?
+		jz read.6			# No
+		xorb %ah,%ah			# BIOS: Reset
+		int $0x13			#  disk system
+		xchg %bx,%ax	 		# Block count
+		jmp read.3			# Continue
+read.4: 	movzbw %bl,%ax	 		# Sectors read
+		add %ax,0x8(%bp)		# Adjust
+		jnc read.5			#  LBA,
+		incw 0xa(%bp)	 		#  transfer
+read.5: 	shlb %bl			#  buffer
+		add %bl,0x5(%bp)		#  pointer,
+		sub %al,0x2(%bp)		#  block count
+		ja read.1			# If not done
+read.6: 	retw				# To caller
+#else
+read.1:		mov $msg_chs,%si
+		jmp error
+msg_chs:	.asciz "CHS not supported"
+#endif
+
+/* Messages */
+
+msg_read:	.asciz "Read"
+msg_part:	.asciz "Boot"
+
+prompt: 	.asciz " error\r\n"
+
+flags:		.byte FLAGS			# Flags
+
+		.org PRT_OFF,0x90
+
+/* Partition table */
+
+		.fill 0x30,0x1,0x0
+part4:		.byte 0x80, 0x00, 0x01, 0x00
+		.byte 0xa5, 0xfe, 0xff, 0xff
+		.byte 0x00, 0x00, 0x00, 0x00
+		.byte 0x50, 0xc3, 0x00, 0x00	# 50000 sectors long, bleh
+
+		.word 0xaa55			# Magic number
diff --git a/sys/boot/zfs/Makefile b/sys/boot/zfs/Makefile
new file mode 100644
index 0000000..723233c
--- /dev/null
+++ b/sys/boot/zfs/Makefile
@@ -0,0 +1,29 @@
+# $FreeBSD$
+
+LIB=		zfsboot
+INTERNALLIB=
+
+SRCS+=		zfs.c
+
+CFLAGS+=	-I${.CURDIR}/../common -I${.CURDIR}/../.. -I.
+CFLAGS+=	-I${.CURDIR}/../../../lib/libstand
+CFLAGS+=	-I${.CURDIR}/../../cddl/boot/zfs
+
+# XXX need arch-specific bootstrap CFLAGS here
+# 
+CFLAGS+=	-ffreestanding -mpreferred-stack-boundary=2 \
+		-mno-mmx -mno-3dnow -mno-sse -mno-sse2 -mno-sse3
+
+CFLAGS+=	-Wformat -Wall
+
+.if ${MACHINE_ARCH} == "amd64"
+CLEANFILES+=    machine
+machine:
+	ln -sf ${.CURDIR}/../../../i386/include machine
+.endif
+
+.include <bsd.lib.mk>
+
+.if ${MACHINE_ARCH} == "amd64"
+beforedepend ${OBJS}: machine
+.endif
diff --git a/sys/boot/zfs/zfs.c b/sys/boot/zfs/zfs.c
new file mode 100644
index 0000000..cf0bb9c
--- /dev/null
+++ b/sys/boot/zfs/zfs.c
@@ -0,0 +1,514 @@
+/*-
+ * Copyright (c) 2007 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ *	Stand-alone file reading package.
+ */
+
+#include <sys/param.h>
+#include <sys/disklabel.h>
+#include <sys/time.h>
+#include <sys/queue.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stand.h>
+#include <bootstrap.h>
+
+#include "zfsimpl.c"
+
+static int	zfs_open(const char *path, struct open_file *f);
+static int	zfs_write(struct open_file *f, void *buf, size_t size, size_t *resid);
+static int	zfs_close(struct open_file *f);
+static int	zfs_read(struct open_file *f, void *buf, size_t size, size_t *resid);
+static off_t	zfs_seek(struct open_file *f, off_t offset, int where);
+static int	zfs_stat(struct open_file *f, struct stat *sb);
+static int	zfs_readdir(struct open_file *f, struct dirent *d);
+
+struct devsw zfs_dev;
+
+struct fs_ops zfs_fsops = {
+	"zfs",
+	zfs_open,
+	zfs_close,
+	zfs_read,
+	zfs_write,
+	zfs_seek,
+	zfs_stat,
+	zfs_readdir
+};
+
+/*
+ * In-core open file.
+ */
+struct file {
+	off_t		f_seekp;	/* seek pointer */
+	dnode_phys_t	f_dnode;
+	uint64_t	f_zap_type;	/* zap type for readdir */
+	uint64_t	f_num_leafs;	/* number of fzap leaf blocks */
+	zap_leaf_phys_t	*f_zap_leaf;	/* zap leaf buffer */
+};
+
+/*
+ * Open a file.
+ */
+static int
+zfs_open(const char *upath, struct open_file *f)
+{
+	spa_t *spa = (spa_t *) f->f_devdata;
+	struct file *fp;
+	int rc;
+
+	if (f->f_dev != &zfs_dev)
+		return (EINVAL);
+
+	rc = zfs_mount_pool(spa);
+	if (rc)
+		return (rc);
+
+	/* allocate file system specific data structure */
+	fp = malloc(sizeof(struct file));
+	bzero(fp, sizeof(struct file));
+	f->f_fsdata = (void *)fp;
+
+	if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
+		printf("Unexpected object set type %lld\n",
+		    spa->spa_root_objset.os_type);
+		rc = EIO;
+		goto out;
+	}
+
+	rc = zfs_lookup(spa, upath, &fp->f_dnode);
+	if (rc)
+		goto out;
+
+	fp->f_seekp = 0;
+out:
+	if (rc) {
+		f->f_fsdata = NULL;
+		free(fp);
+	}
+	return (rc);
+}
+
+static int
+zfs_close(struct open_file *f)
+{
+	struct file *fp = (struct file *)f->f_fsdata;
+
+	dnode_cache_obj = 0;
+	f->f_fsdata = (void *)0;
+	if (fp == (struct file *)0)
+		return (0);
+
+	free(fp);
+	return (0);
+}
+
+/*
+ * Copy a portion of a file into kernel memory.
+ * Cross block boundaries when necessary.
+ */
+static int
+zfs_read(struct open_file *f, void *start, size_t size, size_t *resid	/* out */)
+{
+	spa_t *spa = (spa_t *) f->f_devdata;
+	struct file *fp = (struct file *)f->f_fsdata;
+	const znode_phys_t *zp = (const znode_phys_t *) fp->f_dnode.dn_bonus;
+	size_t n;
+	int rc;
+
+	n = size;
+	if (fp->f_seekp + n > zp->zp_size)
+		n = zp->zp_size - fp->f_seekp;
+	
+	rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n);
+	if (rc)
+		return (rc);
+
+	if (0) {
+	    int i;
+	    for (i = 0; i < n; i++)
+		putchar(((char*) start)[i]);
+	}
+	fp->f_seekp += n;
+	if (resid)
+		*resid = size - n;
+
+	return (0);
+}
+
+/*
+ * Don't be silly - the bootstrap has no business writing anything.
+ */
+static int
+zfs_write(struct open_file *f, void *start, size_t size, size_t *resid	/* out */)
+{
+
+	return (EROFS);
+}
+
+static off_t
+zfs_seek(struct open_file *f, off_t offset, int where)
+{
+	struct file *fp = (struct file *)f->f_fsdata;
+	znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus;
+
+	switch (where) {
+	case SEEK_SET:
+		fp->f_seekp = offset;
+		break;
+	case SEEK_CUR:
+		fp->f_seekp += offset;
+		break;
+	case SEEK_END:
+		fp->f_seekp = zp->zp_size - offset;
+		break;
+	default:
+		errno = EINVAL;
+		return (-1);
+	}
+	return (fp->f_seekp);
+}
+
+static int
+zfs_stat(struct open_file *f, struct stat *sb)
+{
+	struct file *fp = (struct file *)f->f_fsdata;
+	znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus;
+
+	/* only important stuff */
+	sb->st_mode = zp->zp_mode;
+	sb->st_uid = zp->zp_uid;
+	sb->st_gid = zp->zp_gid;
+	sb->st_size = zp->zp_size;
+
+	return (0);
+}
+
+static int
+zfs_readdir(struct open_file *f, struct dirent *d)
+{
+	spa_t *spa = (spa_t *) f->f_devdata;
+	struct file *fp = (struct file *)f->f_fsdata;
+	znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus;
+	mzap_ent_phys_t mze;
+	size_t bsize = fp->f_dnode.dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	int rc;
+
+	if ((zp->zp_mode >> 12) != 0x4) {
+		return (ENOTDIR);
+	}
+
+	/*
+	 * If this is the first read, get the zap type.
+	 */
+	if (fp->f_seekp == 0) {
+		rc = dnode_read(spa, &fp->f_dnode,
+				0, &fp->f_zap_type, sizeof(fp->f_zap_type));
+		if (rc)
+			return (rc);
+
+		if (fp->f_zap_type == ZBT_MICRO) {
+			fp->f_seekp = offsetof(mzap_phys_t, mz_chunk);
+		} else {
+			rc = dnode_read(spa, &fp->f_dnode,
+					offsetof(zap_phys_t, zap_num_leafs),
+					&fp->f_num_leafs,
+					sizeof(fp->f_num_leafs));
+			if (rc)
+				return (rc);
+
+			fp->f_seekp = bsize;
+			fp->f_zap_leaf = (zap_leaf_phys_t *)malloc(bsize);
+			rc = dnode_read(spa, &fp->f_dnode,
+					fp->f_seekp,
+					fp->f_zap_leaf,
+					bsize);
+			if (rc)
+				return (rc);
+		}
+	}
+
+	if (fp->f_zap_type == ZBT_MICRO) {
+	mzap_next:
+		if (fp->f_seekp >= bsize)
+			return (ENOENT);
+
+		rc = dnode_read(spa, &fp->f_dnode,
+				fp->f_seekp, &mze, sizeof(mze));
+		fp->f_seekp += sizeof(mze);
+
+		if (!mze.mze_name[0])
+			goto mzap_next;
+
+		d->d_fileno = ZFS_DIRENT_OBJ(mze.mze_value);
+		d->d_type = ZFS_DIRENT_TYPE(mze.mze_value);
+		strcpy(d->d_name, mze.mze_name);
+		d->d_namlen = strlen(d->d_name);
+		return (0);
+	} else {
+		zap_leaf_t zl;
+		zap_leaf_chunk_t *zc, *nc;
+		int chunk;
+		size_t namelen;
+		char *p;
+		uint64_t value;
+
+		/*
+		 * Initialise this so we can use the ZAP size
+		 * calculating macros.
+		 */
+		zl.l_bs = ilog2(bsize);
+		zl.l_phys = fp->f_zap_leaf;
+
+		/*
+		 * Figure out which chunk we are currently looking at
+		 * and consider seeking to the next leaf. We use the
+		 * low bits of f_seekp as a simple chunk index.
+		 */
+	fzap_next:
+		chunk = fp->f_seekp & (bsize - 1);
+		if (chunk == ZAP_LEAF_NUMCHUNKS(&zl)) {
+			fp->f_seekp = (fp->f_seekp & ~(bsize - 1)) + bsize;
+			chunk = 0;
+
+			/*
+			 * Check for EOF and read the new leaf.
+			 */
+			if (fp->f_seekp >= bsize * fp->f_num_leafs)
+				return (ENOENT);
+
+			rc = dnode_read(spa, &fp->f_dnode,
+					fp->f_seekp,
+					fp->f_zap_leaf,
+					bsize);
+			if (rc)
+				return (rc);
+		}
+
+		zc = &ZAP_LEAF_CHUNK(&zl, chunk);
+		fp->f_seekp++;
+		if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
+			goto fzap_next;
+
+		namelen = zc->l_entry.le_name_length;
+		if (namelen > sizeof(d->d_name))
+			namelen = sizeof(d->d_name);
+
+		/*
+		 * Paste the name back together.
+		 */
+		nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
+		p = d->d_name;
+		while (namelen > 0) {
+			int len;
+			len = namelen;
+			if (len > ZAP_LEAF_ARRAY_BYTES)
+				len = ZAP_LEAF_ARRAY_BYTES;
+			memcpy(p, nc->l_array.la_array, len);
+			p += len;
+			namelen -= len;
+			nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
+		}
+		d->d_name[sizeof(d->d_name) - 1] = 0;
+
+		/*
+		 * Assume the first eight bytes of the value are
+		 * a uint64_t.
+		 */
+		value = fzap_leaf_value(&zl, zc);
+
+		d->d_fileno = ZFS_DIRENT_OBJ(value);
+		d->d_type = ZFS_DIRENT_TYPE(value);
+		d->d_namlen = strlen(d->d_name);
+
+		return (0);
+	}
+}
+
+static int
+vdev_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t size)
+{
+	int fd;
+
+	fd = (uintptr_t) priv;
+	lseek(fd, offset, SEEK_SET);
+	if (read(fd, buf, size) == size) {
+		return 0;
+	} else {
+		return (EIO);
+	}
+}
+
+/*
+ * Convert a pool guid to a 'unit number' suitable for use with zfs_dev_open.
+ */
+int
+zfs_guid_to_unit(uint64_t guid)
+{
+	spa_t *spa;
+	int unit;
+
+	unit = 0;
+	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
+		if (spa->spa_guid == guid)
+			return unit;
+		unit++;
+	}
+	return (-1);
+}
+
+static int
+zfs_dev_init(void) 
+{
+	char devname[512];
+	int unit, slice;
+	int fd;
+
+	/*
+	 * Open all the disks we can find and see if we can reconstruct
+	 * ZFS pools from them. Bogusly assumes that the disks are named
+	 * diskN or diskNsM.
+	 */
+	zfs_init();
+	for (unit = 0; unit < 32 /* XXX */; unit++) {
+		sprintf(devname, "disk%d:", unit);
+		fd = open(devname, O_RDONLY);
+		if (fd == -1)
+			continue;
+
+		/*
+		 * If we find a vdev, the zfs code will eat the fd, otherwise
+		 * we close it.
+		 */
+		if (vdev_probe(vdev_read, (void*) (uintptr_t) fd, 0))
+			close(fd);
+
+		for (slice = 1; slice <= 4; slice++) {
+			sprintf(devname, "disk%ds%d:", unit, slice);
+			fd = open(devname, O_RDONLY);
+			if (fd == -1)
+				continue;
+			if (vdev_probe(vdev_read, (void*) (uintptr_t) fd, 0))
+				close(fd);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Print information about ZFS pools
+ */
+static void
+zfs_dev_print(int verbose)
+{
+	spa_t *spa;
+	char line[80];
+	int unit;
+
+	if (verbose) {
+		spa_all_status();
+		return;
+	}
+	unit = 0;
+	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
+		sprintf(line, "    zfs%d:   %s\n", unit, spa->spa_name);
+		pager_output(line);
+		unit++;
+	}
+}
+
+/*
+ * Attempt to open the pool described by (dev) for use by (f).
+ */
+static int 
+zfs_dev_open(struct open_file *f, ...)
+{
+	va_list		args;
+	struct devdesc	*dev;
+	int		unit, i;
+	spa_t		*spa;
+
+	va_start(args, f);
+	dev = va_arg(args, struct devdesc*);
+	va_end(args);
+
+	/*
+	 * We mostly ignore the stuff that devopen sends us. For now,
+	 * use the unit to find a pool - later we will override the
+	 * devname parsing so that we can name a pool and a fs within
+	 * the pool.
+	 */
+	unit = dev->d_unit;
+	free(dev);
+	
+	i = 0;
+	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
+		if (i == unit)
+			break;
+		i++;
+	}
+	if (!spa) {
+		return (ENXIO);
+	}
+
+	f->f_devdata = spa;
+	return (0);
+}
+
+static int 
+zfs_dev_close(struct open_file *f)
+{
+
+	f->f_devdata = NULL;
+	return (0);
+}
+
+static int 
+zfs_dev_strategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize)
+{
+
+	return (ENOSYS);
+}
+
+struct devsw zfs_dev = {
+	.dv_name = "zfs", 
+	.dv_type = DEVT_ZFS, 
+	.dv_init = zfs_dev_init,
+	.dv_strategy = zfs_dev_strategy, 
+	.dv_open = zfs_dev_open, 
+	.dv_close = zfs_dev_close, 
+	.dv_ioctl = noioctl,
+	.dv_print = zfs_dev_print,
+	.dv_cleanup = NULL
+};
diff --git a/sys/boot/zfs/zfsimpl.c b/sys/boot/zfs/zfsimpl.c
new file mode 100644
index 0000000..5bbc351
--- /dev/null
+++ b/sys/boot/zfs/zfsimpl.c
@@ -0,0 +1,1443 @@
+/*-
+ * Copyright (c) 2007 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ *	Stand-alone ZFS file reader.
+ */
+
+#include "zfsimpl.h"
+#include "zfssubr.c"
+
+/*
+ * List of all vdevs, chained through v_alllink.
+ */
+static vdev_list_t zfs_vdevs;
+
+/*
+ * List of all pools, chained through spa_link.
+ */
+static spa_list_t zfs_pools;
+
+static uint64_t zfs_crc64_table[256];
+static char *zfs_decomp_buf;
+static const dnode_phys_t *dnode_cache_obj = 0;
+static uint64_t dnode_cache_bn;
+static char *dnode_cache_buf;
+static char *zap_scratch;
+
+/*
+ * Forward declarations.
+ */
+static int zio_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset);
+
+static void
+zfs_init(void)
+{
+	STAILQ_INIT(&zfs_vdevs);
+	STAILQ_INIT(&zfs_pools);
+
+	zfs_decomp_buf = malloc(128*1024);
+	dnode_cache_buf = malloc(128*1024);
+	zap_scratch = malloc(128*1024);
+
+	zfs_init_crc();
+}
+
+static int
+xdr_int(const unsigned char **xdr, int *ip)
+{
+	*ip = ((*xdr)[0] << 24)
+		| ((*xdr)[1] << 16)
+		| ((*xdr)[2] << 8)
+		| ((*xdr)[3] << 0);
+	(*xdr) += 4;
+	return (0);
+}
+
+static int
+xdr_u_int(const unsigned char **xdr, u_int *ip)
+{
+	*ip = ((*xdr)[0] << 24)
+		| ((*xdr)[1] << 16)
+		| ((*xdr)[2] << 8)
+		| ((*xdr)[3] << 0);
+	(*xdr) += 4;
+	return (0);
+}
+
+static int
+xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
+{
+	u_int hi, lo;
+
+	xdr_u_int(xdr, &hi);
+	xdr_u_int(xdr, &lo);
+	*lp = (((uint64_t) hi) << 32) | lo;
+	return (0);
+}
+
+static int
+nvlist_find(const unsigned char *nvlist, const char *name, int type,
+	    int* elementsp, void *valuep)
+{
+	const unsigned char *p, *pair;
+	int junk;
+	int encoded_size, decoded_size;
+
+	p = nvlist;
+	xdr_int(&p, &junk);
+	xdr_int(&p, &junk);
+
+	pair = p;
+	xdr_int(&p, &encoded_size);
+	xdr_int(&p, &decoded_size);
+	while (encoded_size && decoded_size) {
+		int namelen, pairtype, elements;
+		const char *pairname;
+
+		xdr_int(&p, &namelen);
+		pairname = (const char*) p;
+		p += roundup(namelen, 4);
+		xdr_int(&p, &pairtype);
+
+		if (!memcmp(name, pairname, namelen) && type == pairtype) {
+			xdr_int(&p, &elements);
+			if (elementsp)
+				*elementsp = elements;
+			if (type == DATA_TYPE_UINT64) {
+				xdr_uint64_t(&p, (uint64_t *) valuep);
+				return (0);
+			} else if (type == DATA_TYPE_STRING) {
+				int len;
+				xdr_int(&p, &len);
+				(*(const char**) valuep) = (const char*) p;
+				return (0);
+			} else if (type == DATA_TYPE_NVLIST
+				   || type == DATA_TYPE_NVLIST_ARRAY) {
+				(*(const unsigned char**) valuep) =
+					 (const unsigned char*) p;
+				return (0);
+			} else {
+				return (EIO);
+			}
+		} else {
+			/*
+			 * Not the pair we are looking for, skip to the next one.
+			 */
+			p = pair + encoded_size;
+		}
+
+		pair = p;
+		xdr_int(&p, &encoded_size);
+		xdr_int(&p, &decoded_size);
+	}
+
+	return (EIO);
+}
+
+/*
+ * Return the next nvlist in an nvlist array.
+ */
+static const unsigned char *
+nvlist_next(const unsigned char *nvlist)
+{
+	const unsigned char *p, *pair;
+	int junk;
+	int encoded_size, decoded_size;
+
+	p = nvlist;
+	xdr_int(&p, &junk);
+	xdr_int(&p, &junk);
+
+	pair = p;
+	xdr_int(&p, &encoded_size);
+	xdr_int(&p, &decoded_size);
+	while (encoded_size && decoded_size) {
+		p = pair + encoded_size;
+
+		pair = p;
+		xdr_int(&p, &encoded_size);
+		xdr_int(&p, &decoded_size);
+	}
+
+	return p;
+}
+
+#ifdef TEST
+
+static const unsigned char *
+nvlist_print(const unsigned char *nvlist, unsigned int indent)
+{
+	static const char* typenames[] = {
+		"DATA_TYPE_UNKNOWN",
+		"DATA_TYPE_BOOLEAN",
+		"DATA_TYPE_BYTE",
+		"DATA_TYPE_INT16",
+		"DATA_TYPE_UINT16",
+		"DATA_TYPE_INT32",
+		"DATA_TYPE_UINT32",
+		"DATA_TYPE_INT64",
+		"DATA_TYPE_UINT64",
+		"DATA_TYPE_STRING",
+		"DATA_TYPE_BYTE_ARRAY",
+		"DATA_TYPE_INT16_ARRAY",
+		"DATA_TYPE_UINT16_ARRAY",
+		"DATA_TYPE_INT32_ARRAY",
+		"DATA_TYPE_UINT32_ARRAY",
+		"DATA_TYPE_INT64_ARRAY",
+		"DATA_TYPE_UINT64_ARRAY",
+		"DATA_TYPE_STRING_ARRAY",
+		"DATA_TYPE_HRTIME",
+		"DATA_TYPE_NVLIST",
+		"DATA_TYPE_NVLIST_ARRAY",
+		"DATA_TYPE_BOOLEAN_VALUE",
+		"DATA_TYPE_INT8",
+		"DATA_TYPE_UINT8",
+		"DATA_TYPE_BOOLEAN_ARRAY",
+		"DATA_TYPE_INT8_ARRAY",
+		"DATA_TYPE_UINT8_ARRAY"
+	};
+
+	unsigned int i, j;
+	const unsigned char *p, *pair;
+	int junk;
+	int encoded_size, decoded_size;
+
+	p = nvlist;
+	xdr_int(&p, &junk);
+	xdr_int(&p, &junk);
+
+	pair = p;
+	xdr_int(&p, &encoded_size);
+	xdr_int(&p, &decoded_size);
+	while (encoded_size && decoded_size) {
+		int namelen, pairtype, elements;
+		const char *pairname;
+
+		xdr_int(&p, &namelen);
+		pairname = (const char*) p;
+		p += roundup(namelen, 4);
+		xdr_int(&p, &pairtype);
+
+		for (i = 0; i < indent; i++)
+			printf(" ");
+		printf("%s %s", typenames[pairtype], pairname);
+
+		xdr_int(&p, &elements);
+		switch (pairtype) {
+		case DATA_TYPE_UINT64: {
+			uint64_t val;
+			xdr_uint64_t(&p, &val);
+			printf(" = 0x%llx\n", val);
+			break;
+		}
+
+		case DATA_TYPE_STRING: {
+			int len;
+			xdr_int(&p, &len);
+			printf(" = \"%s\"\n", p);
+			break;
+		}
+
+		case DATA_TYPE_NVLIST:
+			printf("\n");
+			nvlist_print(p, indent + 1);
+			break;
+
+		case DATA_TYPE_NVLIST_ARRAY:
+			for (j = 0; j < elements; j++) {
+				printf("[%d]\n", j);
+				p = nvlist_print(p, indent + 1);
+				if (j != elements - 1) {
+					for (i = 0; i < indent; i++)
+						printf(" ");
+					printf("%s %s", typenames[pairtype], pairname);
+				}
+			}
+			break;
+
+		default:
+			printf("\n");
+		}
+
+		p = pair + encoded_size;
+
+		pair = p;
+		xdr_int(&p, &encoded_size);
+		xdr_int(&p, &decoded_size);
+	}
+
+	return p;
+}
+
+#endif
+
+static int
+vdev_mirror_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t size)
+{
+	vdev_t *kid;
+	int rc;
+
+	rc = EIO;
+	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
+		if (kid->v_state != VDEV_STATE_HEALTHY)
+			continue;
+		rc = kid->v_read(kid, kid->v_read_priv, offset, buf, size);
+		if (!rc)
+			return (0);
+	}
+
+	return (rc);
+}
+
+static vdev_t *
+vdev_find(uint64_t guid)
+{
+	vdev_t *vdev;
+
+	STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
+		if (vdev->v_guid == guid)
+			return (vdev);
+
+	return (0);
+}
+
+static vdev_t *
+vdev_create(uint64_t guid, vdev_read_t *read, void *read_priv)
+{
+	vdev_t *vdev;
+
+	vdev = malloc(sizeof(vdev_t));
+	memset(vdev, 0, sizeof(vdev_t));
+	STAILQ_INIT(&vdev->v_children);
+	vdev->v_guid = guid;
+	vdev->v_state = VDEV_STATE_OFFLINE;
+	vdev->v_read = read;
+	vdev->v_read_priv = read_priv;
+	STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
+
+	return (vdev);
+}
+
+static int
+vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp)
+{
+	int rc;
+	uint64_t guid, id;
+	const char *type;
+	const char *path;
+	vdev_t *vdev, *kid;
+	const unsigned char *kids;
+	int nkids, i;
+
+	if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
+			DATA_TYPE_UINT64, 0, &guid)
+	    || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
+			   DATA_TYPE_UINT64, 0, &id)
+	    || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
+			   DATA_TYPE_STRING, 0, &type)) {
+		printf("ZFS: can't find vdev details\n");
+		return (ENOENT);
+	}
+
+	/*
+	 * Assume that if we've seen this vdev tree before, this one
+	 * will be identical.
+	 */
+	vdev = vdev_find(guid);
+	if (vdev) {
+		if (vdevp)
+			*vdevp = vdev;
+		return (0);
+	}
+
+	if (strcmp(type, VDEV_TYPE_MIRROR)
+	    && strcmp(type, VDEV_TYPE_DISK)) {
+		printf("ZFS: can only boot from disk or mirror vdevs\n");
+		return (EIO);
+	}
+
+	if (!strcmp(type, VDEV_TYPE_MIRROR))
+		vdev = vdev_create(guid, vdev_mirror_read, 0);
+	else
+		vdev = vdev_create(guid, 0, 0);
+
+
+	if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
+			DATA_TYPE_STRING, 0, &path) == 0) {
+		if (strlen(path) > 5
+		    && path[0] == '/'
+		    && path[1] == 'd'
+		    && path[2] == 'e'
+		    && path[3] == 'v'
+		    && path[4] == '/')
+			path += 5;
+		vdev->v_name = strdup(path);
+	} else {
+		vdev->v_name = strdup(type);
+	}
+	vdev->v_id = id;
+	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
+			 DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
+	/*
+	 * Its ok if we don't have any kids.
+	 */
+	if (rc == 0) {
+		for (i = 0; i < nkids; i++) {
+			rc = vdev_init_from_nvlist(kids, &kid);
+			if (rc)
+				return (rc);
+			STAILQ_INSERT_TAIL(&vdev->v_children, kid, v_childlink);
+			kids = nvlist_next(kids);
+		}
+	}
+
+	if (vdevp)
+		*vdevp = vdev;
+	return (0);
+}
+
+static void
+vdev_set_state(vdev_t *vdev)
+{
+	vdev_t *kid;
+	int good_kids;
+	int bad_kids;
+
+	/*
+	 * We assume that if we have kids, we are a mirror. A mirror
+	 * is healthy if all its kids are healthy. Its degraded (but
+	 * working) if at least one kid is healty.
+	 */
+
+	if (STAILQ_FIRST(&vdev->v_children)) {
+		good_kids = 0;
+		bad_kids = 0;
+		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
+			if (kid->v_state == VDEV_STATE_HEALTHY)
+				good_kids++;
+			else
+				bad_kids++;
+		}
+		if (good_kids) {
+			if (!bad_kids && good_kids)
+				vdev->v_state = VDEV_STATE_HEALTHY;
+			else
+				vdev->v_state = VDEV_STATE_DEGRADED;
+		} else {
+			vdev->v_state = VDEV_STATE_OFFLINE;
+		}
+	}
+}
+
+static spa_t *
+spa_find_by_guid(uint64_t guid)
+{
+	spa_t *spa;
+
+	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
+		if (spa->spa_guid == guid)
+			return (spa);
+
+	return (0);
+}
+
+#ifdef BOOT2
+
+static spa_t *
+spa_find_by_name(const char *name)
+{
+	spa_t *spa;
+
+	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
+		if (!strcmp(spa->spa_name, name))
+			return (spa);
+
+	return (0);
+}
+
+#endif
+
+static spa_t *
+spa_create(uint64_t guid)
+{
+	spa_t *spa;
+
+	spa = malloc(sizeof(spa_t));
+	memset(spa, 0, sizeof(spa_t));
+	STAILQ_INIT(&spa->spa_vdevs);
+	spa->spa_guid = guid;
+	STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
+
+	return (spa);
+}
+
+static const char *
+state_name(vdev_state_t state)
+{
+	static const char* names[] = {
+		"UNKNOWN",
+		"CLOSED",
+		"OFFLINE",
+		"CANT_OPEN",
+		"DEGRADED",
+		"ONLINE"
+	};
+	return names[state];
+}
+
+#ifdef BOOT2
+
+#define pager_printf printf
+
+#else
+
+static void
+pager_printf(const char *fmt, ...)
+{
+	char line[80];
+	va_list args;
+
+	va_start(args, fmt);
+	vsprintf(line, fmt, args);
+	va_end(args);
+	pager_output(line);
+}
+
+#endif
+
+#define STATUS_FORMAT	"        %-16s %-10s\n"
+
+static void
+print_state(int indent, const char *name, vdev_state_t state)
+{
+	int i;
+	char buf[512];
+
+	buf[0] = 0;
+	for (i = 0; i < indent; i++)
+		strcat(buf, "  ");
+	strcat(buf, name);
+	pager_printf(STATUS_FORMAT, buf, state_name(state));
+	
+}
+
+static void
+vdev_status(vdev_t *vdev, int indent)
+{
+	vdev_t *kid;
+	print_state(indent, vdev->v_name, vdev->v_state);
+
+	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
+		vdev_status(kid, indent + 1);
+	}
+}
+
+static void
+spa_status(spa_t *spa)
+{
+	vdev_t *vdev;
+	int good_kids, bad_kids, degraded_kids;
+	vdev_state_t state;
+
+	pager_printf("  pool: %s\n", spa->spa_name);
+	pager_printf("config:\n\n");
+	pager_printf(STATUS_FORMAT, "NAME", "STATE");
+
+	good_kids = 0;
+	degraded_kids = 0;
+	bad_kids = 0;
+	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+		if (vdev->v_state == VDEV_STATE_HEALTHY)
+			good_kids++;
+		else if (vdev->v_state == VDEV_STATE_DEGRADED)
+			degraded_kids++;
+		else
+			bad_kids++;
+	}
+
+	state = VDEV_STATE_CLOSED;
+	if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
+		state = VDEV_STATE_HEALTHY;
+	else if ((good_kids + degraded_kids) > 0)
+		state = VDEV_STATE_DEGRADED;
+
+	print_state(0, spa->spa_name, state);
+	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+		vdev_status(vdev, 1);
+	}
+}
+
+static void
+spa_all_status(void)
+{
+	spa_t *spa;
+	int first = 1;
+
+	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
+		if (!first)
+			pager_printf("\n");
+		first = 0;
+		spa_status(spa);
+	}
+}
+
+static int
+vdev_probe(vdev_read_t *read, void *read_priv, spa_t **spap)
+{
+	vdev_t vtmp;
+	vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
+	spa_t *spa;
+	vdev_t *vdev, *top_vdev, *pool_vdev;
+	off_t off;
+	blkptr_t bp;
+	const unsigned char *nvlist;
+	uint64_t val;
+	uint64_t guid;
+	uint64_t pool_txg, pool_guid;
+	const char *pool_name;
+	const unsigned char *vdevs;
+	int i;
+	char upbuf[1024];
+	const struct uberblock *up;
+
+	/*
+	 * Load the vdev label and figure out which
+	 * uberblock is most current.
+	 */
+	memset(&vtmp, 0, sizeof(vtmp));
+	vtmp.v_read = read;
+	vtmp.v_read_priv = read_priv;
+	off = offsetof(vdev_label_t, vl_vdev_phys);
+	BP_ZERO(&bp);
+	BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
+	BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
+	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
+	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+	ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
+	if (zio_read_phys(&vtmp, &bp, vdev_label, off))
+		return (EIO);
+
+	if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
+		return (EIO);
+	}
+
+	nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
+
+	if (nvlist_find(nvlist,
+			ZPOOL_CONFIG_VERSION,
+			DATA_TYPE_UINT64, 0, &val)) {
+		return (EIO);
+	}
+
+	if (val != ZFS_VERSION) {
+		printf("ZFS: unsupported ZFS version %d\n", (int) val);
+		return (EIO);
+	}
+
+	if (nvlist_find(nvlist,
+			ZPOOL_CONFIG_POOL_STATE,
+			DATA_TYPE_UINT64, 0, &val)) {
+		return (EIO);
+	}
+
+	if (val != POOL_STATE_ACTIVE) {
+		/*
+		 * Don't print a message here. If we happen to reboot
+		 * while where is an exported pool around, we don't
+		 * need a cascade of confusing messages during boot.
+		 */
+		/*printf("ZFS: pool is not active\n");*/
+		return (EIO);
+	}
+
+	if (nvlist_find(nvlist,
+			ZPOOL_CONFIG_POOL_TXG,
+			DATA_TYPE_UINT64, 0, &pool_txg)
+	    || nvlist_find(nvlist,
+			   ZPOOL_CONFIG_POOL_GUID,
+			   DATA_TYPE_UINT64, 0, &pool_guid)
+	    || nvlist_find(nvlist,
+			   ZPOOL_CONFIG_POOL_NAME,
+			   DATA_TYPE_STRING, 0, &pool_name)) {
+		printf("ZFS: can't find pool details\n");
+		return (EIO);
+	}
+
+	/*
+	 * Create the pool if this is the first time we've seen it.
+	 */
+	spa = spa_find_by_guid(pool_guid);
+	if (!spa) {
+		spa = spa_create(pool_guid);
+		spa->spa_name = strdup(pool_name);
+	}
+	if (pool_txg > spa->spa_txg)
+		spa->spa_txg = pool_txg;
+
+	/*
+	 * Get the vdev tree and create our in-core copy of it.
+	 * If we already have a healthy vdev with this guid, this must
+	 * be some kind of alias (overlapping slices, dangerously dedicated
+	 * disks etc).
+	 */
+	if (nvlist_find(nvlist,
+			ZPOOL_CONFIG_GUID,
+			DATA_TYPE_UINT64, 0, &guid)) {
+		return (EIO);
+	}
+	vdev = vdev_find(guid);
+	if (vdev && vdev->v_state == VDEV_STATE_HEALTHY) {
+		return (EIO);
+	}
+
+	if (nvlist_find(nvlist,
+			ZPOOL_CONFIG_VDEV_TREE,
+			DATA_TYPE_NVLIST, 0, &vdevs)) {
+		return (EIO);
+	}
+	vdev_init_from_nvlist(vdevs, &top_vdev);
+
+	/*
+	 * Add the toplevel vdev to the pool if its not already there.
+	 */
+	STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
+		if (top_vdev == pool_vdev)
+			break;
+	if (!pool_vdev && top_vdev)
+		STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
+
+	/*
+	 * We should already have created an incomplete vdev for this
+	 * vdev. Find it and initialise it with our read proc.
+	 */
+	vdev = vdev_find(guid);
+	if (vdev) {
+		vdev->v_read = read;
+		vdev->v_read_priv = read_priv;
+		vdev->v_state = VDEV_STATE_HEALTHY;
+	} else {
+		printf("ZFS: inconsistent nvlist contents\n");
+		return (EIO);
+	}
+
+	/*
+	 * Re-evaluate top-level vdev state.
+	 */
+	vdev_set_state(top_vdev);
+
+	/*
+	 * Ok, we are happy with the pool so far. Lets find
+	 * the best uberblock and then we can actually access
+	 * the contents of the pool.
+	 */
+	for (i = 0;
+	     i < VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT;
+	     i++) {
+		off = offsetof(vdev_label_t, vl_uberblock);
+		off += i << UBERBLOCK_SHIFT;
+		BP_ZERO(&bp);
+		DVA_SET_OFFSET(&bp.blk_dva[0], off);
+		BP_SET_LSIZE(&bp, 1 << UBERBLOCK_SHIFT);
+		BP_SET_PSIZE(&bp, 1 << UBERBLOCK_SHIFT);
+		BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
+		BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+		ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
+		if (zio_read_phys(vdev, &bp, upbuf, off))
+			continue;
+
+		up = (const struct uberblock *) upbuf;
+		if (up->ub_magic != UBERBLOCK_MAGIC)
+			continue;
+		if (up->ub_txg < spa->spa_txg)
+			continue;
+		if (up->ub_txg > spa->spa_uberblock.ub_txg) {
+			spa->spa_uberblock = *up;
+		} else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
+			if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
+				spa->spa_uberblock = *up;
+		}
+	}
+
+	if (spap)
+		*spap = spa;
+	return (0);
+}
+
+static int
+ilog2(int n)
+{
+	int v;
+
+	for (v = 0; v < 32; v++)
+		if (n == (1 << v))
+			return v;
+	return -1;
+}
+
+static int
+zio_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset)
+{
+	int cpfunc = BP_GET_COMPRESS(bp);
+	size_t lsize = BP_GET_LSIZE(bp);
+	size_t psize = BP_GET_PSIZE(bp);
+	int rc;
+
+	/*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
+	if (cpfunc != ZIO_COMPRESS_OFF) {
+		rc = vdev->v_read(vdev, vdev->v_read_priv, offset, zfs_decomp_buf, psize);
+		if (rc)
+			return (rc);
+		if (zio_checksum_error(bp, zfs_decomp_buf))
+			return (EIO);
+		if (zio_decompress_data(cpfunc, zfs_decomp_buf, psize,
+			buf, lsize))
+			return (EIO);
+	} else {
+		rc = vdev->v_read(vdev, vdev->v_read_priv, offset, buf, psize);
+		if (rc)
+			return (rc);
+					  
+		if (zio_checksum_error(bp, buf))
+			return (EIO);
+	}
+	return (0);
+}
+
+static int
+zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
+{
+	int i;
+
+	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
+		const dva_t *dva = &bp->blk_dva[i];
+		vdev_t *vdev;
+		int vdevid;
+		off_t offset;
+
+		if (!dva->dva_word[0] && !dva->dva_word[1])
+			continue;
+
+		vdevid = DVA_GET_VDEV(dva);
+		offset = DVA_GET_OFFSET(dva) + VDEV_LABEL_START_SIZE;
+		STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
+			if (vdev->v_id == vdevid)
+				break;
+		if (!vdev || !vdev->v_read)
+			continue;
+		if (zio_read_phys(vdev, bp, buf, offset))
+			continue;
+
+		return (0);
+	}
+	printf("ZFS: i/o error - all block copies unavailable\n");
+
+	return (EIO);
+}
+
+static int
+dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
+{
+	int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
+	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	int nlevels = dnode->dn_nlevels;
+	int i, rc;
+
+	/*
+	 * We truncate the offset to 32bits, mainly so that I don't
+	 * have to find a copy of __divdi3 to put into the bootstrap.
+	 * I don't think the bootstrap needs to access anything bigger
+	 * than 2G anyway. Note that block addresses are still 64bit
+	 * so it doesn't affect the possible size of the media.
+	 * We still use 64bit block numbers so that the bitshifts
+	 * work correctly. Note: bsize may not be a power of two here.
+	 */
+	while (buflen > 0) {
+		uint64_t bn = ((int) offset) / bsize;
+		int boff = ((int) offset) % bsize;
+		int ibn;
+		const blkptr_t *indbp;
+		blkptr_t bp;
+
+		if (bn > dnode->dn_maxblkid)
+			return (EIO);
+
+		if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
+			goto cached;
+
+		indbp = dnode->dn_blkptr;
+		for (i = 0; i < nlevels; i++) {
+			/*
+			 * Copy the bp from the indirect array so that
+			 * we can re-use the scratch buffer for multi-level
+			 * objects.
+			 */
+			ibn = bn >> ((nlevels - i - 1) * ibshift);
+			ibn &= ((1 << ibshift) - 1);
+			bp = indbp[ibn];
+			rc = zio_read(spa, &bp, dnode_cache_buf);
+			if (rc)
+				return (rc);
+			indbp = (const blkptr_t *) dnode_cache_buf;
+		}
+		dnode_cache_obj = dnode;
+		dnode_cache_bn = bn;
+	cached:
+
+		/*
+		 * The buffer contains our data block. Copy what we
+		 * need from it and loop.
+		 */ 
+		i = bsize - boff;
+		if (i > buflen) i = buflen;
+		memcpy(buf, &dnode_cache_buf[boff], i);
+		buf = ((char*) buf) + i;
+		offset += i;
+		buflen -= i;
+	}
+
+	return (0);
+}
+
+/*
+ * Lookup a value in a microzap directory. Assumes that the zap
+ * scratch buffer contains the directory contents.
+ */
+static int
+mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
+{
+	const mzap_phys_t *mz;
+	const mzap_ent_phys_t *mze;
+	size_t size;
+	int chunks, i;
+
+	/*
+	 * Microzap objects use exactly one block. Read the whole
+	 * thing.
+	 */
+	size = dnode->dn_datablkszsec * 512;
+
+	mz = (const mzap_phys_t *) zap_scratch;
+	chunks = size / MZAP_ENT_LEN - 1;
+
+	for (i = 0; i < chunks; i++) {
+		mze = &mz->mz_chunk[i];
+		if (!strcmp(mze->mze_name, name)) {
+			*value = mze->mze_value;
+			return (0);
+		}
+	}
+
+	return (ENOENT);
+}
+
+/*
+ * Compare a name with a zap leaf entry. Return non-zero if the name
+ * matches.
+ */
+static int
+fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
+{
+	size_t namelen;
+	const zap_leaf_chunk_t *nc;
+	const char *p;
+
+	namelen = zc->l_entry.le_name_length;
+			
+	nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
+	p = name;
+	while (namelen > 0) {
+		size_t len;
+		len = namelen;
+		if (len > ZAP_LEAF_ARRAY_BYTES)
+			len = ZAP_LEAF_ARRAY_BYTES;
+		if (memcmp(p, nc->l_array.la_array, len))
+			return (0);
+		p += len;
+		namelen -= len;
+		nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
+	}
+
+	return 1;
+}
+
+/*
+ * Extract a uint64_t value from a zap leaf entry.
+ */
+static uint64_t
+fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
+{
+	const zap_leaf_chunk_t *vc;
+	int i;
+	uint64_t value;
+	const uint8_t *p;
+
+	vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
+	for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
+		value = (value << 8) | p[i];
+	}
+
+	return value;
+}
+
+/*
+ * Lookup a value in a fatzap directory. Assumes that the zap scratch
+ * buffer contains the directory header.
+ */
+static int
+fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
+{
+	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
+	fat_zap_t z;
+	uint64_t *ptrtbl;
+	uint64_t hash;
+	int rc;
+
+	if (zh.zap_magic != ZAP_MAGIC)
+		return (EIO);
+
+	z.zap_block_shift = ilog2(bsize);
+	z.zap_phys = (zap_phys_t *) zap_scratch;
+
+	/*
+	 * Figure out where the pointer table is and read it in if necessary.
+	 */
+	if (zh.zap_ptrtbl.zt_blk) {
+		rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
+			       zap_scratch, bsize);
+		if (rc)
+			return (rc);
+		ptrtbl = (uint64_t *) zap_scratch;
+	} else {
+		ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
+	}
+
+	hash = zap_hash(zh.zap_salt, name);
+
+	zap_leaf_t zl;
+	zl.l_bs = z.zap_block_shift;
+
+	off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
+	zap_leaf_chunk_t *zc;
+
+	rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
+	if (rc)
+		return (rc);
+
+	zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
+
+	/*
+	 * Make sure this chunk matches our hash.
+	 */
+	if (zl.l_phys->l_hdr.lh_prefix_len > 0
+	    && zl.l_phys->l_hdr.lh_prefix
+	    != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
+		return (ENOENT);
+
+	/*
+	 * Hash within the chunk to find our entry.
+	 */
+	int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
+	int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
+	h = zl.l_phys->l_hash[h];
+	if (h == 0xffff)
+		return (ENOENT);
+	zc = &ZAP_LEAF_CHUNK(&zl, h);
+	while (zc->l_entry.le_hash != hash) {
+		if (zc->l_entry.le_next == 0xffff) {
+			zc = 0;
+			break;
+		}
+		zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
+	}
+	if (fzap_name_equal(&zl, zc, name)) {
+		*value = fzap_leaf_value(&zl, zc);
+		return (0);
+	}
+
+	return (ENOENT);
+}
+
+/*
+ * Lookup a name in a zap object and return its value as a uint64_t.
+ */
+static int
+zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
+{
+	int rc;
+	uint64_t zap_type;
+	size_t size = dnode->dn_datablkszsec * 512;
+
+	rc = dnode_read(spa, dnode, 0, zap_scratch, size);
+	if (rc)
+		return (rc);
+
+	zap_type = *(uint64_t *) zap_scratch;
+	if (zap_type == ZBT_MICRO)
+		return mzap_lookup(spa, dnode, name, value);
+	else
+		return fzap_lookup(spa, dnode, name, value);
+}
+
+#ifdef BOOT2
+
+/*
+ * List a microzap directory. Assumes that the zap scratch buffer contains
+ * the directory contents.
+ */
+static int
+mzap_list(spa_t *spa, const dnode_phys_t *dnode)
+{
+	const mzap_phys_t *mz;
+	const mzap_ent_phys_t *mze;
+	size_t size;
+	int chunks, i;
+
+	/*
+	 * Microzap objects use exactly one block. Read the whole
+	 * thing.
+	 */
+	size = dnode->dn_datablkszsec * 512;
+	mz = (const mzap_phys_t *) zap_scratch;
+	chunks = size / MZAP_ENT_LEN - 1;
+
+	for (i = 0; i < chunks; i++) {
+		mze = &mz->mz_chunk[i];
+		if (mze->mze_name[0])
+			//printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value);
+			printf("%s\n", mze->mze_name);
+	}
+
+	return (0);
+}
+
+/*
+ * List a fatzap directory. Assumes that the zap scratch buffer contains
+ * the directory header.
+ */
+static int
+fzap_list(spa_t *spa, const dnode_phys_t *dnode)
+{
+	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
+	fat_zap_t z;
+	int i, j;
+
+	if (zh.zap_magic != ZAP_MAGIC)
+		return (EIO);
+
+	z.zap_block_shift = ilog2(bsize);
+	z.zap_phys = (zap_phys_t *) zap_scratch;
+
+	/*
+	 * This assumes that the leaf blocks start at block 1. The
+	 * documentation isn't exactly clear on this.
+	 */
+	zap_leaf_t zl;
+	zl.l_bs = z.zap_block_shift;
+	for (i = 0; i < zh.zap_num_leafs; i++) {
+		off_t off = (i + 1) << zl.l_bs;
+		char name[256], *p;
+		uint64_t value;
+
+		if (dnode_read(spa, dnode, off, zap_scratch, bsize))
+			return (EIO);
+
+		zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
+
+		for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
+			zap_leaf_chunk_t *zc, *nc;
+			int namelen;
+
+			zc = &ZAP_LEAF_CHUNK(&zl, j);
+			if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
+				continue;
+			namelen = zc->l_entry.le_name_length;
+			if (namelen > sizeof(name))
+				namelen = sizeof(name);
+			
+			/*
+			 * Paste the name back together.
+			 */
+			nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
+			p = name;
+			while (namelen > 0) {
+				int len;
+				len = namelen;
+				if (len > ZAP_LEAF_ARRAY_BYTES)
+					len = ZAP_LEAF_ARRAY_BYTES;
+				memcpy(p, nc->l_array.la_array, len);
+				p += len;
+				namelen -= len;
+				nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
+			}
+
+			/*
+			 * Assume the first eight bytes of the value are
+			 * a uint64_t.
+			 */
+			value = fzap_leaf_value(&zl, zc);
+
+			printf("%-32s 0x%llx\n", name, value);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * List a zap directory.
+ */
+static int
+zap_list(spa_t *spa, const dnode_phys_t *dnode)
+{
+	uint64_t zap_type;
+	size_t size = dnode->dn_datablkszsec * 512;
+
+	if (dnode_read(spa, dnode, 0, zap_scratch, size))
+		return (EIO);
+
+	zap_type = *(uint64_t *) zap_scratch;
+	if (zap_type == ZBT_MICRO)
+		return mzap_list(spa, dnode);
+	else
+		return fzap_list(spa, dnode);
+}
+
+#endif
+
+static int
+objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
+{
+	off_t offset;
+
+	offset = objnum * sizeof(dnode_phys_t);
+	return dnode_read(spa, &os->os_meta_dnode, offset,
+		dnode, sizeof(dnode_phys_t));
+}
+
+/*
+ * Find the object set given the object number of its dataset object
+ * and return its details in *objset
+ */
+static int
+zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset)
+{
+	dnode_phys_t dataset;
+	dsl_dataset_phys_t *ds;
+
+	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
+		printf("ZFS: can't find dataset %lld\n", objnum);
+		return (EIO);
+	}
+
+	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
+	if (zio_read(spa, &ds->ds_bp, objset)) {
+		printf("ZFS: can't read object set for dataset %lld\n", objnum);
+		return (EIO);
+	}
+
+	return (0);
+}
+
+/*
+ * Find the object set pointed to by the BOOTFS property or the root
+ * dataset if there is none and return its details in *objset
+ */
+static int
+zfs_mount_root(spa_t *spa, objset_phys_t *objset)
+{
+	dnode_phys_t dir, propdir;
+	uint64_t props, bootfs, root;
+
+	/*
+	 * Start with the MOS directory object.
+	 */
+	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
+		printf("ZFS: can't read MOS object directory\n");
+		return (EIO);
+	}
+
+	/*
+	 * Lookup the pool_props and see if we can find a bootfs.
+	 */
+	if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
+	     && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
+	     && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0)
+		return zfs_mount_dataset(spa, bootfs, objset);
+
+	/*
+	 * Lookup the root dataset directory
+	 */
+	if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
+	    || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
+		printf("ZFS: can't find root dsl_dir\n");
+		return (EIO);
+	}
+
+	/*
+	 * Use the information from the dataset directory's bonus buffer
+	 * to find the dataset object and from that the object set itself.
+	 */
+	dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
+	return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset);
+}
+
+static int
+zfs_mount_pool(spa_t *spa)
+{
+	/*
+	 * Find the MOS and work our way in from there.
+	 */
+	if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
+		printf("ZFS: can't read MOS\n");
+		return (EIO);
+	}
+
+	/*
+	 * Find the root object set
+	 */
+	if (zfs_mount_root(spa, &spa->spa_root_objset)) {
+		printf("Can't find root filesystem - giving up\n");
+		return (EIO);
+	}
+
+	return (0);
+}
+
+/*
+ * Lookup a file and return its dnode.
+ */
+static int
+zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
+{
+	int rc;
+	uint64_t objnum, rootnum, parentnum;
+	dnode_phys_t dn;
+	const znode_phys_t *zp = (const znode_phys_t *) dn.dn_bonus;
+	const char *p, *q;
+	char element[256];
+	char path[1024];
+	int symlinks_followed = 0;
+
+	if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
+		printf("ZFS: unexpected object set type %lld\n",
+		       spa->spa_root_objset.os_type);
+		return (EIO);
+	}
+
+	/*
+	 * Get the root directory dnode.
+	 */
+	rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn);
+	if (rc)
+		return (rc);
+
+	rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
+	if (rc)
+		return (rc);
+
+	rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn);
+	if (rc)
+		return (rc);
+
+	objnum = rootnum;
+	p = upath;
+	while (p && *p) {
+		while (*p == '/')
+			p++;
+		if (!*p)
+			break;
+		q = strchr(p, '/');
+		if (q) {
+			memcpy(element, p, q - p);
+			element[q - p] = 0;
+			p = q;
+		} else {
+			strcpy(element, p);
+			p = 0;
+		}
+
+		if ((zp->zp_mode >> 12) != 0x4) {
+			return (ENOTDIR);
+		}
+
+		parentnum = objnum;
+		rc = zap_lookup(spa, &dn, element, &objnum);
+		if (rc)
+			return (rc);
+		objnum = ZFS_DIRENT_OBJ(objnum);
+
+		rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
+		if (rc)
+			return (rc);
+
+		/*
+		 * Check for symlink.
+		 */
+		if ((zp->zp_mode >> 12) == 0xa) {
+			if (symlinks_followed > 10)
+				return (EMLINK);
+			symlinks_followed++;
+
+			/*
+			 * Read the link value and copy the tail of our
+			 * current path onto the end.
+			 */
+			if (p)
+				strcpy(&path[zp->zp_size], p);
+			else
+				path[zp->zp_size] = 0;
+			if (zp->zp_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
+				memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
+					zp->zp_size);
+			} else {
+				rc = dnode_read(spa, &dn, 0, path, zp->zp_size);
+				if (rc)
+					return (rc);
+			}
+
+			/*
+			 * Restart with the new path, starting either at
+			 * the root or at the parent depending whether or
+			 * not the link is relative.
+			 */
+			p = path;
+			if (*p == '/')
+				objnum = rootnum;
+			else
+				objnum = parentnum;
+			objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
+		}
+	}
+
+	*dnode = dn;
+	return (0);
+}
author	pjd <pjd@FreeBSD.org>	2008-11-17 20:49:29 +0000
committer	pjd <pjd@FreeBSD.org>	2008-11-17 20:49:29 +0000
commit	bbe899b96e388a8b82439f81ed3707e0d9c6070d (patch)
tree	81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/boot
parent	d2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff)
download	FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz