summaryrefslogtreecommitdiffstats
path: root/sys/boot
diff options
context:
space:
mode:
authorpjd <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
committerpjd <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
commitbbe899b96e388a8b82439f81ed3707e0d9c6070d (patch)
tree81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/boot
parentd2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff)
downloadFreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip
FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes: - Delegated Administration Allows regular users to perform ZFS operations, like file system creation, snapshot creation, etc. - L2ARC Level 2 cache for ZFS - allows to use additional disks for cache. Huge performance improvements mostly for random read of mostly static content. - slog Allow to use additional disks for ZFS Intent Log to speed up operations like fsync(2). - vfs.zfs.super_owner Allows regular users to perform privileged operations on files stored on ZFS file systems owned by him. Very careful with this one. - chflags(2) Not all the flags are supported. This still needs work. - ZFSBoot Support to boot off of ZFS pool. Not finished, AFAIK. Submitted by: dfr - Snapshot properties - New failure modes Before if write requested failed, system paniced. Now one can select from one of three failure modes: - panic - panic on write error - wait - wait for disk to reappear - continue - serve read requests if possible, block write requests - Refquota, refreservation properties Just quota and reservation properties, but don't count space consumed by children file systems, clones and snapshots. - Sparse volumes ZVOLs that don't reserve space in the pool. - External attributes Compatible with extattr(2). - NFSv4-ACLs Not sure about the status, might not be complete yet. Submitted by: trasz - Creation-time properties - Regression tests for zpool(8) command. Obtained from: OpenSolaris
Diffstat (limited to 'sys/boot')
-rw-r--r--sys/boot/Makefile4
-rw-r--r--sys/boot/common/bootstrap.h1
-rw-r--r--sys/boot/i386/Makefile4
-rw-r--r--sys/boot/i386/libi386/bootinfo32.c1
-rw-r--r--sys/boot/i386/libi386/devicename.c2
-rw-r--r--sys/boot/i386/loader/Makefile10
-rw-r--r--sys/boot/i386/loader/conf.c14
-rw-r--r--sys/boot/i386/loader/main.c51
-rw-r--r--sys/boot/i386/zfsboot/Makefile108
-rw-r--r--sys/boot/i386/zfsboot/zfsboot.c944
-rw-r--r--sys/boot/i386/zfsboot/zfsldr.S402
-rw-r--r--sys/boot/zfs/Makefile29
-rw-r--r--sys/boot/zfs/zfs.c514
-rw-r--r--sys/boot/zfs/zfsimpl.c1443
14 files changed, 3512 insertions, 15 deletions
diff --git a/sys/boot/Makefile b/sys/boot/Makefile
index 1af1457..27cb7e3 100644
--- a/sys/boot/Makefile
+++ b/sys/boot/Makefile
@@ -26,6 +26,10 @@ SUBDIR+= ofw
SUBDIR+= uboot
.endif
+.if defined(LOADER_ZFS_SUPPORT)
+SUBDIR+= zfs
+.endif
+
# Pick the machine-dependent subdir based on the target architecture.
ADIR= ${MACHINE:S/amd64/i386/:S/sun4v/sparc64/}
.if exists(${.CURDIR}/${ADIR}/.)
diff --git a/sys/boot/common/bootstrap.h b/sys/boot/common/bootstrap.h
index 57982d1..5f08480 100644
--- a/sys/boot/common/bootstrap.h
+++ b/sys/boot/common/bootstrap.h
@@ -43,6 +43,7 @@ struct devdesc
#define DEVT_DISK 1
#define DEVT_NET 2
#define DEVT_CD 3
+#define DEVT_ZFS 4
int d_unit;
};
diff --git a/sys/boot/i386/Makefile b/sys/boot/i386/Makefile
index b89222d..6af8642 100644
--- a/sys/boot/i386/Makefile
+++ b/sys/boot/i386/Makefile
@@ -1,7 +1,7 @@
# $FreeBSD$
-SUBDIR= mbr pmbr boot0 boot0sio btx boot2 cdboot gptboot kgzldr \
- libi386 libfirewire loader
+SUBDIR= mbr pmbr boot0 boot0sio btx boot2 cdboot gptboot zfsboot \
+ kgzldr libi386 libfirewire loader
# special boot programs, 'self-extracting boot2+loader'
SUBDIR+= pxeldr
diff --git a/sys/boot/i386/libi386/bootinfo32.c b/sys/boot/i386/libi386/bootinfo32.c
index 6b517c5..d434427 100644
--- a/sys/boot/i386/libi386/bootinfo32.c
+++ b/sys/boot/i386/libi386/bootinfo32.c
@@ -183,6 +183,7 @@ bi_load32(char *args, int *howtop, int *bootdevp, vm_offset_t *bip, vm_offset_t
break;
case DEVT_NET:
+ case DEVT_ZFS:
break;
default:
diff --git a/sys/boot/i386/libi386/devicename.c b/sys/boot/i386/libi386/devicename.c
index e1035aa..79a562b 100644
--- a/sys/boot/i386/libi386/devicename.c
+++ b/sys/boot/i386/libi386/devicename.c
@@ -167,6 +167,7 @@ i386_parsedev(struct i386_devdesc **dev, const char *devspec, const char **path)
case DEVT_CD:
case DEVT_NET:
+ case DEVT_ZFS:
unit = 0;
if (*np && (*np != ':')) {
@@ -238,6 +239,7 @@ i386_fmtdev(void *vdev)
break;
case DEVT_NET:
+ case DEVT_ZFS:
sprintf(buf, "%s%d:", dev->d_dev->dv_name, dev->d_unit);
break;
}
diff --git a/sys/boot/i386/loader/Makefile b/sys/boot/i386/loader/Makefile
index df2ccc0..79aceca 100644
--- a/sys/boot/i386/loader/Makefile
+++ b/sys/boot/i386/loader/Makefile
@@ -17,6 +17,12 @@ CFLAGS+= -DLOADER_FIREWIRE_SUPPORT
LIBFIREWIRE= ${.OBJDIR}/../libfirewire/libfirewire.a
.endif
+# Put LOADER_ZFS_SUPPORT=yes in /etc/make.conf for ZFS support
+.if defined(LOADER_ZFS_SUPPORT)
+CFLAGS+= -DLOADER_ZFS_SUPPORT
+LIBZFS= ${.OBJDIR}/../../zfs/libzfsboot.a
+.endif
+
# Enable PXE TFTP or NFS support, not both.
.if defined(LOADER_TFTP_SUPPORT)
CFLAGS+= -DLOADER_TFTP_SUPPORT
@@ -98,8 +104,8 @@ FILES+= loader.rc
# XXX crt0.o needs to be first for pxeboot(8) to work
OBJS= ${BTXCRT}
-DPADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBI386} ${LIBSTAND}
-LDADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBI386} -lstand
+DPADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBZFS} ${LIBI386} ${LIBSTAND}
+LDADD= ${LIBFICL} ${LIBFIREWIRE} ${LIBZFS} ${LIBI386} -lstand
.include <bsd.prog.mk>
diff --git a/sys/boot/i386/loader/conf.c b/sys/boot/i386/loader/conf.c
index 245f960..05c9a9e9 100644
--- a/sys/boot/i386/loader/conf.c
+++ b/sys/boot/i386/loader/conf.c
@@ -50,6 +50,10 @@ __FBSDID("$FreeBSD$");
extern struct devsw fwohci;
#endif
+#if defined(LOADER_ZFS_SUPPORT)
+extern struct devsw zfs_dev;
+#endif
+
/* Exported for libstand */
struct devsw *devsw[] = {
&bioscd,
@@ -60,15 +64,25 @@ struct devsw *devsw[] = {
#if defined(LOADER_FIREWIRE_SUPPORT)
&fwohci,
#endif
+#if defined(LOADER_ZFS_SUPPORT)
+ &zfs_dev,
+#endif
NULL
};
+#if defined(LOADER_ZFS_SUPPORT)
+extern struct fs_ops zfs_fsops;
+#endif
+
struct fs_ops *file_system[] = {
&ufs_fsops,
&ext2fs_fsops,
&dosfs_fsops,
&cd9660_fsops,
&splitfs_fsops,
+#if defined(LOADER_ZFS_SUPPORT)
+ &zfs_fsops,
+#endif
#ifdef LOADER_GZIP_SUPPORT
&gzipfs_fsops,
#endif
diff --git a/sys/boot/i386/loader/main.c b/sys/boot/i386/loader/main.c
index 5b23670..cac28ae 100644
--- a/sys/boot/i386/loader/main.c
+++ b/sys/boot/i386/loader/main.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
#define KARGS_FLAGS_CD 0x1
#define KARGS_FLAGS_PXE 0x2
+#define KARGS_FLAGS_ZFS 0x4
/* Arguments passed in from the boot1/boot2 loader */
static struct
@@ -51,8 +52,13 @@ static struct
u_int32_t howto;
u_int32_t bootdev;
u_int32_t bootflags;
- u_int32_t pxeinfo;
- u_int32_t res2;
+ union {
+ struct {
+ u_int32_t pxeinfo;
+ u_int32_t res2;
+ };
+ uint64_t zfspool;
+ };
u_int32_t bootinfo;
} *kargs;
@@ -96,7 +102,7 @@ main(void)
*/
bios_getmem();
-#if defined(LOADER_BZIP2_SUPPORT) || defined(LOADER_FIREWIRE_SUPPORT)
+#if defined(LOADER_BZIP2_SUPPORT) || defined(LOADER_FIREWIRE_SUPPORT) || defined(LOADER_ZFS_SUPPORT)
heap_top = PTOV(memtop_copyin);
memtop_copyin -= 0x300000;
heap_bottom = PTOV(memtop_copyin);
@@ -145,6 +151,14 @@ main(void)
bc_add(initial_bootdev);
}
+ archsw.arch_autoload = i386_autoload;
+ archsw.arch_getdev = i386_getdev;
+ archsw.arch_copyin = i386_copyin;
+ archsw.arch_copyout = i386_copyout;
+ archsw.arch_readin = i386_readin;
+ archsw.arch_isainb = isa_inb;
+ archsw.arch_isaoutb = isa_outb;
+
/*
* March through the device switch probing for things.
*/
@@ -172,14 +186,6 @@ main(void)
bios_getsmap();
- archsw.arch_autoload = i386_autoload;
- archsw.arch_getdev = i386_getdev;
- archsw.arch_copyin = i386_copyin;
- archsw.arch_copyout = i386_copyout;
- archsw.arch_readin = i386_readin;
- archsw.arch_isainb = isa_inb;
- archsw.arch_isaoutb = isa_outb;
-
interact(); /* doesn't return */
/* if we ever get here, it is an error */
@@ -252,6 +258,29 @@ extract_currdev(void)
i386_setcurrdev, env_nounset);
env_setenv("loaddev", EV_VOLATILE, i386_fmtdev(&new_currdev), env_noset,
env_nounset);
+
+#ifdef LOADER_ZFS_SUPPORT
+ /*
+ * If we were started from a ZFS-aware boot2, we can work out
+ * which ZFS pool we are booting from.
+ */
+ if (kargs->bootflags & KARGS_FLAGS_ZFS) {
+ /*
+ * Dig out the pool guid and convert it to a 'unit number'
+ */
+ uint64_t guid;
+ int unit;
+ char devname[32];
+ extern int zfs_guid_to_unit(uint64_t);
+
+ guid = kargs->zfspool;
+ unit = zfs_guid_to_unit(guid);
+ if (unit >= 0) {
+ sprintf(devname, "zfs%d", unit);
+ setenv("currdev", devname, 1);
+ }
+ }
+#endif
}
COMMAND_SET(reboot, "reboot", "reboot the system", command_reboot);
diff --git a/sys/boot/i386/zfsboot/Makefile b/sys/boot/i386/zfsboot/Makefile
new file mode 100644
index 0000000..41f1672
--- /dev/null
+++ b/sys/boot/i386/zfsboot/Makefile
@@ -0,0 +1,108 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../boot2
+
+FILES= zfsboot
+
+NM?= nm
+
+# A value of 0x80 enables LBA support.
+BOOT_BOOT1_FLAGS?= 0x80
+
+BOOT_COMCONSOLE_PORT?= 0x3f8
+BOOT_COMCONSOLE_SPEED?= 9600
+B2SIOFMT?= 0x3
+
+REL1= 0x700
+ORG1= 0x7c00
+ORG2= 0x2000
+
+CFLAGS= -Os -g \
+ -fno-guess-branch-probability \
+ -fomit-frame-pointer \
+ -fno-unit-at-a-time \
+ -mno-align-long-strings \
+ -mrtd \
+ -mno-mmx -mno-3dnow -mno-sse -mno-sse2 -mno-sse3 \
+ -DBOOT2 \
+ -DFLAGS=${BOOT_BOOT1_FLAGS} \
+ -DSIOPRT=${BOOT_COMCONSOLE_PORT} \
+ -DSIOFMT=${B2SIOFMT} \
+ -DSIOSPD=${BOOT_COMCONSOLE_SPEED} \
+ -I${.CURDIR}/../../zfs \
+ -I${.CURDIR}/../../../cddl/boot/zfs \
+ -I${.CURDIR}/../btx/lib -I. \
+ -I${.CURDIR}/../boot2 \
+ -Wall -Waggregate-return -Wbad-function-cast -Wcast-align \
+ -Wmissing-declarations -Wmissing-prototypes -Wnested-externs \
+ -Wpointer-arith -Wshadow -Wstrict-prototypes -Wwrite-strings \
+ -Winline --param max-inline-insns-single=100
+
+LDFLAGS=-static -N --gc-sections
+
+# Pick up ../Makefile.inc early.
+.include <bsd.init.mk>
+
+CLEANFILES= zfsboot
+
+zfsboot: zfsboot1 zfsboot2
+ cat zfsboot1 zfsboot2 > zfsboot
+
+CLEANFILES+= zfsboot1 zfsldr.out zfsldr.o
+
+zfsboot1: zfsldr.out
+ objcopy -S -O binary zfsldr.out ${.TARGET}
+
+zfsldr.out: zfsldr.o
+ ${LD} ${LDFLAGS} -e start -Ttext ${ORG1} -o ${.TARGET} zfsldr.o
+
+CLEANFILES+= zfsboot2 zfsboot.ld zfsboot.ldr zfsboot.bin zfsboot.out \
+ zfsboot.o zfsboot.s zfsboot.s.tmp zfsboot.h sio.o
+
+# We currently allow 32768 bytes for zfsboot - in practice it could be
+# any size up to 3.5Mb but keeping it fixed size simplifies zfsldr.
+#
+BOOT2SIZE= 32768
+
+zfsboot2: zfsboot.ld
+ @set -- `ls -l zfsboot.ld`; x=$$((${BOOT2SIZE}-$$5)); \
+ echo "$$x bytes available"; test $$x -ge 0
+ dd if=zfsboot.ld of=${.TARGET} obs=${BOOT2SIZE} conv=osync
+
+zfsboot.ld: zfsboot.ldr zfsboot.bin ${BTXKERN}
+ btxld -v -E ${ORG2} -f bin -b ${BTXKERN} -l zfsboot.ldr \
+ -o ${.TARGET} -P 1 zfsboot.bin
+
+zfsboot.ldr:
+ cp /dev/null ${.TARGET}
+
+zfsboot.bin: zfsboot.out
+ objcopy -S -O binary zfsboot.out ${.TARGET}
+
+zfsboot.out: ${BTXCRT} zfsboot.o sio.o
+ ${LD} ${LDFLAGS} -Ttext ${ORG2} -o ${.TARGET} ${.ALLSRC}
+
+zfsboot.o: zfsboot.s
+
+SRCS= zfsboot.c zfsboot.h
+
+zfsboot.s: zfsboot.c zfsboot.h ${.CURDIR}/../../zfs/zfsimpl.c
+ ${CC} ${CFLAGS} -S -o zfsboot.s.tmp ${.CURDIR}/zfsboot.c
+ sed -e '/align/d' -e '/nop/d' < zfsboot.s.tmp > zfsboot.s
+ rm -f zfsboot.s.tmp
+
+zfsboot.h: zfsldr.out
+ ${NM} -t d ${.ALLSRC} | awk '/([0-9])+ T xread/ \
+ { x = $$1 - ORG1; \
+ printf("#define XREADORG %#x\n", REL1 + x) }' \
+ ORG1=`printf "%d" ${ORG1}` \
+ REL1=`printf "%d" ${REL1}` > ${.TARGET}
+
+.if ${MACHINE_ARCH} == "amd64"
+beforedepend zfsboot.s: machine
+CLEANFILES+= machine
+machine:
+ ln -sf ${.CURDIR}/../../../i386/include machine
+.endif
+
+.include <bsd.prog.mk>
diff --git a/sys/boot/i386/zfsboot/zfsboot.c b/sys/boot/i386/zfsboot/zfsboot.c
new file mode 100644
index 0000000..9b0a465
--- /dev/null
+++ b/sys/boot/i386/zfsboot/zfsboot.c
@@ -0,0 +1,944 @@
+/*-
+ * Copyright (c) 1998 Robert Nordier
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms are freely
+ * permitted provided that the above copyright notice and this
+ * paragraph and the following disclaimer are duplicated in all
+ * such forms.
+ *
+ * This software is provided "AS IS" and without any express or
+ * implied warranties, including, without limitation, the implied
+ * warranties of merchantability and fitness for a particular
+ * purpose.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/diskmbr.h>
+#include <sys/reboot.h>
+#include <sys/queue.h>
+
+#include <machine/bootinfo.h>
+#include <machine/elf.h>
+
+#include <stdarg.h>
+#include <stddef.h>
+
+#include <a.out.h>
+
+#include <btxv86.h>
+
+#include "zfsboot.h"
+#include "lib.h"
+
+#define IO_KEYBOARD 1
+#define IO_SERIAL 2
+
+#define SECOND 18 /* Circa that many ticks in a second. */
+
+#define RBX_ASKNAME 0x0 /* -a */
+#define RBX_SINGLE 0x1 /* -s */
+/* 0x2 is reserved for log2(RB_NOSYNC). */
+/* 0x3 is reserved for log2(RB_HALT). */
+/* 0x4 is reserved for log2(RB_INITNAME). */
+#define RBX_DFLTROOT 0x5 /* -r */
+#define RBX_KDB 0x6 /* -d */
+/* 0x7 is reserved for log2(RB_RDONLY). */
+/* 0x8 is reserved for log2(RB_DUMP). */
+/* 0x9 is reserved for log2(RB_MINIROOT). */
+#define RBX_CONFIG 0xa /* -c */
+#define RBX_VERBOSE 0xb /* -v */
+#define RBX_SERIAL 0xc /* -h */
+#define RBX_CDROM 0xd /* -C */
+/* 0xe is reserved for log2(RB_POWEROFF). */
+#define RBX_GDB 0xf /* -g */
+#define RBX_MUTE 0x10 /* -m */
+/* 0x11 is reserved for log2(RB_SELFTEST). */
+/* 0x12 is reserved for boot programs. */
+/* 0x13 is reserved for boot programs. */
+#define RBX_PAUSE 0x14 /* -p */
+#define RBX_QUIET 0x15 /* -q */
+#define RBX_NOINTR 0x1c /* -n */
+/* 0x1d is reserved for log2(RB_MULTIPLE) and is just misnamed here. */
+#define RBX_DUAL 0x1d /* -D */
+/* 0x1f is reserved for log2(RB_BOOTINFO). */
+
+/* pass: -a, -s, -r, -d, -c, -v, -h, -C, -g, -m, -p, -D */
+#define RBX_MASK (OPT_SET(RBX_ASKNAME) | OPT_SET(RBX_SINGLE) | \
+ OPT_SET(RBX_DFLTROOT) | OPT_SET(RBX_KDB ) | \
+ OPT_SET(RBX_CONFIG) | OPT_SET(RBX_VERBOSE) | \
+ OPT_SET(RBX_SERIAL) | OPT_SET(RBX_CDROM) | \
+ OPT_SET(RBX_GDB ) | OPT_SET(RBX_MUTE) | \
+ OPT_SET(RBX_PAUSE) | OPT_SET(RBX_DUAL))
+
+/* Hint to loader that we came from ZFS */
+#define KARGS_FLAGS_ZFS 0x4
+
+#define PATH_CONFIG "/boot.config"
+#define PATH_BOOT3 "/boot/loader"
+#define PATH_KERNEL "/boot/kernel/kernel"
+
+#define ARGS 0x900
+#define NOPT 14
+#define NDEV 3
+#define MEM_BASE 0x12
+#define MEM_EXT 0x15
+#define V86_CY(x) ((x) & 1)
+#define V86_ZR(x) ((x) & 0x40)
+
+#define DRV_HARD 0x80
+#define DRV_MASK 0x7f
+
+#define TYPE_AD 0
+#define TYPE_DA 1
+#define TYPE_MAXHARD TYPE_DA
+#define TYPE_FD 2
+
+#define OPT_SET(opt) (1 << (opt))
+#define OPT_CHECK(opt) ((opts) & OPT_SET(opt))
+
+extern uint32_t _end;
+
+static const char optstr[NOPT] = "DhaCcdgmnpqrsv"; /* Also 'P', 'S' */
+static const unsigned char flags[NOPT] = {
+ RBX_DUAL,
+ RBX_SERIAL,
+ RBX_ASKNAME,
+ RBX_CDROM,
+ RBX_CONFIG,
+ RBX_KDB,
+ RBX_GDB,
+ RBX_MUTE,
+ RBX_NOINTR,
+ RBX_PAUSE,
+ RBX_QUIET,
+ RBX_DFLTROOT,
+ RBX_SINGLE,
+ RBX_VERBOSE
+};
+
+static const char *const dev_nm[NDEV] = {"ad", "da", "fd"};
+static const unsigned char dev_maj[NDEV] = {30, 4, 2};
+
+struct dsk {
+ unsigned drive;
+ unsigned type;
+ unsigned unit;
+ unsigned slice;
+ unsigned part;
+ unsigned start;
+ int init;
+};
+static char cmd[512];
+static char kname[1024];
+static uint32_t opts;
+static int comspeed = SIOSPD;
+static struct bootinfo bootinfo;
+static uint32_t bootdev;
+static uint8_t ioctrl = IO_KEYBOARD;
+
+/* Buffers that must not span a 64k boundary. */
+#define READ_BUF_SIZE 8192
+struct dmadat {
+ char rdbuf[READ_BUF_SIZE]; /* for reading large things */
+ char secbuf[READ_BUF_SIZE]; /* for MBR/disklabel */
+};
+static struct dmadat *dmadat;
+
+void exit(int);
+static void load(void);
+static int parse(void);
+static void printf(const char *,...);
+static void putchar(int);
+static uint32_t memsize(void);
+static int drvread(struct dsk *, void *, unsigned, unsigned);
+static int keyhit(unsigned);
+static int xputc(int);
+static int xgetc(int);
+static int getc(int);
+
+static void memcpy(void *, const void *, int);
+static void
+memcpy(void *dst, const void *src, int len)
+{
+ const char *s = src;
+ char *d = dst;
+
+ while (len--)
+ *d++ = *s++;
+}
+
+static void
+strcpy(char *dst, const char *src)
+{
+ while (*src)
+ *dst++ = *src++;
+ *dst++ = 0;
+}
+
+static void
+strcat(char *dst, const char *src)
+{
+ while (*dst)
+ dst++;
+ while (*src)
+ *dst++ = *src++;
+ *dst++ = 0;
+}
+
+static int
+strcmp(const char *s1, const char *s2)
+{
+ for (; *s1 == *s2 && *s1; s1++, s2++);
+ return (unsigned char)*s1 - (unsigned char)*s2;
+}
+
+static const char *
+strchr(const char *s, char ch)
+{
+ for (; *s; s++)
+ if (*s == ch)
+ return s;
+ return 0;
+}
+
+static int
+memcmp(const void *p1, const void *p2, size_t n)
+{
+ const char *s1 = (const char *) p1;
+ const char *s2 = (const char *) p2;
+ for (; n > 0 && *s1 == *s2; s1++, s2++, n--);
+ if (n)
+ return (unsigned char)*s1 - (unsigned char)*s2;
+ else
+ return 0;
+}
+
+static void
+memset(void *p, char val, size_t n)
+{
+ char *s = (char *) p;
+ while (n--)
+ *s++ = val;
+}
+
+static void *
+malloc(size_t n)
+{
+ static char *heap_next;
+ static char *heap_end;
+
+ if (!heap_next) {
+ heap_next = (char *) dmadat + sizeof(*dmadat);
+ heap_end = (char *) (640*1024);
+ }
+
+ char *p = heap_next;
+ if (p + n > heap_end) {
+ printf("malloc failure\n");
+ for (;;)
+ ;
+ return 0;
+ }
+ heap_next += n;
+ return p;
+}
+
+static size_t
+strlen(const char *s)
+{
+ size_t len = 0;
+ while (*s++)
+ len++;
+ return len;
+}
+
+static char *
+strdup(const char *s)
+{
+ char *p = malloc(strlen(s) + 1);
+ strcpy(p, s);
+ return p;
+}
+
+#include "zfsimpl.c"
+
+/*
+ * Read from a dnode (which must be from a ZPL filesystem).
+ */
+static int
+zfs_read(spa_t *spa, const dnode_phys_t *dnode, off_t *offp, void *start, size_t size)
+{
+ const znode_phys_t *zp = (const znode_phys_t *) dnode->dn_bonus;
+ size_t n;
+ int rc;
+
+ n = size;
+ if (*offp + n > zp->zp_size)
+ n = zp->zp_size - *offp;
+
+ rc = dnode_read(spa, dnode, *offp, start, n);
+ if (rc)
+ return (-1);
+ *offp += n;
+
+ return (n);
+}
+
+/*
+ * Current ZFS pool
+ */
+spa_t *spa;
+
+/*
+ * A wrapper for dskread that doesn't have to worry about whether the
+ * buffer pointer crosses a 64k boundary.
+ */
+static int
+vdev_read(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes)
+{
+ char *p;
+ unsigned int lba, nb;
+ struct dsk *dsk = (struct dsk *) priv;
+
+ if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1)))
+ return -1;
+
+ p = buf;
+ lba = off / DEV_BSIZE;
+ while (bytes > 0) {
+ nb = bytes / DEV_BSIZE;
+ if (nb > READ_BUF_SIZE / DEV_BSIZE)
+ nb = READ_BUF_SIZE / DEV_BSIZE;
+ if (drvread(dsk, dmadat->rdbuf, lba, nb))
+ return -1;
+ memcpy(p, dmadat->rdbuf, nb * DEV_BSIZE);
+ p += nb * DEV_BSIZE;
+ lba += nb;
+ bytes -= nb * DEV_BSIZE;
+ }
+
+ return 0;
+}
+
+static int
+xfsread(const dnode_phys_t *dnode, off_t *offp, void *buf, size_t nbyte)
+{
+ if ((size_t)zfs_read(spa, dnode, offp, buf, nbyte) != nbyte) {
+ printf("Invalid %s\n", "format");
+ return -1;
+ }
+ return 0;
+}
+
+static inline uint32_t
+memsize(void)
+{
+ v86.addr = MEM_EXT;
+ v86.eax = 0x8800;
+ v86int();
+ return v86.eax;
+}
+
+static inline void
+getstr(void)
+{
+ char *s;
+ int c;
+
+ s = cmd;
+ for (;;) {
+ switch (c = xgetc(0)) {
+ case 0:
+ break;
+ case '\177':
+ case '\b':
+ if (s > cmd) {
+ s--;
+ printf("\b \b");
+ }
+ break;
+ case '\n':
+ case '\r':
+ *s = 0;
+ return;
+ default:
+ if (s - cmd < sizeof(cmd) - 1)
+ *s++ = c;
+ putchar(c);
+ }
+ }
+}
+
+static inline void
+putc(int c)
+{
+ v86.addr = 0x10;
+ v86.eax = 0xe00 | (c & 0xff);
+ v86.ebx = 0x7;
+ v86int();
+}
+
+/*
+ * Try to detect a device supported by the legacy int13 BIOS
+ */
+static int
+int13probe(int drive)
+{
+ v86.ctl = V86_FLAGS;
+ v86.addr = 0x13;
+ v86.eax = 0x800;
+ v86.edx = drive;
+ v86int();
+
+ if (!(v86.efl & 0x1) && /* carry clear */
+ ((v86.edx & 0xff) != (drive & DRV_MASK))) { /* unit # OK */
+ if ((v86.ecx & 0x3f) == 0) { /* absurd sector size */
+ return(0); /* skip device */
+ }
+ return (1);
+ }
+ return(0);
+}
+
+static void
+probe_drive(struct dsk *dsk, spa_t **spap)
+{
+ struct dos_partition *dp;
+ char *sec;
+ unsigned i;
+
+ if (!int13probe(dsk->drive))
+ return;
+
+ /*
+ * If we find a vdev on the whole disk, stop here. Otherwise dig
+ * out the MBR and probe each slice in turn for a vdev.
+ */
+ if (vdev_probe(vdev_read, dsk, spap) == 0)
+ return;
+
+ sec = dmadat->secbuf;
+ dsk->start = 0;
+ if (drvread(dsk, sec, DOSBBSECTOR, 1))
+ return;
+ dp = (void *)(sec + DOSPARTOFF);
+
+ for (i = 0; i < NDOSPART; i++) {
+ if (!dp[i].dp_typ)
+ continue;
+ dsk->start = dp[i].dp_start;
+ if (vdev_probe(vdev_read, dsk, spap) == 0) {
+ /*
+ * We record the first pool we find (we will try to boot
+ * from that one.
+ */
+ spap = 0;
+
+ /*
+ * This slice had a vdev. We need a new dsk structure now
+ * sice the vdev now owns this one.
+ */
+ struct dsk *newdsk;
+ newdsk = malloc(sizeof(struct dsk));
+ *newdsk = *dsk;
+ dsk = newdsk;
+ }
+ }
+}
+
+int
+main(void)
+{
+ int autoboot, i;
+ dnode_phys_t dn;
+ off_t off;
+ struct dsk *dsk;
+
+ dmadat = (void *)(roundup2(__base + (int32_t)&_end, 0x10000) - __base);
+ v86.ctl = V86_FLAGS;
+
+ dsk = malloc(sizeof(struct dsk));
+ dsk->drive = *(uint8_t *)PTOV(ARGS);
+ dsk->type = dsk->drive & DRV_HARD ? TYPE_AD : TYPE_FD;
+ dsk->unit = dsk->drive & DRV_MASK;
+ dsk->slice = *(uint8_t *)PTOV(ARGS + 1) + 1;
+ dsk->part = 0;
+ dsk->start = 0;
+ dsk->init = 0;
+
+ bootinfo.bi_version = BOOTINFO_VERSION;
+ bootinfo.bi_size = sizeof(bootinfo);
+ bootinfo.bi_basemem = 0; /* XXX will be filled by loader or kernel */
+ bootinfo.bi_extmem = memsize();
+ bootinfo.bi_memsizes_valid++;
+ bootinfo.bi_bios_dev = dsk->drive;
+
+ bootdev = MAKEBOOTDEV(dev_maj[dsk->type],
+ dsk->slice, dsk->unit, dsk->part),
+
+ /* Process configuration file */
+
+ autoboot = 1;
+
+ zfs_init();
+
+ /*
+ * Probe the boot drive first - we will try to boot from whatever
+ * pool we find on that drive.
+ */
+ probe_drive(dsk, &spa);
+
+ /*
+ * Probe the rest of the drives that the bios knows about. This
+ * will find any other available pools and it may fill in missing
+ * vdevs for the boot pool.
+ */
+ for (i = 0; i < 4; i++) {
+ if ((i | DRV_HARD) == *(uint8_t *)PTOV(ARGS))
+ continue;
+
+ dsk = malloc(sizeof(struct dsk));
+ dsk->drive = i | DRV_HARD;
+ dsk->type = dsk->drive & TYPE_AD;
+ dsk->unit = i;
+ dsk->slice = 0;
+ dsk->part = 0;
+ dsk->start = 0;
+ dsk->init = 0;
+ probe_drive(dsk, 0);
+ }
+
+ /*
+ * If we didn't find a pool on the boot drive, default to the
+ * first pool we found, if any.
+ */
+ if (!spa) {
+ spa = STAILQ_FIRST(&zfs_pools);
+ if (!spa) {
+ printf("No ZFS pools located, can't boot\n");
+ for (;;)
+ ;
+ }
+ }
+
+ zfs_mount_pool(spa);
+
+ if (zfs_lookup(spa, PATH_CONFIG, &dn) == 0) {
+ off = 0;
+ xfsread(&dn, &off, cmd, sizeof(cmd));
+ }
+
+ if (*cmd) {
+ if (parse())
+ autoboot = 0;
+ if (!OPT_CHECK(RBX_QUIET))
+ printf("%s: %s", PATH_CONFIG, cmd);
+ /* Do not process this command twice */
+ *cmd = 0;
+ }
+
+ /*
+ * Try to exec stage 3 boot loader. If interrupted by a keypress,
+ * or in case of failure, try to load a kernel directly instead.
+ */
+
+ if (autoboot && !*kname) {
+ memcpy(kname, PATH_BOOT3, sizeof(PATH_BOOT3));
+ if (!keyhit(3*SECOND)) {
+ load();
+ memcpy(kname, PATH_KERNEL, sizeof(PATH_KERNEL));
+ }
+ }
+
+ /* Present the user with the boot2 prompt. */
+
+ for (;;) {
+ if (!autoboot || !OPT_CHECK(RBX_QUIET))
+ printf("\nFreeBSD/i386 boot\n"
+ "Default: %s:%s\n"
+ "boot: ",
+ spa->spa_name, kname);
+ if (ioctrl & IO_SERIAL)
+ sio_flush();
+ if (!autoboot || keyhit(5*SECOND))
+ getstr();
+ else if (!autoboot || !OPT_CHECK(RBX_QUIET))
+ putchar('\n');
+ autoboot = 0;
+ if (parse())
+ putchar('\a');
+ else
+ load();
+ }
+}
+
+/* XXX - Needed for btxld to link the boot2 binary; do not remove. */
+void
+exit(int x)
+{
+}
+
+static void
+load(void)
+{
+ union {
+ struct exec ex;
+ Elf32_Ehdr eh;
+ } hdr;
+ static Elf32_Phdr ep[2];
+ static Elf32_Shdr es[2];
+ caddr_t p;
+ dnode_phys_t dn;
+ off_t off;
+ uint32_t addr, x;
+ int fmt, i, j;
+
+ if (zfs_lookup(spa, kname, &dn)) {
+ return;
+ }
+ off = 0;
+ if (xfsread(&dn, &off, &hdr, sizeof(hdr)))
+ return;
+ if (N_GETMAGIC(hdr.ex) == ZMAGIC)
+ fmt = 0;
+ else if (IS_ELF(hdr.eh))
+ fmt = 1;
+ else {
+ printf("Invalid %s\n", "format");
+ return;
+ }
+ if (fmt == 0) {
+ addr = hdr.ex.a_entry & 0xffffff;
+ p = PTOV(addr);
+ off = PAGE_SIZE;
+ if (xfsread(&dn, &off, p, hdr.ex.a_text))
+ return;
+ p += roundup2(hdr.ex.a_text, PAGE_SIZE);
+ if (xfsread(&dn, &off, p, hdr.ex.a_data))
+ return;
+ p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE);
+ bootinfo.bi_symtab = VTOP(p);
+ memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms));
+ p += sizeof(hdr.ex.a_syms);
+ if (hdr.ex.a_syms) {
+ if (xfsread(&dn, &off, p, hdr.ex.a_syms))
+ return;
+ p += hdr.ex.a_syms;
+ if (xfsread(&dn, &off, p, sizeof(int)))
+ return;
+ x = *(uint32_t *)p;
+ p += sizeof(int);
+ x -= sizeof(int);
+ if (xfsread(&dn, &off, p, x))
+ return;
+ p += x;
+ }
+ } else {
+ off = hdr.eh.e_phoff;
+ for (j = i = 0; i < hdr.eh.e_phnum && j < 2; i++) {
+ if (xfsread(&dn, &off, ep + j, sizeof(ep[0])))
+ return;
+ if (ep[j].p_type == PT_LOAD)
+ j++;
+ }
+ for (i = 0; i < 2; i++) {
+ p = PTOV(ep[i].p_paddr & 0xffffff);
+ off = ep[i].p_offset;
+ if (xfsread(&dn, &off, p, ep[i].p_filesz))
+ return;
+ }
+ p += roundup2(ep[1].p_memsz, PAGE_SIZE);
+ bootinfo.bi_symtab = VTOP(p);
+ if (hdr.eh.e_shnum == hdr.eh.e_shstrndx + 3) {
+ off = hdr.eh.e_shoff + sizeof(es[0]) *
+ (hdr.eh.e_shstrndx + 1);
+ if (xfsread(&dn, &off, &es, sizeof(es)))
+ return;
+ for (i = 0; i < 2; i++) {
+ memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size));
+ p += sizeof(es[i].sh_size);
+ off = es[i].sh_offset;
+ if (xfsread(&dn, &off, p, es[i].sh_size))
+ return;
+ p += es[i].sh_size;
+ }
+ }
+ addr = hdr.eh.e_entry & 0xffffff;
+ }
+ bootinfo.bi_esymtab = VTOP(p);
+ bootinfo.bi_kernelname = VTOP(kname);
+ __exec((caddr_t)addr, RB_BOOTINFO | (opts & RBX_MASK),
+ bootdev,
+ KARGS_FLAGS_ZFS,
+ (uint32_t) spa->spa_guid,
+ (uint32_t) (spa->spa_guid >> 32),
+ VTOP(&bootinfo));
+}
+
+static int
+parse()
+{
+ char *arg = cmd;
+ char *ep, *p, *q;
+ const char *cp;
+ //unsigned int drv;
+ int c, i, j;
+
+ while ((c = *arg++)) {
+ if (c == ' ' || c == '\t' || c == '\n')
+ continue;
+ for (p = arg; *p && *p != '\n' && *p != ' ' && *p != '\t'; p++);
+ ep = p;
+ if (*p)
+ *p++ = 0;
+ if (c == '-') {
+ while ((c = *arg++)) {
+ if (c == 'P') {
+ if (*(uint8_t *)PTOV(0x496) & 0x10) {
+ cp = "yes";
+ } else {
+ opts |= OPT_SET(RBX_DUAL) | OPT_SET(RBX_SERIAL);
+ cp = "no";
+ }
+ printf("Keyboard: %s\n", cp);
+ continue;
+ } else if (c == 'S') {
+ j = 0;
+ while ((unsigned int)(i = *arg++ - '0') <= 9)
+ j = j * 10 + i;
+ if (j > 0 && i == -'0') {
+ comspeed = j;
+ break;
+ }
+ /* Fall through to error below ('S' not in optstr[]). */
+ }
+ for (i = 0; c != optstr[i]; i++)
+ if (i == NOPT - 1)
+ return -1;
+ opts ^= OPT_SET(flags[i]);
+ }
+ ioctrl = OPT_CHECK(RBX_DUAL) ? (IO_SERIAL|IO_KEYBOARD) :
+ OPT_CHECK(RBX_SERIAL) ? IO_SERIAL : IO_KEYBOARD;
+ if (ioctrl & IO_SERIAL)
+ sio_init(115200 / comspeed);
+ } if (c == '?') {
+ dnode_phys_t dn;
+
+ if (zfs_lookup(spa, arg, &dn) == 0) {
+ zap_list(spa, &dn);
+ }
+ return -1;
+ } else {
+ arg--;
+
+ /*
+ * Report pool status if the comment is 'status'. Lets
+ * hope no-one wants to load /status as a kernel.
+ */
+ if (!strcmp(arg, "status")) {
+ spa_all_status();
+ return -1;
+ }
+
+ /*
+ * If there is a colon, switch pools.
+ */
+ q = (char *) strchr(arg, ':');
+ if (q) {
+ spa_t *newspa;
+
+ *q++ = 0;
+ newspa = spa_find_by_name(arg);
+ if (newspa) {
+ spa = newspa;
+ zfs_mount_pool(spa);
+ } else {
+ printf("\nCan't find ZFS pool %s\n", arg);
+ return -1;
+ }
+ arg = q;
+ }
+ if ((i = ep - arg)) {
+ if ((size_t)i >= sizeof(kname))
+ return -1;
+ memcpy(kname, arg, i + 1);
+ }
+ }
+ arg = p;
+ }
+ return 0;
+}
+
+static void
+printf(const char *fmt,...)
+{
+ va_list ap;
+ char buf[10];
+ char *s;
+ unsigned u;
+ int c;
+ int minus;
+ int prec;
+ int len;
+ int pad;
+
+ va_start(ap, fmt);
+ while ((c = *fmt++)) {
+ if (c == '%') {
+ minus = 0;
+ prec = 0;
+ nextfmt:
+ c = *fmt++;
+ switch (c) {
+ case '-':
+ minus = 1;
+ goto nextfmt;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ prec = 10 * prec + (c - '0');
+ goto nextfmt;
+ case 'c':
+ putchar(va_arg(ap, int));
+ continue;
+ case 's':
+ s = va_arg(ap, char *);
+ if (prec) {
+ len = strlen(s);
+ if (len < prec)
+ pad = prec - len;
+ else
+ pad = 0;
+ if (minus)
+ while (pad--)
+ putchar(' ');
+ for (; *s; s++)
+ putchar(*s);
+ if (!minus)
+ while (pad--)
+ putchar(' ');
+ } else {
+ for (; *s; s++)
+ putchar(*s);
+ }
+ continue;
+ case 'u':
+ u = va_arg(ap, unsigned);
+ s = buf;
+ do
+ *s++ = '0' + u % 10U;
+ while (u /= 10U);
+ while (--s >= buf)
+ putchar(*s);
+ continue;
+ }
+ }
+ putchar(c);
+ }
+ va_end(ap);
+ return;
+}
+
+static void
+putchar(int c)
+{
+ if (c == '\n')
+ xputc('\r');
+ xputc(c);
+}
+
+static int
+drvread(struct dsk *dsk, void *buf, unsigned lba, unsigned nblk)
+{
+ static unsigned c = 0x2d5c7c2f;
+
+ lba += dsk->start;
+ if (!OPT_CHECK(RBX_QUIET))
+ printf("%c\b", c = c << 8 | c >> 24);
+ v86.ctl = V86_ADDR | V86_CALLF | V86_FLAGS;
+ v86.addr = XREADORG; /* call to xread in boot1 */
+ v86.es = VTOPSEG(buf);
+ v86.eax = lba;
+ v86.ebx = VTOPOFF(buf);
+ v86.ecx = lba >> 16;
+ v86.edx = nblk << 8 | dsk->drive;
+ v86int();
+ v86.ctl = V86_FLAGS;
+ if (V86_CY(v86.efl)) {
+ printf("error %u lba %u\n", v86.eax >> 8 & 0xff, lba);
+ return -1;
+ }
+ return 0;
+}
+
+static int
+keyhit(unsigned ticks)
+{
+ uint32_t t0, t1;
+
+ if (OPT_CHECK(RBX_NOINTR))
+ return 0;
+ t0 = 0;
+ for (;;) {
+ if (xgetc(1))
+ return 1;
+ t1 = *(uint32_t *)PTOV(0x46c);
+ if (!t0)
+ t0 = t1;
+ if (t1 < t0 || t1 >= t0 + ticks)
+ return 0;
+ }
+}
+
+static int
+xputc(int c)
+{
+ if (ioctrl & IO_KEYBOARD)
+ putc(c);
+ if (ioctrl & IO_SERIAL)
+ sio_putc(c);
+ return c;
+}
+
+static int
+xgetc(int fn)
+{
+ if (OPT_CHECK(RBX_NOINTR))
+ return 0;
+ for (;;) {
+ if (ioctrl & IO_KEYBOARD && getc(1))
+ return fn ? 1 : getc(0);
+ if (ioctrl & IO_SERIAL && sio_ischar())
+ return fn ? 1 : sio_getc();
+ if (fn)
+ return 0;
+ }
+}
+
+static int
+getc(int fn)
+{
+ /*
+ * The extra comparison against zero is an attempt to work around
+ * what appears to be a bug in QEMU and Bochs. Both emulators
+ * sometimes report a key-press with scancode one and ascii zero
+ * when no such key is pressed in reality. As far as I can tell,
+ * this only happens shortly after a reboot.
+ */
+ v86.addr = 0x16;
+ v86.eax = fn << 8;
+ v86int();
+ return fn == 0 ? v86.eax & 0xff : (!V86_ZR(v86.efl) && (v86.eax & 0xff));
+}
diff --git a/sys/boot/i386/zfsboot/zfsldr.S b/sys/boot/i386/zfsboot/zfsldr.S
new file mode 100644
index 0000000..a256d30
--- /dev/null
+++ b/sys/boot/i386/zfsboot/zfsldr.S
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 1998 Robert Nordier
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms are freely
+ * permitted provided that the above copyright notice and this
+ * paragraph and the following disclaimer are duplicated in all
+ * such forms.
+ *
+ * This software is provided "AS IS" and without any express or
+ * implied warranties, including, without limitation, the implied
+ * warranties of merchantability and fitness for a particular
+ * purpose.
+ *
+ * $FreeBSD$
+ */
+
+/* Memory Locations */
+ .set MEM_REL,0x700 # Relocation address
+ .set MEM_ARG,0x900 # Arguments
+ .set MEM_ORG,0x7c00 # Origin
+ .set MEM_BUF,0x8000 # Load area
+ .set MEM_BTX,0x9000 # BTX start
+ .set MEM_JMP,0x9010 # BTX entry point
+ .set MEM_USR,0xa000 # Client start
+ .set BDA_BOOT,0x472 # Boot howto flag
+
+/* Partition Constants */
+ .set PRT_OFF,0x1be # Partition offset
+ .set PRT_NUM,0x4 # Partitions
+ .set PRT_BSD,0xa5 # Partition type
+
+/* Flag Bits */
+ .set FL_PACKET,0x80 # Packet mode
+
+/* Misc. Constants */
+ .set SIZ_PAG,0x1000 # Page size
+ .set SIZ_SEC,0x200 # Sector size
+
+ .set NSECT,0x40
+ .globl start
+ .globl xread
+ .code16
+
+start: jmp main # Start recognizably
+
+/*
+ * This is the start of a standard BIOS Parameter Block (BPB). Most bootable
+ * FAT disks have this at the start of their MBR. While normal BIOS's will
+ * work fine without this section, IBM's El Torito emulation "fixes" up the
+ * BPB by writing into the memory copy of the MBR. Rather than have data
+ * written into our xread routine, we'll define a BPB to work around it.
+ * The data marked with (T) indicates a field required for a ThinkPad to
+ * recognize the disk and (W) indicates fields written from IBM BIOS code.
+ * The use of the BPB is based on what OpenBSD and NetBSD implemented in
+ * their boot code but the required fields were determined by trial and error.
+ *
+ * Note: If additional space is needed in boot1, one solution would be to
+ * move the "prompt" message data (below) to replace the OEM ID.
+ */
+ .org 0x03, 0x00
+oemid: .space 0x08, 0x00 # OEM ID
+
+ .org 0x0b, 0x00
+bpb: .word 512 # sector size (T)
+ .byte 0 # sectors/clustor
+ .word 0 # reserved sectors
+ .byte 0 # number of FATs
+ .word 0 # root entries
+ .word 0 # small sectors
+ .byte 0 # media type (W)
+ .word 0 # sectors/fat
+ .word 18 # sectors per track (T)
+ .word 2 # number of heads (T)
+ .long 0 # hidden sectors (W)
+ .long 0 # large sectors
+
+ .org 0x24, 0x00
+ebpb: .byte 0 # BIOS physical drive number (W)
+
+ .org 0x25,0x90
+/*
+ * Trampoline used by boot2 to call read to read data from the disk via
+ * the BIOS. Call with:
+ *
+ * %cx:%ax - long - LBA to read in
+ * %es:(%bx) - caddr_t - buffer to read data into
+ * %dl - byte - drive to read from
+ * %dh - byte - num sectors to read
+ */
+
+xread: push %ss # Address
+ pop %ds # data
+/*
+ * Setup an EDD disk packet and pass it to read
+ */
+xread.1: # Starting
+ pushl $0x0 # absolute
+ push %cx # block
+ push %ax # number
+ push %es # Address of
+ push %bx # transfer buffer
+ xor %ax,%ax # Number of
+ movb %dh,%al # blocks to
+ push %ax # transfer
+ push $0x10 # Size of packet
+ mov %sp,%bp # Packet pointer
+ callw read # Read from disk
+ lea 0x10(%bp),%sp # Clear stack
+ lret # To far caller
+/*
+ * Load the rest of boot2 and BTX up, copy the parts to the right locations,
+ * and start it all up.
+ */
+
+/*
+ * Setup the segment registers to flat addressing (segment 0) and setup the
+ * stack to end just below the start of our code.
+ */
+main: cld # String ops inc
+ xor %cx,%cx # Zero
+ mov %cx,%es # Address
+ mov %cx,%ds # data
+ mov %cx,%ss # Set up
+ mov $start,%sp # stack
+/*
+ * Relocate ourself to MEM_REL. Since %cx == 0, the inc %ch sets
+ * %cx == 0x100.
+ */
+ mov %sp,%si # Source
+ mov $MEM_REL,%di # Destination
+ incb %ch # Word count
+ rep # Copy
+ movsw # code
+/*
+ * If we are on a hard drive, then load the MBR and look for the first
+ * FreeBSD slice. We use the fake partition entry below that points to
+ * the MBR when we call nread. The first pass looks for the first active
+ * FreeBSD slice. The second pass looks for the first non-active FreeBSD
+ * slice if the first one fails.
+ */
+ mov $part4,%si # Partition
+ cmpb $0x80,%dl # Hard drive?
+ jb main.4 # No
+ movb $0x1,%dh # Block count
+ callw nread # Read MBR
+ mov $0x1,%cx # Two passes
+main.1: mov $MEM_BUF+PRT_OFF,%si # Partition table
+ movb $0x1,%dh # Partition
+main.2: cmpb $PRT_BSD,0x4(%si) # Our partition type?
+ jne main.3 # No
+ jcxz main.5 # If second pass
+ testb $0x80,(%si) # Active?
+ jnz main.5 # Yes
+main.3: add $0x10,%si # Next entry
+ incb %dh # Partition
+ cmpb $0x1+PRT_NUM,%dh # In table?
+ jb main.2 # Yes
+ dec %cx # Do two
+ jcxz main.1 # passes
+/*
+ * If we get here, we didn't find any FreeBSD slices at all, so print an
+ * error message and die.
+ */
+ mov $msg_part,%si # Message
+ jmp error # Error
+/*
+ * Floppies use partition 0 of drive 0.
+ */
+main.4: xor %dx,%dx # Partition:drive
+
+/*
+ * Ok, we have a slice and drive in %dx now, so use that to locate and
+ * load boot2. %si references the start of the slice we are looking
+ * for, so go ahead and load up the 64 sectors starting at sector 1024
+ * (i.e. after the two vdev labels). We don't have do anything fancy
+ * here to allow for an extra copy of boot1 and a partition table
+ * (compare to this section of the UFS bootstrap) so we just load it
+ * all at 0x8000. The first part of boot2 is BTX, which wants to run
+ * at 0x9000. The boot2.bin binary starts right after the end of BTX,
+ * so we have to figure out where the start of it is and then move the
+ * binary to 0xc000. After we have moved the client, we relocate BTX
+ * itself to 0x9000 - doing it in this order means that none of the
+ * memcpy regions overlap which would corrupt the copy. Normally, BTX
+ * clients start at MEM_USR, or 0xa000, but when we use btxld to
+ * create boot2, we use an entry point of 0x2000. That entry point is
+ * relative to MEM_USR; thus boot2.bin starts at 0xc000.
+ *
+ * The load area and the target area for the client overlap so we have
+ * to use a decrementing string move. We also play segment register
+ * games with the destination address for the move so that the client
+ * can be larger than 16k (which would overflow the zero segment since
+ * the client starts at 0xc000). Relocating BTX is easy since the load
+ * area and target area do not overlap.
+ */
+main.5: mov %dx,MEM_ARG # Save args
+ movb $NSECT,%dh # Sector count
+ movw $1024,%ax # Offset to boot2
+ callw nread.1 # Read disk
+main.6: mov $MEM_BUF,%si # BTX (before reloc)
+ mov 0xa(%si),%bx # Get BTX length and set
+ mov $NSECT*SIZ_SEC-1,%di # Size of load area (less one)
+ mov %di,%si # End of load
+ add $MEM_BUF,%si # area
+ sub %bx,%di # End of client, 0xc000 rel
+ mov %di,%cx # Size of
+ inc %cx # client
+ mov $(MEM_USR+2*SIZ_PAG)>>4,%dx # Segment
+ mov %dx,%es # addressing 0xc000
+ std # Move with decrement
+ rep # Relocate
+ movsb # client
+ mov %ds,%dx # Back to
+ mov %dx,%es # zero segment
+ mov $MEM_BUF,%si # BTX (before reloc)
+ mov $MEM_BTX,%di # BTX
+ mov %bx,%cx # Get BTX length
+ cld # Increment this time
+ rep # Relocate
+ movsb # BTX
+
+/*
+ * Enable A20 so we can access memory above 1 meg.
+ * Use the zero-valued %cx as a timeout for embedded hardware which do not
+ * have a keyboard controller.
+ */
+seta20: cli # Disable interrupts
+seta20.1: dec %cx # Timeout?
+ jz seta20.3 # Yes
+ inb $0x64,%al # Get status
+ testb $0x2,%al # Busy?
+ jnz seta20.1 # Yes
+ movb $0xd1,%al # Command: Write
+ outb %al,$0x64 # output port
+seta20.2: inb $0x64,%al # Get status
+ testb $0x2,%al # Busy?
+ jnz seta20.2 # Yes
+ movb $0xdf,%al # Enable
+ outb %al,$0x60 # A20
+seta20.3: sti # Enable interrupts
+
+ jmp start+MEM_JMP-MEM_ORG # Start BTX
+
+
+/*
+ * Trampoline used to call read from within boot1.
+ */
+nread: xor %ax,%ax # Sector offset in partition
+nread.1: mov $MEM_BUF,%bx # Transfer buffer
+ add 0x8(%si),%ax # Get
+ mov 0xa(%si),%cx # LBA
+ push %cs # Read from
+ callw xread.1 # disk
+ jnc return # If success, return
+ mov $msg_read,%si # Otherwise, set the error
+ # message and fall through to
+ # the error routine
+/*
+ * Print out the error message pointed to by %ds:(%si) followed
+ * by a prompt, wait for a keypress, and then reboot the machine.
+ */
+error: callw putstr # Display message
+ mov $prompt,%si # Display
+ callw putstr # prompt
+ xorb %ah,%ah # BIOS: Get
+ int $0x16 # keypress
+ movw $0x1234, BDA_BOOT # Do a warm boot
+ ljmp $0xffff,$0x0 # reboot the machine
+/*
+ * Display a null-terminated string using the BIOS output.
+ */
+putstr.0: mov $0x7,%bx # Page:attribute
+ movb $0xe,%ah # BIOS: Display
+ int $0x10 # character
+putstr: lodsb # Get char
+ testb %al,%al # End of string?
+ jne putstr.0 # No
+
+/*
+ * Overused return code. ereturn is used to return an error from the
+ * read function. Since we assume putstr succeeds, we (ab)use the
+ * same code when we return from putstr.
+ */
+ereturn: movb $0x1,%ah # Invalid
+ stc # argument
+return: retw # To caller
+/*
+ * Reads sectors from the disk. If EDD is enabled, then check if it is
+ * installed and use it if it is. If it is not installed or not enabled, then
+ * fall back to using CHS. Since we use a LBA, if we are using CHS, we have to
+ * fetch the drive parameters from the BIOS and divide it out ourselves.
+ * Call with:
+ *
+ * %dl - byte - drive number
+ * stack - 10 bytes - EDD Packet
+ */
+read: testb $FL_PACKET,%cs:MEM_REL+flags-start # LBA support enabled?
+ jz read.1 # No, use CHS
+ cmpb $0x80,%dl # Hard drive?
+ jb read.1 # No, use CHS
+ mov $0x55aa,%bx # Magic
+ push %dx # Save
+ movb $0x41,%ah # BIOS: Check
+ int $0x13 # extensions present
+ pop %dx # Restore
+ jc read.1 # If error, use CHS
+ cmp $0xaa55,%bx # Magic?
+ jne read.1 # No, so use CHS
+ testb $0x1,%cl # Packet interface?
+ jz read.1 # No, so use CHS
+ mov %bp,%si # Disk packet
+ movb $0x42,%ah # BIOS: Extended
+ int $0x13 # read
+ retw # To caller
+#if 0
+read.1: push %dx # Save
+ movb $0x8,%ah # BIOS: Get drive
+ int $0x13 # parameters
+ movb %dh,%ch # Max head number
+ pop %dx # Restore
+ jc return # If error
+ andb $0x3f,%cl # Sectors per track
+ jz ereturn # If zero
+ cli # Disable interrupts
+ mov 0x8(%bp),%eax # Get LBA
+ push %dx # Save
+ movzbl %cl,%ebx # Divide by
+ xor %edx,%edx # sectors
+ div %ebx # per track
+ movb %ch,%bl # Max head number
+ movb %dl,%ch # Sector number
+ inc %bx # Divide by
+ xorb %dl,%dl # number
+ div %ebx # of heads
+ movb %dl,%bh # Head number
+ pop %dx # Restore
+ cmpl $0x3ff,%eax # Cylinder number supportable?
+ sti # Enable interrupts
+ ja ereturn # No, return an error
+ xchgb %al,%ah # Set up cylinder
+ rorb $0x2,%al # number
+ orb %ch,%al # Merge
+ inc %ax # sector
+ xchg %ax,%cx # number
+ movb %bh,%dh # Head number
+ subb %ah,%al # Sectors this track
+ mov 0x2(%bp),%ah # Blocks to read
+ cmpb %ah,%al # To read
+ jb read.2 # this
+#ifdef TRACK_AT_A_TIME
+ movb %ah,%al # track
+#else
+ movb $1,%al # one sector
+#endif
+read.2: mov $0x5,%di # Try count
+read.3: les 0x4(%bp),%bx # Transfer buffer
+ push %ax # Save
+ movb $0x2,%ah # BIOS: Read
+ int $0x13 # from disk
+ pop %bx # Restore
+ jnc read.4 # If success
+ dec %di # Retry?
+ jz read.6 # No
+ xorb %ah,%ah # BIOS: Reset
+ int $0x13 # disk system
+ xchg %bx,%ax # Block count
+ jmp read.3 # Continue
+read.4: movzbw %bl,%ax # Sectors read
+ add %ax,0x8(%bp) # Adjust
+ jnc read.5 # LBA,
+ incw 0xa(%bp) # transfer
+read.5: shlb %bl # buffer
+ add %bl,0x5(%bp) # pointer,
+ sub %al,0x2(%bp) # block count
+ ja read.1 # If not done
+read.6: retw # To caller
+#else
+read.1: mov $msg_chs,%si
+ jmp error
+msg_chs: .asciz "CHS not supported"
+#endif
+
+/* Messages */
+
+msg_read: .asciz "Read"
+msg_part: .asciz "Boot"
+
+prompt: .asciz " error\r\n"
+
+flags: .byte FLAGS # Flags
+
+ .org PRT_OFF,0x90
+
+/* Partition table */
+
+ .fill 0x30,0x1,0x0
+part4: .byte 0x80, 0x00, 0x01, 0x00
+ .byte 0xa5, 0xfe, 0xff, 0xff
+ .byte 0x00, 0x00, 0x00, 0x00
+ .byte 0x50, 0xc3, 0x00, 0x00 # 50000 sectors long, bleh
+
+ .word 0xaa55 # Magic number
diff --git a/sys/boot/zfs/Makefile b/sys/boot/zfs/Makefile
new file mode 100644
index 0000000..723233c
--- /dev/null
+++ b/sys/boot/zfs/Makefile
@@ -0,0 +1,29 @@
+# $FreeBSD$
+
+LIB= zfsboot
+INTERNALLIB=
+
+SRCS+= zfs.c
+
+CFLAGS+= -I${.CURDIR}/../common -I${.CURDIR}/../.. -I.
+CFLAGS+= -I${.CURDIR}/../../../lib/libstand
+CFLAGS+= -I${.CURDIR}/../../cddl/boot/zfs
+
+# XXX need arch-specific bootstrap CFLAGS here
+#
+CFLAGS+= -ffreestanding -mpreferred-stack-boundary=2 \
+ -mno-mmx -mno-3dnow -mno-sse -mno-sse2 -mno-sse3
+
+CFLAGS+= -Wformat -Wall
+
+.if ${MACHINE_ARCH} == "amd64"
+CLEANFILES+= machine
+machine:
+ ln -sf ${.CURDIR}/../../../i386/include machine
+.endif
+
+.include <bsd.lib.mk>
+
+.if ${MACHINE_ARCH} == "amd64"
+beforedepend ${OBJS}: machine
+.endif
diff --git a/sys/boot/zfs/zfs.c b/sys/boot/zfs/zfs.c
new file mode 100644
index 0000000..cf0bb9c
--- /dev/null
+++ b/sys/boot/zfs/zfs.c
@@ -0,0 +1,514 @@
+/*-
+ * Copyright (c) 2007 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Stand-alone file reading package.
+ */
+
+#include <sys/param.h>
+#include <sys/disklabel.h>
+#include <sys/time.h>
+#include <sys/queue.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stand.h>
+#include <bootstrap.h>
+
+#include "zfsimpl.c"
+
+static int zfs_open(const char *path, struct open_file *f);
+static int zfs_write(struct open_file *f, void *buf, size_t size, size_t *resid);
+static int zfs_close(struct open_file *f);
+static int zfs_read(struct open_file *f, void *buf, size_t size, size_t *resid);
+static off_t zfs_seek(struct open_file *f, off_t offset, int where);
+static int zfs_stat(struct open_file *f, struct stat *sb);
+static int zfs_readdir(struct open_file *f, struct dirent *d);
+
+struct devsw zfs_dev;
+
+struct fs_ops zfs_fsops = {
+ "zfs",
+ zfs_open,
+ zfs_close,
+ zfs_read,
+ zfs_write,
+ zfs_seek,
+ zfs_stat,
+ zfs_readdir
+};
+
+/*
+ * In-core open file.
+ */
+struct file {
+ off_t f_seekp; /* seek pointer */
+ dnode_phys_t f_dnode;
+ uint64_t f_zap_type; /* zap type for readdir */
+ uint64_t f_num_leafs; /* number of fzap leaf blocks */
+ zap_leaf_phys_t *f_zap_leaf; /* zap leaf buffer */
+};
+
+/*
+ * Open a file.
+ */
+static int
+zfs_open(const char *upath, struct open_file *f)
+{
+ spa_t *spa = (spa_t *) f->f_devdata;
+ struct file *fp;
+ int rc;
+
+ if (f->f_dev != &zfs_dev)
+ return (EINVAL);
+
+ rc = zfs_mount_pool(spa);
+ if (rc)
+ return (rc);
+
+ /* allocate file system specific data structure */
+ fp = malloc(sizeof(struct file));
+ bzero(fp, sizeof(struct file));
+ f->f_fsdata = (void *)fp;
+
+ if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
+ printf("Unexpected object set type %lld\n",
+ spa->spa_root_objset.os_type);
+ rc = EIO;
+ goto out;
+ }
+
+ rc = zfs_lookup(spa, upath, &fp->f_dnode);
+ if (rc)
+ goto out;
+
+ fp->f_seekp = 0;
+out:
+ if (rc) {
+ f->f_fsdata = NULL;
+ free(fp);
+ }
+ return (rc);
+}
+
+static int
+zfs_close(struct open_file *f)
+{
+ struct file *fp = (struct file *)f->f_fsdata;
+
+ dnode_cache_obj = 0;
+ f->f_fsdata = (void *)0;
+ if (fp == (struct file *)0)
+ return (0);
+
+ free(fp);
+ return (0);
+}
+
+/*
+ * Copy a portion of a file into kernel memory.
+ * Cross block boundaries when necessary.
+ */
+static int
+zfs_read(struct open_file *f, void *start, size_t size, size_t *resid /* out */)
+{
+ spa_t *spa = (spa_t *) f->f_devdata;
+ struct file *fp = (struct file *)f->f_fsdata;
+ const znode_phys_t *zp = (const znode_phys_t *) fp->f_dnode.dn_bonus;
+ size_t n;
+ int rc;
+
+ n = size;
+ if (fp->f_seekp + n > zp->zp_size)
+ n = zp->zp_size - fp->f_seekp;
+
+ rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n);
+ if (rc)
+ return (rc);
+
+ if (0) {
+ int i;
+ for (i = 0; i < n; i++)
+ putchar(((char*) start)[i]);
+ }
+ fp->f_seekp += n;
+ if (resid)
+ *resid = size - n;
+
+ return (0);
+}
+
+/*
+ * Don't be silly - the bootstrap has no business writing anything.
+ */
+static int
+zfs_write(struct open_file *f, void *start, size_t size, size_t *resid /* out */)
+{
+
+ return (EROFS);
+}
+
+static off_t
+zfs_seek(struct open_file *f, off_t offset, int where)
+{
+ struct file *fp = (struct file *)f->f_fsdata;
+ znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus;
+
+ switch (where) {
+ case SEEK_SET:
+ fp->f_seekp = offset;
+ break;
+ case SEEK_CUR:
+ fp->f_seekp += offset;
+ break;
+ case SEEK_END:
+ fp->f_seekp = zp->zp_size - offset;
+ break;
+ default:
+ errno = EINVAL;
+ return (-1);
+ }
+ return (fp->f_seekp);
+}
+
+static int
+zfs_stat(struct open_file *f, struct stat *sb)
+{
+ struct file *fp = (struct file *)f->f_fsdata;
+ znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus;
+
+ /* only important stuff */
+ sb->st_mode = zp->zp_mode;
+ sb->st_uid = zp->zp_uid;
+ sb->st_gid = zp->zp_gid;
+ sb->st_size = zp->zp_size;
+
+ return (0);
+}
+
+static int
+zfs_readdir(struct open_file *f, struct dirent *d)
+{
+ spa_t *spa = (spa_t *) f->f_devdata;
+ struct file *fp = (struct file *)f->f_fsdata;
+ znode_phys_t *zp = (znode_phys_t *) fp->f_dnode.dn_bonus;
+ mzap_ent_phys_t mze;
+ size_t bsize = fp->f_dnode.dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ int rc;
+
+ if ((zp->zp_mode >> 12) != 0x4) {
+ return (ENOTDIR);
+ }
+
+ /*
+ * If this is the first read, get the zap type.
+ */
+ if (fp->f_seekp == 0) {
+ rc = dnode_read(spa, &fp->f_dnode,
+ 0, &fp->f_zap_type, sizeof(fp->f_zap_type));
+ if (rc)
+ return (rc);
+
+ if (fp->f_zap_type == ZBT_MICRO) {
+ fp->f_seekp = offsetof(mzap_phys_t, mz_chunk);
+ } else {
+ rc = dnode_read(spa, &fp->f_dnode,
+ offsetof(zap_phys_t, zap_num_leafs),
+ &fp->f_num_leafs,
+ sizeof(fp->f_num_leafs));
+ if (rc)
+ return (rc);
+
+ fp->f_seekp = bsize;
+ fp->f_zap_leaf = (zap_leaf_phys_t *)malloc(bsize);
+ rc = dnode_read(spa, &fp->f_dnode,
+ fp->f_seekp,
+ fp->f_zap_leaf,
+ bsize);
+ if (rc)
+ return (rc);
+ }
+ }
+
+ if (fp->f_zap_type == ZBT_MICRO) {
+ mzap_next:
+ if (fp->f_seekp >= bsize)
+ return (ENOENT);
+
+ rc = dnode_read(spa, &fp->f_dnode,
+ fp->f_seekp, &mze, sizeof(mze));
+ fp->f_seekp += sizeof(mze);
+
+ if (!mze.mze_name[0])
+ goto mzap_next;
+
+ d->d_fileno = ZFS_DIRENT_OBJ(mze.mze_value);
+ d->d_type = ZFS_DIRENT_TYPE(mze.mze_value);
+ strcpy(d->d_name, mze.mze_name);
+ d->d_namlen = strlen(d->d_name);
+ return (0);
+ } else {
+ zap_leaf_t zl;
+ zap_leaf_chunk_t *zc, *nc;
+ int chunk;
+ size_t namelen;
+ char *p;
+ uint64_t value;
+
+ /*
+ * Initialise this so we can use the ZAP size
+ * calculating macros.
+ */
+ zl.l_bs = ilog2(bsize);
+ zl.l_phys = fp->f_zap_leaf;
+
+ /*
+ * Figure out which chunk we are currently looking at
+ * and consider seeking to the next leaf. We use the
+ * low bits of f_seekp as a simple chunk index.
+ */
+ fzap_next:
+ chunk = fp->f_seekp & (bsize - 1);
+ if (chunk == ZAP_LEAF_NUMCHUNKS(&zl)) {
+ fp->f_seekp = (fp->f_seekp & ~(bsize - 1)) + bsize;
+ chunk = 0;
+
+ /*
+ * Check for EOF and read the new leaf.
+ */
+ if (fp->f_seekp >= bsize * fp->f_num_leafs)
+ return (ENOENT);
+
+ rc = dnode_read(spa, &fp->f_dnode,
+ fp->f_seekp,
+ fp->f_zap_leaf,
+ bsize);
+ if (rc)
+ return (rc);
+ }
+
+ zc = &ZAP_LEAF_CHUNK(&zl, chunk);
+ fp->f_seekp++;
+ if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
+ goto fzap_next;
+
+ namelen = zc->l_entry.le_name_length;
+ if (namelen > sizeof(d->d_name))
+ namelen = sizeof(d->d_name);
+
+ /*
+ * Paste the name back together.
+ */
+ nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
+ p = d->d_name;
+ while (namelen > 0) {
+ int len;
+ len = namelen;
+ if (len > ZAP_LEAF_ARRAY_BYTES)
+ len = ZAP_LEAF_ARRAY_BYTES;
+ memcpy(p, nc->l_array.la_array, len);
+ p += len;
+ namelen -= len;
+ nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
+ }
+ d->d_name[sizeof(d->d_name) - 1] = 0;
+
+ /*
+ * Assume the first eight bytes of the value are
+ * a uint64_t.
+ */
+ value = fzap_leaf_value(&zl, zc);
+
+ d->d_fileno = ZFS_DIRENT_OBJ(value);
+ d->d_type = ZFS_DIRENT_TYPE(value);
+ d->d_namlen = strlen(d->d_name);
+
+ return (0);
+ }
+}
+
+static int
+vdev_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t size)
+{
+ int fd;
+
+ fd = (uintptr_t) priv;
+ lseek(fd, offset, SEEK_SET);
+ if (read(fd, buf, size) == size) {
+ return 0;
+ } else {
+ return (EIO);
+ }
+}
+
+/*
+ * Convert a pool guid to a 'unit number' suitable for use with zfs_dev_open.
+ */
+int
+zfs_guid_to_unit(uint64_t guid)
+{
+ spa_t *spa;
+ int unit;
+
+ unit = 0;
+ STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
+ if (spa->spa_guid == guid)
+ return unit;
+ unit++;
+ }
+ return (-1);
+}
+
+static int
+zfs_dev_init(void)
+{
+ char devname[512];
+ int unit, slice;
+ int fd;
+
+ /*
+ * Open all the disks we can find and see if we can reconstruct
+ * ZFS pools from them. Bogusly assumes that the disks are named
+ * diskN or diskNsM.
+ */
+ zfs_init();
+ for (unit = 0; unit < 32 /* XXX */; unit++) {
+ sprintf(devname, "disk%d:", unit);
+ fd = open(devname, O_RDONLY);
+ if (fd == -1)
+ continue;
+
+ /*
+ * If we find a vdev, the zfs code will eat the fd, otherwise
+ * we close it.
+ */
+ if (vdev_probe(vdev_read, (void*) (uintptr_t) fd, 0))
+ close(fd);
+
+ for (slice = 1; slice <= 4; slice++) {
+ sprintf(devname, "disk%ds%d:", unit, slice);
+ fd = open(devname, O_RDONLY);
+ if (fd == -1)
+ continue;
+ if (vdev_probe(vdev_read, (void*) (uintptr_t) fd, 0))
+ close(fd);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Print information about ZFS pools
+ */
+static void
+zfs_dev_print(int verbose)
+{
+ spa_t *spa;
+ char line[80];
+ int unit;
+
+ if (verbose) {
+ spa_all_status();
+ return;
+ }
+ unit = 0;
+ STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
+ sprintf(line, " zfs%d: %s\n", unit, spa->spa_name);
+ pager_output(line);
+ unit++;
+ }
+}
+
+/*
+ * Attempt to open the pool described by (dev) for use by (f).
+ */
+static int
+zfs_dev_open(struct open_file *f, ...)
+{
+ va_list args;
+ struct devdesc *dev;
+ int unit, i;
+ spa_t *spa;
+
+ va_start(args, f);
+ dev = va_arg(args, struct devdesc*);
+ va_end(args);
+
+ /*
+ * We mostly ignore the stuff that devopen sends us. For now,
+ * use the unit to find a pool - later we will override the
+ * devname parsing so that we can name a pool and a fs within
+ * the pool.
+ */
+ unit = dev->d_unit;
+ free(dev);
+
+ i = 0;
+ STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
+ if (i == unit)
+ break;
+ i++;
+ }
+ if (!spa) {
+ return (ENXIO);
+ }
+
+ f->f_devdata = spa;
+ return (0);
+}
+
+static int
+zfs_dev_close(struct open_file *f)
+{
+
+ f->f_devdata = NULL;
+ return (0);
+}
+
+static int
+zfs_dev_strategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize)
+{
+
+ return (ENOSYS);
+}
+
+struct devsw zfs_dev = {
+ .dv_name = "zfs",
+ .dv_type = DEVT_ZFS,
+ .dv_init = zfs_dev_init,
+ .dv_strategy = zfs_dev_strategy,
+ .dv_open = zfs_dev_open,
+ .dv_close = zfs_dev_close,
+ .dv_ioctl = noioctl,
+ .dv_print = zfs_dev_print,
+ .dv_cleanup = NULL
+};
diff --git a/sys/boot/zfs/zfsimpl.c b/sys/boot/zfs/zfsimpl.c
new file mode 100644
index 0000000..5bbc351
--- /dev/null
+++ b/sys/boot/zfs/zfsimpl.c
@@ -0,0 +1,1443 @@
+/*-
+ * Copyright (c) 2007 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Stand-alone ZFS file reader.
+ */
+
+#include "zfsimpl.h"
+#include "zfssubr.c"
+
+/*
+ * List of all vdevs, chained through v_alllink.
+ */
+static vdev_list_t zfs_vdevs;
+
+/*
+ * List of all pools, chained through spa_link.
+ */
+static spa_list_t zfs_pools;
+
+static uint64_t zfs_crc64_table[256];
+static char *zfs_decomp_buf;
+static const dnode_phys_t *dnode_cache_obj = 0;
+static uint64_t dnode_cache_bn;
+static char *dnode_cache_buf;
+static char *zap_scratch;
+
+/*
+ * Forward declarations.
+ */
+static int zio_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset);
+
+static void
+zfs_init(void)
+{
+ STAILQ_INIT(&zfs_vdevs);
+ STAILQ_INIT(&zfs_pools);
+
+ zfs_decomp_buf = malloc(128*1024);
+ dnode_cache_buf = malloc(128*1024);
+ zap_scratch = malloc(128*1024);
+
+ zfs_init_crc();
+}
+
+static int
+xdr_int(const unsigned char **xdr, int *ip)
+{
+ *ip = ((*xdr)[0] << 24)
+ | ((*xdr)[1] << 16)
+ | ((*xdr)[2] << 8)
+ | ((*xdr)[3] << 0);
+ (*xdr) += 4;
+ return (0);
+}
+
+static int
+xdr_u_int(const unsigned char **xdr, u_int *ip)
+{
+ *ip = ((*xdr)[0] << 24)
+ | ((*xdr)[1] << 16)
+ | ((*xdr)[2] << 8)
+ | ((*xdr)[3] << 0);
+ (*xdr) += 4;
+ return (0);
+}
+
+static int
+xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
+{
+ u_int hi, lo;
+
+ xdr_u_int(xdr, &hi);
+ xdr_u_int(xdr, &lo);
+ *lp = (((uint64_t) hi) << 32) | lo;
+ return (0);
+}
+
+static int
+nvlist_find(const unsigned char *nvlist, const char *name, int type,
+ int* elementsp, void *valuep)
+{
+ const unsigned char *p, *pair;
+ int junk;
+ int encoded_size, decoded_size;
+
+ p = nvlist;
+ xdr_int(&p, &junk);
+ xdr_int(&p, &junk);
+
+ pair = p;
+ xdr_int(&p, &encoded_size);
+ xdr_int(&p, &decoded_size);
+ while (encoded_size && decoded_size) {
+ int namelen, pairtype, elements;
+ const char *pairname;
+
+ xdr_int(&p, &namelen);
+ pairname = (const char*) p;
+ p += roundup(namelen, 4);
+ xdr_int(&p, &pairtype);
+
+ if (!memcmp(name, pairname, namelen) && type == pairtype) {
+ xdr_int(&p, &elements);
+ if (elementsp)
+ *elementsp = elements;
+ if (type == DATA_TYPE_UINT64) {
+ xdr_uint64_t(&p, (uint64_t *) valuep);
+ return (0);
+ } else if (type == DATA_TYPE_STRING) {
+ int len;
+ xdr_int(&p, &len);
+ (*(const char**) valuep) = (const char*) p;
+ return (0);
+ } else if (type == DATA_TYPE_NVLIST
+ || type == DATA_TYPE_NVLIST_ARRAY) {
+ (*(const unsigned char**) valuep) =
+ (const unsigned char*) p;
+ return (0);
+ } else {
+ return (EIO);
+ }
+ } else {
+ /*
+ * Not the pair we are looking for, skip to the next one.
+ */
+ p = pair + encoded_size;
+ }
+
+ pair = p;
+ xdr_int(&p, &encoded_size);
+ xdr_int(&p, &decoded_size);
+ }
+
+ return (EIO);
+}
+
+/*
+ * Return the next nvlist in an nvlist array.
+ */
+static const unsigned char *
+nvlist_next(const unsigned char *nvlist)
+{
+ const unsigned char *p, *pair;
+ int junk;
+ int encoded_size, decoded_size;
+
+ p = nvlist;
+ xdr_int(&p, &junk);
+ xdr_int(&p, &junk);
+
+ pair = p;
+ xdr_int(&p, &encoded_size);
+ xdr_int(&p, &decoded_size);
+ while (encoded_size && decoded_size) {
+ p = pair + encoded_size;
+
+ pair = p;
+ xdr_int(&p, &encoded_size);
+ xdr_int(&p, &decoded_size);
+ }
+
+ return p;
+}
+
+#ifdef TEST
+
+static const unsigned char *
+nvlist_print(const unsigned char *nvlist, unsigned int indent)
+{
+ static const char* typenames[] = {
+ "DATA_TYPE_UNKNOWN",
+ "DATA_TYPE_BOOLEAN",
+ "DATA_TYPE_BYTE",
+ "DATA_TYPE_INT16",
+ "DATA_TYPE_UINT16",
+ "DATA_TYPE_INT32",
+ "DATA_TYPE_UINT32",
+ "DATA_TYPE_INT64",
+ "DATA_TYPE_UINT64",
+ "DATA_TYPE_STRING",
+ "DATA_TYPE_BYTE_ARRAY",
+ "DATA_TYPE_INT16_ARRAY",
+ "DATA_TYPE_UINT16_ARRAY",
+ "DATA_TYPE_INT32_ARRAY",
+ "DATA_TYPE_UINT32_ARRAY",
+ "DATA_TYPE_INT64_ARRAY",
+ "DATA_TYPE_UINT64_ARRAY",
+ "DATA_TYPE_STRING_ARRAY",
+ "DATA_TYPE_HRTIME",
+ "DATA_TYPE_NVLIST",
+ "DATA_TYPE_NVLIST_ARRAY",
+ "DATA_TYPE_BOOLEAN_VALUE",
+ "DATA_TYPE_INT8",
+ "DATA_TYPE_UINT8",
+ "DATA_TYPE_BOOLEAN_ARRAY",
+ "DATA_TYPE_INT8_ARRAY",
+ "DATA_TYPE_UINT8_ARRAY"
+ };
+
+ unsigned int i, j;
+ const unsigned char *p, *pair;
+ int junk;
+ int encoded_size, decoded_size;
+
+ p = nvlist;
+ xdr_int(&p, &junk);
+ xdr_int(&p, &junk);
+
+ pair = p;
+ xdr_int(&p, &encoded_size);
+ xdr_int(&p, &decoded_size);
+ while (encoded_size && decoded_size) {
+ int namelen, pairtype, elements;
+ const char *pairname;
+
+ xdr_int(&p, &namelen);
+ pairname = (const char*) p;
+ p += roundup(namelen, 4);
+ xdr_int(&p, &pairtype);
+
+ for (i = 0; i < indent; i++)
+ printf(" ");
+ printf("%s %s", typenames[pairtype], pairname);
+
+ xdr_int(&p, &elements);
+ switch (pairtype) {
+ case DATA_TYPE_UINT64: {
+ uint64_t val;
+ xdr_uint64_t(&p, &val);
+ printf(" = 0x%llx\n", val);
+ break;
+ }
+
+ case DATA_TYPE_STRING: {
+ int len;
+ xdr_int(&p, &len);
+ printf(" = \"%s\"\n", p);
+ break;
+ }
+
+ case DATA_TYPE_NVLIST:
+ printf("\n");
+ nvlist_print(p, indent + 1);
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY:
+ for (j = 0; j < elements; j++) {
+ printf("[%d]\n", j);
+ p = nvlist_print(p, indent + 1);
+ if (j != elements - 1) {
+ for (i = 0; i < indent; i++)
+ printf(" ");
+ printf("%s %s", typenames[pairtype], pairname);
+ }
+ }
+ break;
+
+ default:
+ printf("\n");
+ }
+
+ p = pair + encoded_size;
+
+ pair = p;
+ xdr_int(&p, &encoded_size);
+ xdr_int(&p, &decoded_size);
+ }
+
+ return p;
+}
+
+#endif
+
+static int
+vdev_mirror_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t size)
+{
+ vdev_t *kid;
+ int rc;
+
+ rc = EIO;
+ STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
+ if (kid->v_state != VDEV_STATE_HEALTHY)
+ continue;
+ rc = kid->v_read(kid, kid->v_read_priv, offset, buf, size);
+ if (!rc)
+ return (0);
+ }
+
+ return (rc);
+}
+
+static vdev_t *
+vdev_find(uint64_t guid)
+{
+ vdev_t *vdev;
+
+ STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
+ if (vdev->v_guid == guid)
+ return (vdev);
+
+ return (0);
+}
+
+static vdev_t *
+vdev_create(uint64_t guid, vdev_read_t *read, void *read_priv)
+{
+ vdev_t *vdev;
+
+ vdev = malloc(sizeof(vdev_t));
+ memset(vdev, 0, sizeof(vdev_t));
+ STAILQ_INIT(&vdev->v_children);
+ vdev->v_guid = guid;
+ vdev->v_state = VDEV_STATE_OFFLINE;
+ vdev->v_read = read;
+ vdev->v_read_priv = read_priv;
+ STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
+
+ return (vdev);
+}
+
+static int
+vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp)
+{
+ int rc;
+ uint64_t guid, id;
+ const char *type;
+ const char *path;
+ vdev_t *vdev, *kid;
+ const unsigned char *kids;
+ int nkids, i;
+
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
+ DATA_TYPE_UINT64, 0, &guid)
+ || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
+ DATA_TYPE_UINT64, 0, &id)
+ || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
+ DATA_TYPE_STRING, 0, &type)) {
+ printf("ZFS: can't find vdev details\n");
+ return (ENOENT);
+ }
+
+ /*
+ * Assume that if we've seen this vdev tree before, this one
+ * will be identical.
+ */
+ vdev = vdev_find(guid);
+ if (vdev) {
+ if (vdevp)
+ *vdevp = vdev;
+ return (0);
+ }
+
+ if (strcmp(type, VDEV_TYPE_MIRROR)
+ && strcmp(type, VDEV_TYPE_DISK)) {
+ printf("ZFS: can only boot from disk or mirror vdevs\n");
+ return (EIO);
+ }
+
+ if (!strcmp(type, VDEV_TYPE_MIRROR))
+ vdev = vdev_create(guid, vdev_mirror_read, 0);
+ else
+ vdev = vdev_create(guid, 0, 0);
+
+
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
+ DATA_TYPE_STRING, 0, &path) == 0) {
+ if (strlen(path) > 5
+ && path[0] == '/'
+ && path[1] == 'd'
+ && path[2] == 'e'
+ && path[3] == 'v'
+ && path[4] == '/')
+ path += 5;
+ vdev->v_name = strdup(path);
+ } else {
+ vdev->v_name = strdup(type);
+ }
+ vdev->v_id = id;
+ rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
+ DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
+ /*
+ * Its ok if we don't have any kids.
+ */
+ if (rc == 0) {
+ for (i = 0; i < nkids; i++) {
+ rc = vdev_init_from_nvlist(kids, &kid);
+ if (rc)
+ return (rc);
+ STAILQ_INSERT_TAIL(&vdev->v_children, kid, v_childlink);
+ kids = nvlist_next(kids);
+ }
+ }
+
+ if (vdevp)
+ *vdevp = vdev;
+ return (0);
+}
+
+static void
+vdev_set_state(vdev_t *vdev)
+{
+ vdev_t *kid;
+ int good_kids;
+ int bad_kids;
+
+ /*
+ * We assume that if we have kids, we are a mirror. A mirror
+ * is healthy if all its kids are healthy. Its degraded (but
+ * working) if at least one kid is healty.
+ */
+
+ if (STAILQ_FIRST(&vdev->v_children)) {
+ good_kids = 0;
+ bad_kids = 0;
+ STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
+ if (kid->v_state == VDEV_STATE_HEALTHY)
+ good_kids++;
+ else
+ bad_kids++;
+ }
+ if (good_kids) {
+ if (!bad_kids && good_kids)
+ vdev->v_state = VDEV_STATE_HEALTHY;
+ else
+ vdev->v_state = VDEV_STATE_DEGRADED;
+ } else {
+ vdev->v_state = VDEV_STATE_OFFLINE;
+ }
+ }
+}
+
+static spa_t *
+spa_find_by_guid(uint64_t guid)
+{
+ spa_t *spa;
+
+ STAILQ_FOREACH(spa, &zfs_pools, spa_link)
+ if (spa->spa_guid == guid)
+ return (spa);
+
+ return (0);
+}
+
+#ifdef BOOT2
+
+static spa_t *
+spa_find_by_name(const char *name)
+{
+ spa_t *spa;
+
+ STAILQ_FOREACH(spa, &zfs_pools, spa_link)
+ if (!strcmp(spa->spa_name, name))
+ return (spa);
+
+ return (0);
+}
+
+#endif
+
+static spa_t *
+spa_create(uint64_t guid)
+{
+ spa_t *spa;
+
+ spa = malloc(sizeof(spa_t));
+ memset(spa, 0, sizeof(spa_t));
+ STAILQ_INIT(&spa->spa_vdevs);
+ spa->spa_guid = guid;
+ STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
+
+ return (spa);
+}
+
+static const char *
+state_name(vdev_state_t state)
+{
+ static const char* names[] = {
+ "UNKNOWN",
+ "CLOSED",
+ "OFFLINE",
+ "CANT_OPEN",
+ "DEGRADED",
+ "ONLINE"
+ };
+ return names[state];
+}
+
+#ifdef BOOT2
+
+#define pager_printf printf
+
+#else
+
+static void
+pager_printf(const char *fmt, ...)
+{
+ char line[80];
+ va_list args;
+
+ va_start(args, fmt);
+ vsprintf(line, fmt, args);
+ va_end(args);
+ pager_output(line);
+}
+
+#endif
+
+#define STATUS_FORMAT " %-16s %-10s\n"
+
+static void
+print_state(int indent, const char *name, vdev_state_t state)
+{
+ int i;
+ char buf[512];
+
+ buf[0] = 0;
+ for (i = 0; i < indent; i++)
+ strcat(buf, " ");
+ strcat(buf, name);
+ pager_printf(STATUS_FORMAT, buf, state_name(state));
+
+}
+
+static void
+vdev_status(vdev_t *vdev, int indent)
+{
+ vdev_t *kid;
+ print_state(indent, vdev->v_name, vdev->v_state);
+
+ STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
+ vdev_status(kid, indent + 1);
+ }
+}
+
+static void
+spa_status(spa_t *spa)
+{
+ vdev_t *vdev;
+ int good_kids, bad_kids, degraded_kids;
+ vdev_state_t state;
+
+ pager_printf(" pool: %s\n", spa->spa_name);
+ pager_printf("config:\n\n");
+ pager_printf(STATUS_FORMAT, "NAME", "STATE");
+
+ good_kids = 0;
+ degraded_kids = 0;
+ bad_kids = 0;
+ STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+ if (vdev->v_state == VDEV_STATE_HEALTHY)
+ good_kids++;
+ else if (vdev->v_state == VDEV_STATE_DEGRADED)
+ degraded_kids++;
+ else
+ bad_kids++;
+ }
+
+ state = VDEV_STATE_CLOSED;
+ if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
+ state = VDEV_STATE_HEALTHY;
+ else if ((good_kids + degraded_kids) > 0)
+ state = VDEV_STATE_DEGRADED;
+
+ print_state(0, spa->spa_name, state);
+ STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+ vdev_status(vdev, 1);
+ }
+}
+
+static void
+spa_all_status(void)
+{
+ spa_t *spa;
+ int first = 1;
+
+ STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
+ if (!first)
+ pager_printf("\n");
+ first = 0;
+ spa_status(spa);
+ }
+}
+
+static int
+vdev_probe(vdev_read_t *read, void *read_priv, spa_t **spap)
+{
+ vdev_t vtmp;
+ vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
+ spa_t *spa;
+ vdev_t *vdev, *top_vdev, *pool_vdev;
+ off_t off;
+ blkptr_t bp;
+ const unsigned char *nvlist;
+ uint64_t val;
+ uint64_t guid;
+ uint64_t pool_txg, pool_guid;
+ const char *pool_name;
+ const unsigned char *vdevs;
+ int i;
+ char upbuf[1024];
+ const struct uberblock *up;
+
+ /*
+ * Load the vdev label and figure out which
+ * uberblock is most current.
+ */
+ memset(&vtmp, 0, sizeof(vtmp));
+ vtmp.v_read = read;
+ vtmp.v_read_priv = read_priv;
+ off = offsetof(vdev_label_t, vl_vdev_phys);
+ BP_ZERO(&bp);
+ BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
+ BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
+ BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
+ BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+ ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
+ if (zio_read_phys(&vtmp, &bp, vdev_label, off))
+ return (EIO);
+
+ if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
+ return (EIO);
+ }
+
+ nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
+
+ if (nvlist_find(nvlist,
+ ZPOOL_CONFIG_VERSION,
+ DATA_TYPE_UINT64, 0, &val)) {
+ return (EIO);
+ }
+
+ if (val != ZFS_VERSION) {
+ printf("ZFS: unsupported ZFS version %d\n", (int) val);
+ return (EIO);
+ }
+
+ if (nvlist_find(nvlist,
+ ZPOOL_CONFIG_POOL_STATE,
+ DATA_TYPE_UINT64, 0, &val)) {
+ return (EIO);
+ }
+
+ if (val != POOL_STATE_ACTIVE) {
+ /*
+ * Don't print a message here. If we happen to reboot
+ * while where is an exported pool around, we don't
+ * need a cascade of confusing messages during boot.
+ */
+ /*printf("ZFS: pool is not active\n");*/
+ return (EIO);
+ }
+
+ if (nvlist_find(nvlist,
+ ZPOOL_CONFIG_POOL_TXG,
+ DATA_TYPE_UINT64, 0, &pool_txg)
+ || nvlist_find(nvlist,
+ ZPOOL_CONFIG_POOL_GUID,
+ DATA_TYPE_UINT64, 0, &pool_guid)
+ || nvlist_find(nvlist,
+ ZPOOL_CONFIG_POOL_NAME,
+ DATA_TYPE_STRING, 0, &pool_name)) {
+ printf("ZFS: can't find pool details\n");
+ return (EIO);
+ }
+
+ /*
+ * Create the pool if this is the first time we've seen it.
+ */
+ spa = spa_find_by_guid(pool_guid);
+ if (!spa) {
+ spa = spa_create(pool_guid);
+ spa->spa_name = strdup(pool_name);
+ }
+ if (pool_txg > spa->spa_txg)
+ spa->spa_txg = pool_txg;
+
+ /*
+ * Get the vdev tree and create our in-core copy of it.
+ * If we already have a healthy vdev with this guid, this must
+ * be some kind of alias (overlapping slices, dangerously dedicated
+ * disks etc).
+ */
+ if (nvlist_find(nvlist,
+ ZPOOL_CONFIG_GUID,
+ DATA_TYPE_UINT64, 0, &guid)) {
+ return (EIO);
+ }
+ vdev = vdev_find(guid);
+ if (vdev && vdev->v_state == VDEV_STATE_HEALTHY) {
+ return (EIO);
+ }
+
+ if (nvlist_find(nvlist,
+ ZPOOL_CONFIG_VDEV_TREE,
+ DATA_TYPE_NVLIST, 0, &vdevs)) {
+ return (EIO);
+ }
+ vdev_init_from_nvlist(vdevs, &top_vdev);
+
+ /*
+ * Add the toplevel vdev to the pool if its not already there.
+ */
+ STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
+ if (top_vdev == pool_vdev)
+ break;
+ if (!pool_vdev && top_vdev)
+ STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
+
+ /*
+ * We should already have created an incomplete vdev for this
+ * vdev. Find it and initialise it with our read proc.
+ */
+ vdev = vdev_find(guid);
+ if (vdev) {
+ vdev->v_read = read;
+ vdev->v_read_priv = read_priv;
+ vdev->v_state = VDEV_STATE_HEALTHY;
+ } else {
+ printf("ZFS: inconsistent nvlist contents\n");
+ return (EIO);
+ }
+
+ /*
+ * Re-evaluate top-level vdev state.
+ */
+ vdev_set_state(top_vdev);
+
+ /*
+ * Ok, we are happy with the pool so far. Lets find
+ * the best uberblock and then we can actually access
+ * the contents of the pool.
+ */
+ for (i = 0;
+ i < VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT;
+ i++) {
+ off = offsetof(vdev_label_t, vl_uberblock);
+ off += i << UBERBLOCK_SHIFT;
+ BP_ZERO(&bp);
+ DVA_SET_OFFSET(&bp.blk_dva[0], off);
+ BP_SET_LSIZE(&bp, 1 << UBERBLOCK_SHIFT);
+ BP_SET_PSIZE(&bp, 1 << UBERBLOCK_SHIFT);
+ BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
+ BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+ ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
+ if (zio_read_phys(vdev, &bp, upbuf, off))
+ continue;
+
+ up = (const struct uberblock *) upbuf;
+ if (up->ub_magic != UBERBLOCK_MAGIC)
+ continue;
+ if (up->ub_txg < spa->spa_txg)
+ continue;
+ if (up->ub_txg > spa->spa_uberblock.ub_txg) {
+ spa->spa_uberblock = *up;
+ } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
+ if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
+ spa->spa_uberblock = *up;
+ }
+ }
+
+ if (spap)
+ *spap = spa;
+ return (0);
+}
+
+static int
+ilog2(int n)
+{
+ int v;
+
+ for (v = 0; v < 32; v++)
+ if (n == (1 << v))
+ return v;
+ return -1;
+}
+
+static int
+zio_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset)
+{
+ int cpfunc = BP_GET_COMPRESS(bp);
+ size_t lsize = BP_GET_LSIZE(bp);
+ size_t psize = BP_GET_PSIZE(bp);
+ int rc;
+
+ /*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
+ if (cpfunc != ZIO_COMPRESS_OFF) {
+ rc = vdev->v_read(vdev, vdev->v_read_priv, offset, zfs_decomp_buf, psize);
+ if (rc)
+ return (rc);
+ if (zio_checksum_error(bp, zfs_decomp_buf))
+ return (EIO);
+ if (zio_decompress_data(cpfunc, zfs_decomp_buf, psize,
+ buf, lsize))
+ return (EIO);
+ } else {
+ rc = vdev->v_read(vdev, vdev->v_read_priv, offset, buf, psize);
+ if (rc)
+ return (rc);
+
+ if (zio_checksum_error(bp, buf))
+ return (EIO);
+ }
+ return (0);
+}
+
+static int
+zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
+{
+ int i;
+
+ for (i = 0; i < SPA_DVAS_PER_BP; i++) {
+ const dva_t *dva = &bp->blk_dva[i];
+ vdev_t *vdev;
+ int vdevid;
+ off_t offset;
+
+ if (!dva->dva_word[0] && !dva->dva_word[1])
+ continue;
+
+ vdevid = DVA_GET_VDEV(dva);
+ offset = DVA_GET_OFFSET(dva) + VDEV_LABEL_START_SIZE;
+ STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
+ if (vdev->v_id == vdevid)
+ break;
+ if (!vdev || !vdev->v_read)
+ continue;
+ if (zio_read_phys(vdev, bp, buf, offset))
+ continue;
+
+ return (0);
+ }
+ printf("ZFS: i/o error - all block copies unavailable\n");
+
+ return (EIO);
+}
+
+static int
+dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
+{
+ int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ int nlevels = dnode->dn_nlevels;
+ int i, rc;
+
+ /*
+ * We truncate the offset to 32bits, mainly so that I don't
+ * have to find a copy of __divdi3 to put into the bootstrap.
+ * I don't think the bootstrap needs to access anything bigger
+ * than 2G anyway. Note that block addresses are still 64bit
+ * so it doesn't affect the possible size of the media.
+ * We still use 64bit block numbers so that the bitshifts
+ * work correctly. Note: bsize may not be a power of two here.
+ */
+ while (buflen > 0) {
+ uint64_t bn = ((int) offset) / bsize;
+ int boff = ((int) offset) % bsize;
+ int ibn;
+ const blkptr_t *indbp;
+ blkptr_t bp;
+
+ if (bn > dnode->dn_maxblkid)
+ return (EIO);
+
+ if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
+ goto cached;
+
+ indbp = dnode->dn_blkptr;
+ for (i = 0; i < nlevels; i++) {
+ /*
+ * Copy the bp from the indirect array so that
+ * we can re-use the scratch buffer for multi-level
+ * objects.
+ */
+ ibn = bn >> ((nlevels - i - 1) * ibshift);
+ ibn &= ((1 << ibshift) - 1);
+ bp = indbp[ibn];
+ rc = zio_read(spa, &bp, dnode_cache_buf);
+ if (rc)
+ return (rc);
+ indbp = (const blkptr_t *) dnode_cache_buf;
+ }
+ dnode_cache_obj = dnode;
+ dnode_cache_bn = bn;
+ cached:
+
+ /*
+ * The buffer contains our data block. Copy what we
+ * need from it and loop.
+ */
+ i = bsize - boff;
+ if (i > buflen) i = buflen;
+ memcpy(buf, &dnode_cache_buf[boff], i);
+ buf = ((char*) buf) + i;
+ offset += i;
+ buflen -= i;
+ }
+
+ return (0);
+}
+
+/*
+ * Lookup a value in a microzap directory. Assumes that the zap
+ * scratch buffer contains the directory contents.
+ */
+static int
+mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
+{
+ const mzap_phys_t *mz;
+ const mzap_ent_phys_t *mze;
+ size_t size;
+ int chunks, i;
+
+ /*
+ * Microzap objects use exactly one block. Read the whole
+ * thing.
+ */
+ size = dnode->dn_datablkszsec * 512;
+
+ mz = (const mzap_phys_t *) zap_scratch;
+ chunks = size / MZAP_ENT_LEN - 1;
+
+ for (i = 0; i < chunks; i++) {
+ mze = &mz->mz_chunk[i];
+ if (!strcmp(mze->mze_name, name)) {
+ *value = mze->mze_value;
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+/*
+ * Compare a name with a zap leaf entry. Return non-zero if the name
+ * matches.
+ */
+static int
+fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
+{
+ size_t namelen;
+ const zap_leaf_chunk_t *nc;
+ const char *p;
+
+ namelen = zc->l_entry.le_name_length;
+
+ nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
+ p = name;
+ while (namelen > 0) {
+ size_t len;
+ len = namelen;
+ if (len > ZAP_LEAF_ARRAY_BYTES)
+ len = ZAP_LEAF_ARRAY_BYTES;
+ if (memcmp(p, nc->l_array.la_array, len))
+ return (0);
+ p += len;
+ namelen -= len;
+ nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
+ }
+
+ return 1;
+}
+
+/*
+ * Extract a uint64_t value from a zap leaf entry.
+ */
+static uint64_t
+fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
+{
+ const zap_leaf_chunk_t *vc;
+ int i;
+ uint64_t value;
+ const uint8_t *p;
+
+ vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
+ for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
+ value = (value << 8) | p[i];
+ }
+
+ return value;
+}
+
+/*
+ * Lookup a value in a fatzap directory. Assumes that the zap scratch
+ * buffer contains the directory header.
+ */
+static int
+fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
+{
+ int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ zap_phys_t zh = *(zap_phys_t *) zap_scratch;
+ fat_zap_t z;
+ uint64_t *ptrtbl;
+ uint64_t hash;
+ int rc;
+
+ if (zh.zap_magic != ZAP_MAGIC)
+ return (EIO);
+
+ z.zap_block_shift = ilog2(bsize);
+ z.zap_phys = (zap_phys_t *) zap_scratch;
+
+ /*
+ * Figure out where the pointer table is and read it in if necessary.
+ */
+ if (zh.zap_ptrtbl.zt_blk) {
+ rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
+ zap_scratch, bsize);
+ if (rc)
+ return (rc);
+ ptrtbl = (uint64_t *) zap_scratch;
+ } else {
+ ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
+ }
+
+ hash = zap_hash(zh.zap_salt, name);
+
+ zap_leaf_t zl;
+ zl.l_bs = z.zap_block_shift;
+
+ off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
+ zap_leaf_chunk_t *zc;
+
+ rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
+ if (rc)
+ return (rc);
+
+ zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
+
+ /*
+ * Make sure this chunk matches our hash.
+ */
+ if (zl.l_phys->l_hdr.lh_prefix_len > 0
+ && zl.l_phys->l_hdr.lh_prefix
+ != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
+ return (ENOENT);
+
+ /*
+ * Hash within the chunk to find our entry.
+ */
+ int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
+ int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
+ h = zl.l_phys->l_hash[h];
+ if (h == 0xffff)
+ return (ENOENT);
+ zc = &ZAP_LEAF_CHUNK(&zl, h);
+ while (zc->l_entry.le_hash != hash) {
+ if (zc->l_entry.le_next == 0xffff) {
+ zc = 0;
+ break;
+ }
+ zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
+ }
+ if (fzap_name_equal(&zl, zc, name)) {
+ *value = fzap_leaf_value(&zl, zc);
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+/*
+ * Lookup a name in a zap object and return its value as a uint64_t.
+ */
+static int
+zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
+{
+ int rc;
+ uint64_t zap_type;
+ size_t size = dnode->dn_datablkszsec * 512;
+
+ rc = dnode_read(spa, dnode, 0, zap_scratch, size);
+ if (rc)
+ return (rc);
+
+ zap_type = *(uint64_t *) zap_scratch;
+ if (zap_type == ZBT_MICRO)
+ return mzap_lookup(spa, dnode, name, value);
+ else
+ return fzap_lookup(spa, dnode, name, value);
+}
+
+#ifdef BOOT2
+
+/*
+ * List a microzap directory. Assumes that the zap scratch buffer contains
+ * the directory contents.
+ */
+static int
+mzap_list(spa_t *spa, const dnode_phys_t *dnode)
+{
+ const mzap_phys_t *mz;
+ const mzap_ent_phys_t *mze;
+ size_t size;
+ int chunks, i;
+
+ /*
+ * Microzap objects use exactly one block. Read the whole
+ * thing.
+ */
+ size = dnode->dn_datablkszsec * 512;
+ mz = (const mzap_phys_t *) zap_scratch;
+ chunks = size / MZAP_ENT_LEN - 1;
+
+ for (i = 0; i < chunks; i++) {
+ mze = &mz->mz_chunk[i];
+ if (mze->mze_name[0])
+ //printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value);
+ printf("%s\n", mze->mze_name);
+ }
+
+ return (0);
+}
+
+/*
+ * List a fatzap directory. Assumes that the zap scratch buffer contains
+ * the directory header.
+ */
+static int
+fzap_list(spa_t *spa, const dnode_phys_t *dnode)
+{
+ int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ zap_phys_t zh = *(zap_phys_t *) zap_scratch;
+ fat_zap_t z;
+ int i, j;
+
+ if (zh.zap_magic != ZAP_MAGIC)
+ return (EIO);
+
+ z.zap_block_shift = ilog2(bsize);
+ z.zap_phys = (zap_phys_t *) zap_scratch;
+
+ /*
+ * This assumes that the leaf blocks start at block 1. The
+ * documentation isn't exactly clear on this.
+ */
+ zap_leaf_t zl;
+ zl.l_bs = z.zap_block_shift;
+ for (i = 0; i < zh.zap_num_leafs; i++) {
+ off_t off = (i + 1) << zl.l_bs;
+ char name[256], *p;
+ uint64_t value;
+
+ if (dnode_read(spa, dnode, off, zap_scratch, bsize))
+ return (EIO);
+
+ zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
+
+ for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
+ zap_leaf_chunk_t *zc, *nc;
+ int namelen;
+
+ zc = &ZAP_LEAF_CHUNK(&zl, j);
+ if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
+ continue;
+ namelen = zc->l_entry.le_name_length;
+ if (namelen > sizeof(name))
+ namelen = sizeof(name);
+
+ /*
+ * Paste the name back together.
+ */
+ nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
+ p = name;
+ while (namelen > 0) {
+ int len;
+ len = namelen;
+ if (len > ZAP_LEAF_ARRAY_BYTES)
+ len = ZAP_LEAF_ARRAY_BYTES;
+ memcpy(p, nc->l_array.la_array, len);
+ p += len;
+ namelen -= len;
+ nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
+ }
+
+ /*
+ * Assume the first eight bytes of the value are
+ * a uint64_t.
+ */
+ value = fzap_leaf_value(&zl, zc);
+
+ printf("%-32s 0x%llx\n", name, value);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * List a zap directory.
+ */
+static int
+zap_list(spa_t *spa, const dnode_phys_t *dnode)
+{
+ uint64_t zap_type;
+ size_t size = dnode->dn_datablkszsec * 512;
+
+ if (dnode_read(spa, dnode, 0, zap_scratch, size))
+ return (EIO);
+
+ zap_type = *(uint64_t *) zap_scratch;
+ if (zap_type == ZBT_MICRO)
+ return mzap_list(spa, dnode);
+ else
+ return fzap_list(spa, dnode);
+}
+
+#endif
+
+static int
+objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
+{
+ off_t offset;
+
+ offset = objnum * sizeof(dnode_phys_t);
+ return dnode_read(spa, &os->os_meta_dnode, offset,
+ dnode, sizeof(dnode_phys_t));
+}
+
+/*
+ * Find the object set given the object number of its dataset object
+ * and return its details in *objset
+ */
+static int
+zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset)
+{
+ dnode_phys_t dataset;
+ dsl_dataset_phys_t *ds;
+
+ if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
+ printf("ZFS: can't find dataset %lld\n", objnum);
+ return (EIO);
+ }
+
+ ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
+ if (zio_read(spa, &ds->ds_bp, objset)) {
+ printf("ZFS: can't read object set for dataset %lld\n", objnum);
+ return (EIO);
+ }
+
+ return (0);
+}
+
+/*
+ * Find the object set pointed to by the BOOTFS property or the root
+ * dataset if there is none and return its details in *objset
+ */
+static int
+zfs_mount_root(spa_t *spa, objset_phys_t *objset)
+{
+ dnode_phys_t dir, propdir;
+ uint64_t props, bootfs, root;
+
+ /*
+ * Start with the MOS directory object.
+ */
+ if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
+ printf("ZFS: can't read MOS object directory\n");
+ return (EIO);
+ }
+
+ /*
+ * Lookup the pool_props and see if we can find a bootfs.
+ */
+ if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
+ && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
+ && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0)
+ return zfs_mount_dataset(spa, bootfs, objset);
+
+ /*
+ * Lookup the root dataset directory
+ */
+ if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
+ || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
+ printf("ZFS: can't find root dsl_dir\n");
+ return (EIO);
+ }
+
+ /*
+ * Use the information from the dataset directory's bonus buffer
+ * to find the dataset object and from that the object set itself.
+ */
+ dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
+ return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset);
+}
+
+static int
+zfs_mount_pool(spa_t *spa)
+{
+ /*
+ * Find the MOS and work our way in from there.
+ */
+ if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
+ printf("ZFS: can't read MOS\n");
+ return (EIO);
+ }
+
+ /*
+ * Find the root object set
+ */
+ if (zfs_mount_root(spa, &spa->spa_root_objset)) {
+ printf("Can't find root filesystem - giving up\n");
+ return (EIO);
+ }
+
+ return (0);
+}
+
+/*
+ * Lookup a file and return its dnode.
+ */
+static int
+zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
+{
+ int rc;
+ uint64_t objnum, rootnum, parentnum;
+ dnode_phys_t dn;
+ const znode_phys_t *zp = (const znode_phys_t *) dn.dn_bonus;
+ const char *p, *q;
+ char element[256];
+ char path[1024];
+ int symlinks_followed = 0;
+
+ if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
+ printf("ZFS: unexpected object set type %lld\n",
+ spa->spa_root_objset.os_type);
+ return (EIO);
+ }
+
+ /*
+ * Get the root directory dnode.
+ */
+ rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn);
+ if (rc)
+ return (rc);
+
+ rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
+ if (rc)
+ return (rc);
+
+ rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn);
+ if (rc)
+ return (rc);
+
+ objnum = rootnum;
+ p = upath;
+ while (p && *p) {
+ while (*p == '/')
+ p++;
+ if (!*p)
+ break;
+ q = strchr(p, '/');
+ if (q) {
+ memcpy(element, p, q - p);
+ element[q - p] = 0;
+ p = q;
+ } else {
+ strcpy(element, p);
+ p = 0;
+ }
+
+ if ((zp->zp_mode >> 12) != 0x4) {
+ return (ENOTDIR);
+ }
+
+ parentnum = objnum;
+ rc = zap_lookup(spa, &dn, element, &objnum);
+ if (rc)
+ return (rc);
+ objnum = ZFS_DIRENT_OBJ(objnum);
+
+ rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
+ if (rc)
+ return (rc);
+
+ /*
+ * Check for symlink.
+ */
+ if ((zp->zp_mode >> 12) == 0xa) {
+ if (symlinks_followed > 10)
+ return (EMLINK);
+ symlinks_followed++;
+
+ /*
+ * Read the link value and copy the tail of our
+ * current path onto the end.
+ */
+ if (p)
+ strcpy(&path[zp->zp_size], p);
+ else
+ path[zp->zp_size] = 0;
+ if (zp->zp_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
+ memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
+ zp->zp_size);
+ } else {
+ rc = dnode_read(spa, &dn, 0, path, zp->zp_size);
+ if (rc)
+ return (rc);
+ }
+
+ /*
+ * Restart with the new path, starting either at
+ * the root or at the parent depending whether or
+ * not the link is relative.
+ */
+ p = path;
+ if (*p == '/')
+ objnum = rootnum;
+ else
+ objnum = parentnum;
+ objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
+ }
+ }
+
+ *dnode = dn;
+ return (0);
+}
OpenPOWER on IntegriCloud