From 8dab5b050118631ed065f01515a1e2617f8e98de Mon Sep 17 00:00:00 2001
From: mav <mav@FreeBSD.org>
Date: Thu, 24 Mar 2011 21:31:32 +0000
Subject: MFgraid/head: Add new RAID GEOM class, that is going to replace
 ataraid(4) in supporting various BIOS-based software RAIDs. Unlike ataraid(4)
 this implementation does not depend on legacy ata(4) subsystem and can be
 used with any disk drivers, including new CAM-based ones (ahci(4), siis(4),
 mvs(4), ata(4) with `options ATA_CAM`). To make code more readable and
 extensible, this implementation follows modular design, including core part
 and two sets of modules, implementing support for different metadata formats
 and RAID levels.

Support for such popular metadata formats is now implemented:
Intel, JMicron, NVIDIA, Promise (also used by AMD/ATI) and SiliconImage.

Such RAID levels are now supported:
RAID0, RAID1, RAID1E, RAID10, SINGLE, CONCAT.

For any all of these RAID levels and metadata formats this class supports
full cycle of volume operations: reading, writing, creation, deletion,
disk removal and insertion, rebuilding, dirty shutdown detection
and resynchronization, bad sector recovery, faulty disks tracking,
hot-spare disks. For Intel and Promise formats there is support multiple
volumes per disk set.

Look graid(8) manual page for additional details.

Co-authored by:	imp
Sponsored by:	Cisco Systems, Inc. and iXsystems, Inc.
---
 etc/mtree/BSD.include.dist          |    2 +
 include/Makefile                    |    2 +-
 sbin/geom/class/Makefile            |    1 +
 sbin/geom/class/raid/Makefile       |   10 +
 sbin/geom/class/raid/geom_raid.c    |   91 ++
 sbin/geom/class/raid/graid.8        |  266 ++++
 sys/conf/NOTES                      |    1 +
 sys/conf/files                      |   13 +
 sys/conf/options                    |    1 +
 sys/geom/raid/g_raid.c              | 2340 +++++++++++++++++++++++++++++++++++
 sys/geom/raid/g_raid.h              |  403 ++++++
 sys/geom/raid/g_raid_ctl.c          |  217 ++++
 sys/geom/raid/g_raid_md_if.m        |  156 +++
 sys/geom/raid/g_raid_tr_if.m        |  118 ++
 sys/geom/raid/md_intel.c            | 2323 ++++++++++++++++++++++++++++++++++
 sys/geom/raid/md_jmicron.c          | 1582 +++++++++++++++++++++++
 sys/geom/raid/md_nvidia.c           | 1607 ++++++++++++++++++++++++
 sys/geom/raid/md_promise.c          | 1940 +++++++++++++++++++++++++++++
 sys/geom/raid/md_sii.c              | 1692 +++++++++++++++++++++++++
 sys/geom/raid/tr_concat.c           |  343 +++++
 sys/geom/raid/tr_raid0.c            |  326 +++++
 sys/geom/raid/tr_raid1.c            |  993 +++++++++++++++
 sys/geom/raid/tr_raid1e.c           | 1227 ++++++++++++++++++
 sys/modules/geom/Makefile           |    1 +
 sys/modules/geom/geom_raid/Makefile |   19 +
 25 files changed, 15673 insertions(+), 1 deletion(-)
 create mode 100644 sbin/geom/class/raid/Makefile
 create mode 100644 sbin/geom/class/raid/geom_raid.c
 create mode 100644 sbin/geom/class/raid/graid.8
 create mode 100644 sys/geom/raid/g_raid.c
 create mode 100644 sys/geom/raid/g_raid.h
 create mode 100644 sys/geom/raid/g_raid_ctl.c
 create mode 100644 sys/geom/raid/g_raid_md_if.m
 create mode 100644 sys/geom/raid/g_raid_tr_if.m
 create mode 100644 sys/geom/raid/md_intel.c
 create mode 100644 sys/geom/raid/md_jmicron.c
 create mode 100644 sys/geom/raid/md_nvidia.c
 create mode 100644 sys/geom/raid/md_promise.c
 create mode 100644 sys/geom/raid/md_sii.c
 create mode 100644 sys/geom/raid/tr_concat.c
 create mode 100644 sys/geom/raid/tr_raid0.c
 create mode 100644 sys/geom/raid/tr_raid1.c
 create mode 100644 sys/geom/raid/tr_raid1e.c
 create mode 100644 sys/modules/geom/geom_raid/Makefile

diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist
index b227bdb..a19eddc 100644
--- a/etc/mtree/BSD.include.dist
+++ b/etc/mtree/BSD.include.dist
@@ -190,6 +190,8 @@
         ..
         nop
         ..
+        raid
+        ..
         raid3
         ..
         shsec
diff --git a/include/Makefile b/include/Makefile
index 249db95..9bcced5 100644
--- a/include/Makefile
+++ b/include/Makefile
@@ -47,7 +47,7 @@ LSUBDIRS=	cam/ata cam/scsi \
 	${_fs_nwfs} fs/portalfs fs/procfs fs/smbfs fs/udf fs/unionfs \
 	geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \
 	geom/mirror geom/mountver geom/multipath geom/nop \
-	geom/raid3 geom/shsec geom/stripe geom/virstor \
+	geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \
 	netgraph/atm netgraph/netflow \
 	security/audit \
 	security/mac_biba security/mac_bsdextended security/mac_lomac \
diff --git a/sbin/geom/class/Makefile b/sbin/geom/class/Makefile
index 0611cdd..912561f 100644
--- a/sbin/geom/class/Makefile
+++ b/sbin/geom/class/Makefile
@@ -14,6 +14,7 @@ SUBDIR+=mountver
 SUBDIR+=multipath
 SUBDIR+=nop
 SUBDIR+=part
+SUBDIR+=raid
 SUBDIR+=raid3
 SUBDIR+=sched
 SUBDIR+=shsec
diff --git a/sbin/geom/class/raid/Makefile b/sbin/geom/class/raid/Makefile
new file mode 100644
index 0000000..743f690
--- /dev/null
+++ b/sbin/geom/class/raid/Makefile
@@ -0,0 +1,10 @@
+# $FreeBSD$
+
+.PATH:	${.CURDIR}/../../misc
+
+GEOM_CLASS=	raid
+
+DPADD=	${LIBMD}
+LDADD=	-lmd
+
+.include <bsd.lib.mk>
diff --git a/sbin/geom/class/raid/geom_raid.c b/sbin/geom/class/raid/geom_raid.c
new file mode 100644
index 0000000..2f16295
--- /dev/null
+++ b/sbin/geom/class/raid/geom_raid.c
@@ -0,0 +1,91 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <errno.h>
+#include <paths.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+#include <libgeom.h>
+#include <geom/raid/g_raid.h>
+#include <core/geom.h>
+#include <misc/subr.h>
+
+uint32_t lib_version = G_LIB_VERSION;
+uint32_t version = G_RAID_VERSION;
+
+struct g_command class_commands[] = {
+	{ "label", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		{ 'S', "size", G_VAL_OPTIONAL, G_TYPE_NUMBER },
+		{ 's', "strip", G_VAL_OPTIONAL, G_TYPE_NUMBER },
+		G_OPT_SENTINEL
+	    },
+	    "[-fv] [-S size] [-s stripsize] format label level prov ..."
+	},
+	{ "add", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		{ 'S', "size", G_VAL_OPTIONAL, G_TYPE_NUMBER },
+		{ 's', "strip", G_VAL_OPTIONAL, G_TYPE_NUMBER },
+		G_OPT_SENTINEL
+	    },
+	    "[-fv] [-S size] [-s stripsize] name label level"
+	},
+	{ "delete", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		G_OPT_SENTINEL
+	    },
+	    "[-fv] name [label|num]"
+	},
+	{ "insert", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
+	    "[-v] name prov ..."
+	},
+	{ "remove", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
+	    "[-v] name prov ..."
+	},
+	{ "fail", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
+	    "[-v] name prov ..."
+	},
+	{ "stop", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		G_OPT_SENTINEL
+	    },
+	    "[-fv] name"
+	},
+	G_CMD_SENTINEL
+};
+
diff --git a/sbin/geom/class/raid/graid.8 b/sbin/geom/class/raid/graid.8
new file mode 100644
index 0000000..d1c92a2
--- /dev/null
+++ b/sbin/geom/class/raid/graid.8
@@ -0,0 +1,266 @@
+.\" Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd March 22, 2011
+.Dt GRAID 8
+.Os
+.Sh NAME
+.Nm graid
+.Nd "control utility for software RAID devices"
+.Sh SYNOPSIS
+.Nm
+.Cm label
+.Op Fl f
+.Op Fl S Ar size
+.Op Fl s Ar strip
+.Ar format
+.Ar label
+.Ar level
+.Ar prov ...
+.Nm
+.Cm add
+.Op Fl f
+.Op Fl S Ar size
+.Op Fl s Ar strip
+.Ar name
+.Ar label
+.Ar level
+.Nm
+.Cm delete
+.Op Fl f
+.Ar name
+.Op Ar label | Ar num
+.Nm
+.Cm insert
+.Ar name
+.Ar prov ...
+.Nm
+.Cm remove
+.Ar name
+.Ar prov ...
+.Nm
+.Cm fail
+.Ar name
+.Ar prov ...
+.Nm
+.Cm stop
+.Op Fl fv
+.Ar name ...
+.Nm
+.Cm list
+.Nm
+.Cm status
+.Nm
+.Cm load
+.Nm
+.Cm unload
+.Sh DESCRIPTION
+The
+.Nm
+utility is used to manage software RAID configurations, supported by the
+GEOM RAID class.
+GEOM RAID class uses on-disk metadata to provide access to software-RAID
+volumes defined by different RAID BIOSes.
+Depending on RAID BIOS type and it's metadata format, different subsets of
+configurations and features are supported.
+To allow booting from RAID volume, the metadata format should match the
+RAID BIOS type and its capabilities.
+To guarantee that these match, it is recommended to create volumes via the
+RAID BIOS interface, while experienced users are free to do it using this
+utility.
+.Pp
+The first argument to
+.Nm
+indicates an action to be performed:
+.Bl -tag -width ".Cm destroy"
+.It Cm label
+Create an array with single volume.
+The
+.Ar format
+argument specifies the on-disk metadata format to use for this array,
+such as "Intel".
+The
+.Ar label
+argument specifies the label of the created volume.
+The
+.Ar level
+argument specifies the RAID level of the created volume, such as:
+"RAID0", "RAID1", etc.
+The subsequent list enumerates providers to use as array components.
+The special name "NONE" can be used to reserve space for absent disks.
+The order of components can be important, depending on specific RAID level
+and metadata format.
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl s Ar strip"
+.It Fl f
+Enforce specified configuration creation if it is officially unsupported,
+but technically can be created.
+.It Fl S Ar size
+Use
+.Ar size
+bytes on each component for this volume.
+Should be used if several volumes per array are planned, or if smaller
+components going to be inserted later.
+Defaults to size of the smallest component.
+.It Fl s Ar strip
+Specifies strip size in bytes.
+Defaults to 131072.
+.El
+.It Cm add
+Create another volume on the existing array.
+The
+.Ar name
+argument is the name of the existing array, reported by label command.
+The rest of arguments are the same as for the label command.
+.It Cm delete
+Delete volume(s) from the existing array.
+When the last volume is deleted, the array is also deleted and its metadata
+erased.
+The
+.Ar name
+argument is the name of existing array.
+Optional
+.Ar label
+or
+.Ar num
+arguments allow specifying volume for deletion.
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl f"
+.It Fl f
+Delete volume(s) even if it is still open.
+.El
+.It Cm insert
+Insert specified provider(s) into specified array instead of the first missing
+or failed components.
+If there are no such components, mark disk(s) as spare.
+.It Cm remove
+Remove the specified provider(s) from the specified array and erase metadata.
+If there are spare disks present, the removed disk(s) will be replaced by
+spares.
+.It Cm fail
+Mark the given disks(s) as failed, removing from active use unless absolutely
+necessary due to exhausted redundancy.
+If there are spare disks present - failed disk(s) will be replaced with one
+of them.
+.It Cm stop
+Stop the given array.
+The metadata will not be erased.
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl f"
+.It Fl f
+Stop the given array even if some of its volumes are opened.
+.El
+.It Cm list
+See
+.Xr geom 8 .
+.It Cm status
+See
+.Xr geom 8 .
+.It Cm load
+See
+.Xr geom 8 .
+.It Cm unload
+See
+.Xr geom 8 .
+.El
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl v"
+.It Fl v
+Be more verbose.
+.El
+.Sh SUPPORTED METADATA FORMATS
+The GEOM RAID class follows a modular design, allowing different metadata
+formats to be used.
+Support is currently implemented for the following formats:
+.Bl -tag -width "Intel"
+.It Intel
+The format used by Intel RAID BIOS.
+Supports up to two volumes per array.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID5 (3+ disks), RAID10 (4 disks).
+Configurations not supported by Intel RAID BIOS, but enforceable on your own
+risk: RAID1 (3+ disks), RAID1E (3+ disks), RAID10 (6+ disks).
+.It JMicron
+The format used by JMicron RAID BIOS.
+Supports one volume per array.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID10 (4 disks), CONCAT (2+ disks).
+Configurations not supported by JMicron RAID BIOS, but enforceable on your own
+risk: RAID1 (3+ disks), RAID1E (3+ disks), RAID10 (6+ disks), RAID5 (3+ disks).
+.It NVIDIA
+The format used by NVIDIA MediaShield RAID BIOS.
+Supports one volume per array.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID5 (3+ disks), RAID10 (4+ disks), SINGLE (1 disk), CONCAT (2+ disks).
+Configurations not supported by NVIDIA MediaShield RAID BIOS, but enforceable
+on your own risk: RAID1 (3+ disks).
+.It Promise
+The format used by Promise and AMD/ATI RAID BIOSes and FreeBSD ataraid(4)
+driver.
+Supports multiple volumes per array.
+Each disk can be split to be used by up to two arbitrary volumes.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID5 (3+ disks), RAID10 (4 disks), SINGLE (1 disk), CONCAT (2+ disks).
+Configurations not supported by RAID BIOSes, but enforceable on your
+own risk: RAID1 (3+ disks), RAID10 (6+ disks).
+.It SiI
+The format used by SiliconImage RAID BIOS.
+Supports one volume per array.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID5 (3+ disks), RAID10 (4 disks), SINGLE (1 disk), CONCAT (2+ disks).
+Configurations not supported by SiliconImage RAID BIOS, but enforceable on your
+own risk: RAID1 (3+ disks), RAID10 (6+ disks).
+.El
+.Sh SUPPORTED RAID LEVELS
+The GEOM RAID class follows a modular design, allowing different RAID levels
+to be used.
+Support for the following RAID levels is currently implemented: RAID0, RAID1,
+RAID1E, RAID10, SINGLE, CONCAT.
+.Sh RAID LEVEL MIGRATION
+The GEOM RAID class has no support for RAID level migration, allowed by some
+metadata formats.
+If you started migration using BIOS or in some other way, make sure to
+complete it there.
+Do not run GEOM RAID class on migrating volumes under pain of possible data
+corruption!
+.Sh EXIT STATUS
+Exit status is 0 on success, and non-zero if the command fails.
+.Sh SEE ALSO
+.Xr geom 4 ,
+.Xr geom 8 ,
+.Xr vinum 8
+.Sh HISTORY
+The
+.Nm
+utility appeared in
+.Fx 9.0 .
+.Sh AUTHORS
+.An Alexander Motin Aq mav@FreeBSD.org
+.An M. Warner Losh Aq imp@FreeBSD.org
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index cf8064f..851b9b8 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -163,6 +163,7 @@ options 	GEOM_PART_MBR		# MBR partitioning
 options 	GEOM_PART_PC98		# PC-9800 disk partitioning
 options 	GEOM_PART_VTOC8		# SMI VTOC8 disk label
 options 	GEOM_PC98		# NEC PC9800 partitioning
+options 	GEOM_RAID		# Soft RAID functionality.
 options 	GEOM_RAID3		# RAID3 functionality.
 options 	GEOM_SHSEC		# Shared secret.
 options 	GEOM_STRIPE		# Disk striping.
diff --git a/sys/conf/files b/sys/conf/files
index 8af90a4..bced838 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2115,6 +2115,19 @@ geom/part/g_part_gpt.c		optional geom_part_gpt
 geom/part/g_part_mbr.c		optional geom_part_mbr
 geom/part/g_part_pc98.c		optional geom_part_pc98
 geom/part/g_part_vtoc8.c	optional geom_part_vtoc8
+geom/raid/g_raid.c		optional geom_raid
+geom/raid/g_raid_ctl.c		optional geom_raid
+geom/raid/g_raid_md_if.m	optional geom_raid
+geom/raid/g_raid_tr_if.m	optional geom_raid
+geom/raid/md_intel.c		optional geom_raid
+geom/raid/md_jmicron.c		optional geom_raid
+geom/raid/md_nvidia.c		optional geom_raid
+geom/raid/md_promise.c		optional geom_raid
+geom/raid/md_sii.c		optional geom_raid
+geom/raid/tr_concat.c		optional geom_raid
+geom/raid/tr_raid0.c		optional geom_raid
+geom/raid/tr_raid1.c		optional geom_raid
+geom/raid/tr_raid1e.c		optional geom_raid
 geom/raid3/g_raid3.c		optional geom_raid3
 geom/raid3/g_raid3_ctl.c	optional geom_raid3
 geom/shsec/g_shsec.c		optional geom_shsec
diff --git a/sys/conf/options b/sys/conf/options
index b3642e9..a507d69 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -102,6 +102,7 @@ GEOM_PART_MBR	opt_geom.h
 GEOM_PART_PC98	opt_geom.h
 GEOM_PART_VTOC8	opt_geom.h
 GEOM_PC98	opt_geom.h
+GEOM_RAID	opt_geom.h
 GEOM_RAID3	opt_geom.h
 GEOM_SHSEC	opt_geom.h
 GEOM_STRIPE	opt_geom.h
diff --git a/sys/geom/raid/g_raid.c b/sys/geom/raid/g_raid.c
new file mode 100644
index 0000000..eebb360
--- /dev/null
+++ b/sys/geom/raid/g_raid.c
@@ -0,0 +1,2340 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/bio.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/eventhandler.h>
+#include <vm/uma.h>
+#include <geom/geom.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <sys/sched.h>
+#include <geom/raid/g_raid.h>
+#include "g_raid_md_if.h"
+#include "g_raid_tr_if.h"
+
+static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");
+
+SYSCTL_DECL(_kern_geom);
+SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff");
+u_int g_raid_aggressive_spare = 0;
+TUNABLE_INT("kern.geom.raid.aggressive_spare", &g_raid_aggressive_spare);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RW,
+    &g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
+u_int g_raid_debug = 2;
+TUNABLE_INT("kern.geom.raid.debug", &g_raid_debug);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RW, &g_raid_debug, 0,
+    "Debug level");
+int g_raid_read_err_thresh = 10;
+TUNABLE_INT("kern.geom.raid.read_err_thresh", &g_raid_read_err_thresh);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RW,
+    &g_raid_read_err_thresh, 0,
+    "Number of read errors equated to disk failure");
+u_int g_raid_start_timeout = 30;
+TUNABLE_INT("kern.geom.raid.start_timeout", &g_raid_start_timeout);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RW,
+    &g_raid_start_timeout, 0,
+    "Time to wait for all array components");
+static u_int g_raid_clean_time = 5;
+TUNABLE_INT("kern.geom.raid.clean_time", &g_raid_clean_time);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RW,
+    &g_raid_clean_time, 0, "Mark volume as clean when idling");
+static u_int g_raid_disconnect_on_failure = 1;
+TUNABLE_INT("kern.geom.raid.disconnect_on_failure",
+    &g_raid_disconnect_on_failure);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RW,
+    &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
+static u_int g_raid_name_format = 0;
+TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW,
+    &g_raid_name_format, 0, "Providers name format.");
+static u_int g_raid_idle_threshold = 1000000;
+TUNABLE_INT("kern.geom.raid.idle_threshold", &g_raid_idle_threshold);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RW,
+    &g_raid_idle_threshold, 1000000,
+    "Time in microseconds to consider a volume idle.");
+
+#define	MSLEEP(rv, ident, mtx, priority, wmesg, timeout)	do {	\
+	G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));		\
+	rv = msleep((ident), (mtx), (priority), (wmesg), (timeout));	\
+	G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident));		\
+} while (0)
+
+LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
+    LIST_HEAD_INITIALIZER(g_raid_md_classes);
+
+LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
+    LIST_HEAD_INITIALIZER(g_raid_tr_classes);
+
+LIST_HEAD(, g_raid_volume) g_raid_volumes =
+    LIST_HEAD_INITIALIZER(g_raid_volumes);
+
+static eventhandler_tag g_raid_pre_sync = NULL;
+static int g_raid_started = 0;
+
+static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp,
+    struct g_geom *gp);
+static g_taste_t g_raid_taste;
+static void g_raid_init(struct g_class *mp);
+static void g_raid_fini(struct g_class *mp);
+
+struct g_class g_raid_class = {
+	.name = G_RAID_CLASS_NAME,
+	.version = G_VERSION,
+	.ctlreq = g_raid_ctl,
+	.taste = g_raid_taste,
+	.destroy_geom = g_raid_destroy_geom,
+	.init = g_raid_init,
+	.fini = g_raid_fini
+};
+
+static void g_raid_destroy_provider(struct g_raid_volume *vol);
+static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
+static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
+static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
+static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
+static void g_raid_dumpconf(struct sbuf *sb, const char *indent,
+    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
+static void g_raid_start(struct bio *bp);
+static void g_raid_start_request(struct bio *bp);
+static void g_raid_disk_done(struct bio *bp);
+static void g_raid_poll(struct g_raid_softc *sc);
+
+static const char *
+g_raid_node_event2str(int event)
+{
+
+	switch (event) {
+	case G_RAID_NODE_E_WAKE:
+		return ("WAKE");
+	case G_RAID_NODE_E_START:
+		return ("START");
+	default:
+		return ("INVALID");
+	}
+}
+
+const char *
+g_raid_disk_state2str(int state)
+{
+
+	switch (state) {
+	case G_RAID_DISK_S_NONE:
+		return ("NONE");
+	case G_RAID_DISK_S_OFFLINE:
+		return ("OFFLINE");
+	case G_RAID_DISK_S_FAILED:
+		return ("FAILED");
+	case G_RAID_DISK_S_STALE_FAILED:
+		return ("STALE_FAILED");
+	case G_RAID_DISK_S_SPARE:
+		return ("SPARE");
+	case G_RAID_DISK_S_STALE:
+		return ("STALE");
+	case G_RAID_DISK_S_ACTIVE:
+		return ("ACTIVE");
+	default:
+		return ("INVALID");
+	}
+}
+
+static const char *
+g_raid_disk_event2str(int event)
+{
+
+	switch (event) {
+	case G_RAID_DISK_E_DISCONNECTED:
+		return ("DISCONNECTED");
+	default:
+		return ("INVALID");
+	}
+}
+
+const char *
+g_raid_subdisk_state2str(int state)
+{
+
+	switch (state) {
+	case G_RAID_SUBDISK_S_NONE:
+		return ("NONE");
+	case G_RAID_SUBDISK_S_FAILED:
+		return ("FAILED");
+	case G_RAID_SUBDISK_S_NEW:
+		return ("NEW");
+	case G_RAID_SUBDISK_S_REBUILD:
+		return ("REBUILD");
+	case G_RAID_SUBDISK_S_UNINITIALIZED:
+		return ("UNINITIALIZED");
+	case G_RAID_SUBDISK_S_STALE:
+		return ("STALE");
+	case G_RAID_SUBDISK_S_RESYNC:
+		return ("RESYNC");
+	case G_RAID_SUBDISK_S_ACTIVE:
+		return ("ACTIVE");
+	default:
+		return ("INVALID");
+	}
+}
+
+static const char *
+g_raid_subdisk_event2str(int event)
+{
+
+	switch (event) {
+	case G_RAID_SUBDISK_E_NEW:
+		return ("NEW");
+	case G_RAID_SUBDISK_E_DISCONNECTED:
+		return ("DISCONNECTED");
+	default:
+		return ("INVALID");
+	}
+}
+
+const char *
+g_raid_volume_state2str(int state)
+{
+
+	switch (state) {
+	case G_RAID_VOLUME_S_STARTING:
+		return ("STARTING");
+	case G_RAID_VOLUME_S_BROKEN:
+		return ("BROKEN");
+	case G_RAID_VOLUME_S_DEGRADED:
+		return ("DEGRADED");
+	case G_RAID_VOLUME_S_SUBOPTIMAL:
+		return ("SUBOPTIMAL");
+	case G_RAID_VOLUME_S_OPTIMAL:
+		return ("OPTIMAL");
+	case G_RAID_VOLUME_S_UNSUPPORTED:
+		return ("UNSUPPORTED");
+	case G_RAID_VOLUME_S_STOPPED:
+		return ("STOPPED");
+	default:
+		return ("INVALID");
+	}
+}
+
+static const char *
+g_raid_volume_event2str(int event)
+{
+
+	switch (event) {
+	case G_RAID_VOLUME_E_UP:
+		return ("UP");
+	case G_RAID_VOLUME_E_DOWN:
+		return ("DOWN");
+	case G_RAID_VOLUME_E_START:
+		return ("START");
+	case G_RAID_VOLUME_E_STARTMD:
+		return ("STARTMD");
+	default:
+		return ("INVALID");
+	}
+}
+
+const char *
+g_raid_volume_level2str(int level, int qual)
+{
+
+	switch (level) {
+	case G_RAID_VOLUME_RL_RAID0:
+		return ("RAID0");
+	case G_RAID_VOLUME_RL_RAID1:
+		return ("RAID1");
+	case G_RAID_VOLUME_RL_RAID3:
+		return ("RAID3");
+	case G_RAID_VOLUME_RL_RAID4:
+		return ("RAID4");
+	case G_RAID_VOLUME_RL_RAID5:
+		return ("RAID5");
+	case G_RAID_VOLUME_RL_RAID6:
+		return ("RAID6");
+	case G_RAID_VOLUME_RL_RAID1E:
+		return ("RAID1E");
+	case G_RAID_VOLUME_RL_SINGLE:
+		return ("SINGLE");
+	case G_RAID_VOLUME_RL_CONCAT:
+		return ("CONCAT");
+	case G_RAID_VOLUME_RL_RAID5E:
+		return ("RAID5E");
+	case G_RAID_VOLUME_RL_RAID5EE:
+		return ("RAID5EE");
+	default:
+		return ("UNKNOWN");
+	}
+}
+
+int
+g_raid_volume_str2level(const char *str, int *level, int *qual)
+{
+
+	*level = G_RAID_VOLUME_RL_UNKNOWN;
+	*qual = G_RAID_VOLUME_RLQ_NONE;
+	if (strcasecmp(str, "RAID0") == 0)
+		*level = G_RAID_VOLUME_RL_RAID0;
+	else if (strcasecmp(str, "RAID1") == 0)
+		*level = G_RAID_VOLUME_RL_RAID1;
+	else if (strcasecmp(str, "RAID3") == 0)
+		*level = G_RAID_VOLUME_RL_RAID3;
+	else if (strcasecmp(str, "RAID4") == 0)
+		*level = G_RAID_VOLUME_RL_RAID4;
+	else if (strcasecmp(str, "RAID5") == 0)
+		*level = G_RAID_VOLUME_RL_RAID5;
+	else if (strcasecmp(str, "RAID6") == 0)
+		*level = G_RAID_VOLUME_RL_RAID6;
+	else if (strcasecmp(str, "RAID10") == 0 ||
+		 strcasecmp(str, "RAID1E") == 0)
+		*level = G_RAID_VOLUME_RL_RAID1E;
+	else if (strcasecmp(str, "SINGLE") == 0)
+		*level = G_RAID_VOLUME_RL_SINGLE;
+	else if (strcasecmp(str, "CONCAT") == 0)
+		*level = G_RAID_VOLUME_RL_CONCAT;
+	else if (strcasecmp(str, "RAID5E") == 0)
+		*level = G_RAID_VOLUME_RL_RAID5E;
+	else if (strcasecmp(str, "RAID5EE") == 0)
+		*level = G_RAID_VOLUME_RL_RAID5EE;
+	else
+		return (-1);
+	return (0);
+}
+
+const char *
+g_raid_get_diskname(struct g_raid_disk *disk)
+{
+
+	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
+		return ("[unknown]");
+	return (disk->d_consumer->provider->name);
+}
+
+void
+g_raid_report_disk_state(struct g_raid_disk *disk)
+{
+	struct g_raid_subdisk *sd;
+	int len, state;
+	uint32_t s;
+
+	if (disk->d_consumer == NULL)
+		return;
+	if (disk->d_state == G_RAID_DISK_S_FAILED ||
+	    disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
+		s = G_STATE_FAILED;
+	} else {
+		state = G_RAID_SUBDISK_S_ACTIVE;
+		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+			if (sd->sd_state < state)
+				state = sd->sd_state;
+		}
+		if (state == G_RAID_SUBDISK_S_FAILED)
+			s = G_STATE_FAILED;
+		else if (state == G_RAID_SUBDISK_S_NEW ||
+		    state == G_RAID_SUBDISK_S_REBUILD)
+			s = G_STATE_REBUILD;
+		else if (state == G_RAID_SUBDISK_S_STALE ||
+		    state == G_RAID_SUBDISK_S_RESYNC)
+			s = G_STATE_RESYNC;
+		else
+			s = G_STATE_ACTIVE;
+	}
+	len = sizeof(s);
+	g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
+	G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
+	    g_raid_get_diskname(disk), s);
+}
+
+void
+g_raid_change_disk_state(struct g_raid_disk *disk, int state)
+{
+
+	G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
+	    g_raid_get_diskname(disk),
+	    g_raid_disk_state2str(disk->d_state),
+	    g_raid_disk_state2str(state));
+	disk->d_state = state;
+	g_raid_report_disk_state(disk);
+}
+
+void
+g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
+{
+
+	G_RAID_DEBUG1(0, sd->sd_softc,
+	    "Subdisk %s:%d-%s state changed from %s to %s.",
+	    sd->sd_volume->v_name, sd->sd_pos,
+	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
+	    g_raid_subdisk_state2str(sd->sd_state),
+	    g_raid_subdisk_state2str(state));
+	sd->sd_state = state;
+	if (sd->sd_disk)
+		g_raid_report_disk_state(sd->sd_disk);
+}
+
+void
+g_raid_change_volume_state(struct g_raid_volume *vol, int state)
+{
+
+	G_RAID_DEBUG1(0, vol->v_softc,
+	    "Volume %s state changed from %s to %s.",
+	    vol->v_name,
+	    g_raid_volume_state2str(vol->v_state),
+	    g_raid_volume_state2str(state));
+	vol->v_state = state;
+}
+
+/*
+ * --- Events handling functions ---
+ * Events in geom_raid are used to maintain subdisks and volumes status
+ * from one thread to simplify locking.
+ */
+static void
+g_raid_event_free(struct g_raid_event *ep)
+{
+
+	free(ep, M_RAID);
+}
+
+int
+g_raid_event_send(void *arg, int event, int flags)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_event *ep;
+	int error;
+
+	if ((flags & G_RAID_EVENT_VOLUME) != 0) {
+		sc = ((struct g_raid_volume *)arg)->v_softc;
+	} else if ((flags & G_RAID_EVENT_DISK) != 0) {
+		sc = ((struct g_raid_disk *)arg)->d_softc;
+	} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
+		sc = ((struct g_raid_subdisk *)arg)->sd_softc;
+	} else {
+		sc = arg;
+	}
+	ep = malloc(sizeof(*ep), M_RAID,
+	    sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
+	if (ep == NULL)
+		return (ENOMEM);
+	ep->e_tgt = arg;
+	ep->e_event = event;
+	ep->e_flags = flags;
+	ep->e_error = 0;
+	G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
+	mtx_lock(&sc->sc_queue_mtx);
+	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
+	mtx_unlock(&sc->sc_queue_mtx);
+	wakeup(sc);
+
+	if ((flags & G_RAID_EVENT_WAIT) == 0)
+		return (0);
+
+	sx_assert(&sc->sc_lock, SX_XLOCKED);
+	G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
+	sx_xunlock(&sc->sc_lock);
+	while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
+		mtx_lock(&sc->sc_queue_mtx);
+		MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event",
+		    hz * 5);
+	}
+	error = ep->e_error;
+	g_raid_event_free(ep);
+	sx_xlock(&sc->sc_lock);
+	return (error);
+}
+
+static void
+g_raid_event_cancel(struct g_raid_softc *sc, void *tgt)
+{
+	struct g_raid_event *ep, *tmpep;
+
+	sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+	mtx_lock(&sc->sc_queue_mtx);
+	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
+		if (ep->e_tgt != tgt)
+			continue;
+		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
+		if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
+			g_raid_event_free(ep);
+		else {
+			ep->e_error = ECANCELED;
+			wakeup(ep);
+		}
+	}
+	mtx_unlock(&sc->sc_queue_mtx);
+}
+
+static int
+g_raid_event_check(struct g_raid_softc *sc, void *tgt)
+{
+	struct g_raid_event *ep;
+	int	res = 0;
+
+	sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+	mtx_lock(&sc->sc_queue_mtx);
+	TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
+		if (ep->e_tgt != tgt)
+			continue;
+		res = 1;
+		break;
+	}
+	mtx_unlock(&sc->sc_queue_mtx);
+	return (res);
+}
+
+/*
+ * Return the number of disks in given state.
+ * If state is equal to -1, count all connected disks.
+ */
+u_int
+g_raid_ndisks(struct g_raid_softc *sc, int state)
+{
+	struct g_raid_disk *disk;
+	u_int n;
+
+	sx_assert(&sc->sc_lock, SX_LOCKED);
+
+	n = 0;
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		if (disk->d_state == state || state == -1)
+			n++;
+	}
+	return (n);
+}
+
+/*
+ * Return the number of subdisks in given state.
+ * If state is equal to -1, count all connected disks.
+ */
+u_int
+g_raid_nsubdisks(struct g_raid_volume *vol, int state)
+{
+	struct g_raid_subdisk *subdisk;
+	struct g_raid_softc *sc;
+	u_int i, n ;
+
+	sc = vol->v_softc;
+	sx_assert(&sc->sc_lock, SX_LOCKED);
+
+	n = 0;
+	for (i = 0; i < vol->v_disks_count; i++) {
+		subdisk = &vol->v_subdisks[i];
+		if ((state == -1 &&
+		     subdisk->sd_state != G_RAID_SUBDISK_S_NONE) ||
+		    subdisk->sd_state == state)
+			n++;
+	}
+	return (n);
+}
+
+/*
+ * Return the first subdisk in given state.
+ * If state is equal to -1, then the first connected disks.
+ */
+struct g_raid_subdisk *
+g_raid_get_subdisk(struct g_raid_volume *vol, int state)
+{
+	struct g_raid_subdisk *sd;
+	struct g_raid_softc *sc;
+	u_int i;
+
+	sc = vol->v_softc;
+	sx_assert(&sc->sc_lock, SX_LOCKED);
+
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		if ((state == -1 &&
+		     sd->sd_state != G_RAID_SUBDISK_S_NONE) ||
+		    sd->sd_state == state)
+			return (sd);
+	}
+	return (NULL);
+}
+
+struct g_consumer *
+g_raid_open_consumer(struct g_raid_softc *sc, const char *name)
+{
+	struct g_consumer *cp;
+	struct g_provider *pp;
+
+	g_topology_assert();
+
+	if (strncmp(name, "/dev/", 5) == 0)
+		name += 5;
+	pp = g_provider_by_name(name);
+	if (pp == NULL)
+		return (NULL);
+	cp = g_new_consumer(sc->sc_geom);
+	if (g_attach(cp, pp) != 0) {
+		g_destroy_consumer(cp);
+		return (NULL);
+	}
+	if (g_access(cp, 1, 1, 1) != 0) {
+		g_detach(cp);
+		g_destroy_consumer(cp);
+		return (NULL);
+	}
+	return (cp);
+}
+
+static u_int
+g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp)
+{
+	struct bio *bp;
+	u_int nreqs = 0;
+
+	mtx_lock(&sc->sc_queue_mtx);
+	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
+		if (bp->bio_from == cp)
+			nreqs++;
+	}
+	mtx_unlock(&sc->sc_queue_mtx);
+	return (nreqs);
+}
+
+u_int
+g_raid_nopens(struct g_raid_softc *sc)
+{
+	struct g_raid_volume *vol;
+	u_int opens;
+
+	opens = 0;
+	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+		if (vol->v_provider_open != 0)
+			opens++;
+	}
+	return (opens);
+}
+
+static int
+g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp)
+{
+
+	if (cp->index > 0) {
+		G_RAID_DEBUG1(2, sc,
+		    "I/O requests for %s exist, can't destroy it now.",
+		    cp->provider->name);
+		return (1);
+	}
+	if (g_raid_nrequests(sc, cp) > 0) {
+		G_RAID_DEBUG1(2, sc,
+		    "I/O requests for %s in queue, can't destroy it now.",
+		    cp->provider->name);
+		return (1);
+	}
+	return (0);
+}
+
+static void
+g_raid_destroy_consumer(void *arg, int flags __unused)
+{
+	struct g_consumer *cp;
+
+	g_topology_assert();
+
+	cp = arg;
+	G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
+	g_detach(cp);
+	g_destroy_consumer(cp);
+}
+
+void
+g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp)
+{
+	struct g_provider *pp;
+	int retaste_wait;
+
+	g_topology_assert_not();
+
+	g_topology_lock();
+	cp->private = NULL;
+	if (g_raid_consumer_is_busy(sc, cp))
+		goto out;
+	pp = cp->provider;
+	retaste_wait = 0;
+	if (cp->acw == 1) {
+		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
+			retaste_wait = 1;
+	}
+	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
+		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
+	if (retaste_wait) {
+		/*
+		 * After retaste event was send (inside g_access()), we can send
+		 * event to detach and destroy consumer.
+		 * A class, which has consumer to the given provider connected
+		 * will not receive retaste event for the provider.
+		 * This is the way how I ignore retaste events when I close
+		 * consumers opened for write: I detach and destroy consumer
+		 * after retaste event is sent.
+		 */
+		g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL);
+		goto out;
+	}
+	G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name);
+	g_detach(cp);
+	g_destroy_consumer(cp);
+out:
+	g_topology_unlock();
+}
+
+static void
+g_raid_orphan(struct g_consumer *cp)
+{
+	struct g_raid_disk *disk;
+
+	g_topology_assert();
+
+	disk = cp->private;
+	if (disk == NULL)
+		return;
+	g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED,
+	    G_RAID_EVENT_DISK);
+}
+
+static int
+g_raid_clean(struct g_raid_volume *vol, int acw)
+{
+	struct g_raid_softc *sc;
+	int timeout;
+
+	sc = vol->v_softc;
+	g_topology_assert_not();
+	sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
+//		return (0);
+	if (!vol->v_dirty)
+		return (0);
+	if (vol->v_writes > 0)
+		return (0);
+	if (acw > 0 || (acw == -1 &&
+	    vol->v_provider != NULL && vol->v_provider->acw > 0)) {
+		timeout = g_raid_clean_time - (time_uptime - vol->v_last_write);
+		if (timeout > 0)
+			return (timeout);
+	}
+	vol->v_dirty = 0;
+	G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.",
+	    vol->v_name);
+	g_raid_write_metadata(sc, vol, NULL, NULL);
+	return (0);
+}
+
+static void
+g_raid_dirty(struct g_raid_volume *vol)
+{
+	struct g_raid_softc *sc;
+
+	sc = vol->v_softc;
+	g_topology_assert_not();
+	sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
+//		return;
+	vol->v_dirty = 1;
+	G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.",
+	    vol->v_name);
+	g_raid_write_metadata(sc, vol, NULL, NULL);
+}
+
+void
+g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct bio_queue_head queue;
+	struct bio *cbp;
+	int i;
+
+	vol = tr->tro_volume;
+	sc = vol->v_softc;
+
+	/*
+	 * Allocate all bios before sending any request, so we can return
+	 * ENOMEM in nice and clean way.
+	 */
+	bioq_init(&queue);
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
+		    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
+			continue;
+		cbp = g_clone_bio(bp);
+		if (cbp == NULL)
+			goto failure;
+		cbp->bio_caller1 = sd;
+		bioq_insert_tail(&queue, cbp);
+	}
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		sd = cbp->bio_caller1;
+		cbp->bio_caller1 = NULL;
+		g_raid_subdisk_iostart(sd, cbp);
+	}
+	return;
+failure:
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		g_destroy_bio(cbp);
+	}
+	if (bp->bio_error == 0)
+		bp->bio_error = ENOMEM;
+	g_raid_iodone(bp, bp->bio_error);
+}
+
+static void
+g_raid_tr_kerneldump_common_done(struct bio *bp)
+{
+
+	bp->bio_flags |= BIO_DONE;
+}
+
+int
+g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
+    void *virtual, vm_offset_t physical, off_t offset, size_t length)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct bio bp;
+
+	vol = tr->tro_volume;
+	sc = vol->v_softc;
+
+	bzero(&bp, sizeof(bp));
+	bp.bio_cmd = BIO_WRITE;
+	bp.bio_done = g_raid_tr_kerneldump_common_done;
+	bp.bio_attribute = NULL;
+	bp.bio_offset = offset;
+	bp.bio_length = length;
+	bp.bio_data = virtual;
+	bp.bio_to = vol->v_provider;
+
+	g_raid_start(&bp);
+	while (!(bp.bio_flags & BIO_DONE)) {
+		G_RAID_DEBUG1(4, sc, "Poll...");
+		g_raid_poll(sc);
+		DELAY(10);
+	}
+
+	return (bp.bio_error != 0 ? EIO : 0);
+}
+
+static int
+g_raid_dump(void *arg,
+    void *virtual, vm_offset_t physical, off_t offset, size_t length)
+{
+	struct g_raid_volume *vol;
+	int error;
+
+	vol = (struct g_raid_volume *)arg;
+	G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.",
+	    (long long unsigned)offset, (long long unsigned)length);
+
+	error = G_RAID_TR_KERNELDUMP(vol->v_tr,
+	    virtual, physical, offset, length);
+	return (error);
+}
+
+static void
+g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp)
+{
+	struct g_kerneldump *gkd;
+	struct g_provider *pp;
+	struct g_raid_volume *vol;
+
+	gkd = (struct g_kerneldump*)bp->bio_data;
+	pp = bp->bio_to;
+	vol = pp->private;
+	g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)",
+		pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
+	gkd->di.dumper = g_raid_dump;
+	gkd->di.priv = vol;
+	gkd->di.blocksize = vol->v_sectorsize;
+	gkd->di.maxiosize = DFLTPHYS;
+	gkd->di.mediaoffset = gkd->offset;
+	if ((gkd->offset + gkd->length) > vol->v_mediasize)
+		gkd->length = vol->v_mediasize - gkd->offset;
+	gkd->di.mediasize = gkd->length;
+	g_io_deliver(bp, 0);
+}
+
+static void
+g_raid_start(struct bio *bp)
+{
+	struct g_raid_softc *sc;
+
+	sc = bp->bio_to->geom->softc;
+	/*
+	 * If sc == NULL or there are no valid disks, provider's error
+	 * should be set and g_raid_start() should not be called at all.
+	 */
+//	KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING,
+//	    ("Provider's error should be set (error=%d)(mirror=%s).",
+//	    bp->bio_to->error, bp->bio_to->name));
+	G_RAID_LOGREQ(3, bp, "Request received.");
+
+	switch (bp->bio_cmd) {
+	case BIO_READ:
+	case BIO_WRITE:
+	case BIO_DELETE:
+	case BIO_FLUSH:
+		break;
+	case BIO_GETATTR:
+		if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
+			g_raid_kerneldump(sc, bp);
+		else
+			g_io_deliver(bp, EOPNOTSUPP);
+		return;
+	default:
+		g_io_deliver(bp, EOPNOTSUPP);
+		return;
+	}
+	mtx_lock(&sc->sc_queue_mtx);
+	bioq_disksort(&sc->sc_queue, bp);
+	mtx_unlock(&sc->sc_queue_mtx);
+	if (!dumping) {
+		G_RAID_DEBUG1(4, sc, "Waking up %p.", sc);
+		wakeup(sc);
+	}
+}
+
+static int
+g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len)
+{
+	/*
+	 * 5 cases:
+	 * (1) bp entirely below NO
+	 * (2) bp entirely above NO
+	 * (3) bp start below, but end in range YES
+	 * (4) bp entirely within YES
+	 * (5) bp starts within, ends above YES
+	 *
+	 * lock range 10-19 (offset 10 length 10)
+	 * (1) 1-5: first if kicks it out
+	 * (2) 30-35: second if kicks it out
+	 * (3) 5-15: passes both ifs
+	 * (4) 12-14: passes both ifs
+	 * (5) 19-20: passes both
+	 */
+	off_t lend = lstart + len - 1;
+	off_t bstart = bp->bio_offset;
+	off_t bend = bp->bio_offset + bp->bio_length - 1;
+
+	if (bend < lstart)
+		return (0);
+	if (lend < bstart)
+		return (0);
+	return (1);
+}
+
+static int
+g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp)
+{
+	struct g_raid_lock *lp;
+
+	sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);
+
+	LIST_FOREACH(lp, &vol->v_locks, l_next) {
+		if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
+			return (1);
+	}
+	return (0);
+}
+
+static void
+g_raid_start_request(struct bio *bp)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+
+	sc = bp->bio_to->geom->softc;
+	sx_assert(&sc->sc_lock, SX_LOCKED);
+	vol = bp->bio_to->private;
+
+	/*
+	 * Check to see if this item is in a locked range.  If so,
+	 * queue it to our locked queue and return.  We'll requeue
+	 * it when the range is unlocked.  Internal I/O for the
+	 * rebuild/rescan/recovery process is excluded from this
+	 * check so we can actually do the recovery.
+	 */
+	if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) &&
+	    g_raid_is_in_locked_range(vol, bp)) {
+		G_RAID_LOGREQ(3, bp, "Defer request.");
+		bioq_insert_tail(&vol->v_locked, bp);
+		return;
+	}
+
+	/*
+	 * If we're actually going to do the write/delete, then
+	 * update the idle stats for the volume.
+	 */
+	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
+		if (!vol->v_dirty)
+			g_raid_dirty(vol);
+		vol->v_writes++;
+	}
+
+	/*
+	 * Put request onto inflight queue, so we can check if new
+	 * synchronization requests don't collide with it.  Then tell
+	 * the transformation layer to start the I/O.
+	 */
+	bioq_insert_tail(&vol->v_inflight, bp);
+	G_RAID_LOGREQ(4, bp, "Request started");
+	G_RAID_TR_IOSTART(vol->v_tr, bp);
+}
+
+static void
+g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp)
+{
+	off_t off, len;
+	struct bio *nbp;
+	struct g_raid_lock *lp;
+
+	vol->v_pending_lock = 0;
+	LIST_FOREACH(lp, &vol->v_locks, l_next) {
+		if (lp->l_pending) {
+			off = lp->l_offset;
+			len = lp->l_length;
+			lp->l_pending = 0;
+			TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) {
+				if (g_raid_bio_overlaps(nbp, off, len))
+					lp->l_pending++;
+			}
+			if (lp->l_pending) {
+				vol->v_pending_lock = 1;
+				G_RAID_DEBUG1(4, vol->v_softc,
+				    "Deferred lock(%jd, %jd) has %d pending",
+				    (intmax_t)off, (intmax_t)(off + len),
+				    lp->l_pending);
+				continue;
+			}
+			G_RAID_DEBUG1(4, vol->v_softc,
+			    "Deferred lock of %jd to %jd completed",
+			    (intmax_t)off, (intmax_t)(off + len));
+			G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
+		}
+	}
+}
+
+void
+g_raid_iodone(struct bio *bp, int error)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+
+	sc = bp->bio_to->geom->softc;
+	sx_assert(&sc->sc_lock, SX_LOCKED);
+	vol = bp->bio_to->private;
+	G_RAID_LOGREQ(3, bp, "Request done: %d.", error);
+
+	/* Update stats if we done write/delete. */
+	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
+		vol->v_writes--;
+		vol->v_last_write = time_uptime;
+	}
+
+	bioq_remove(&vol->v_inflight, bp);
+	if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp))
+		g_raid_finish_with_locked_ranges(vol, bp);
+	getmicrouptime(&vol->v_last_done);
+	g_io_deliver(bp, error);
+}
+
+int
+g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
+    struct bio *ignore, void *argp)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_lock *lp;
+	struct bio *bp;
+
+	sc = vol->v_softc;
+	lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO);
+	LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
+	lp->l_offset = off;
+	lp->l_length = len;
+	lp->l_callback_arg = argp;
+
+	lp->l_pending = 0;
+	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
+		if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
+			lp->l_pending++;
+	}	
+
+	/*
+	 * If there are any writes that are pending, we return EBUSY.  All
+	 * callers will have to wait until all pending writes clear.
+	 */
+	if (lp->l_pending > 0) {
+		vol->v_pending_lock = 1;
+		G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend",
+		    (intmax_t)off, (intmax_t)(off+len), lp->l_pending);
+		return (EBUSY);
+	}
+	G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd",
+	    (intmax_t)off, (intmax_t)(off+len));
+	G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
+	return (0);
+}
+
+int
+g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
+{
+	struct g_raid_lock *lp;
+	struct g_raid_softc *sc;
+	struct bio *bp;
+
+	sc = vol->v_softc;
+	LIST_FOREACH(lp, &vol->v_locks, l_next) {
+		if (lp->l_offset == off && lp->l_length == len) {
+			LIST_REMOVE(lp, l_next);
+			/* XXX
+			 * Right now we just put them all back on the queue
+			 * and hope for the best.  We hope this because any
+			 * locked ranges will go right back on this list
+			 * when the worker thread runs.
+			 * XXX
+			 */
+			G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd",
+			    (intmax_t)lp->l_offset,
+			    (intmax_t)(lp->l_offset+lp->l_length));
+			mtx_lock(&sc->sc_queue_mtx);
+			while ((bp = bioq_takefirst(&vol->v_locked)) != NULL)
+				bioq_disksort(&sc->sc_queue, bp);
+			mtx_unlock(&sc->sc_queue_mtx);
+			free(lp, M_RAID);
+			return (0);
+		}
+	}
+	return (EINVAL);
+}
+
+void
+g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
+{
+	struct g_consumer *cp;
+	struct g_raid_disk *disk, *tdisk;
+
+	bp->bio_caller1 = sd;
+
+	/*
+	 * Make sure that the disk is present. Generally it is a task of
+	 * transformation layers to not send requests to absent disks, but
+	 * it is better to be safe and report situation then sorry.
+	 */
+	if (sd->sd_disk == NULL) {
+		G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!");
+nodisk:
+		bp->bio_from = NULL;
+		bp->bio_to = NULL;
+		bp->bio_error = ENXIO;
+		g_raid_disk_done(bp);
+		return;
+	}
+	disk = sd->sd_disk;
+	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
+	    disk->d_state != G_RAID_DISK_S_FAILED) {
+		G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a "
+		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
+		goto nodisk;
+	}
+
+	cp = disk->d_consumer;
+	bp->bio_from = cp;
+	bp->bio_to = cp->provider;
+	cp->index++;
+
+	/* Update average disks load. */
+	TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
+		if (tdisk->d_consumer == NULL)
+			tdisk->d_load = 0;
+		else
+			tdisk->d_load = (tdisk->d_consumer->index *
+			    G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
+	}
+
+	disk->d_last_offset = bp->bio_offset + bp->bio_length;
+	if (dumping) {
+		G_RAID_LOGREQ(3, bp, "Sending dumping request.");
+		if (bp->bio_cmd == BIO_WRITE) {
+			bp->bio_error = g_raid_subdisk_kerneldump(sd,
+			    bp->bio_data, 0, bp->bio_offset, bp->bio_length);
+		} else
+			bp->bio_error = EOPNOTSUPP;
+		g_raid_disk_done(bp);
+	} else {
+		bp->bio_done = g_raid_disk_done;
+		bp->bio_offset += sd->sd_offset;
+		G_RAID_LOGREQ(3, bp, "Sending request.");
+		g_io_request(bp, cp);
+	}
+}
+
+int
+g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
+    void *virtual, vm_offset_t physical, off_t offset, size_t length)
+{
+
+	if (sd->sd_disk == NULL)
+		return (ENXIO);
+	if (sd->sd_disk->d_kd.di.dumper == NULL)
+		return (EOPNOTSUPP);
+	return (dump_write(&sd->sd_disk->d_kd.di,
+	    virtual, physical,
+	    sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset,
+	    length));
+}
+
+static void
+g_raid_disk_done(struct bio *bp)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd;
+
+	sd = bp->bio_caller1;
+	sc = sd->sd_softc;
+	mtx_lock(&sc->sc_queue_mtx);
+	bioq_disksort(&sc->sc_queue, bp);
+	mtx_unlock(&sc->sc_queue_mtx);
+	if (!dumping)
+		wakeup(sc);
+}
+
+static void
+g_raid_disk_done_request(struct bio *bp)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_disk *disk;
+	struct g_raid_subdisk *sd;
+	struct g_raid_volume *vol;
+
+	g_topology_assert_not();
+
+	G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error);
+	sd = bp->bio_caller1;
+	sc = sd->sd_softc;
+	vol = sd->sd_volume;
+	if (bp->bio_from != NULL) {
+		bp->bio_from->index--;
+		disk = bp->bio_from->private;
+		if (disk == NULL)
+			g_raid_kill_consumer(sc, bp->bio_from);
+	}
+	bp->bio_offset -= sd->sd_offset;
+
+	G_RAID_TR_IODONE(vol->v_tr, sd, bp);
+}
+
+static void
+g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep)
+{
+
+	if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0)
+		ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event);
+	else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0)
+		ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event);
+	else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0)
+		ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event);
+	else
+		ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event);
+	if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) {
+		KASSERT(ep->e_error == 0,
+		    ("Error cannot be handled."));
+		g_raid_event_free(ep);
+	} else {
+		ep->e_flags |= G_RAID_EVENT_DONE;
+		G_RAID_DEBUG1(4, sc, "Waking up %p.", ep);
+		mtx_lock(&sc->sc_queue_mtx);
+		wakeup(ep);
+		mtx_unlock(&sc->sc_queue_mtx);
+	}
+}
+
+/*
+ * Worker thread.
+ */
+static void
+g_raid_worker(void *arg)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_event *ep;
+	struct g_raid_volume *vol;
+	struct bio *bp;
+	struct timeval now, t;
+	int timeout, rv;
+
+	sc = arg;
+	thread_lock(curthread);
+	sched_prio(curthread, PRIBIO);
+	thread_unlock(curthread);
+
+	sx_xlock(&sc->sc_lock);
+	for (;;) {
+		mtx_lock(&sc->sc_queue_mtx);
+		/*
+		 * First take a look at events.
+		 * This is important to handle events before any I/O requests.
+		 */
+		bp = NULL;
+		vol = NULL;
+		rv = 0;
+		ep = TAILQ_FIRST(&sc->sc_events);
+		if (ep != NULL)
+			TAILQ_REMOVE(&sc->sc_events, ep, e_next);
+		else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
+			;
+		else {
+			getmicrouptime(&now);
+			t = now;
+			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+				if (bioq_first(&vol->v_inflight) == NULL &&
+				    vol->v_tr &&
+				    timevalcmp(&vol->v_last_done, &t, < ))
+					t = vol->v_last_done;
+			}
+			timevalsub(&t, &now);
+			timeout = g_raid_idle_threshold +
+			    t.tv_sec * 1000000 + t.tv_usec;
+			if (timeout > 0) {
+				/*
+				 * Two steps to avoid overflows at HZ=1000
+				 * and idle timeouts > 2.1s.  Some rounding
+				 * errors can occur, but they are < 1tick,
+				 * which is deemed to be close enough for
+				 * this purpose.
+				 */
+				int micpertic = 1000000 / hz;
+				timeout = (timeout + micpertic - 1) / micpertic;
+				sx_xunlock(&sc->sc_lock);
+				MSLEEP(rv, sc, &sc->sc_queue_mtx,
+				    PRIBIO | PDROP, "-", timeout);
+				sx_xlock(&sc->sc_lock);
+				goto process;
+			} else
+				rv = EWOULDBLOCK;
+		}
+		mtx_unlock(&sc->sc_queue_mtx);
+process:
+		if (ep != NULL) {
+			g_raid_handle_event(sc, ep);
+		} else if (bp != NULL) {
+			if (bp->bio_to != NULL &&
+			    bp->bio_to->geom == sc->sc_geom)
+				g_raid_start_request(bp);
+			else
+				g_raid_disk_done_request(bp);
+		} else if (rv == EWOULDBLOCK) {
+			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+				if (vol->v_writes == 0 && vol->v_dirty)
+					g_raid_clean(vol, -1);
+				if (bioq_first(&vol->v_inflight) == NULL &&
+				    vol->v_tr) {
+					t.tv_sec = g_raid_idle_threshold / 1000000;
+					t.tv_usec = g_raid_idle_threshold % 1000000;
+					timevaladd(&t, &vol->v_last_done);
+					getmicrouptime(&now);
+					if (timevalcmp(&t, &now, <= )) {
+						G_RAID_TR_IDLE(vol->v_tr);
+						vol->v_last_done = now;
+					}
+				}
+			}
+		}
+		if (sc->sc_stopping == G_RAID_DESTROY_HARD)
+			g_raid_destroy_node(sc, 1);	/* May not return. */
+	}
+}
+
+static void
+g_raid_poll(struct g_raid_softc *sc)
+{
+	struct g_raid_event *ep;
+	struct bio *bp;
+
+	sx_xlock(&sc->sc_lock);
+	mtx_lock(&sc->sc_queue_mtx);
+	/*
+	 * First take a look at events.
+	 * This is important to handle events before any I/O requests.
+	 */
+	ep = TAILQ_FIRST(&sc->sc_events);
+	if (ep != NULL) {
+		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
+		mtx_unlock(&sc->sc_queue_mtx);
+		g_raid_handle_event(sc, ep);
+		goto out;
+	}
+	bp = bioq_takefirst(&sc->sc_queue);
+	if (bp != NULL) {
+		mtx_unlock(&sc->sc_queue_mtx);
+		if (bp->bio_from == NULL ||
+		    bp->bio_from->geom != sc->sc_geom)
+			g_raid_start_request(bp);
+		else
+			g_raid_disk_done_request(bp);
+	}
+out:
+	sx_xunlock(&sc->sc_lock);
+}
+
+static void
+g_raid_launch_provider(struct g_raid_volume *vol)
+{
+	struct g_raid_disk *disk;
+	struct g_raid_softc *sc;
+	struct g_provider *pp;
+	char name[G_RAID_MAX_VOLUMENAME];
+	off_t off;
+
+	sc = vol->v_softc;
+	sx_assert(&sc->sc_lock, SX_LOCKED);
+
+	g_topology_lock();
+	/* Try to name provider with volume name. */
+	snprintf(name, sizeof(name), "raid/%s", vol->v_name);
+	if (g_raid_name_format == 0 || vol->v_name[0] == 0 ||
+	    g_provider_by_name(name) != NULL) {
+		/* Otherwise use sequential volume number. */
+		snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id);
+	}
+	pp = g_new_providerf(sc->sc_geom, "%s", name);
+	pp->private = vol;
+	pp->mediasize = vol->v_mediasize;
+	pp->sectorsize = vol->v_sectorsize;
+	pp->stripesize = 0;
+	pp->stripeoffset = 0;
+	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
+	    vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
+	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE ||
+	    vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) {
+		if ((disk = vol->v_subdisks[0].sd_disk) != NULL &&
+		    disk->d_consumer != NULL &&
+		    disk->d_consumer->provider != NULL) {
+			pp->stripesize = disk->d_consumer->provider->stripesize;
+			off = disk->d_consumer->provider->stripeoffset;
+			pp->stripeoffset = off + vol->v_subdisks[0].sd_offset;
+			if (off > 0)
+				pp->stripeoffset %= off;
+		}
+		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) {
+			pp->stripesize *= (vol->v_disks_count - 1);
+			pp->stripeoffset *= (vol->v_disks_count - 1);
+		}
+	} else
+		pp->stripesize = vol->v_strip_size;
+	vol->v_provider = pp;
+	g_error_provider(pp, 0);
+	g_topology_unlock();
+	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.",
+	    pp->name, vol->v_name);
+}
+
+static void
+g_raid_destroy_provider(struct g_raid_volume *vol)
+{
+	struct g_raid_softc *sc;
+	struct g_provider *pp;
+	struct bio *bp, *tmp;
+
+	g_topology_assert_not();
+	sc = vol->v_softc;
+	pp = vol->v_provider;
+	KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name));
+
+	g_topology_lock();
+	g_error_provider(pp, ENXIO);
+	mtx_lock(&sc->sc_queue_mtx);
+	TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) {
+		if (bp->bio_to != pp)
+			continue;
+		bioq_remove(&sc->sc_queue, bp);
+		g_io_deliver(bp, ENXIO);
+	}
+	mtx_unlock(&sc->sc_queue_mtx);
+	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.",
+	    pp->name, vol->v_name);
+	g_wither_provider(pp, ENXIO);
+	g_topology_unlock();
+	vol->v_provider = NULL;
+}
+
+/*
+ * Update device state.
+ */
+static int
+g_raid_update_volume(struct g_raid_volume *vol, u_int event)
+{
+	struct g_raid_softc *sc;
+
+	sc = vol->v_softc;
+	sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+	G_RAID_DEBUG1(2, sc, "Event %s for volume %s.",
+	    g_raid_volume_event2str(event),
+	    vol->v_name);
+	switch (event) {
+	case G_RAID_VOLUME_E_DOWN:
+		if (vol->v_provider != NULL)
+			g_raid_destroy_provider(vol);
+		break;
+	case G_RAID_VOLUME_E_UP:
+		if (vol->v_provider == NULL)
+			g_raid_launch_provider(vol);
+		break;
+	case G_RAID_VOLUME_E_START:
+		if (vol->v_tr)
+			G_RAID_TR_START(vol->v_tr);
+		return (0);
+	default:
+		if (sc->sc_md)
+			G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event);
+		return (0);
+	}
+
+	/* Manage root mount release. */
+	if (vol->v_starting) {
+		vol->v_starting = 0;
+		G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount);
+		root_mount_rel(vol->v_rootmount);
+		vol->v_rootmount = NULL;
+	}
+	if (vol->v_stopping && vol->v_provider_open == 0)
+		g_raid_destroy_volume(vol);
+	return (0);
+}
+
+/*
+ * Update subdisk state.
+ */
+static int
+g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+
+	sc = sd->sd_softc;
+	vol = sd->sd_volume;
+	sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+	G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.",
+	    g_raid_subdisk_event2str(event),
+	    vol->v_name, sd->sd_pos,
+	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
+	if (vol->v_tr)
+		G_RAID_TR_EVENT(vol->v_tr, sd, event);
+
+	return (0);
+}
+
+/*
+ * Update disk state.
+ */
+static int
+g_raid_update_disk(struct g_raid_disk *disk, u_int event)
+{
+	struct g_raid_softc *sc;
+
+	sc = disk->d_softc;
+	sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+	G_RAID_DEBUG1(2, sc, "Event %s for disk %s.",
+	    g_raid_disk_event2str(event),
+	    g_raid_get_diskname(disk));
+
+	if (sc->sc_md)
+		G_RAID_MD_EVENT(sc->sc_md, disk, event);
+	return (0);
+}
+
+/*
+ * Node event.
+ */
+static int
+g_raid_update_node(struct g_raid_softc *sc, u_int event)
+{
+	sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+	G_RAID_DEBUG1(2, sc, "Event %s for the array.",
+	    g_raid_node_event2str(event));
+
+	if (event == G_RAID_NODE_E_WAKE)
+		return (0);
+	if (sc->sc_md)
+		G_RAID_MD_EVENT(sc->sc_md, NULL, event);
+	return (0);
+}
+
+static int
+g_raid_access(struct g_provider *pp, int acr, int acw, int ace)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_softc *sc;
+	int dcr, dcw, dce, opens, error = 0;
+
+	g_topology_assert();
+	sc = pp->geom->softc;
+	vol = pp->private;
+	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
+	KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name));
+
+	G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name,
+	    acr, acw, ace);
+
+	dcr = pp->acr + acr;
+	dcw = pp->acw + acw;
+	dce = pp->ace + ace;
+
+	g_topology_unlock();
+	sx_xlock(&sc->sc_lock);
+	/* Deny new opens while dying. */
+	if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) {
+		error = ENXIO;
+		goto out;
+	}
+	if (dcw == 0 && vol->v_dirty)
+		g_raid_clean(vol, dcw);
+	vol->v_provider_open += acr + acw + ace;
+	/* Handle delayed node destruction. */
+	if (sc->sc_stopping == G_RAID_DESTROY_DELAYED &&
+	    vol->v_provider_open == 0) {
+		/* Count open volumes. */
+		opens = g_raid_nopens(sc);
+		if (opens == 0) {
+			sc->sc_stopping = G_RAID_DESTROY_HARD;
+			/* Wake up worker to make it selfdestruct. */
+			g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
+		}
+	}
+	/* Handle open volume destruction. */
+	if (vol->v_stopping && vol->v_provider_open == 0)
+		g_raid_destroy_volume(vol);
+out:
+	sx_xunlock(&sc->sc_lock);
+	g_topology_lock();
+	return (error);
+}
+
+struct g_raid_softc *
+g_raid_create_node(struct g_class *mp,
+    const char *name, struct g_raid_md_object *md)
+{
+	struct g_raid_softc *sc;
+	struct g_geom *gp;
+	int error;
+
+	g_topology_assert();
+	G_RAID_DEBUG(1, "Creating array %s.", name);
+
+	gp = g_new_geomf(mp, "%s", name);
+	sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO);
+	gp->start = g_raid_start;
+	gp->orphan = g_raid_orphan;
+	gp->access = g_raid_access;
+	gp->dumpconf = g_raid_dumpconf;
+
+	sc->sc_md = md;
+	sc->sc_geom = gp;
+	sc->sc_flags = 0;
+	TAILQ_INIT(&sc->sc_volumes);
+	TAILQ_INIT(&sc->sc_disks);
+	sx_init(&sc->sc_lock, "gmirror:lock");
+	mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF);
+	TAILQ_INIT(&sc->sc_events);
+	bioq_init(&sc->sc_queue);
+	gp->softc = sc;
+	error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0,
+	    "g_raid %s", name);
+	if (error != 0) {
+		G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name);
+		mtx_destroy(&sc->sc_queue_mtx);
+		sx_destroy(&sc->sc_lock);
+		g_destroy_geom(sc->sc_geom);
+		free(sc, M_RAID);
+		return (NULL);
+	}
+
+	G_RAID_DEBUG1(0, sc, "Array %s created.", name);
+	return (sc);
+}
+
+struct g_raid_volume *
+g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id)
+{
+	struct g_raid_volume	*vol, *vol1;
+	int i;
+
+	G_RAID_DEBUG1(1, sc, "Creating volume %s.", name);
+	vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO);
+	vol->v_softc = sc;
+	strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME);
+	vol->v_state = G_RAID_VOLUME_S_STARTING;
+	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
+	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN;
+	bioq_init(&vol->v_inflight);
+	bioq_init(&vol->v_locked);
+	LIST_INIT(&vol->v_locks);
+	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
+		vol->v_subdisks[i].sd_softc = sc;
+		vol->v_subdisks[i].sd_volume = vol;
+		vol->v_subdisks[i].sd_pos = i;
+		vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE;
+	}
+
+	/* Find free ID for this volume. */
+	g_topology_lock();
+	vol1 = vol;
+	if (id >= 0) {
+		LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
+			if (vol1->v_global_id == id)
+				break;
+		}
+	}
+	if (vol1 != NULL) {
+		for (id = 0; ; id++) {
+			LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
+				if (vol1->v_global_id == id)
+					break;
+			}
+			if (vol1 == NULL)
+				break;
+		}
+	}
+	vol->v_global_id = id;
+	LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next);
+	g_topology_unlock();
+
+	/* Delay root mounting. */
+	vol->v_rootmount = root_mount_hold("GRAID");
+	G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount);
+	vol->v_starting = 1;
+	TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next);
+	return (vol);
+}
+
+struct g_raid_disk *
+g_raid_create_disk(struct g_raid_softc *sc)
+{
+	struct g_raid_disk	*disk;
+
+	G_RAID_DEBUG1(1, sc, "Creating disk.");
+	disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO);
+	disk->d_softc = sc;
+	disk->d_state = G_RAID_DISK_S_NONE;
+	TAILQ_INIT(&disk->d_subdisks);
+	TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next);
+	return (disk);
+}
+
+int g_raid_start_volume(struct g_raid_volume *vol)
+{
+	struct g_raid_tr_class *class;
+	struct g_raid_tr_object *obj;
+	int status;
+
+	G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name);
+	LIST_FOREACH(class, &g_raid_tr_classes, trc_list) {
+		G_RAID_DEBUG1(2, vol->v_softc,
+		    "Tasting volume %s for %s transformation.",
+		    vol->v_name, class->name);
+		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
+		    M_WAITOK);
+		obj->tro_class = class;
+		obj->tro_volume = vol;
+		status = G_RAID_TR_TASTE(obj, vol);
+		if (status != G_RAID_TR_TASTE_FAIL)
+			break;
+		kobj_delete((kobj_t)obj, M_RAID);
+	}
+	if (class == NULL) {
+		G_RAID_DEBUG1(0, vol->v_softc,
+		    "No transformation module found for %s.",
+		    vol->v_name);
+		vol->v_tr = NULL;
+		g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED);
+		g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN,
+		    G_RAID_EVENT_VOLUME);
+		return (-1);
+	}
+	G_RAID_DEBUG1(2, vol->v_softc,
+	    "Transformation module %s chosen for %s.",
+	    class->name, vol->v_name);
+	vol->v_tr = obj;
+	return (0);
+}
+
+int
+g_raid_destroy_node(struct g_raid_softc *sc, int worker)
+{
+	struct g_raid_volume *vol, *tmpv;
+	struct g_raid_disk *disk, *tmpd;
+	int error = 0;
+
+	sc->sc_stopping = G_RAID_DESTROY_HARD;
+	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) {
+		if (g_raid_destroy_volume(vol))
+			error = EBUSY;
+	}
+	if (error)
+		return (error);
+	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) {
+		if (g_raid_destroy_disk(disk))
+			error = EBUSY;
+	}
+	if (error)
+		return (error);
+	if (sc->sc_md) {
+		G_RAID_MD_FREE(sc->sc_md);
+		kobj_delete((kobj_t)sc->sc_md, M_RAID);
+		sc->sc_md = NULL;
+	}
+	if (sc->sc_geom != NULL) {
+		G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name);
+		g_topology_lock();
+		sc->sc_geom->softc = NULL;
+		g_wither_geom(sc->sc_geom, ENXIO);
+		g_topology_unlock();
+		sc->sc_geom = NULL;
+	} else
+		G_RAID_DEBUG(1, "Array destroyed.");
+	if (worker) {
+		g_raid_event_cancel(sc, sc);
+		mtx_destroy(&sc->sc_queue_mtx);
+		sx_xunlock(&sc->sc_lock);
+		sx_destroy(&sc->sc_lock);
+		wakeup(&sc->sc_stopping);
+		free(sc, M_RAID);
+		curthread->td_pflags &= ~TDP_GEOM;
+		G_RAID_DEBUG(1, "Thread exiting.");
+		kproc_exit(0);
+	} else {
+		/* Wake up worker to make it selfdestruct. */
+		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
+	}
+	return (0);
+}
+
+int
+g_raid_destroy_volume(struct g_raid_volume *vol)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_disk *disk;
+	int i;
+
+	sc = vol->v_softc;
+	G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name);
+	vol->v_stopping = 1;
+	if (vol->v_state != G_RAID_VOLUME_S_STOPPED) {
+		if (vol->v_tr) {
+			G_RAID_TR_STOP(vol->v_tr);
+			return (EBUSY);
+		} else
+			vol->v_state = G_RAID_VOLUME_S_STOPPED;
+	}
+	if (g_raid_event_check(sc, vol) != 0)
+		return (EBUSY);
+	if (vol->v_provider != NULL)
+		return (EBUSY);
+	if (vol->v_provider_open != 0)
+		return (EBUSY);
+	if (vol->v_tr) {
+		G_RAID_TR_FREE(vol->v_tr);
+		kobj_delete((kobj_t)vol->v_tr, M_RAID);
+		vol->v_tr = NULL;
+	}
+	if (vol->v_rootmount)
+		root_mount_rel(vol->v_rootmount);
+	g_topology_lock();
+	LIST_REMOVE(vol, v_global_next);
+	g_topology_unlock();
+	TAILQ_REMOVE(&sc->sc_volumes, vol, v_next);
+	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
+		g_raid_event_cancel(sc, &vol->v_subdisks[i]);
+		disk = vol->v_subdisks[i].sd_disk;
+		if (disk == NULL)
+			continue;
+		TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next);
+	}
+	G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name);
+	if (sc->sc_md)
+		G_RAID_MD_FREE_VOLUME(sc->sc_md, vol);
+	g_raid_event_cancel(sc, vol);
+	free(vol, M_RAID);
+	if (sc->sc_stopping == G_RAID_DESTROY_HARD) {
+		/* Wake up worker to let it selfdestruct. */
+		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
+	}
+	return (0);
+}
+
+int
+g_raid_destroy_disk(struct g_raid_disk *disk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd, *tmp;
+
+	sc = disk->d_softc;
+	G_RAID_DEBUG1(2, sc, "Destroying disk.");
+	if (disk->d_consumer) {
+		g_raid_kill_consumer(sc, disk->d_consumer);
+		disk->d_consumer = NULL;
+	}
+	TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) {
+		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE);
+		g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
+		    G_RAID_EVENT_SUBDISK);
+		TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next);
+		sd->sd_disk = NULL;
+	}
+	TAILQ_REMOVE(&sc->sc_disks, disk, d_next);
+	if (sc->sc_md)
+		G_RAID_MD_FREE_DISK(sc->sc_md, disk);
+	g_raid_event_cancel(sc, disk);
+	free(disk, M_RAID);
+	return (0);
+}
+
+int
+g_raid_destroy(struct g_raid_softc *sc, int how)
+{
+	int opens;
+
+	g_topology_assert_not();
+	if (sc == NULL)
+		return (ENXIO);
+	sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+	/* Count open volumes. */
+	opens = g_raid_nopens(sc);
+
+	/* React on some opened volumes. */
+	if (opens > 0) {
+		switch (how) {
+		case G_RAID_DESTROY_SOFT:
+			G_RAID_DEBUG1(1, sc,
+			    "%d volumes are still open.",
+			    opens);
+			return (EBUSY);
+		case G_RAID_DESTROY_DELAYED:
+			G_RAID_DEBUG1(1, sc,
+			    "Array will be destroyed on last close.");
+			sc->sc_stopping = G_RAID_DESTROY_DELAYED;
+			return (EBUSY);
+		case G_RAID_DESTROY_HARD:
+			G_RAID_DEBUG1(1, sc,
+			    "%d volumes are still open.",
+			    opens);
+		}
+	}
+
+	/* Mark node for destruction. */
+	sc->sc_stopping = G_RAID_DESTROY_HARD;
+	/* Wake up worker to let it selfdestruct. */
+	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
+	/* Sleep until node destroyed. */
+	sx_sleep(&sc->sc_stopping, &sc->sc_lock,
+	    PRIBIO | PDROP, "r:destroy", 0);
+	return (0);
+}
+
+static void
+g_raid_taste_orphan(struct g_consumer *cp)
+{
+
+	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
+	    cp->provider->name));
+}
+
+static struct g_geom *
+g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
+{
+	struct g_consumer *cp;
+	struct g_geom *gp, *geom;
+	struct g_raid_md_class *class;
+	struct g_raid_md_object *obj;
+	int status;
+
+	g_topology_assert();
+	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
+	G_RAID_DEBUG(2, "Tasting provider %s.", pp->name);
+
+	gp = g_new_geomf(mp, "mirror:taste");
+	/*
+	 * This orphan function should be never called.
+	 */
+	gp->orphan = g_raid_taste_orphan;
+	cp = g_new_consumer(gp);
+	g_attach(cp, pp);
+
+	geom = NULL;
+	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
+		G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.",
+		    pp->name, class->name);
+		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
+		    M_WAITOK);
+		obj->mdo_class = class;
+		status = G_RAID_MD_TASTE(obj, mp, cp, &geom);
+		if (status != G_RAID_MD_TASTE_NEW)
+			kobj_delete((kobj_t)obj, M_RAID);
+		if (status != G_RAID_MD_TASTE_FAIL)
+			break;
+	}
+
+	g_detach(cp);
+	g_destroy_consumer(cp);
+	g_destroy_geom(gp);
+	G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name);
+	return (geom);
+}
+
+int
+g_raid_create_node_format(const char *format, struct g_geom **gp)
+{
+	struct g_raid_md_class *class;
+	struct g_raid_md_object *obj;
+	int status;
+
+	G_RAID_DEBUG(2, "Creating array for %s metadata.", format);
+	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
+		if (strcasecmp(class->name, format) == 0)
+			break;
+	}
+	if (class == NULL) {
+		G_RAID_DEBUG(1, "No support for %s metadata.", format);
+		return (G_RAID_MD_TASTE_FAIL);
+	}
+	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
+	    M_WAITOK);
+	obj->mdo_class = class;
+	status = G_RAID_MD_CREATE(obj, &g_raid_class, gp);
+	if (status != G_RAID_MD_TASTE_NEW)
+		kobj_delete((kobj_t)obj, M_RAID);
+	return (status);
+}
+
+static int
+g_raid_destroy_geom(struct gctl_req *req __unused,
+    struct g_class *mp __unused, struct g_geom *gp)
+{
+	struct g_raid_softc *sc;
+	int error;
+
+	g_topology_unlock();
+	sc = gp->softc;
+	sx_xlock(&sc->sc_lock);
+	g_cancel_event(sc);
+	error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT);
+	if (error != 0)
+		sx_xunlock(&sc->sc_lock);
+	g_topology_lock();
+	return (error);
+}
+
+void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
+    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
+{
+
+	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
+		return;
+	if (sc->sc_md)
+		G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk);
+}
+
+void g_raid_fail_disk(struct g_raid_softc *sc,
+    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
+{
+
+	if (disk == NULL)
+		disk = sd->sd_disk;
+	if (disk == NULL) {
+		G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!");
+		return;
+	}
+	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
+		G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a "
+		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
+		return;
+	}
+	if (sc->sc_md)
+		G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk);
+}
+
+static void
+g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
+    struct g_consumer *cp, struct g_provider *pp)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	int i, s;
+
+	g_topology_assert();
+
+	sc = gp->softc;
+	if (sc == NULL)
+		return;
+	if (pp != NULL) {
+		vol = pp->private;
+		g_topology_unlock();
+		sx_xlock(&sc->sc_lock);
+		sbuf_printf(sb, "%s<Label>%s</Label>\n", indent,
+		    vol->v_name);
+		sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent,
+		    g_raid_volume_level2str(vol->v_raid_level,
+		    vol->v_raid_level_qualifier));
+		sbuf_printf(sb,
+		    "%s<Transformation>%s</Transformation>\n", indent,
+		    vol->v_tr ? vol->v_tr->tro_class->name : "NONE");
+		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
+		    vol->v_disks_count);
+		sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent,
+		    vol->v_strip_size);
+		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
+		    g_raid_volume_state2str(vol->v_state));
+		sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent,
+		    vol->v_dirty ? "Yes" : "No");
+		sbuf_printf(sb, "%s<Subdisks>", indent);
+		for (i = 0; i < vol->v_disks_count; i++) {
+			sd = &vol->v_subdisks[i];
+			if (sd->sd_disk != NULL &&
+			    sd->sd_disk->d_consumer != NULL) {
+				sbuf_printf(sb, "%s ",
+				    g_raid_get_diskname(sd->sd_disk));
+			} else {
+				sbuf_printf(sb, "NONE ");
+			}
+			sbuf_printf(sb, "(%s",
+			    g_raid_subdisk_state2str(sd->sd_state));
+			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
+			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
+				sbuf_printf(sb, " %d%%",
+				    (int)(sd->sd_rebuild_pos * 100 /
+				     sd->sd_size));
+			}
+			sbuf_printf(sb, ")");
+			if (i + 1 < vol->v_disks_count)
+				sbuf_printf(sb, ", ");
+		}
+		sbuf_printf(sb, "</Subdisks>\n");
+		sx_xunlock(&sc->sc_lock);
+		g_topology_lock();
+	} else if (cp != NULL) {
+		disk = cp->private;
+		if (disk == NULL)
+			return;
+		g_topology_unlock();
+		sx_xlock(&sc->sc_lock);
+		sbuf_printf(sb, "%s<State>%s", indent,
+		    g_raid_disk_state2str(disk->d_state));
+		if (!TAILQ_EMPTY(&disk->d_subdisks)) {
+			sbuf_printf(sb, " (");
+			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+				sbuf_printf(sb, "%s",
+				    g_raid_subdisk_state2str(sd->sd_state));
+				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
+				    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
+					sbuf_printf(sb, " %d%%",
+					    (int)(sd->sd_rebuild_pos * 100 /
+					     sd->sd_size));
+				}
+				if (TAILQ_NEXT(sd, sd_next))
+					sbuf_printf(sb, ", ");
+			}
+			sbuf_printf(sb, ")");
+		}
+		sbuf_printf(sb, "</State>\n");
+		sbuf_printf(sb, "%s<Subdisks>", indent);
+		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+			sbuf_printf(sb, "r%d(%s):%d@%ju",
+			    sd->sd_volume->v_global_id,
+			    sd->sd_volume->v_name,
+			    sd->sd_pos, sd->sd_offset);
+			if (TAILQ_NEXT(sd, sd_next))
+				sbuf_printf(sb, ", ");
+		}
+		sbuf_printf(sb, "</Subdisks>\n");
+		sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent,
+		    disk->d_read_errs);
+		sx_xunlock(&sc->sc_lock);
+		g_topology_lock();
+	} else {
+		g_topology_unlock();
+		sx_xlock(&sc->sc_lock);
+		if (sc->sc_md) {
+			sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent,
+			    sc->sc_md->mdo_class->name);
+		}
+		if (!TAILQ_EMPTY(&sc->sc_volumes)) {
+			s = 0xff;
+			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+				if (vol->v_state < s)
+					s = vol->v_state;
+			}
+			sbuf_printf(sb, "%s<State>%s</State>\n", indent,
+			    g_raid_volume_state2str(s));
+		}
+		sx_xunlock(&sc->sc_lock);
+		g_topology_lock();
+	}
+}
+
+static void
+g_raid_shutdown_pre_sync(void *arg, int howto)
+{
+	struct g_class *mp;
+	struct g_geom *gp, *gp2;
+	struct g_raid_softc *sc;
+	int error;
+
+	mp = arg;
+	DROP_GIANT();
+	g_topology_lock();
+	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
+		if ((sc = gp->softc) == NULL)
+			continue;
+		g_topology_unlock();
+		sx_xlock(&sc->sc_lock);
+		g_cancel_event(sc);
+		error = g_raid_destroy(sc, G_RAID_DESTROY_DELAYED);
+		if (error != 0)
+			sx_xunlock(&sc->sc_lock);
+		g_topology_lock();
+	}
+	g_topology_unlock();
+	PICKUP_GIANT();
+}
+
+static void
+g_raid_init(struct g_class *mp)
+{
+
+	g_raid_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync,
+	    g_raid_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST);
+	if (g_raid_pre_sync == NULL)
+		G_RAID_DEBUG(0, "Warning! Cannot register shutdown event.");
+	g_raid_started = 1;
+}
+
+static void
+g_raid_fini(struct g_class *mp)
+{
+
+	if (g_raid_pre_sync != NULL)
+		EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_raid_pre_sync);
+	g_raid_started = 0;
+}
+
+int
+g_raid_md_modevent(module_t mod, int type, void *arg)
+{
+	struct g_raid_md_class *class, *c, *nc;
+	int error;
+
+	error = 0;
+	class = arg;
+	switch (type) {
+	case MOD_LOAD:
+		c = LIST_FIRST(&g_raid_md_classes);
+		if (c == NULL || c->mdc_priority > class->mdc_priority)
+			LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list);
+		else {
+			while ((nc = LIST_NEXT(c, mdc_list)) != NULL &&
+			    nc->mdc_priority < class->mdc_priority)
+				c = nc;
+			LIST_INSERT_AFTER(c, class, mdc_list);
+		}
+		if (g_raid_started)
+			g_retaste(&g_raid_class);
+		break;
+	case MOD_UNLOAD:
+		LIST_REMOVE(class, mdc_list);
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+
+	return (error);
+}
+
+int
+g_raid_tr_modevent(module_t mod, int type, void *arg)
+{
+	struct g_raid_tr_class *class, *c, *nc;
+	int error;
+
+	error = 0;
+	class = arg;
+	switch (type) {
+	case MOD_LOAD:
+		c = LIST_FIRST(&g_raid_tr_classes);
+		if (c == NULL || c->trc_priority > class->trc_priority)
+			LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list);
+		else {
+			while ((nc = LIST_NEXT(c, trc_list)) != NULL &&
+			    nc->trc_priority < class->trc_priority)
+				c = nc;
+			LIST_INSERT_AFTER(c, class, trc_list);
+		}
+		break;
+	case MOD_UNLOAD:
+		LIST_REMOVE(class, trc_list);
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+
+	return (error);
+}
+
+/*
+ * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid)
+ * to reduce module priority, allowing submodules to register them first.
+ */
+static moduledata_t g_raid_mod = {
+	"g_raid",
+	g_modevent,
+	&g_raid_class
+};
+DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD);
+MODULE_VERSION(geom_raid, 0);
diff --git a/sys/geom/raid/g_raid.h b/sys/geom/raid/g_raid.h
new file mode 100644
index 0000000..1c14ad6
--- /dev/null
+++ b/sys/geom/raid/g_raid.h
@@ -0,0 +1,403 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_G_RAID_H_
+#define	_G_RAID_H_
+
+#include <sys/param.h>
+#include <sys/kobj.h>
+#include <sys/bio.h>
+#include <sys/time.h>
+
+#define	G_RAID_CLASS_NAME	"RAID"
+
+#define	G_RAID_MAGIC		"GEOM::RAID"
+
+#define	G_RAID_VERSION		0
+
+struct g_raid_md_object;
+struct g_raid_tr_object;
+
+#define	G_RAID_DEVICE_FLAG_NOAUTOSYNC	0x0000000000000001ULL
+#define	G_RAID_DEVICE_FLAG_NOFAILSYNC	0x0000000000000002ULL
+#define	G_RAID_DEVICE_FLAG_MASK	(G_RAID_DEVICE_FLAG_NOAUTOSYNC | \
+					 G_RAID_DEVICE_FLAG_NOFAILSYNC)
+
+#ifdef _KERNEL
+extern u_int g_raid_aggressive_spare;
+extern u_int g_raid_debug;
+extern int g_raid_read_err_thresh;
+extern u_int g_raid_start_timeout;
+extern struct g_class g_raid_class;
+
+#define	G_RAID_DEBUG(lvl, fmt, ...)	do {				\
+	if (g_raid_debug >= (lvl)) {					\
+		if (g_raid_debug > 0) {					\
+			printf("GEOM_RAID[%u]: " fmt "\n",		\
+			    lvl, ## __VA_ARGS__);			\
+		} else {						\
+			printf("GEOM_RAID: " fmt "\n",			\
+			    ## __VA_ARGS__);				\
+		}							\
+	}								\
+} while (0)
+#define	G_RAID_DEBUG1(lvl, sc, fmt, ...)	do {			\
+	if (g_raid_debug >= (lvl)) {					\
+		if (g_raid_debug > 0) {					\
+			printf("GEOM_RAID[%u]: %s: " fmt "\n",		\
+			    lvl, (sc)->sc_name, ## __VA_ARGS__);	\
+		} else {						\
+			printf("GEOM_RAID: %s: " fmt "\n",		\
+			    (sc)->sc_name, ## __VA_ARGS__);		\
+		}							\
+	}								\
+} while (0)
+#define	G_RAID_LOGREQ(lvl, bp, fmt, ...)	do {			\
+	if (g_raid_debug >= (lvl)) {					\
+		if (g_raid_debug > 0) {					\
+			printf("GEOM_RAID[%u]: " fmt " ",		\
+			    lvl, ## __VA_ARGS__);			\
+		} else							\
+			printf("GEOM_RAID: " fmt " ", ## __VA_ARGS__);	\
+		g_print_bio(bp);					\
+		printf("\n");						\
+	}								\
+} while (0)
+
+/*
+ * Flags we use to distinguish I/O initiated by the TR layer to maintain
+ * the volume's characteristics, fix subdisks, extra copies of data, etc.
+ *
+ * G_RAID_BIO_FLAG_SYNC		I/O to update an extra copy of the data
+ *				for RAID volumes that maintain extra data
+ *				and need to rebuild that data.
+ * G_RAID_BIO_FLAG_REMAP	I/O done to try to provoke a subdisk into
+ *				doing some desirable action such as bad
+ *				block remapping after we detect a bad part
+ *				of the disk.
+ * G_RAID_BIO_FLAG_LOCKED	I/O holds range lock that should re released.
+ *
+ * and the following meta item:
+ * G_RAID_BIO_FLAG_SPECIAL	And of the I/O flags that need to make it
+ *				through the range locking which would
+ *				otherwise defer the I/O until after that
+ *				range is unlocked.
+ */
+#define	G_RAID_BIO_FLAG_SYNC		0x01
+#define	G_RAID_BIO_FLAG_REMAP		0x02
+#define	G_RAID_BIO_FLAG_SPECIAL \
+		(G_RAID_BIO_FLAG_SYNC|G_RAID_BIO_FLAG_REMAP)
+#define	G_RAID_BIO_FLAG_LOCKED		0x80
+
+struct g_raid_lock {
+	off_t			 l_offset;
+	off_t			 l_length;
+	void			*l_callback_arg;
+	int			 l_pending;
+	LIST_ENTRY(g_raid_lock)	 l_next;
+};
+
+#define	G_RAID_EVENT_WAIT	0x01
+#define	G_RAID_EVENT_VOLUME	0x02
+#define	G_RAID_EVENT_SUBDISK	0x04
+#define	G_RAID_EVENT_DISK	0x08
+#define	G_RAID_EVENT_DONE	0x10
+struct g_raid_event {
+	void			*e_tgt;
+	int			 e_event;
+	int			 e_flags;
+	int			 e_error;
+	TAILQ_ENTRY(g_raid_event) e_next;
+};
+#define G_RAID_DISK_S_NONE		0x00	/* State is unknown. */
+#define G_RAID_DISK_S_OFFLINE		0x01	/* Missing disk placeholder. */
+#define G_RAID_DISK_S_FAILED		0x02	/* Failed. */
+#define G_RAID_DISK_S_STALE_FAILED	0x03	/* Old failed. */
+#define G_RAID_DISK_S_SPARE		0x04	/* Hot-spare. */
+#define G_RAID_DISK_S_STALE		0x05	/* Old disk, unused now. */
+#define G_RAID_DISK_S_ACTIVE		0x06	/* Operational. */
+
+#define G_RAID_DISK_E_DISCONNECTED	0x01
+
+struct g_raid_disk {
+	struct g_raid_softc	*d_softc;	/* Back-pointer to softc. */
+	struct g_consumer	*d_consumer;	/* GEOM disk consumer. */
+	void			*d_md_data;	/* Disk's metadata storage. */
+	struct g_kerneldump	 d_kd;		/* Kernel dumping method/args. */
+	uint64_t		 d_flags;	/* Additional flags. */
+	u_int			 d_state;	/* Disk state. */
+	u_int			 d_load;	/* Disk average load. */
+	off_t			 d_last_offset;	/* Last head offset. */
+	int			 d_read_errs;	/* Count of the read errors */
+	TAILQ_HEAD(, g_raid_subdisk)	 d_subdisks; /* List of subdisks. */
+	TAILQ_ENTRY(g_raid_disk)	 d_next;	/* Next disk in the node. */
+};
+
+#define G_RAID_SUBDISK_S_NONE		0x00	/* Absent. */
+#define G_RAID_SUBDISK_S_FAILED		0x01	/* Failed. */
+#define G_RAID_SUBDISK_S_NEW		0x02	/* Blank. */
+#define G_RAID_SUBDISK_S_REBUILD	0x03	/* Blank + rebuild. */
+#define G_RAID_SUBDISK_S_UNINITIALIZED	0x04	/* Disk of the new volume. */
+#define G_RAID_SUBDISK_S_STALE		0x05	/* Dirty. */
+#define G_RAID_SUBDISK_S_RESYNC		0x06	/* Dirty + check/repair. */
+#define G_RAID_SUBDISK_S_ACTIVE		0x07	/* Usable. */
+
+#define G_RAID_SUBDISK_E_NEW		0x01	/* A new subdisk has arrived */
+#define G_RAID_SUBDISK_E_FAILED		0x02	/* A subdisk failed, but remains in volume */
+#define G_RAID_SUBDISK_E_DISCONNECTED	0x03	/* A subdisk removed from volume. */
+#define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80	/* translation private events */
+
+#define G_RAID_SUBDISK_POS(sd)						\
+    ((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0)
+#define G_RAID_SUBDISK_TRACK_SIZE	(1 * 1024 * 1024)
+#define G_RAID_SUBDISK_LOAD(sd)						\
+    ((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0)
+#define G_RAID_SUBDISK_LOAD_SCALE	256
+
+struct g_raid_subdisk {
+	struct g_raid_softc	*sd_softc;	/* Back-pointer to softc. */
+	struct g_raid_disk	*sd_disk;	/* Where this subdisk lives. */
+	struct g_raid_volume	*sd_volume;	/* Volume, sd is a part of. */
+	off_t			 sd_offset;	/* Offset on the disk. */
+	off_t			 sd_size;	/* Size on the disk. */
+	u_int			 sd_pos;	/* Position in volume. */
+	u_int			 sd_state;	/* Subdisk state. */
+	off_t			 sd_rebuild_pos; /* Rebuild position. */
+	int			 sd_recovery;	/* Count of recovery reqs. */
+	TAILQ_ENTRY(g_raid_subdisk)	 sd_next; /* Next subdisk on disk. */
+};
+
+#define G_RAID_MAX_SUBDISKS	16
+#define G_RAID_MAX_VOLUMENAME	32
+
+#define G_RAID_VOLUME_S_STARTING	0x00
+#define G_RAID_VOLUME_S_BROKEN		0x01
+#define G_RAID_VOLUME_S_DEGRADED	0x02
+#define G_RAID_VOLUME_S_SUBOPTIMAL	0x03
+#define G_RAID_VOLUME_S_OPTIMAL		0x04
+#define G_RAID_VOLUME_S_UNSUPPORTED	0x05
+#define G_RAID_VOLUME_S_STOPPED		0x06
+
+#define G_RAID_VOLUME_S_ALIVE(s)			\
+    ((s) == G_RAID_VOLUME_S_DEGRADED ||			\
+     (s) == G_RAID_VOLUME_S_SUBOPTIMAL ||		\
+     (s) == G_RAID_VOLUME_S_OPTIMAL)
+
+#define G_RAID_VOLUME_E_DOWN		0x00
+#define G_RAID_VOLUME_E_UP		0x01
+#define G_RAID_VOLUME_E_START		0x10
+#define G_RAID_VOLUME_E_STARTMD		0x11
+
+#define G_RAID_VOLUME_RL_RAID0		0x00
+#define G_RAID_VOLUME_RL_RAID1		0x01
+#define G_RAID_VOLUME_RL_RAID3		0x03
+#define G_RAID_VOLUME_RL_RAID4		0x04
+#define G_RAID_VOLUME_RL_RAID5		0x05
+#define G_RAID_VOLUME_RL_RAID6		0x06
+#define G_RAID_VOLUME_RL_RAID1E		0x11
+#define G_RAID_VOLUME_RL_SINGLE		0x0f
+#define G_RAID_VOLUME_RL_CONCAT		0x1f
+#define G_RAID_VOLUME_RL_RAID5E		0x15
+#define G_RAID_VOLUME_RL_RAID5EE	0x25
+#define G_RAID_VOLUME_RL_UNKNOWN	0xff
+
+#define G_RAID_VOLUME_RLQ_NONE		0x00
+#define G_RAID_VOLUME_RLQ_UNKNOWN	0xff
+
+struct g_raid_volume;
+
+struct g_raid_volume {
+	struct g_raid_softc	*v_softc;	/* Back-pointer to softc. */
+	struct g_provider	*v_provider;	/* GEOM provider. */
+	struct g_raid_subdisk	 v_subdisks[G_RAID_MAX_SUBDISKS];
+						/* Subdisks of this volume. */
+	void			*v_md_data;	/* Volume's metadata storage. */
+	struct g_raid_tr_object	*v_tr;		/* Transformation object. */
+	char			 v_name[G_RAID_MAX_VOLUMENAME];
+						/* Volume name. */
+	u_int			 v_state;	/* Volume state. */
+	u_int			 v_raid_level;	/* Array RAID level. */
+	u_int			 v_raid_level_qualifier; /* RAID level det. */
+	u_int			 v_disks_count;	/* Number of disks in array. */
+	u_int			 v_strip_size;	/* Array strip size. */
+	u_int			 v_sectorsize;	/* Volume sector size. */
+	off_t			 v_mediasize;	/* Volume media size.  */
+	struct bio_queue_head	 v_inflight;	/* In-flight write requests. */
+	struct bio_queue_head	 v_locked;	/* Blocked I/O requests. */
+	LIST_HEAD(, g_raid_lock) v_locks;	 /* List of locked regions. */
+	int			 v_pending_lock; /* writes to locked region */
+	int			 v_dirty;	/* Volume is DIRTY. */
+	struct timeval		 v_last_done;	/* Time of the last I/O. */
+	time_t			 v_last_write;	/* Time of the last write. */
+	u_int			 v_writes;	/* Number of active writes. */
+	struct root_hold_token	*v_rootmount;	/* Root mount delay token. */
+	int			 v_starting;	/* Volume is starting */
+	int			 v_stopping;	/* Volume is stopping */
+	int			 v_provider_open; /* Number of opens. */
+	int			 v_global_id;	/* Global volume ID (rX). */
+	TAILQ_ENTRY(g_raid_volume)	 v_next; /* List of volumes entry. */
+	LIST_ENTRY(g_raid_volume)	 v_global_next; /* Global list entry. */
+};
+
+#define G_RAID_NODE_E_WAKE	0x00
+#define G_RAID_NODE_E_START	0x01
+
+struct g_raid_softc {
+	struct g_raid_md_object	*sc_md;		/* Metadata object. */
+	struct g_geom		*sc_geom;	/* GEOM class instance. */
+	uint64_t		 sc_flags;	/* Additional flags. */
+	TAILQ_HEAD(, g_raid_volume)	 sc_volumes;	/* List of volumes. */
+	TAILQ_HEAD(, g_raid_disk)	 sc_disks;	/* List of disks. */
+	struct sx		 sc_lock;	/* Main node lock. */
+	struct proc		*sc_worker;	/* Worker process. */
+	struct mtx		 sc_queue_mtx;	/* Worker queues lock. */
+	TAILQ_HEAD(, g_raid_event) sc_events;	/* Worker events queue. */
+	struct bio_queue_head	 sc_queue;	/* Worker I/O queue. */
+	int			 sc_stopping;	/* Node is stopping */
+};
+#define	sc_name	sc_geom->name
+
+/*
+ * KOBJ parent class of metadata processing modules.
+ */
+struct g_raid_md_class {
+	KOBJ_CLASS_FIELDS;
+	int		 mdc_priority;
+	LIST_ENTRY(g_raid_md_class) mdc_list;
+};
+
+/*
+ * KOBJ instance of metadata processing module.
+ */
+struct g_raid_md_object {
+	KOBJ_FIELDS;
+	struct g_raid_md_class	*mdo_class;
+	struct g_raid_softc	*mdo_softc;	/* Back-pointer to softc. */
+};
+
+int g_raid_md_modevent(module_t, int, void *);
+
+#define	G_RAID_MD_DECLARE(name)					\
+    static moduledata_t name##_mod = {				\
+	#name,							\
+	g_raid_md_modevent,					\
+	&name##_class						\
+    };								\
+    DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);	\
+    MODULE_DEPEND(name, geom_raid, 0, 0, 0)
+
+/*
+ * KOBJ parent class of data transformation modules.
+ */
+struct g_raid_tr_class {
+	KOBJ_CLASS_FIELDS;
+	int		 trc_priority;
+	LIST_ENTRY(g_raid_tr_class) trc_list;
+};
+
+/*
+ * KOBJ instance of data transformation module.
+ */
+struct g_raid_tr_object {
+	KOBJ_FIELDS;
+	struct g_raid_tr_class	*tro_class;
+	struct g_raid_volume 	*tro_volume;	/* Back-pointer to volume. */
+};
+
+int g_raid_tr_modevent(module_t, int, void *);
+
+#define	G_RAID_TR_DECLARE(name)					\
+    static moduledata_t name##_mod = {				\
+	#name,							\
+	g_raid_tr_modevent,					\
+	&name##_class						\
+    };								\
+    DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);	\
+    MODULE_DEPEND(name, geom_raid, 0, 0, 0)
+
+const char * g_raid_volume_level2str(int level, int qual);
+int g_raid_volume_str2level(const char *str, int *level, int *qual);
+const char * g_raid_volume_state2str(int state);
+const char * g_raid_subdisk_state2str(int state);
+const char * g_raid_disk_state2str(int state);
+
+struct g_raid_softc * g_raid_create_node(struct g_class *mp,
+    const char *name, struct g_raid_md_object *md);
+int g_raid_create_node_format(const char *format, struct g_geom **gp);
+struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc,
+    const char *name, int id);
+struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc);
+const char * g_raid_get_diskname(struct g_raid_disk *disk);
+
+int g_raid_start_volume(struct g_raid_volume *vol);
+
+int g_raid_destroy_node(struct g_raid_softc *sc, int worker);
+int g_raid_destroy_volume(struct g_raid_volume *vol);
+int g_raid_destroy_disk(struct g_raid_disk *disk);
+
+void g_raid_iodone(struct bio *bp, int error);
+void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp);
+int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
+    void *virtual, vm_offset_t physical, off_t offset, size_t length);
+
+struct g_consumer *g_raid_open_consumer(struct g_raid_softc *sc,
+    const char *name);
+void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp);
+
+void g_raid_report_disk_state(struct g_raid_disk *disk);
+void g_raid_change_disk_state(struct g_raid_disk *disk, int state);
+void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state);
+void g_raid_change_volume_state(struct g_raid_volume *vol, int state);
+
+void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
+    struct g_raid_subdisk *sd, struct g_raid_disk *disk);
+void g_raid_fail_disk(struct g_raid_softc *sc,
+    struct g_raid_subdisk *sd, struct g_raid_disk *disk);
+
+void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp);
+int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
+    void *virtual, vm_offset_t physical, off_t offset, size_t length);
+
+u_int g_raid_ndisks(struct g_raid_softc *sc, int state);
+u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state);
+u_int g_raid_nopens(struct g_raid_softc *sc);
+struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol,
+    int state);
+#define	G_RAID_DESTROY_SOFT		0
+#define	G_RAID_DESTROY_DELAYED	1
+#define	G_RAID_DESTROY_HARD		2
+int g_raid_destroy(struct g_raid_softc *sc, int how);
+int g_raid_event_send(void *arg, int event, int flags);
+int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
+    struct bio *ignore, void *argp);
+int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len);
+
+g_ctl_req_t g_raid_ctl;
+#endif	/* _KERNEL */
+
+#endif	/* !_G_RAID_H_ */
diff --git a/sys/geom/raid/g_raid_ctl.c b/sys/geom/raid/g_raid_ctl.c
new file mode 100644
index 0000000..028aa94
--- /dev/null
+++ b/sys/geom/raid/g_raid_ctl.c
@@ -0,0 +1,217 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/bio.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/bitstring.h>
+#include <vm/uma.h>
+#include <machine/atomic.h>
+#include <geom/geom.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <geom/raid/g_raid.h>
+#include "g_raid_md_if.h"
+
+
+static struct g_raid_softc *
+g_raid_find_node(struct g_class *mp, const char *name)
+{
+	struct g_raid_softc *sc;
+	struct g_geom *gp;
+
+	LIST_FOREACH(gp, &mp->geom, geom) {
+		sc = gp->softc;
+		if (sc == NULL)
+			continue;
+		if (sc->sc_stopping != 0)
+			continue;
+		if (strcasecmp(sc->sc_name, name) == 0)
+			return (sc);
+	}
+	return (NULL);
+}
+
+static void
+g_raid_ctl_label(struct gctl_req *req, struct g_class *mp)
+{
+	struct g_geom *geom;
+	struct g_raid_softc *sc;
+	const char *format;
+	int *nargs;
+	int crstatus, ctlstatus;
+	char buf[64];
+
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs < 4) {
+		gctl_error(req, "Invalid number of arguments.");
+		return;
+	}
+	format = gctl_get_asciiparam(req, "arg0");
+	if (format == NULL) {
+		gctl_error(req, "No format recieved.");
+		return;
+	}
+	crstatus = g_raid_create_node_format(format, &geom);
+	if (crstatus == G_RAID_MD_TASTE_FAIL) {
+		gctl_error(req, "Failed to create array with format '%s'.",
+		    format);
+		return;
+	}
+	sc = (struct g_raid_softc *)geom->softc;
+	g_topology_unlock();
+	sx_xlock(&sc->sc_lock);
+	ctlstatus = G_RAID_MD_CTL(sc->sc_md, req);
+	if (ctlstatus < 0) {
+		gctl_error(req, "Command failed: %d.", ctlstatus);
+		if (crstatus == G_RAID_MD_TASTE_NEW)
+			g_raid_destroy_node(sc, 0);
+	} else {
+		if (crstatus == G_RAID_MD_TASTE_NEW)
+			snprintf(buf, sizeof(buf), "%s created\n", sc->sc_name);
+		else
+			snprintf(buf, sizeof(buf), "%s reused\n", sc->sc_name);
+		gctl_set_param_err(req, "output", buf, strlen(buf) + 1);
+	}
+	sx_xunlock(&sc->sc_lock);
+	g_topology_lock();
+}
+
+static void
+g_raid_ctl_stop(struct gctl_req *req, struct g_class *mp)
+{
+	struct g_raid_softc *sc;
+	const char *nodename;
+	int *nargs, *force;
+	int error, how;
+
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs != 1) {
+		gctl_error(req, "Invalid number of arguments.");
+		return;
+	}
+	nodename = gctl_get_asciiparam(req, "arg0");
+	if (nodename == NULL) {
+		gctl_error(req, "No array name recieved.");
+		return;
+	}
+	sc = g_raid_find_node(mp, nodename);
+	if (sc == NULL) {
+		gctl_error(req, "Array '%s' not found.", nodename);
+		return;
+	}
+	force = gctl_get_paraml(req, "force", sizeof(*force));
+	if (force != NULL && *force)
+		how = G_RAID_DESTROY_HARD;
+	else
+		how = G_RAID_DESTROY_SOFT;
+	g_topology_unlock();
+	sx_xlock(&sc->sc_lock);
+	error = g_raid_destroy(sc, how);
+	if (error != 0)
+		sx_xunlock(&sc->sc_lock);
+	g_topology_lock();
+}
+
+static void
+g_raid_ctl_other(struct gctl_req *req, struct g_class *mp)
+{
+	struct g_raid_softc *sc;
+	const char *nodename;
+	int *nargs;
+	int ctlstatus;
+
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs < 1) {
+		gctl_error(req, "Invalid number of arguments.");
+		return;
+	}
+	nodename = gctl_get_asciiparam(req, "arg0");
+	if (nodename == NULL) {
+		gctl_error(req, "No array name recieved.");
+		return;
+	}
+	sc = g_raid_find_node(mp, nodename);
+	if (sc == NULL) {
+		gctl_error(req, "Array '%s' not found.", nodename);
+		return;
+	}
+	g_topology_unlock();
+	sx_xlock(&sc->sc_lock);
+	if (sc->sc_md != NULL) {
+		ctlstatus = G_RAID_MD_CTL(sc->sc_md, req);
+		if (ctlstatus < 0)
+			gctl_error(req, "Command failed: %d.", ctlstatus);
+	}
+	sx_xunlock(&sc->sc_lock);
+	g_topology_lock();
+}
+
+void
+g_raid_ctl(struct gctl_req *req, struct g_class *mp, const char *verb)
+{
+	uint32_t *version;
+
+	g_topology_assert();
+
+	version = gctl_get_paraml(req, "version", sizeof(*version));
+	if (version == NULL) {
+		gctl_error(req, "No '%s' argument.", "version");
+		return;
+	}
+	if (*version != G_RAID_VERSION) {
+		gctl_error(req, "Userland and kernel parts are out of sync.");
+		return;
+	}
+
+	if (strcmp(verb, "label") == 0)
+		g_raid_ctl_label(req, mp);
+	else if (strcmp(verb, "stop") == 0)
+		g_raid_ctl_stop(req, mp);
+	else
+		g_raid_ctl_other(req, mp);
+}
diff --git a/sys/geom/raid/g_raid_md_if.m b/sys/geom/raid/g_raid_md_if.m
new file mode 100644
index 0000000..05e9f66
--- /dev/null
+++ b/sys/geom/raid/g_raid_md_if.m
@@ -0,0 +1,156 @@
+#-
+# Copyright (c) 2010 Alexander Motin
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# $FreeBSD$
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/bus.h>
+#include <machine/bus.h>
+#include <sys/systm.h>
+#include <geom/geom.h>
+#include <geom/raid/g_raid.h>
+
+# The G_RAID metadata class interface.
+
+INTERFACE g_raid_md;
+
+HEADER {
+#define G_RAID_MD_TASTE_FAIL		-1
+#define G_RAID_MD_TASTE_EXISTING	 0
+#define G_RAID_MD_TASTE_NEW		 1
+};
+
+# Default implementations of methods.
+CODE {
+	static int
+	g_raid_md_create_default(struct g_raid_md_object *md)
+	{
+
+		return (G_RAID_MD_TASTE_FAIL);
+	}
+
+	static int
+	g_raid_md_ctl_default(struct g_raid_md_object *md,
+	    struct gctl_req *req)
+	{
+
+		return (-1);
+	}
+
+	static int
+	g_raid_md_volume_event_default(struct g_raid_md_object *md,
+	    struct g_raid_volume *vol, u_int event)
+	{
+
+		return (-1);
+	}
+
+	static int
+	g_raid_md_free_disk_default(struct g_raid_md_object *md,
+	    struct g_raid_volume *vol)
+	{
+
+		return (0);
+	}
+
+	static int
+	g_raid_md_free_volume_default(struct g_raid_md_object *md,
+	    struct g_raid_volume *vol)
+	{
+
+		return (0);
+	}
+};
+
+# create() - create new node from scratch.
+METHOD int create {
+	struct g_raid_md_object *md;
+	struct g_class *mp;
+	struct g_geom **gp;
+} DEFAULT g_raid_md_create_default;
+
+# taste() - taste disk and, if needed, create new node.
+METHOD int taste {
+	struct g_raid_md_object *md;
+	struct g_class *mp;
+	struct g_consumer *cp;
+	struct g_geom **gp;
+};
+
+# ctl() - user-level control commands handling method.
+METHOD int ctl {
+	struct g_raid_md_object *md;
+	struct gctl_req *req;
+} DEFAULT g_raid_md_ctl_default;
+
+# event() - events handling method.
+METHOD int event {
+	struct g_raid_md_object *md;
+	struct g_raid_disk *disk;
+	u_int event;
+};
+
+# volume_event() - events handling method.
+METHOD int volume_event {
+	struct g_raid_md_object *md;
+	struct g_raid_volume *vol;
+	u_int event;
+} DEFAULT g_raid_md_volume_event_default;
+
+# write() - metadata write method.
+METHOD int write {
+	struct g_raid_md_object *md;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+};
+
+# fail_disk() - mark disk as failed and remove it from use.
+METHOD int fail_disk {
+	struct g_raid_md_object *md;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+};
+
+# free_disk() - disk destructor.
+METHOD int free_disk {
+	struct g_raid_md_object *md;
+	struct g_raid_disk *disk;
+} DEFAULT g_raid_md_free_disk_default;
+
+# free_volume() - volume destructor.
+METHOD int free_volume {
+	struct g_raid_md_object *md;
+	struct g_raid_volume *vol;
+} DEFAULT g_raid_md_free_volume_default;
+
+# free() - destructor.
+METHOD int free {
+	struct g_raid_md_object *md;
+};
diff --git a/sys/geom/raid/g_raid_tr_if.m b/sys/geom/raid/g_raid_tr_if.m
new file mode 100644
index 0000000..193b429
--- /dev/null
+++ b/sys/geom/raid/g_raid_tr_if.m
@@ -0,0 +1,118 @@
+#-
+# Copyright (c) 2010 Alexander Motin
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# $FreeBSD$
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/bus.h>
+#include <machine/bus.h>
+#include <sys/systm.h>
+#include <geom/geom.h>
+#include <geom/raid/g_raid.h>
+
+# The G_RAID transformation class interface.
+
+INTERFACE g_raid_tr;
+
+# Default implementations of methods.
+CODE {
+	static int
+	g_raid_tr_locked_default(struct g_raid_tr_object *tr, void *argp)
+	{
+
+		return (0);
+	}
+};
+
+HEADER {
+#define G_RAID_TR_TASTE_FAIL		-1
+#define G_RAID_TR_TASTE_SUCCEED		 0
+};
+
+# taste() - volume taste method.
+METHOD int taste {
+	struct g_raid_tr_object *tr;
+	struct g_raid_volume *volume;
+};
+
+# event() - events handling method.
+METHOD int event {
+	struct g_raid_tr_object *tr;
+	struct g_raid_subdisk *sd;
+	u_int event;
+};
+
+# start() - begin operation.
+METHOD int start {
+	struct g_raid_tr_object *tr;
+};
+
+# stop() - stop operation.
+METHOD int stop {
+	struct g_raid_tr_object *tr;
+};
+
+# iorequest() - manage forward transformation and generates requests to disks.
+METHOD void iostart {
+	struct g_raid_tr_object *tr;
+	struct bio *bp;
+};
+
+# iodone() - manages backward transformation and reports completion status.
+METHOD void iodone {
+	struct g_raid_tr_object *tr;
+	struct g_raid_subdisk *sd;
+	struct bio *bp;
+};
+
+# kerneldump() - optimized for rebustness (simplified) kernel dumping routine.
+METHOD int kerneldump {
+	struct g_raid_tr_object *tr;
+	void *virtual;
+	vm_offset_t physical;
+	off_t offset;
+	size_t length;
+} DEFAULT g_raid_tr_kerneldump_common;
+
+# locked() - callback method for lock().
+METHOD int locked {
+	struct g_raid_tr_object *tr;
+	void *argp;
+} DEFAULT g_raid_tr_locked_default;
+
+# free() - destructor.
+METHOD int free {
+	struct g_raid_tr_object *tr;
+};
+
+# idle() - callback when the volume is idle for a while and the TR wants
+# to schedule some work for that idle period.
+METHOD int idle {
+	struct g_raid_tr_object *tr;
+};
diff --git a/sys/geom/raid/md_intel.c b/sys/geom/raid/md_intel.c
new file mode 100644
index 0000000..32dc8f0
--- /dev/null
+++ b/sys/geom/raid/md_intel.c
@@ -0,0 +1,2323 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+#include <geom/geom.h>
+#include "geom/raid/g_raid.h"
+#include "g_raid_md_if.h"
+
+static MALLOC_DEFINE(M_MD_INTEL, "md_intel_data", "GEOM_RAID Intel metadata");
+
+struct intel_raid_map {
+	uint32_t	offset;
+	uint32_t	disk_sectors;
+	uint32_t	stripe_count;
+	uint16_t	strip_sectors;
+	uint8_t		status;
+#define INTEL_S_READY           0x00
+#define INTEL_S_UNINITIALIZED   0x01
+#define INTEL_S_DEGRADED        0x02
+#define INTEL_S_FAILURE         0x03
+
+	uint8_t		type;
+#define INTEL_T_RAID0           0x00
+#define INTEL_T_RAID1           0x01
+#define INTEL_T_RAID5           0x05
+
+	uint8_t		total_disks;
+	uint8_t		total_domains;
+	uint8_t		failed_disk_num;
+	uint8_t		ddf;
+	uint32_t	filler_2[7];
+	uint32_t	disk_idx[1];	/* total_disks entries. */
+#define INTEL_DI_IDX	0x00ffffff
+#define INTEL_DI_RBLD	0x01000000
+} __packed;
+
+struct intel_raid_vol {
+	uint8_t		name[16];
+	u_int64_t	total_sectors __packed;
+	uint32_t	state;
+#define INTEL_ST_BOOTABLE		0x00000001
+#define INTEL_ST_BOOT_DEVICE		0x00000002
+#define INTEL_ST_READ_COALESCING	0x00000004
+#define INTEL_ST_WRITE_COALESCING	0x00000008
+#define INTEL_ST_LAST_SHUTDOWN_DIRTY	0x00000010
+#define INTEL_ST_HIDDEN_AT_BOOT		0x00000020
+#define INTEL_ST_CURRENTLY_HIDDEN	0x00000040
+#define INTEL_ST_VERIFY_AND_FIX		0x00000080
+#define INTEL_ST_MAP_STATE_UNINIT	0x00000100
+#define INTEL_ST_NO_AUTO_RECOVERY	0x00000200
+#define INTEL_ST_CLONE_N_GO		0x00000400
+#define INTEL_ST_CLONE_MAN_SYNC		0x00000800
+#define INTEL_ST_CNG_MASTER_DISK_NUM	0x00001000
+	uint32_t	reserved;
+	uint8_t		migr_priority;
+	uint8_t		num_sub_vols;
+	uint8_t		tid;
+	uint8_t		cng_master_disk;
+	uint16_t	cache_policy;
+	uint8_t		cng_state;
+	uint8_t		cng_sub_state;
+	uint32_t	filler_0[10];
+
+	uint32_t	curr_migr_unit;
+	uint32_t	checkpoint_id;
+	uint8_t		migr_state;
+	uint8_t		migr_type;
+#define INTEL_MT_INIT		0
+#define INTEL_MT_REBUILD	1
+#define INTEL_MT_VERIFY		2
+#define INTEL_MT_GEN_MIGR	3
+#define INTEL_MT_STATE_CHANGE	4
+#define INTEL_MT_REPAIR		5
+	uint8_t		dirty;
+	uint8_t		fs_state;
+	uint16_t	verify_errors;
+	uint16_t	bad_blocks;
+	uint32_t	filler_1[4];
+	struct intel_raid_map map[1];	/* 2 entries if migr_state != 0. */
+} __packed;
+
+struct intel_raid_disk {
+#define INTEL_SERIAL_LEN	16
+	uint8_t		serial[INTEL_SERIAL_LEN];
+	uint32_t	sectors;
+	uint32_t	id;
+	uint32_t	flags;
+#define INTEL_F_SPARE		0x01
+#define INTEL_F_ASSIGNED	0x02
+#define INTEL_F_FAILED		0x04
+#define INTEL_F_ONLINE		0x08
+
+	uint32_t	filler[5];
+} __packed;
+
+struct intel_raid_conf {
+	uint8_t		intel_id[24];
+#define INTEL_MAGIC             "Intel Raid ISM Cfg Sig. "
+
+	uint8_t		version[6];
+#define INTEL_VERSION_1000	"1.0.00"	/* RAID0 */
+#define INTEL_VERSION_1100	"1.1.00"	/* RAID1 */
+#define INTEL_VERSION_1200	"1.2.00"	/* Many volumes */
+#define INTEL_VERSION_1201	"1.2.01"	/* 3 or 4 disks */
+#define INTEL_VERSION_1202	"1.2.02"	/* RAID5 */
+#define INTEL_VERSION_1204	"1.2.04"	/* 5 or 6 disks */
+#define INTEL_VERSION_1206	"1.2.06"	/* CNG */
+#define INTEL_VERSION_1300	"1.3.00"	/* Attributes */
+
+	uint8_t		dummy_0[2];
+	uint32_t	checksum;
+	uint32_t	config_size;
+	uint32_t	config_id;
+	uint32_t	generation;
+	uint32_t	error_log_size;
+	uint32_t	attributes;
+#define INTEL_ATTR_RAID0	0x00000001
+#define INTEL_ATTR_RAID1	0x00000002
+#define INTEL_ATTR_RAID10	0x00000004
+#define INTEL_ATTR_RAID1E	0x00000008
+#define INTEL_ATTR_RAID5	0x00000010
+#define INTEL_ATTR_RAIDCNG	0x00000020
+#define INTEL_ATTR_2TB		0x20000000
+#define INTEL_ATTR_PM		0x40000000
+#define INTEL_ATTR_CHECKSUM	0x80000000
+
+	uint8_t		total_disks;
+	uint8_t		total_volumes;
+	uint8_t		dummy_2[2];
+	uint32_t	filler_0[39];
+	struct intel_raid_disk	disk[1];	/* total_disks entries. */
+	/* Here goes total_volumes of struct intel_raid_vol. */
+} __packed;
+
+#define INTEL_MAX_MD_SIZE(ndisks)				\
+    (sizeof(struct intel_raid_conf) +				\
+     sizeof(struct intel_raid_disk) * (ndisks - 1) +		\
+     sizeof(struct intel_raid_vol) * 2 +			\
+     sizeof(struct intel_raid_map) * 2 +			\
+     sizeof(uint32_t) * (ndisks - 1) * 4)
+
+struct g_raid_md_intel_perdisk {
+	struct intel_raid_conf	*pd_meta;
+	int			 pd_disk_pos;
+	struct intel_raid_disk	 pd_disk_meta;
+};
+
+struct g_raid_md_intel_object {
+	struct g_raid_md_object	 mdio_base;
+	uint32_t		 mdio_config_id;
+	uint32_t		 mdio_generation;
+	struct intel_raid_conf	*mdio_meta;
+	struct callout		 mdio_start_co;	/* STARTING state timer. */
+	int			 mdio_disks_present;
+	int			 mdio_started;
+	int			 mdio_incomplete;
+	struct root_hold_token	*mdio_rootmount; /* Root mount delay token. */
+};
+
+static g_raid_md_create_t g_raid_md_create_intel;
+static g_raid_md_taste_t g_raid_md_taste_intel;
+static g_raid_md_event_t g_raid_md_event_intel;
+static g_raid_md_ctl_t g_raid_md_ctl_intel;
+static g_raid_md_write_t g_raid_md_write_intel;
+static g_raid_md_fail_disk_t g_raid_md_fail_disk_intel;
+static g_raid_md_free_disk_t g_raid_md_free_disk_intel;
+static g_raid_md_free_t g_raid_md_free_intel;
+
+static kobj_method_t g_raid_md_intel_methods[] = {
+	KOBJMETHOD(g_raid_md_create,	g_raid_md_create_intel),
+	KOBJMETHOD(g_raid_md_taste,	g_raid_md_taste_intel),
+	KOBJMETHOD(g_raid_md_event,	g_raid_md_event_intel),
+	KOBJMETHOD(g_raid_md_ctl,	g_raid_md_ctl_intel),
+	KOBJMETHOD(g_raid_md_write,	g_raid_md_write_intel),
+	KOBJMETHOD(g_raid_md_fail_disk,	g_raid_md_fail_disk_intel),
+	KOBJMETHOD(g_raid_md_free_disk,	g_raid_md_free_disk_intel),
+	KOBJMETHOD(g_raid_md_free,	g_raid_md_free_intel),
+	{ 0, 0 }
+};
+
+static struct g_raid_md_class g_raid_md_intel_class = {
+	"Intel",
+	g_raid_md_intel_methods,
+	sizeof(struct g_raid_md_intel_object),
+	.mdc_priority = 100
+};
+
+
+static struct intel_raid_map *
+intel_get_map(struct intel_raid_vol *mvol, int i)
+{
+	struct intel_raid_map *mmap;
+
+	if (i > (mvol->migr_state ? 1 : 0))
+		return (NULL);
+	mmap = &mvol->map[0];
+	for (; i > 0; i--) {
+		mmap = (struct intel_raid_map *)
+		    &mmap->disk_idx[mmap->total_disks];
+	}
+	return ((struct intel_raid_map *)mmap);
+}
+
+static struct intel_raid_vol *
+intel_get_volume(struct intel_raid_conf *meta, int i)
+{
+	struct intel_raid_vol *mvol;
+	struct intel_raid_map *mmap;
+
+	if (i > 1)
+		return (NULL);
+	mvol = (struct intel_raid_vol *)&meta->disk[meta->total_disks];
+	for (; i > 0; i--) {
+		mmap = intel_get_map(mvol, mvol->migr_state ? 1 : 0);
+		mvol = (struct intel_raid_vol *)
+		    &mmap->disk_idx[mmap->total_disks];
+	}
+	return (mvol);
+}
+
+static void
+g_raid_md_intel_print(struct intel_raid_conf *meta)
+{
+	struct intel_raid_vol *mvol;
+	struct intel_raid_map *mmap;
+	int i, j, k;
+
+	if (g_raid_debug < 1)
+		return;
+
+	printf("********* ATA Intel MatrixRAID Metadata *********\n");
+	printf("intel_id            <%.24s>\n", meta->intel_id);
+	printf("version             <%.6s>\n", meta->version);
+	printf("checksum            0x%08x\n", meta->checksum);
+	printf("config_size         0x%08x\n", meta->config_size);
+	printf("config_id           0x%08x\n", meta->config_id);
+	printf("generation          0x%08x\n", meta->generation);
+	printf("attributes          0x%08x\n", meta->attributes);
+	printf("total_disks         %u\n", meta->total_disks);
+	printf("total_volumes       %u\n", meta->total_volumes);
+	printf("DISK#   serial disk_sectors disk_id flags\n");
+	for (i = 0; i < meta->total_disks; i++ ) {
+		printf("    %d   <%.16s> %u 0x%08x 0x%08x\n", i,
+		    meta->disk[i].serial, meta->disk[i].sectors,
+		    meta->disk[i].id, meta->disk[i].flags);
+	}
+	for (i = 0; i < meta->total_volumes; i++) {
+		mvol = intel_get_volume(meta, i);
+		printf(" ****** Volume %d ******\n", i);
+		printf(" name               %.16s\n", mvol->name);
+		printf(" total_sectors      %ju\n", mvol->total_sectors);
+		printf(" state              %u\n", mvol->state);
+		printf(" reserved           %u\n", mvol->reserved);
+		printf(" curr_migr_unit     %u\n", mvol->curr_migr_unit);
+		printf(" checkpoint_id      %u\n", mvol->checkpoint_id);
+		printf(" migr_state         %u\n", mvol->migr_state);
+		printf(" migr_type          %u\n", mvol->migr_type);
+		printf(" dirty              %u\n", mvol->dirty);
+
+		for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) {
+			printf("  *** Map %d ***\n", j);
+			mmap = intel_get_map(mvol, j);
+			printf("  offset            %u\n", mmap->offset);
+			printf("  disk_sectors      %u\n", mmap->disk_sectors);
+			printf("  stripe_count      %u\n", mmap->stripe_count);
+			printf("  strip_sectors     %u\n", mmap->strip_sectors);
+			printf("  status            %u\n", mmap->status);
+			printf("  type              %u\n", mmap->type);
+			printf("  total_disks       %u\n", mmap->total_disks);
+			printf("  total_domains     %u\n", mmap->total_domains);
+			printf("  failed_disk_num   %u\n", mmap->failed_disk_num);
+			printf("  ddf               %u\n", mmap->ddf);
+			printf("  disk_idx         ");
+			for (k = 0; k < mmap->total_disks; k++)
+				printf(" 0x%08x", mmap->disk_idx[k]);
+			printf("\n");
+		}
+	}
+	printf("=================================================\n");
+}
+
+static struct intel_raid_conf *
+intel_meta_copy(struct intel_raid_conf *meta)
+{
+	struct intel_raid_conf *nmeta;
+
+	nmeta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK);
+	memcpy(nmeta, meta, meta->config_size);
+	return (nmeta);
+}
+
+static int
+intel_meta_find_disk(struct intel_raid_conf *meta, char *serial)
+{
+	int pos;
+
+	for (pos = 0; pos < meta->total_disks; pos++) {
+		if (strncmp(meta->disk[pos].serial,
+		    serial, INTEL_SERIAL_LEN) == 0)
+			return (pos);
+	}
+	return (-1);
+}
+
+static struct intel_raid_conf *
+intel_meta_read(struct g_consumer *cp)
+{
+	struct g_provider *pp;
+	struct intel_raid_conf *meta;
+	struct intel_raid_vol *mvol;
+	struct intel_raid_map *mmap;
+	char *buf;
+	int error, i, j, k, left, size;
+	uint32_t checksum, *ptr;
+
+	pp = cp->provider;
+
+	/* Read the anchor sector. */
+	buf = g_read_data(cp,
+	    pp->mediasize - pp->sectorsize * 2, pp->sectorsize, &error);
+	if (buf == NULL) {
+		G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
+		    pp->name, error);
+		return (NULL);
+	}
+	meta = (struct intel_raid_conf *)buf;
+
+	/* Check if this is an Intel RAID struct */
+	if (strncmp(meta->intel_id, INTEL_MAGIC, strlen(INTEL_MAGIC))) {
+		G_RAID_DEBUG(1, "Intel signature check failed on %s", pp->name);
+		g_free(buf);
+		return (NULL);
+	}
+	if (meta->config_size > 65536 ||
+	    meta->config_size < sizeof(struct intel_raid_conf)) {
+		G_RAID_DEBUG(1, "Intel metadata size looks wrong: %d",
+		    meta->config_size);
+		g_free(buf);
+		return (NULL);
+	}
+	size = meta->config_size;
+	meta = malloc(size, M_MD_INTEL, M_WAITOK);
+	memcpy(meta, buf, min(size, pp->sectorsize));
+	g_free(buf);
+
+	/* Read all the rest, if needed. */
+	if (meta->config_size > pp->sectorsize) {
+		left = (meta->config_size - 1) / pp->sectorsize;
+		buf = g_read_data(cp,
+		    pp->mediasize - pp->sectorsize * (2 + left),
+		    pp->sectorsize * left, &error);
+		if (buf == NULL) {
+			G_RAID_DEBUG(1, "Cannot read remaining metadata"
+			    " part from %s (error=%d).",
+			    pp->name, error);
+			free(meta, M_MD_INTEL);
+			return (NULL);
+		}
+		memcpy(((char *)meta) + pp->sectorsize, buf,
+		    pp->sectorsize * left);
+		g_free(buf);
+	}
+
+	/* Check metadata checksum. */
+	for (checksum = 0, ptr = (uint32_t *)meta, i = 0;
+	    i < (meta->config_size / sizeof(uint32_t)); i++) {
+		checksum += *ptr++;
+	}
+	checksum -= meta->checksum;
+	if (checksum != meta->checksum) {
+		G_RAID_DEBUG(1, "Intel checksum check failed on %s", pp->name);
+		free(meta, M_MD_INTEL);
+		return (NULL);
+	}
+
+	/* Validate metadata size. */
+	size = sizeof(struct intel_raid_conf) +
+	    sizeof(struct intel_raid_disk) * (meta->total_disks - 1) +
+	    sizeof(struct intel_raid_vol) * meta->total_volumes;
+	if (size > meta->config_size) {
+badsize:
+		G_RAID_DEBUG(1, "Intel metadata size incorrect %d < %d",
+		    meta->config_size, size);
+		free(meta, M_MD_INTEL);
+		return (NULL);
+	}
+	for (i = 0; i < meta->total_volumes; i++) {
+		mvol = intel_get_volume(meta, i);
+		mmap = intel_get_map(mvol, 0);
+		size += 4 * (mmap->total_disks - 1);
+		if (size > meta->config_size)
+			goto badsize;
+		if (mvol->migr_state) {
+			size += sizeof(struct intel_raid_map);
+			if (size > meta->config_size)
+				goto badsize;
+			mmap = intel_get_map(mvol, 1);
+			size += 4 * (mmap->total_disks - 1);
+			if (size > meta->config_size)
+				goto badsize;
+		}
+	}
+
+	/* Validate disk indexes. */
+	for (i = 0; i < meta->total_volumes; i++) {
+		mvol = intel_get_volume(meta, i);
+		for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) {
+			mmap = intel_get_map(mvol, j);
+			for (k = 0; k < mmap->total_disks; k++) {
+				if ((mmap->disk_idx[k] & INTEL_DI_IDX) >
+				    meta->total_disks) {
+					G_RAID_DEBUG(1, "Intel metadata disk"
+					    " index %d too big (>%d)",
+					    mmap->disk_idx[k] & INTEL_DI_IDX,
+					    meta->total_disks);
+					free(meta, M_MD_INTEL);
+					return (NULL);
+				}
+			}
+		}
+	}
+
+	/* Validate migration types. */
+	for (i = 0; i < meta->total_volumes; i++) {
+		mvol = intel_get_volume(meta, i);
+		if (mvol->migr_state &&
+		    mvol->migr_type != INTEL_MT_INIT &&
+		    mvol->migr_type != INTEL_MT_REBUILD &&
+		    mvol->migr_type != INTEL_MT_VERIFY &&
+		    mvol->migr_type != INTEL_MT_REPAIR) {
+			G_RAID_DEBUG(1, "Intel metadata has unsupported"
+			    " migration type %d", mvol->migr_type);
+			free(meta, M_MD_INTEL);
+			return (NULL);
+		}
+	}
+
+	return (meta);
+}
+
+static int
+intel_meta_write(struct g_consumer *cp, struct intel_raid_conf *meta)
+{
+	struct g_provider *pp;
+	char *buf;
+	int error, i, sectors;
+	uint32_t checksum, *ptr;
+
+	pp = cp->provider;
+
+	/* Recalculate checksum for case if metadata were changed. */
+	meta->checksum = 0;
+	for (checksum = 0, ptr = (uint32_t *)meta, i = 0;
+	    i < (meta->config_size / sizeof(uint32_t)); i++) {
+		checksum += *ptr++;
+	}
+	meta->checksum = checksum;
+
+	/* Create and fill buffer. */
+	sectors = (meta->config_size + pp->sectorsize - 1) / pp->sectorsize;
+	buf = malloc(sectors * pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO);
+	if (sectors > 1) {
+		memcpy(buf, ((char *)meta) + pp->sectorsize,
+		    (sectors - 1) * pp->sectorsize);
+	}
+	memcpy(buf + (sectors - 1) * pp->sectorsize, meta, pp->sectorsize);
+
+	error = g_write_data(cp,
+	    pp->mediasize - pp->sectorsize * (1 + sectors),
+	    buf, pp->sectorsize * sectors);
+	if (error != 0) {
+		G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
+		    pp->name, error);
+	}
+
+	free(buf, M_MD_INTEL);
+	return (error);
+}
+
+static int
+intel_meta_erase(struct g_consumer *cp)
+{
+	struct g_provider *pp;
+	char *buf;
+	int error;
+
+	pp = cp->provider;
+	buf = malloc(pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO);
+	error = g_write_data(cp,
+	    pp->mediasize - 2 * pp->sectorsize,
+	    buf, pp->sectorsize);
+	if (error != 0) {
+		G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
+		    pp->name, error);
+	}
+	free(buf, M_MD_INTEL);
+	return (error);
+}
+
+static int
+intel_meta_write_spare(struct g_consumer *cp, struct intel_raid_disk *d)
+{
+	struct intel_raid_conf *meta;
+	int error;
+
+	/* Fill anchor and single disk. */
+	meta = malloc(INTEL_MAX_MD_SIZE(1), M_MD_INTEL, M_WAITOK | M_ZERO);
+	memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC));
+	memcpy(&meta->version[0], INTEL_VERSION_1000,
+	    sizeof(INTEL_VERSION_1000));
+	meta->config_size = INTEL_MAX_MD_SIZE(1);
+	meta->config_id = arc4random();
+	meta->generation = 1;
+	meta->total_disks = 1;
+	meta->disk[0] = *d;
+	error = intel_meta_write(cp, meta);
+	free(meta, M_MD_INTEL);
+	return (error);
+}
+
+static struct g_raid_disk *
+g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id)
+{
+	struct g_raid_disk	*disk;
+	struct g_raid_md_intel_perdisk *pd;
+
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
+		if (pd->pd_disk_pos == id)
+			break;
+	}
+	return (disk);
+}
+
+static int
+g_raid_md_intel_supported(int level, int qual, int disks, int force)
+{
+
+	switch (level) {
+	case G_RAID_VOLUME_RL_RAID0:
+		if (disks < 1)
+			return (0);
+		if (!force && (disks < 2 || disks > 6))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID1:
+		if (disks < 1)
+			return (0);
+		if (!force && (disks != 2))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID1E:
+		if (disks < 2)
+			return (0);
+		if (!force && (disks != 4))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID5:
+		if (disks < 3)
+			return (0);
+		if (!force && disks > 6)
+			return (0);
+		break;
+	default:
+		return (0);
+	}
+	if (qual != G_RAID_VOLUME_RLQ_NONE)
+		return (0);
+	return (1);
+}
+
+static struct g_raid_volume *
+g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id)
+{
+	struct g_raid_volume	*mvol;
+
+	TAILQ_FOREACH(mvol, &sc->sc_volumes, v_next) {
+		if ((intptr_t)(mvol->v_md_data) == id)
+			break;
+	}
+	return (mvol);
+}
+
+static int
+g_raid_md_intel_start_disk(struct g_raid_disk *disk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd, *tmpsd;
+	struct g_raid_disk *olddisk, *tmpdisk;
+	struct g_raid_md_object *md;
+	struct g_raid_md_intel_object *mdi;
+	struct g_raid_md_intel_perdisk *pd, *oldpd;
+	struct intel_raid_conf *meta;
+	struct intel_raid_vol *mvol;
+	struct intel_raid_map *mmap0, *mmap1;
+	int disk_pos, resurrection = 0;
+
+	sc = disk->d_softc;
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_intel_object *)md;
+	meta = mdi->mdio_meta;
+	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
+	olddisk = NULL;
+
+	/* Find disk position in metadata by it's serial. */
+	disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial);
+	if (disk_pos < 0) {
+		G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk");
+		/* Failed stale disk is useless for us. */
+		if (pd->pd_disk_meta.flags & INTEL_F_FAILED) {
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
+			return (0);
+		}
+		/* If we are in the start process, that's all for now. */
+		if (!mdi->mdio_started)
+			goto nofit;
+		/*
+		 * If we have already started - try to get use of the disk.
+		 * Try to replace OFFLINE disks first, then FAILED.
+		 */
+		TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) {
+			if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE &&
+			    tmpdisk->d_state != G_RAID_DISK_S_FAILED)
+				continue;
+			/* Make sure this disk is big enough. */
+			TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) {
+				if (sd->sd_offset + sd->sd_size + 4096 >
+				    (off_t)pd->pd_disk_meta.sectors * 512) {
+					G_RAID_DEBUG1(1, sc,
+					    "Disk too small (%llu < %llu)",
+					    ((unsigned long long)
+					    pd->pd_disk_meta.sectors) * 512,
+					    (unsigned long long)
+					    sd->sd_offset + sd->sd_size + 4096);
+					break;
+				}
+			}
+			if (sd != NULL)
+				continue;
+			if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) {
+				olddisk = tmpdisk;
+				break;
+			} else if (olddisk == NULL)
+				olddisk = tmpdisk;
+		}
+		if (olddisk == NULL) {
+nofit:
+			if (pd->pd_disk_meta.flags & INTEL_F_SPARE) {
+				g_raid_change_disk_state(disk,
+				    G_RAID_DISK_S_SPARE);
+				return (1);
+			} else {
+				g_raid_change_disk_state(disk,
+				    G_RAID_DISK_S_STALE);
+				return (0);
+			}
+		}
+		oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data;
+		disk_pos = oldpd->pd_disk_pos;
+		resurrection = 1;
+	}
+
+	if (olddisk == NULL) {
+		/* Find placeholder by position. */
+		olddisk = g_raid_md_intel_get_disk(sc, disk_pos);
+		if (olddisk == NULL)
+			panic("No disk at position %d!", disk_pos);
+		if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) {
+			G_RAID_DEBUG1(1, sc, "More then one disk for pos %d",
+			    disk_pos);
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE);
+			return (0);
+		}
+		oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data;
+	}
+
+	/* Replace failed disk or placeholder with new disk. */
+	TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) {
+		TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next);
+		TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+		sd->sd_disk = disk;
+	}
+	oldpd->pd_disk_pos = -2;
+	pd->pd_disk_pos = disk_pos;
+
+	/* If it was placeholder -- destroy it. */
+	if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) {
+		g_raid_destroy_disk(olddisk);
+	} else {
+		/* Otherwise, make it STALE_FAILED. */
+		g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED);
+		/* Update global metadata just in case. */
+		memcpy(&meta->disk[disk_pos], &pd->pd_disk_meta,
+		    sizeof(struct intel_raid_disk));
+	}
+
+	/* Welcome the new disk. */
+	if (resurrection)
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
+	else if (meta->disk[disk_pos].flags & INTEL_F_FAILED)
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
+	else if (meta->disk[disk_pos].flags & INTEL_F_SPARE)
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
+	else
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
+	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+		mvol = intel_get_volume(meta,
+		    (uintptr_t)(sd->sd_volume->v_md_data));
+		mmap0 = intel_get_map(mvol, 0);
+		if (mvol->migr_state)
+			mmap1 = intel_get_map(mvol, 1);
+		else
+			mmap1 = mmap0;
+
+		if (resurrection) {
+			/* Stale disk, almost same as new. */
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_NEW);
+		} else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) {
+			/* Failed disk, almost useless. */
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_FAILED);
+		} else if (mvol->migr_state == 0) {
+			if (mmap0->status == INTEL_S_UNINITIALIZED) {
+				/* Freshly created uninitialized volume. */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_UNINITIALIZED);
+			} else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
+				/* Freshly inserted disk. */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_NEW);
+			} else if (mvol->dirty) {
+				/* Dirty volume (unclean shutdown). */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_STALE);
+			} else {
+				/* Up to date disk. */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_ACTIVE);
+			}
+		} else if (mvol->migr_type == INTEL_MT_INIT ||
+			   mvol->migr_type == INTEL_MT_REBUILD) {
+			if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
+				/* Freshly inserted disk. */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_NEW);
+			} else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
+				/* Rebuilding disk. */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_REBUILD);
+				if (mvol->dirty) {
+					sd->sd_rebuild_pos = 0;
+				} else {
+					sd->sd_rebuild_pos =
+					    (off_t)mvol->curr_migr_unit *
+					    sd->sd_volume->v_strip_size *
+					    mmap0->total_domains;
+				}
+			} else if (mvol->dirty) {
+				/* Dirty volume (unclean shutdown). */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_STALE);
+			} else {
+				/* Up to date disk. */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_ACTIVE);
+			}
+		} else if (mvol->migr_type == INTEL_MT_VERIFY ||
+			   mvol->migr_type == INTEL_MT_REPAIR) {
+			if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
+				/* Freshly inserted disk. */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_NEW);
+			} else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
+				/* Resyncing disk. */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_RESYNC);
+				if (mvol->dirty) {
+					sd->sd_rebuild_pos = 0;
+				} else {
+					sd->sd_rebuild_pos =
+					    (off_t)mvol->curr_migr_unit *
+					    sd->sd_volume->v_strip_size *
+					    mmap0->total_domains;
+				}
+			} else if (mvol->dirty) {
+				/* Dirty volume (unclean shutdown). */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_STALE);
+			} else {
+				/* Up to date disk. */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_ACTIVE);
+			}
+		}
+		g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
+		    G_RAID_EVENT_SUBDISK);
+	}
+
+	/* Update status of our need for spare. */
+	if (mdi->mdio_started) {
+		mdi->mdio_incomplete =
+		    (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) <
+		     meta->total_disks);
+	}
+
+	return (resurrection);
+}
+
+static void
+g_disk_md_intel_retaste(void *arg, int pending)
+{
+
+	G_RAID_DEBUG(1, "Array is not complete, trying to retaste.");
+	g_retaste(&g_raid_class);
+	free(arg, M_MD_INTEL);
+}
+
+static void
+g_raid_md_intel_refill(struct g_raid_softc *sc)
+{
+	struct g_raid_md_object *md;
+	struct g_raid_md_intel_object *mdi;
+	struct intel_raid_conf *meta;
+	struct g_raid_disk *disk;
+	struct task *task;
+	int update, na;
+
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_intel_object *)md;
+	meta = mdi->mdio_meta;
+	update = 0;
+	do {
+		/* Make sure we miss anything. */
+		na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE);
+		if (na == meta->total_disks)
+			break;
+
+		G_RAID_DEBUG1(1, md->mdo_softc,
+		    "Array is not complete (%d of %d), "
+		    "trying to refill.", na, meta->total_disks);
+
+		/* Try to get use some of STALE disks. */
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_state == G_RAID_DISK_S_STALE) {
+				update += g_raid_md_intel_start_disk(disk);
+				if (disk->d_state == G_RAID_DISK_S_ACTIVE)
+					break;
+			}
+		}
+		if (disk != NULL)
+			continue;
+
+		/* Try to get use some of SPARE disks. */
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_state == G_RAID_DISK_S_SPARE) {
+				update += g_raid_md_intel_start_disk(disk);
+				if (disk->d_state == G_RAID_DISK_S_ACTIVE)
+					break;
+			}
+		}
+	} while (disk != NULL);
+
+	/* Write new metadata if we changed something. */
+	if (update) {
+		g_raid_md_write_intel(md, NULL, NULL, NULL);
+		meta = mdi->mdio_meta;
+	}
+
+	/* Update status of our need for spare. */
+	mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) <
+	    meta->total_disks);
+
+	/* Request retaste hoping to find spare. */
+	if (mdi->mdio_incomplete) {
+		task = malloc(sizeof(struct task),
+		    M_MD_INTEL, M_WAITOK | M_ZERO);
+		TASK_INIT(task, 0, g_disk_md_intel_retaste, task);
+		taskqueue_enqueue(taskqueue_swi, task);
+	}
+}
+
+static void
+g_raid_md_intel_start(struct g_raid_softc *sc)
+{
+	struct g_raid_md_object *md;
+	struct g_raid_md_intel_object *mdi;
+	struct g_raid_md_intel_perdisk *pd;
+	struct intel_raid_conf *meta;
+	struct intel_raid_vol *mvol;
+	struct intel_raid_map *mmap;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	int i, j, disk_pos;
+
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_intel_object *)md;
+	meta = mdi->mdio_meta;
+
+	/* Create volumes and subdisks. */
+	for (i = 0; i < meta->total_volumes; i++) {
+		mvol = intel_get_volume(meta, i);
+		mmap = intel_get_map(mvol, 0);
+		vol = g_raid_create_volume(sc, mvol->name, -1);
+		vol->v_md_data = (void *)(intptr_t)i;
+		if (mmap->type == INTEL_T_RAID0)
+			vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
+		else if (mmap->type == INTEL_T_RAID1 &&
+		    mmap->total_domains >= 2 &&
+		    mmap->total_domains <= mmap->total_disks) {
+			/* Assume total_domains is correct. */
+			if (mmap->total_domains == mmap->total_disks)
+				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
+			else
+				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
+		} else if (mmap->type == INTEL_T_RAID1) {
+			/* total_domains looks wrong. */
+			if (mmap->total_disks <= 2)
+				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
+			else
+				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
+		} else if (mmap->type == INTEL_T_RAID5)
+			vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
+		else
+			vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
+		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
+		vol->v_strip_size = (u_int)mmap->strip_sectors * 512; //ZZZ
+		vol->v_disks_count = mmap->total_disks;
+		vol->v_mediasize = (off_t)mvol->total_sectors * 512; //ZZZ
+		vol->v_sectorsize = 512; //ZZZ
+		for (j = 0; j < vol->v_disks_count; j++) {
+			sd = &vol->v_subdisks[j];
+			sd->sd_offset = (off_t)mmap->offset * 512; //ZZZ
+			sd->sd_size = (off_t)mmap->disk_sectors * 512; //ZZZ
+		}
+		g_raid_start_volume(vol);
+	}
+
+	/* Create disk placeholders to store data for later writing. */
+	for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) {
+		pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
+		pd->pd_disk_pos = disk_pos;
+		pd->pd_disk_meta = meta->disk[disk_pos];
+		disk = g_raid_create_disk(sc);
+		disk->d_md_data = (void *)pd;
+		disk->d_state = G_RAID_DISK_S_OFFLINE;
+		for (i = 0; i < meta->total_volumes; i++) {
+			mvol = intel_get_volume(meta, i);
+			mmap = intel_get_map(mvol, 0);
+			for (j = 0; j < mmap->total_disks; j++) {
+				if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos)
+					break;
+			}
+			if (j == mmap->total_disks)
+				continue;
+			vol = g_raid_md_intel_get_volume(sc, i);
+			sd = &vol->v_subdisks[j];
+			sd->sd_disk = disk;
+			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+		}
+	}
+
+	/* Make all disks found till the moment take their places. */
+	do {
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_state == G_RAID_DISK_S_NONE) {
+				g_raid_md_intel_start_disk(disk);
+				break;
+			}
+		}
+	} while (disk != NULL);
+
+	mdi->mdio_started = 1;
+	G_RAID_DEBUG1(0, sc, "Array started.");
+	g_raid_md_write_intel(md, NULL, NULL, NULL);
+
+	/* Pickup any STALE/SPARE disks to refill array if needed. */
+	g_raid_md_intel_refill(sc);
+
+	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
+		    G_RAID_EVENT_VOLUME);
+	}
+
+	callout_stop(&mdi->mdio_start_co);
+	G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount);
+	root_mount_rel(mdi->mdio_rootmount);
+	mdi->mdio_rootmount = NULL;
+}
+
+static void
+g_raid_md_intel_new_disk(struct g_raid_disk *disk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_object *md;
+	struct g_raid_md_intel_object *mdi;
+	struct intel_raid_conf *pdmeta;
+	struct g_raid_md_intel_perdisk *pd;
+
+	sc = disk->d_softc;
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_intel_object *)md;
+	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
+	pdmeta = pd->pd_meta;
+
+	if (mdi->mdio_started) {
+		if (g_raid_md_intel_start_disk(disk))
+			g_raid_md_write_intel(md, NULL, NULL, NULL);
+	} else {
+		/* If we haven't started yet - check metadata freshness. */
+		if (mdi->mdio_meta == NULL ||
+		    ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) {
+			G_RAID_DEBUG1(1, sc, "Newer disk");
+			if (mdi->mdio_meta != NULL)
+				free(mdi->mdio_meta, M_MD_INTEL);
+			mdi->mdio_meta = intel_meta_copy(pdmeta);
+			mdi->mdio_generation = mdi->mdio_meta->generation;
+			mdi->mdio_disks_present = 1;
+		} else if (pdmeta->generation == mdi->mdio_generation) {
+			mdi->mdio_disks_present++;
+			G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
+			    mdi->mdio_disks_present,
+			    mdi->mdio_meta->total_disks);
+		} else {
+			G_RAID_DEBUG1(1, sc, "Older disk");
+		}
+		/* If we collected all needed disks - start array. */
+		if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks)
+			g_raid_md_intel_start(sc);
+	}
+}
+
+static void
+g_raid_intel_go(void *arg)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_object *md;
+	struct g_raid_md_intel_object *mdi;
+
+	sc = arg;
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_intel_object *)md;
+	if (!mdi->mdio_started) {
+		G_RAID_DEBUG1(0, sc, "Force array start due to timeout.");
+		g_raid_event_send(sc, G_RAID_NODE_E_START, 0);
+	}
+}
+
+static int
+g_raid_md_create_intel(struct g_raid_md_object *md, struct g_class *mp,
+    struct g_geom **gp)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_intel_object *mdi;
+	char name[16];
+
+	mdi = (struct g_raid_md_intel_object *)md;
+	mdi->mdio_config_id = arc4random();
+	mdi->mdio_generation = 0;
+	snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id);
+	sc = g_raid_create_node(mp, name, md);
+	if (sc == NULL)
+		return (G_RAID_MD_TASTE_FAIL);
+	md->mdo_softc = sc;
+	*gp = sc->sc_geom;
+	return (G_RAID_MD_TASTE_NEW);
+}
+
+/*
+ * Return the last N characters of the serial label.  The Linux and
+ * ataraid(7) code always uses the last 16 characters of the label to
+ * store into the Intel meta format.  Generalize this to N characters
+ * since that's easy.  Labels can be up to 20 characters for SATA drives
+ * and up 251 characters for SAS drives.  Since intel controllers don't
+ * support SAS drives, just stick with the SATA limits for stack friendliness.
+ */
+static int
+g_raid_md_get_label(struct g_consumer *cp, char *serial, int serlen)
+{
+	char serial_buffer[24];
+	int len, error;
+	
+	len = sizeof(serial_buffer);
+	error = g_io_getattr("GEOM::ident", cp, &len, serial_buffer);
+	if (error != 0)
+		return (error);
+	len = strlen(serial_buffer);
+	if (len > serlen)
+		len -= serlen;
+	else
+		len = 0;
+	strncpy(serial, serial_buffer + len, serlen);
+	return (0);
+}
+
+static int
+g_raid_md_taste_intel(struct g_raid_md_object *md, struct g_class *mp,
+                              struct g_consumer *cp, struct g_geom **gp)
+{
+	struct g_consumer *rcp;
+	struct g_provider *pp;
+	struct g_raid_md_intel_object *mdi, *mdi1;
+	struct g_raid_softc *sc;
+	struct g_raid_disk *disk;
+	struct intel_raid_conf *meta;
+	struct g_raid_md_intel_perdisk *pd;
+	struct g_geom *geom;
+	int error, disk_pos, result, spare, len;
+	char serial[INTEL_SERIAL_LEN];
+	char name[16];
+	uint16_t vendor;
+
+	G_RAID_DEBUG(1, "Tasting Intel on %s", cp->provider->name);
+	mdi = (struct g_raid_md_intel_object *)md;
+	pp = cp->provider;
+
+	/* Read metadata from device. */
+	meta = NULL;
+	spare = 0;
+	vendor = 0xffff;
+	disk_pos = 0;
+	if (g_access(cp, 1, 0, 0) != 0)
+		return (G_RAID_MD_TASTE_FAIL);
+	g_topology_unlock();
+	error = g_raid_md_get_label(cp, serial, sizeof(serial));
+	if (error != 0) {
+		G_RAID_DEBUG(1, "Cannot get serial number from %s (error=%d).",
+		    pp->name, error);
+		goto fail2;
+	}
+	len = 2;
+	if (pp->geom->rank == 1)
+		g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
+	meta = intel_meta_read(cp);
+	g_topology_lock();
+	g_access(cp, -1, 0, 0);
+	if (meta == NULL) {
+		if (g_raid_aggressive_spare) {
+			if (vendor == 0x8086) {
+				G_RAID_DEBUG(1,
+				    "No Intel metadata, forcing spare.");
+				spare = 2;
+				goto search;
+			} else {
+				G_RAID_DEBUG(1,
+				    "Intel vendor mismatch 0x%04x != 0x8086",
+				    vendor);
+			}
+		}
+		return (G_RAID_MD_TASTE_FAIL);
+	}
+
+	/* Check this disk position in obtained metadata. */
+	disk_pos = intel_meta_find_disk(meta, serial);
+	if (disk_pos < 0) {
+		G_RAID_DEBUG(1, "Intel serial '%s' not found", serial);
+		goto fail1;
+	}
+	if (meta->disk[disk_pos].sectors !=
+	    (pp->mediasize / pp->sectorsize)) {
+		G_RAID_DEBUG(1, "Intel size mismatch %u != %u",
+		    meta->disk[disk_pos].sectors,
+		    (u_int)(pp->mediasize / pp->sectorsize));
+		goto fail1;
+	}
+
+	/* Metadata valid. Print it. */
+	g_raid_md_intel_print(meta);
+	G_RAID_DEBUG(1, "Intel disk position %d", disk_pos);
+	spare = meta->disk[disk_pos].flags & INTEL_F_SPARE;
+
+search:
+	/* Search for matching node. */
+	sc = NULL;
+	mdi1 = NULL;
+	LIST_FOREACH(geom, &mp->geom, geom) {
+		sc = geom->softc;
+		if (sc == NULL)
+			continue;
+		if (sc->sc_stopping != 0)
+			continue;
+		if (sc->sc_md->mdo_class != md->mdo_class)
+			continue;
+		mdi1 = (struct g_raid_md_intel_object *)sc->sc_md;
+		if (spare) {
+			if (mdi1->mdio_incomplete)
+				break;
+		} else {
+			if (mdi1->mdio_config_id == meta->config_id)
+				break;
+		}
+	}
+
+	/* Found matching node. */
+	if (geom != NULL) {
+		G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
+		result = G_RAID_MD_TASTE_EXISTING;
+
+	} else if (spare) { /* Not found needy node -- left for later. */
+		G_RAID_DEBUG(1, "Spare is not needed at this time");
+		goto fail1;
+
+	} else { /* Not found matching node -- create one. */
+		result = G_RAID_MD_TASTE_NEW;
+		mdi->mdio_config_id = meta->config_id;
+		snprintf(name, sizeof(name), "Intel-%08x", meta->config_id);
+		sc = g_raid_create_node(mp, name, md);
+		md->mdo_softc = sc;
+		geom = sc->sc_geom;
+		callout_init(&mdi->mdio_start_co, 1);
+		callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz,
+		    g_raid_intel_go, sc);
+		mdi->mdio_rootmount = root_mount_hold("GRAID-Intel");
+		G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount);
+	}
+
+	rcp = g_new_consumer(geom);
+	g_attach(rcp, pp);
+	if (g_access(rcp, 1, 1, 1) != 0)
+		; //goto fail1;
+
+	g_topology_unlock();
+	sx_xlock(&sc->sc_lock);
+
+	pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
+	pd->pd_meta = meta;
+	pd->pd_disk_pos = -1;
+	if (spare == 2) {
+		memcpy(&pd->pd_disk_meta.serial[0], serial, INTEL_SERIAL_LEN);
+		pd->pd_disk_meta.sectors = pp->mediasize / pp->sectorsize;
+		pd->pd_disk_meta.id = 0;
+		pd->pd_disk_meta.flags = INTEL_F_SPARE;
+	} else {
+		pd->pd_disk_meta = meta->disk[disk_pos];
+	}
+	disk = g_raid_create_disk(sc);
+	disk->d_md_data = (void *)pd;
+	disk->d_consumer = rcp;
+	rcp->private = disk;
+
+	/* Read kernel dumping information. */
+	disk->d_kd.offset = 0;
+	disk->d_kd.length = OFF_MAX;
+	len = sizeof(disk->d_kd);
+	error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd);
+	if (disk->d_kd.di.dumper == NULL)
+		G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", 
+		    rcp->provider->name, error);
+
+	g_raid_md_intel_new_disk(disk);
+
+	sx_xunlock(&sc->sc_lock);
+	g_topology_lock();
+	*gp = geom;
+	return (result);
+fail2:
+	g_topology_lock();
+	g_access(cp, -1, 0, 0);
+fail1:
+	free(meta, M_MD_INTEL);
+	return (G_RAID_MD_TASTE_FAIL);
+}
+
+static int
+g_raid_md_event_intel(struct g_raid_md_object *md,
+    struct g_raid_disk *disk, u_int event)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd;
+	struct g_raid_md_intel_object *mdi;
+	struct g_raid_md_intel_perdisk *pd;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_intel_object *)md;
+	if (disk == NULL) {
+		switch (event) {
+		case G_RAID_NODE_E_START:
+			if (!mdi->mdio_started)
+				g_raid_md_intel_start(sc);
+			return (0);
+		}
+		return (-1);
+	}
+	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
+	switch (event) {
+	case G_RAID_DISK_E_DISCONNECTED:
+		/* If disk was assigned, just update statuses. */
+		if (pd->pd_disk_pos >= 0) {
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+			if (disk->d_consumer) {
+				g_raid_kill_consumer(sc, disk->d_consumer);
+				disk->d_consumer = NULL;
+			}
+			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_NONE);
+				g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
+				    G_RAID_EVENT_SUBDISK);
+			}
+		} else {
+			/* Otherwise -- delete. */
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
+			g_raid_destroy_disk(disk);
+		}
+
+		/* Write updated metadata to all disks. */
+		g_raid_md_write_intel(md, NULL, NULL, NULL);
+
+		/* Check if anything left except placeholders. */
+		if (g_raid_ndisks(sc, -1) ==
+		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
+			g_raid_destroy_node(sc, 0);
+		else
+			g_raid_md_intel_refill(sc);
+		return (0);
+	}
+	return (-2);
+}
+
+static int
+g_raid_md_ctl_intel(struct g_raid_md_object *md,
+    struct gctl_req *req)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol, *vol1;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	struct g_raid_md_intel_object *mdi;
+	struct g_raid_md_intel_perdisk *pd;
+	struct g_consumer *cp;
+	struct g_provider *pp;
+	char arg[16], serial[INTEL_SERIAL_LEN];
+	const char *verb, *volname, *levelname, *diskname;
+	char *tmp;
+	int *nargs, *force;
+	off_t off, size, sectorsize, strip;
+	intmax_t *sizearg, *striparg;
+	int numdisks, i, len, level, qual, update;
+	int error;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_intel_object *)md;
+	verb = gctl_get_param(req, "verb", NULL);
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	error = 0;
+	if (strcmp(verb, "label") == 0) {
+
+		if (*nargs < 4) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		volname = gctl_get_asciiparam(req, "arg1");
+		if (volname == NULL) {
+			gctl_error(req, "No volume name.");
+			return (-2);
+		}
+		levelname = gctl_get_asciiparam(req, "arg2");
+		if (levelname == NULL) {
+			gctl_error(req, "No RAID level.");
+			return (-3);
+		}
+		if (g_raid_volume_str2level(levelname, &level, &qual)) {
+			gctl_error(req, "Unknown RAID level '%s'.", levelname);
+			return (-4);
+		}
+		numdisks = *nargs - 3;
+		force = gctl_get_paraml(req, "force", sizeof(*force));
+		if (!g_raid_md_intel_supported(level, qual, numdisks,
+		    force ? *force : 0)) {
+			gctl_error(req, "Unsupported RAID level "
+			    "(0x%02x/0x%02x), or number of disks (%d).",
+			    level, qual, numdisks);
+			return (-5);
+		}
+
+		/* Search for disks, connect them and probe. */
+		size = 0x7fffffffffffffffllu;
+		sectorsize = 0;
+		for (i = 0; i < numdisks; i++) {
+			snprintf(arg, sizeof(arg), "arg%d", i + 3);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -6;
+				break;
+			}
+			if (strcmp(diskname, "NONE") == 0) {
+				cp = NULL;
+				pp = NULL;
+			} else {
+				g_topology_lock();
+				cp = g_raid_open_consumer(sc, diskname);
+				if (cp == NULL) {
+					gctl_error(req, "Can't open disk '%s'.",
+					    diskname);
+					g_topology_unlock();
+					error = -4;
+					break;
+				}
+				pp = cp->provider;
+			}
+			pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
+			pd->pd_disk_pos = i;
+			disk = g_raid_create_disk(sc);
+			disk->d_md_data = (void *)pd;
+			disk->d_consumer = cp;
+			if (cp == NULL) {
+				strcpy(&pd->pd_disk_meta.serial[0], "NONE");
+				pd->pd_disk_meta.id = 0xffffffff;
+				pd->pd_disk_meta.flags = INTEL_F_ASSIGNED;
+				continue;
+			}
+			cp->private = disk;
+			g_topology_unlock();
+
+			error = g_raid_md_get_label(cp,
+			    &pd->pd_disk_meta.serial[0], INTEL_SERIAL_LEN);
+			if (error != 0) {
+				gctl_error(req,
+				    "Can't get serial for provider '%s'.",
+				    diskname);
+				error = -8;
+				break;
+			}
+
+			/* Read kernel dumping information. */
+			disk->d_kd.offset = 0;
+			disk->d_kd.length = OFF_MAX;
+			len = sizeof(disk->d_kd);
+			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
+			if (disk->d_kd.di.dumper == NULL)
+				G_RAID_DEBUG1(2, sc,
+				    "Dumping not supported by %s.",
+				    cp->provider->name);
+
+			pd->pd_disk_meta.sectors = pp->mediasize / pp->sectorsize;
+			if (size > pp->mediasize)
+				size = pp->mediasize;
+			if (sectorsize < pp->sectorsize)
+				sectorsize = pp->sectorsize;
+			pd->pd_disk_meta.id = 0;
+			pd->pd_disk_meta.flags = INTEL_F_ASSIGNED | INTEL_F_ONLINE;
+		}
+		if (error != 0)
+			return (error);
+
+		/* Reserve some space for metadata. */
+		size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize;
+
+		/* Handle size argument. */
+		len = sizeof(*sizearg);
+		sizearg = gctl_get_param(req, "size", &len);
+		if (sizearg != NULL && len == sizeof(*sizearg) &&
+		    *sizearg > 0) {
+			if (*sizearg > size) {
+				gctl_error(req, "Size too big %lld > %lld.",
+				    (long long)*sizearg, (long long)size);
+				return (-9);
+			}
+			size = *sizearg;
+		}
+
+		/* Handle strip argument. */
+		strip = 131072;
+		len = sizeof(*striparg);
+		striparg = gctl_get_param(req, "strip", &len);
+		if (striparg != NULL && len == sizeof(*striparg) &&
+		    *striparg > 0) {
+			if (*striparg < sectorsize) {
+				gctl_error(req, "Strip size too small.");
+				return (-10);
+			}
+			if (*striparg % sectorsize != 0) {
+				gctl_error(req, "Incorrect strip size.");
+				return (-11);
+			}
+			if (strip > 65535 * sectorsize) {
+				gctl_error(req, "Strip size too big.");
+				return (-12);
+			}
+			strip = *striparg;
+		}
+
+		/* Round size down to strip or sector. */
+		if (level == G_RAID_VOLUME_RL_RAID1)
+			size -= (size % sectorsize);
+		else if (level == G_RAID_VOLUME_RL_RAID1E &&
+		    (numdisks & 1) != 0)
+			size -= (size % (2 * strip));
+		else
+			size -= (size % strip);
+		if (size <= 0) {
+			gctl_error(req, "Size too small.");
+			return (-13);
+		}
+		if (size > 0xffffffffllu * sectorsize) {
+			gctl_error(req, "Size too big.");
+			return (-14);
+		}
+
+		/* We have all we need, create things: volume, ... */
+		mdi->mdio_started = 1;
+		vol = g_raid_create_volume(sc, volname, -1);
+		vol->v_md_data = (void *)(intptr_t)0;
+		vol->v_raid_level = level;
+		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
+		vol->v_strip_size = strip;
+		vol->v_disks_count = numdisks;
+		if (level == G_RAID_VOLUME_RL_RAID0)
+			vol->v_mediasize = size * numdisks;
+		else if (level == G_RAID_VOLUME_RL_RAID1)
+			vol->v_mediasize = size;
+		else if (level == G_RAID_VOLUME_RL_RAID5)
+			vol->v_mediasize = size * (numdisks - 1);
+		else { /* RAID1E */
+			vol->v_mediasize = ((size * numdisks) / strip / 2) *
+			    strip;
+		}
+		vol->v_sectorsize = sectorsize;
+		g_raid_start_volume(vol);
+
+		/* , and subdisks. */
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
+			sd = &vol->v_subdisks[pd->pd_disk_pos];
+			sd->sd_disk = disk;
+			sd->sd_offset = 0;
+			sd->sd_size = size;
+			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+			if (sd->sd_disk->d_consumer != NULL) {
+				g_raid_change_disk_state(disk,
+				    G_RAID_DISK_S_ACTIVE);
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_ACTIVE);
+				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
+				    G_RAID_EVENT_SUBDISK);
+			} else {
+				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+			}
+		}
+
+		/* Write metadata based on created entities. */
+		G_RAID_DEBUG1(0, sc, "Array started.");
+		g_raid_md_write_intel(md, NULL, NULL, NULL);
+
+		/* Pickup any STALE/SPARE disks to refill array if needed. */
+		g_raid_md_intel_refill(sc);
+
+		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
+		    G_RAID_EVENT_VOLUME);
+		return (0);
+	}
+	if (strcmp(verb, "add") == 0) {
+
+		if (*nargs != 3) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		volname = gctl_get_asciiparam(req, "arg1");
+		if (volname == NULL) {
+			gctl_error(req, "No volume name.");
+			return (-2);
+		}
+		levelname = gctl_get_asciiparam(req, "arg2");
+		if (levelname == NULL) {
+			gctl_error(req, "No RAID level.");
+			return (-3);
+		}
+		if (g_raid_volume_str2level(levelname, &level, &qual)) {
+			gctl_error(req, "Unknown RAID level '%s'.", levelname);
+			return (-4);
+		}
+
+		/* Look for existing volumes. */
+		i = 0;
+		vol1 = NULL;
+		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+			vol1 = vol;
+			i++;
+		}
+		if (i > 1) {
+			gctl_error(req, "Maximum two volumes supported.");
+			return (-6);
+		}
+		if (vol1 == NULL) {
+			gctl_error(req, "At least one volume must exist.");
+			return (-7);
+		}
+
+		numdisks = vol1->v_disks_count;
+		force = gctl_get_paraml(req, "force", sizeof(*force));
+		if (!g_raid_md_intel_supported(level, qual, numdisks,
+		    force ? *force : 0)) {
+			gctl_error(req, "Unsupported RAID level "
+			    "(0x%02x/0x%02x), or number of disks (%d).",
+			    level, qual, numdisks);
+			return (-5);
+		}
+
+		/* Collect info about present disks. */
+		size = 0x7fffffffffffffffllu;
+		sectorsize = 512;
+		for (i = 0; i < numdisks; i++) {
+			disk = vol1->v_subdisks[i].sd_disk;
+			pd = (struct g_raid_md_intel_perdisk *)
+			    disk->d_md_data;
+			if ((off_t)pd->pd_disk_meta.sectors * 512 < size)
+				size = (off_t)pd->pd_disk_meta.sectors * 512;
+			if (disk->d_consumer != NULL &&
+			    disk->d_consumer->provider != NULL &&
+			    disk->d_consumer->provider->sectorsize >
+			     sectorsize) {
+				sectorsize =
+				    disk->d_consumer->provider->sectorsize;
+			}
+		}
+
+		/* Reserve some space for metadata. */
+		size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize;
+
+		/* Decide insert before or after. */
+		sd = &vol1->v_subdisks[0];
+		if (sd->sd_offset >
+		    size - (sd->sd_offset + sd->sd_size)) {
+			off = 0;
+			size = sd->sd_offset;
+		} else {
+			off = sd->sd_offset + sd->sd_size;
+			size = size - (sd->sd_offset + sd->sd_size);
+		}
+
+		/* Handle strip argument. */
+		strip = 131072;
+		len = sizeof(*striparg);
+		striparg = gctl_get_param(req, "strip", &len);
+		if (striparg != NULL && len == sizeof(*striparg) &&
+		    *striparg > 0) {
+			if (*striparg < sectorsize) {
+				gctl_error(req, "Strip size too small.");
+				return (-10);
+			}
+			if (*striparg % sectorsize != 0) {
+				gctl_error(req, "Incorrect strip size.");
+				return (-11);
+			}
+			if (strip > 65535 * sectorsize) {
+				gctl_error(req, "Strip size too big.");
+				return (-12);
+			}
+			strip = *striparg;
+		}
+
+		/* Round offset up to strip. */
+		if (off % strip != 0) {
+			size -= strip - off % strip;
+			off += strip - off % strip;
+		}
+
+		/* Handle size argument. */
+		len = sizeof(*sizearg);
+		sizearg = gctl_get_param(req, "size", &len);
+		if (sizearg != NULL && len == sizeof(*sizearg) &&
+		    *sizearg > 0) {
+			if (*sizearg > size) {
+				gctl_error(req, "Size too big %lld > %lld.",
+				    (long long)*sizearg, (long long)size);
+				return (-9);
+			}
+			size = *sizearg;
+		}
+
+		/* Round size down to strip or sector. */
+		if (level == G_RAID_VOLUME_RL_RAID1)
+			size -= (size % sectorsize);
+		else
+			size -= (size % strip);
+		if (size <= 0) {
+			gctl_error(req, "Size too small.");
+			return (-13);
+		}
+		if (size > 0xffffffffllu * sectorsize) {
+			gctl_error(req, "Size too big.");
+			return (-14);
+		}
+
+		/* We have all we need, create things: volume, ... */
+		vol = g_raid_create_volume(sc, volname, -1);
+		vol->v_md_data = (void *)(intptr_t)i;
+		vol->v_raid_level = level;
+		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
+		vol->v_strip_size = strip;
+		vol->v_disks_count = numdisks;
+		if (level == G_RAID_VOLUME_RL_RAID0)
+			vol->v_mediasize = size * numdisks;
+		else if (level == G_RAID_VOLUME_RL_RAID1)
+			vol->v_mediasize = size;
+		else if (level == G_RAID_VOLUME_RL_RAID5)
+			vol->v_mediasize = size * (numdisks - 1);
+		else { /* RAID1E */
+			vol->v_mediasize = ((size * numdisks) / strip / 2) *
+			    strip;
+		}
+		vol->v_sectorsize = sectorsize;
+		g_raid_start_volume(vol);
+
+		/* , and subdisks. */
+		for (i = 0; i < numdisks; i++) {
+			disk = vol1->v_subdisks[i].sd_disk;
+			sd = &vol->v_subdisks[i];
+			sd->sd_disk = disk;
+			sd->sd_offset = off;
+			sd->sd_size = size;
+			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+			if (disk->d_state == G_RAID_DISK_S_ACTIVE) {
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_ACTIVE);
+				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
+				    G_RAID_EVENT_SUBDISK);
+			}
+		}
+
+		/* Write metadata based on created entities. */
+		g_raid_md_write_intel(md, NULL, NULL, NULL);
+
+		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
+		    G_RAID_EVENT_VOLUME);
+		return (0);
+	}
+	if (strcmp(verb, "delete") == 0) {
+
+		/* Full node destruction. */
+		if (*nargs == 1) {
+			/* Check if some volume is still open. */
+			force = gctl_get_paraml(req, "force", sizeof(*force));
+			if (force != NULL && *force == 0 &&
+			    g_raid_nopens(sc) != 0) {
+				gctl_error(req, "Some volume is still open.");
+				return (-4);
+			}
+
+			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+				if (disk->d_consumer)
+					intel_meta_erase(disk->d_consumer);
+			}
+			g_raid_destroy_node(sc, 0);
+			return (0);
+		}
+
+		/* Destroy specified volume. If it was last - all node. */
+		if (*nargs != 2) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		volname = gctl_get_asciiparam(req, "arg1");
+		if (volname == NULL) {
+			gctl_error(req, "No volume name.");
+			return (-2);
+		}
+
+		/* Search for volume. */
+		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+			if (strcmp(vol->v_name, volname) == 0)
+				break;
+		}
+		if (vol == NULL) {
+			i = strtol(volname, &tmp, 10);
+			if (verb != volname && tmp[0] == 0) {
+				TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+					if (vol->v_global_id == i)
+						break;
+				}
+			}
+		}
+		if (vol == NULL) {
+			gctl_error(req, "Volume '%s' not found.", volname);
+			return (-3);
+		}
+
+		/* Check if volume is still open. */
+		force = gctl_get_paraml(req, "force", sizeof(*force));
+		if (force != NULL && *force == 0 &&
+		    vol->v_provider_open != 0) {
+			gctl_error(req, "Volume is still open.");
+			return (-4);
+		}
+
+		/* Destroy volume and potentially node. */
+		i = 0;
+		TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
+			i++;
+		if (i >= 2) {
+			g_raid_destroy_volume(vol);
+			g_raid_md_write_intel(md, NULL, NULL, NULL);
+		} else {
+			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+				if (disk->d_consumer)
+					intel_meta_erase(disk->d_consumer);
+			}
+			g_raid_destroy_node(sc, 0);
+		}
+		return (0);
+	}
+	if (strcmp(verb, "remove") == 0 ||
+	    strcmp(verb, "fail") == 0) {
+		if (*nargs < 2) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		for (i = 1; i < *nargs; i++) {
+			snprintf(arg, sizeof(arg), "arg%d", i);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -2;
+				break;
+			}
+			if (strncmp(diskname, "/dev/", 5) == 0)
+				diskname += 5;
+
+			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+				if (disk->d_consumer != NULL && 
+				    disk->d_consumer->provider != NULL &&
+				    strcmp(disk->d_consumer->provider->name,
+				     diskname) == 0)
+					break;
+			}
+			if (disk == NULL) {
+				gctl_error(req, "Disk '%s' not found.",
+				    diskname);
+				error = -3;
+				break;
+			}
+
+			if (strcmp(verb, "fail") == 0) {
+				g_raid_md_fail_disk_intel(md, NULL, disk);
+				continue;
+			}
+
+			pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
+
+			/* Erase metadata on deleting disk. */
+			intel_meta_erase(disk->d_consumer);
+
+			/* If disk was assigned, just update statuses. */
+			if (pd->pd_disk_pos >= 0) {
+				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+				if (disk->d_consumer) {
+					g_raid_kill_consumer(sc, disk->d_consumer);
+					disk->d_consumer = NULL;
+				}
+				TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+					g_raid_change_subdisk_state(sd,
+					    G_RAID_SUBDISK_S_NONE);
+					g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
+					    G_RAID_EVENT_SUBDISK);
+				}
+			} else {
+				/* Otherwise -- delete. */
+				g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
+				g_raid_destroy_disk(disk);
+			}
+		}
+
+		/* Write updated metadata to remaining disks. */
+		g_raid_md_write_intel(md, NULL, NULL, NULL);
+
+		/* Check if anything left except placeholders. */
+		if (g_raid_ndisks(sc, -1) ==
+		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
+			g_raid_destroy_node(sc, 0);
+		else
+			g_raid_md_intel_refill(sc);
+		return (error);
+	}
+	if (strcmp(verb, "insert") == 0) {
+		if (*nargs < 2) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		update = 0;
+		for (i = 1; i < *nargs; i++) {
+			/* Get disk name. */
+			snprintf(arg, sizeof(arg), "arg%d", i);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -3;
+				break;
+			}
+
+			/* Try to find provider with specified name. */
+			g_topology_lock();
+			cp = g_raid_open_consumer(sc, diskname);
+			if (cp == NULL) {
+				gctl_error(req, "Can't open disk '%s'.",
+				    diskname);
+				g_topology_unlock();
+				error = -4;
+				break;
+			}
+			pp = cp->provider;
+			g_topology_unlock();
+
+			/* Read disk serial. */
+			error = g_raid_md_get_label(cp,
+			    &serial[0], INTEL_SERIAL_LEN);
+			if (error != 0) {
+				gctl_error(req,
+				    "Can't get serial for provider '%s'.",
+				    diskname);
+				g_raid_kill_consumer(sc, cp);
+				error = -7;
+				break;
+			}
+
+			pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
+			pd->pd_disk_pos = -1;
+
+			disk = g_raid_create_disk(sc);
+			disk->d_consumer = cp;
+			disk->d_consumer->private = disk;
+			disk->d_md_data = (void *)pd;
+			cp->private = disk;
+
+			/* Read kernel dumping information. */
+			disk->d_kd.offset = 0;
+			disk->d_kd.length = OFF_MAX;
+			len = sizeof(disk->d_kd);
+			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
+			if (disk->d_kd.di.dumper == NULL)
+				G_RAID_DEBUG1(2, sc,
+				    "Dumping not supported by %s.",
+				    cp->provider->name);
+
+			memcpy(&pd->pd_disk_meta.serial[0], &serial[0],
+			    INTEL_SERIAL_LEN);
+			pd->pd_disk_meta.sectors = pp->mediasize / pp->sectorsize;
+			pd->pd_disk_meta.id = 0;
+			pd->pd_disk_meta.flags = INTEL_F_SPARE;
+
+			/* Welcome the "new" disk. */
+			update += g_raid_md_intel_start_disk(disk);
+			if (disk->d_state == G_RAID_DISK_S_SPARE) {
+				intel_meta_write_spare(cp, &pd->pd_disk_meta);
+				g_raid_destroy_disk(disk);
+			} else if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
+				gctl_error(req, "Disk '%s' doesn't fit.",
+				    diskname);
+				g_raid_destroy_disk(disk);
+				error = -8;
+				break;
+			}
+		}
+
+		/* Write new metadata if we changed something. */
+		if (update)
+			g_raid_md_write_intel(md, NULL, NULL, NULL);
+		return (error);
+	}
+	return (-100);
+}
+
+static int
+g_raid_md_write_intel(struct g_raid_md_object *md, struct g_raid_volume *tvol,
+    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	struct g_raid_md_intel_object *mdi;
+	struct g_raid_md_intel_perdisk *pd;
+	struct intel_raid_conf *meta;
+	struct intel_raid_vol *mvol;
+	struct intel_raid_map *mmap0, *mmap1;
+	off_t sectorsize = 512, pos;
+	const char *version, *cv;
+	int vi, sdi, numdisks, len, state, stale;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_intel_object *)md;
+
+	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
+		return (0);
+
+	/* Bump generation. Newly written metadata may differ from previous. */
+	mdi->mdio_generation++;
+
+	/* Count number of disks. */
+	numdisks = 0;
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
+		if (pd->pd_disk_pos < 0)
+			continue;
+		numdisks++;
+		if (disk->d_state == G_RAID_DISK_S_ACTIVE) {
+			pd->pd_disk_meta.flags =
+			    INTEL_F_ONLINE | INTEL_F_ASSIGNED;
+		} else if (disk->d_state == G_RAID_DISK_S_FAILED) {
+			pd->pd_disk_meta.flags = INTEL_F_FAILED | INTEL_F_ASSIGNED;
+		} else {
+			pd->pd_disk_meta.flags = INTEL_F_ASSIGNED;
+			if (pd->pd_disk_meta.id != 0xffffffff) {
+				pd->pd_disk_meta.id = 0xffffffff;
+				len = strlen(pd->pd_disk_meta.serial);
+				len = min(len, INTEL_SERIAL_LEN - 3);
+				strcpy(pd->pd_disk_meta.serial + len, ":0");
+			}
+		}
+	}
+
+	/* Fill anchor and disks. */
+	meta = malloc(INTEL_MAX_MD_SIZE(numdisks),
+	    M_MD_INTEL, M_WAITOK | M_ZERO);
+	memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC));
+	meta->config_size = INTEL_MAX_MD_SIZE(numdisks);
+	meta->config_id = mdi->mdio_config_id;
+	meta->generation = mdi->mdio_generation;
+	meta->attributes = INTEL_ATTR_CHECKSUM;
+	meta->total_disks = numdisks;
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
+		if (pd->pd_disk_pos < 0)
+			continue;
+		meta->disk[pd->pd_disk_pos] = pd->pd_disk_meta;
+	}
+
+	/* Fill volumes and maps. */
+	vi = 0;
+	version = INTEL_VERSION_1000;
+	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+		if (vol->v_stopping)
+			continue;
+		mvol = intel_get_volume(meta, vi);
+
+		/* New metadata may have different volumes order. */
+		vol->v_md_data = (void *)(intptr_t)vi;
+
+		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
+			sd = &vol->v_subdisks[sdi];
+			if (sd->sd_disk != NULL)
+				break;
+		}
+		if (sdi >= vol->v_disks_count)
+			panic("No any filled subdisk in volume");
+		if (vol->v_mediasize >= 0x20000000000llu)
+			meta->attributes |= INTEL_ATTR_2TB;
+		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
+			meta->attributes |= INTEL_ATTR_RAID0;
+		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
+			meta->attributes |= INTEL_ATTR_RAID1;
+		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
+			meta->attributes |= INTEL_ATTR_RAID5;
+		else
+			meta->attributes |= INTEL_ATTR_RAID10;
+
+		if (meta->attributes & INTEL_ATTR_2TB)
+			cv = INTEL_VERSION_1300;
+//		else if (dev->status == DEV_CLONE_N_GO)
+//			cv = INTEL_VERSION_1206;
+		else if (vol->v_disks_count > 4)
+			cv = INTEL_VERSION_1204;
+		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
+			cv = INTEL_VERSION_1202;
+		else if (vol->v_disks_count > 2)
+			cv = INTEL_VERSION_1201;
+		else if (vi > 0)
+			cv = INTEL_VERSION_1200;
+		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
+			cv = INTEL_VERSION_1100;
+		else
+			cv = INTEL_VERSION_1000;
+		if (strcmp(cv, version) > 0)
+			version = cv;
+
+		strlcpy(&mvol->name[0], vol->v_name, sizeof(mvol->name));
+		mvol->total_sectors = vol->v_mediasize / sectorsize;
+
+		/* Check for any recovery in progress. */
+		state = G_RAID_SUBDISK_S_ACTIVE;
+		pos = 0x7fffffffffffffffllu;
+		stale = 0;
+		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
+			sd = &vol->v_subdisks[sdi];
+			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD)
+				state = G_RAID_SUBDISK_S_REBUILD;
+			else if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC &&
+			    state != G_RAID_SUBDISK_S_REBUILD)
+				state = G_RAID_SUBDISK_S_RESYNC;
+			else if (sd->sd_state == G_RAID_SUBDISK_S_STALE)
+				stale = 1;
+			if ((sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
+			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
+			     sd->sd_rebuild_pos < pos)
+			        pos = sd->sd_rebuild_pos;
+		}
+		if (state == G_RAID_SUBDISK_S_REBUILD) {
+			mvol->migr_state = 1;
+			mvol->migr_type = INTEL_MT_REBUILD;
+		} else if (state == G_RAID_SUBDISK_S_RESYNC) {
+			mvol->migr_state = 1;
+			/* mvol->migr_type = INTEL_MT_REPAIR; */
+			mvol->migr_type = INTEL_MT_VERIFY;
+			mvol->state |= INTEL_ST_VERIFY_AND_FIX;
+		} else
+			mvol->migr_state = 0;
+		mvol->dirty = (vol->v_dirty || stale);
+
+		mmap0 = intel_get_map(mvol, 0);
+
+		/* Write map / common part of two maps. */
+		mmap0->offset = sd->sd_offset / sectorsize;
+		mmap0->disk_sectors = sd->sd_size / sectorsize;
+		mmap0->strip_sectors = vol->v_strip_size / sectorsize;
+		if (vol->v_state == G_RAID_VOLUME_S_BROKEN)
+			mmap0->status = INTEL_S_FAILURE;
+		else if (vol->v_state == G_RAID_VOLUME_S_DEGRADED)
+			mmap0->status = INTEL_S_DEGRADED;
+		else
+			mmap0->status = INTEL_S_READY;
+		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
+			mmap0->type = INTEL_T_RAID0;
+		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
+		    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
+			mmap0->type = INTEL_T_RAID1;
+		else
+			mmap0->type = INTEL_T_RAID5;
+		mmap0->total_disks = vol->v_disks_count;
+		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
+			mmap0->total_domains = vol->v_disks_count;
+		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
+			mmap0->total_domains = 2;
+		else
+			mmap0->total_domains = 1;
+		mmap0->stripe_count = sd->sd_size / vol->v_strip_size /
+		    mmap0->total_domains;
+		mmap0->failed_disk_num = 0xff;
+		mmap0->ddf = 1;
+
+		/* If there are two maps - copy common and update. */
+		if (mvol->migr_state) {
+			mvol->curr_migr_unit = pos /
+			    vol->v_strip_size / mmap0->total_domains;
+			mmap1 = intel_get_map(mvol, 1);
+			memcpy(mmap1, mmap0, sizeof(struct intel_raid_map));
+			mmap0->status = INTEL_S_READY;
+		} else
+			mmap1 = NULL;
+
+		/* Write disk indexes and put rebuild flags. */
+		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
+			sd = &vol->v_subdisks[sdi];
+			pd = (struct g_raid_md_intel_perdisk *)
+			    sd->sd_disk->d_md_data;
+			mmap0->disk_idx[sdi] = pd->pd_disk_pos;
+			if (mvol->migr_state)
+				mmap1->disk_idx[sdi] = pd->pd_disk_pos;
+			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
+			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
+				mmap1->disk_idx[sdi] |= INTEL_DI_RBLD;
+			} else if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
+			    sd->sd_state != G_RAID_SUBDISK_S_STALE) {
+				mmap0->disk_idx[sdi] |= INTEL_DI_RBLD;
+				if (mvol->migr_state)
+					mmap1->disk_idx[sdi] |= INTEL_DI_RBLD;
+			}
+			if ((sd->sd_state == G_RAID_SUBDISK_S_NONE ||
+			     sd->sd_state == G_RAID_SUBDISK_S_FAILED) &&
+			    mmap0->failed_disk_num == 0xff) {
+				mmap0->failed_disk_num = sdi;
+				if (mvol->migr_state)
+					mmap1->failed_disk_num = sdi;
+			}
+		}
+		vi++;
+	}
+	meta->total_volumes = vi;
+	if (strcmp(version, INTEL_VERSION_1300) != 0)
+		meta->attributes &= INTEL_ATTR_CHECKSUM;
+	memcpy(&meta->version[0], version, sizeof(INTEL_VERSION_1000));
+
+	/* We are done. Print meta data and store them to disks. */
+	g_raid_md_intel_print(meta);
+	if (mdi->mdio_meta != NULL)
+		free(mdi->mdio_meta, M_MD_INTEL);
+	mdi->mdio_meta = meta;
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
+		if (disk->d_state != G_RAID_DISK_S_ACTIVE)
+			continue;
+		if (pd->pd_meta != NULL) {
+			free(pd->pd_meta, M_MD_INTEL);
+			pd->pd_meta = NULL;
+		}
+		pd->pd_meta = intel_meta_copy(meta);
+		intel_meta_write(disk->d_consumer, meta);
+	}
+	return (0);
+}
+
+static int
+g_raid_md_fail_disk_intel(struct g_raid_md_object *md,
+    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_intel_object *mdi;
+	struct g_raid_md_intel_perdisk *pd;
+	struct g_raid_subdisk *sd;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_intel_object *)md;
+	pd = (struct g_raid_md_intel_perdisk *)tdisk->d_md_data;
+
+	/* We can't fail disk that is not a part of array now. */
+	if (pd->pd_disk_pos < 0)
+		return (-1);
+
+	/*
+	 * Mark disk as failed in metadata and try to write that metadata
+	 * to the disk itself to prevent it's later resurrection as STALE.
+	 */
+	mdi->mdio_meta->disk[pd->pd_disk_pos].flags = INTEL_F_FAILED;
+	pd->pd_disk_meta.flags = INTEL_F_FAILED;
+	g_raid_md_intel_print(mdi->mdio_meta);
+	if (tdisk->d_consumer)
+		intel_meta_write(tdisk->d_consumer, mdi->mdio_meta);
+
+	/* Change states. */
+	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
+	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
+		g_raid_change_subdisk_state(sd,
+		    G_RAID_SUBDISK_S_FAILED);
+		g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
+		    G_RAID_EVENT_SUBDISK);
+	}
+
+	/* Write updated metadata to remaining disks. */
+	g_raid_md_write_intel(md, NULL, NULL, tdisk);
+
+	/* Check if anything left except placeholders. */
+	if (g_raid_ndisks(sc, -1) ==
+	    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
+		g_raid_destroy_node(sc, 0);
+	else
+		g_raid_md_intel_refill(sc);
+	return (0);
+}
+
+static int
+g_raid_md_free_disk_intel(struct g_raid_md_object *md,
+    struct g_raid_disk *disk)
+{
+	struct g_raid_md_intel_perdisk *pd;
+
+	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
+	if (pd->pd_meta != NULL) {
+		free(pd->pd_meta, M_MD_INTEL);
+		pd->pd_meta = NULL;
+	}
+	free(pd, M_MD_INTEL);
+	disk->d_md_data = NULL;
+	return (0);
+}
+
+static int
+g_raid_md_free_intel(struct g_raid_md_object *md)
+{
+	struct g_raid_md_intel_object *mdi;
+
+	mdi = (struct g_raid_md_intel_object *)md;
+	if (!mdi->mdio_started) {
+		mdi->mdio_started = 0;
+		callout_stop(&mdi->mdio_start_co);
+		G_RAID_DEBUG1(1, md->mdo_softc,
+		    "root_mount_rel %p", mdi->mdio_rootmount);
+		root_mount_rel(mdi->mdio_rootmount);
+		mdi->mdio_rootmount = NULL;
+	}
+	if (mdi->mdio_meta != NULL) {
+		free(mdi->mdio_meta, M_MD_INTEL);
+		mdi->mdio_meta = NULL;
+	}
+	return (0);
+}
+
+G_RAID_MD_DECLARE(g_raid_md_intel);
diff --git a/sys/geom/raid/md_jmicron.c b/sys/geom/raid/md_jmicron.c
new file mode 100644
index 0000000..a56c543
--- /dev/null
+++ b/sys/geom/raid/md_jmicron.c
@@ -0,0 +1,1582 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+#include <geom/geom.h>
+#include "geom/raid/g_raid.h"
+#include "g_raid_md_if.h"
+
+static MALLOC_DEFINE(M_MD_JMICRON, "md_jmicron_data", "GEOM_RAID JMicron metadata");
+
+#define	JMICRON_MAX_DISKS	8
+#define	JMICRON_MAX_SPARE	2
+
+struct jmicron_raid_conf {
+    u_int8_t		signature[2];
+#define	JMICRON_MAGIC		"JM"
+
+    u_int16_t		version;
+#define	JMICRON_VERSION		0x0001
+
+    u_int16_t		checksum;
+    u_int8_t		filler_1[10];
+    u_int32_t		disk_id;
+    u_int32_t		offset;
+    u_int32_t		disk_sectors_high;
+    u_int16_t		disk_sectors_low;
+    u_int8_t		filler_2[2];
+    u_int8_t		name[16];
+    u_int8_t		type;
+#define	JMICRON_T_RAID0		0
+#define	JMICRON_T_RAID1		1
+#define	JMICRON_T_RAID01	2
+#define	JMICRON_T_CONCAT	3
+#define	JMICRON_T_RAID5		5
+
+    u_int8_t		stripe_shift;
+    u_int16_t		flags;
+#define	JMICRON_F_READY		0x0001
+#define	JMICRON_F_BOOTABLE	0x0002
+#define	JMICRON_F_BADSEC	0x0004
+#define	JMICRON_F_ACTIVE	0x0010
+#define	JMICRON_F_UNSYNC	0x0020
+#define	JMICRON_F_NEWEST	0x0040
+
+    u_int8_t		filler_3[4];
+    u_int32_t		spare[JMICRON_MAX_SPARE];
+    u_int32_t		disks[JMICRON_MAX_DISKS];
+#define	JMICRON_DISK_MASK	0xFFFFFFF0
+#define	JMICRON_SEG_MASK	0x0000000F
+    u_int8_t		filler_4[32];
+    u_int8_t		filler_5[384];
+};
+
+struct g_raid_md_jmicron_perdisk {
+	struct jmicron_raid_conf	*pd_meta;
+	int				 pd_disk_pos;
+	int				 pd_disk_id;
+	off_t				 pd_disk_size;
+};
+
+struct g_raid_md_jmicron_object {
+	struct g_raid_md_object	 mdio_base;
+	uint32_t		 mdio_config_id;
+	struct jmicron_raid_conf	*mdio_meta;
+	struct callout		 mdio_start_co;	/* STARTING state timer. */
+	int			 mdio_total_disks;
+	int			 mdio_disks_present;
+	int			 mdio_started;
+	int			 mdio_incomplete;
+	struct root_hold_token	*mdio_rootmount; /* Root mount delay token. */
+};
+
+static g_raid_md_create_t g_raid_md_create_jmicron;
+static g_raid_md_taste_t g_raid_md_taste_jmicron;
+static g_raid_md_event_t g_raid_md_event_jmicron;
+static g_raid_md_ctl_t g_raid_md_ctl_jmicron;
+static g_raid_md_write_t g_raid_md_write_jmicron;
+static g_raid_md_fail_disk_t g_raid_md_fail_disk_jmicron;
+static g_raid_md_free_disk_t g_raid_md_free_disk_jmicron;
+static g_raid_md_free_t g_raid_md_free_jmicron;
+
+static kobj_method_t g_raid_md_jmicron_methods[] = {
+	KOBJMETHOD(g_raid_md_create,	g_raid_md_create_jmicron),
+	KOBJMETHOD(g_raid_md_taste,	g_raid_md_taste_jmicron),
+	KOBJMETHOD(g_raid_md_event,	g_raid_md_event_jmicron),
+	KOBJMETHOD(g_raid_md_ctl,	g_raid_md_ctl_jmicron),
+	KOBJMETHOD(g_raid_md_write,	g_raid_md_write_jmicron),
+	KOBJMETHOD(g_raid_md_fail_disk,	g_raid_md_fail_disk_jmicron),
+	KOBJMETHOD(g_raid_md_free_disk,	g_raid_md_free_disk_jmicron),
+	KOBJMETHOD(g_raid_md_free,	g_raid_md_free_jmicron),
+	{ 0, 0 }
+};
+
+static struct g_raid_md_class g_raid_md_jmicron_class = {
+	"JMicron",
+	g_raid_md_jmicron_methods,
+	sizeof(struct g_raid_md_jmicron_object),
+	.mdc_priority = 100
+};
+
+static void
+g_raid_md_jmicron_print(struct jmicron_raid_conf *meta)
+{
+	int k;
+
+	if (g_raid_debug < 1)
+		return;
+
+	printf("********* ATA JMicron RAID Metadata *********\n");
+	printf("signature           <%c%c>\n", meta->signature[0], meta->signature[1]);
+	printf("version             %04x\n", meta->version);
+	printf("checksum            0x%04x\n", meta->checksum);
+	printf("disk_id             0x%08x\n", meta->disk_id);
+	printf("offset              0x%08x\n", meta->offset);
+	printf("disk_sectors_high   0x%08x\n", meta->disk_sectors_high);
+	printf("disk_sectors_low    0x%04x\n", meta->disk_sectors_low);
+	printf("name                <%.16s>\n", meta->name);
+	printf("type                %d\n", meta->type);
+	printf("stripe_shift        %d\n", meta->stripe_shift);
+	printf("flags               %04x\n", meta->flags);
+	printf("spare              ");
+	for (k = 0; k < JMICRON_MAX_SPARE; k++)
+		printf(" 0x%08x", meta->spare[k]);
+	printf("\n");
+	printf("disks              ");
+	for (k = 0; k < JMICRON_MAX_DISKS; k++)
+		printf(" 0x%08x", meta->disks[k]);
+	printf("\n");
+	printf("=================================================\n");
+}
+
+static struct jmicron_raid_conf *
+jmicron_meta_copy(struct jmicron_raid_conf *meta)
+{
+	struct jmicron_raid_conf *nmeta;
+
+	nmeta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK);
+	memcpy(nmeta, meta, sizeof(*meta));
+	return (nmeta);
+}
+
+static int
+jmicron_meta_total_disks(struct jmicron_raid_conf *meta)
+{
+	int pos;
+
+	for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) {
+		if (meta->disks[pos] == 0)
+			break;
+	}
+	return (pos);
+}
+
+static int
+jmicron_meta_total_spare(struct jmicron_raid_conf *meta)
+{
+	int pos, n;
+
+	n = 0;
+	for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) {
+		if (meta->spare[pos] != 0)
+			n++;
+	}
+	return (n);
+}
+
+/*
+ * Generate fake Configuration ID based on disk IDs.
+ * Note: it will change after each disk set change.
+ */
+static uint32_t
+jmicron_meta_config_id(struct jmicron_raid_conf *meta)
+{
+	int pos;
+	uint32_t config_id;
+
+	config_id = 0;
+	for (pos = 0; pos < JMICRON_MAX_DISKS; pos++)
+		config_id += meta->disks[pos] << pos;
+	return (config_id);
+}
+
+static void
+jmicron_meta_get_name(struct jmicron_raid_conf *meta, char *buf)
+{
+	int i;
+
+	strncpy(buf, meta->name, 16);
+	buf[16] = 0;
+	for (i = 15; i >= 0; i--) {
+		if (buf[i] > 0x20)
+			break;
+		buf[i] = 0;
+	}
+}
+
+static void
+jmicron_meta_put_name(struct jmicron_raid_conf *meta, char *buf)
+{
+
+	memset(meta->name, 0x20, 16);
+	memcpy(meta->name, buf, MIN(strlen(buf), 16));
+}
+
+static int
+jmicron_meta_find_disk(struct jmicron_raid_conf *meta, uint32_t id)
+{
+	int pos;
+
+	id &= JMICRON_DISK_MASK;
+	for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) {
+		if ((meta->disks[pos] & JMICRON_DISK_MASK) == id)
+			return (pos);
+	}
+	for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) {
+		if ((meta->spare[pos] & JMICRON_DISK_MASK) == id)
+			return (-3);
+	}
+	return (-1);
+}
+
+static struct jmicron_raid_conf *
+jmicron_meta_read(struct g_consumer *cp)
+{
+	struct g_provider *pp;
+	struct jmicron_raid_conf *meta;
+	char *buf;
+	int error, i;
+	uint16_t checksum, *ptr;
+
+	pp = cp->provider;
+
+	/* Read the anchor sector. */
+	buf = g_read_data(cp,
+	    pp->mediasize - pp->sectorsize, pp->sectorsize, &error);
+	if (buf == NULL) {
+		G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
+		    pp->name, error);
+		return (NULL);
+	}
+	meta = (struct jmicron_raid_conf *)buf;
+
+	/* Check if this is an JMicron RAID struct */
+	if (strncmp(meta->signature, JMICRON_MAGIC, strlen(JMICRON_MAGIC))) {
+		G_RAID_DEBUG(1, "JMicron signature check failed on %s", pp->name);
+		g_free(buf);
+		return (NULL);
+	}
+	meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK);
+	memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize));
+	g_free(buf);
+
+	/* Check metadata checksum. */
+	for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++)
+		checksum += *ptr++;
+	if (checksum != 0) {
+		G_RAID_DEBUG(1, "JMicron checksum check failed on %s", pp->name);
+		free(meta, M_MD_JMICRON);
+		return (NULL);
+	}
+
+	return (meta);
+}
+
+static int
+jmicron_meta_write(struct g_consumer *cp, struct jmicron_raid_conf *meta)
+{
+	struct g_provider *pp;
+	char *buf;
+	int error, i;
+	uint16_t checksum, *ptr;
+
+	pp = cp->provider;
+
+	/* Recalculate checksum for case if metadata were changed. */
+	meta->checksum = 0;
+	for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++)
+		checksum += *ptr++;
+	meta->checksum -= checksum;
+
+	/* Create and fill buffer. */
+	buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO);
+	memcpy(buf, meta, sizeof(*meta));
+
+	error = g_write_data(cp,
+	    pp->mediasize - pp->sectorsize, buf, pp->sectorsize);
+	if (error != 0) {
+		G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
+		    pp->name, error);
+	}
+
+	free(buf, M_MD_JMICRON);
+	return (error);
+}
+
+static int
+jmicron_meta_erase(struct g_consumer *cp)
+{
+	struct g_provider *pp;
+	char *buf;
+	int error;
+
+	pp = cp->provider;
+	buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO);
+	error = g_write_data(cp,
+	    pp->mediasize - pp->sectorsize, buf, pp->sectorsize);
+	if (error != 0) {
+		G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
+		    pp->name, error);
+	}
+	free(buf, M_MD_JMICRON);
+	return (error);
+}
+
+static struct g_raid_disk *
+g_raid_md_jmicron_get_disk(struct g_raid_softc *sc, int id)
+{
+	struct g_raid_disk	*disk;
+	struct g_raid_md_jmicron_perdisk *pd;
+
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data;
+		if (pd->pd_disk_pos == id)
+			break;
+	}
+	return (disk);
+}
+
+static int
+g_raid_md_jmicron_supported(int level, int qual, int disks, int force)
+{
+
+	if (disks > 8)
+		return (0);
+	switch (level) {
+	case G_RAID_VOLUME_RL_RAID0:
+		if (disks < 1)
+			return (0);
+		if (!force && (disks < 2 || disks > 6))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID1:
+		if (disks < 1)
+			return (0);
+		if (!force && (disks != 2))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID1E:
+		if (disks < 2)
+			return (0);
+		if (!force && (disks != 4))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_SINGLE:
+		if (disks != 1)
+			return (0);
+		if (!force)
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_CONCAT:
+		if (disks < 2)
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID5:
+		if (disks < 3)
+			return (0);
+		if (!force)
+			return (0);
+		break;
+	default:
+		return (0);
+	}
+	if (qual != G_RAID_VOLUME_RLQ_NONE)
+		return (0);
+	return (1);
+}
+
+static int
+g_raid_md_jmicron_start_disk(struct g_raid_disk *disk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd, *tmpsd;
+	struct g_raid_disk *olddisk, *tmpdisk;
+	struct g_raid_md_object *md;
+	struct g_raid_md_jmicron_object *mdi;
+	struct g_raid_md_jmicron_perdisk *pd, *oldpd;
+	struct jmicron_raid_conf *meta;
+	int disk_pos, resurrection = 0;
+
+	sc = disk->d_softc;
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_jmicron_object *)md;
+	meta = mdi->mdio_meta;
+	pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data;
+	olddisk = NULL;
+
+	/* Find disk position in metadata by it's serial. */
+	disk_pos = jmicron_meta_find_disk(meta, pd->pd_disk_id);
+	if (disk_pos < 0) {
+		G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk");
+		/* If we are in the start process, that's all for now. */
+		if (!mdi->mdio_started)
+			goto nofit;
+		/*
+		 * If we have already started - try to get use of the disk.
+		 * Try to replace OFFLINE disks first, then FAILED.
+		 */
+		TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) {
+			if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE &&
+			    tmpdisk->d_state != G_RAID_DISK_S_FAILED)
+				continue;
+			/* Make sure this disk is big enough. */
+			TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) {
+				if (sd->sd_offset + sd->sd_size + 512 >
+				    pd->pd_disk_size) {
+					G_RAID_DEBUG1(1, sc,
+					    "Disk too small (%ju < %ju)",
+					    pd->pd_disk_size,
+					    sd->sd_offset + sd->sd_size + 512);
+					break;
+				}
+			}
+			if (sd != NULL)
+				continue;
+			if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) {
+				olddisk = tmpdisk;
+				break;
+			} else if (olddisk == NULL)
+				olddisk = tmpdisk;
+		}
+		if (olddisk == NULL) {
+nofit:
+			if (disk_pos == -3 || pd->pd_disk_pos == -3) {
+				g_raid_change_disk_state(disk,
+				    G_RAID_DISK_S_SPARE);
+				return (1);
+			} else {
+				g_raid_change_disk_state(disk,
+				    G_RAID_DISK_S_STALE);
+				return (0);
+			}
+		}
+		oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data;
+		disk_pos = oldpd->pd_disk_pos;
+		resurrection = 1;
+	}
+
+	if (olddisk == NULL) {
+		/* Find placeholder by position. */
+		olddisk = g_raid_md_jmicron_get_disk(sc, disk_pos);
+		if (olddisk == NULL)
+			panic("No disk at position %d!", disk_pos);
+		if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) {
+			G_RAID_DEBUG1(1, sc, "More then one disk for pos %d",
+			    disk_pos);
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE);
+			return (0);
+		}
+		oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data;
+	}
+
+	/* Replace failed disk or placeholder with new disk. */
+	TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) {
+		TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next);
+		TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+		sd->sd_disk = disk;
+	}
+	oldpd->pd_disk_pos = -2;
+	pd->pd_disk_pos = disk_pos;
+	/* Update global metadata just in case. */
+	meta->disks[disk_pos] = pd->pd_disk_id;
+
+	/* If it was placeholder -- destroy it. */
+	if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) {
+		g_raid_destroy_disk(olddisk);
+	} else {
+		/* Otherwise, make it STALE_FAILED. */
+		g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED);
+	}
+
+	/* Welcome the new disk. */
+	g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
+	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+
+		/*
+		 * Different disks may have different sizes/offsets,
+		 * especially in concat mode. Update.
+		 */
+		if (pd->pd_meta != NULL && !resurrection) {
+			sd->sd_offset =
+			    (off_t)pd->pd_meta->offset * 16 * 512; //ZZZ
+			sd->sd_size =
+			    (((off_t)pd->pd_meta->disk_sectors_high << 16) +
+			      pd->pd_meta->disk_sectors_low) * 512;
+		}
+
+		if (resurrection) {
+			/* Stale disk, almost same as new. */
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_NEW);
+		} else if ((meta->flags & JMICRON_F_BADSEC) != 0 &&
+		    (pd->pd_meta->flags & JMICRON_F_BADSEC) == 0) {
+			/* Cold-inserted or rebuilding disk. */
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_NEW);
+		} else if (pd->pd_meta->flags & JMICRON_F_UNSYNC) {
+			/* Dirty or resyncing disk.. */
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_STALE);
+		} else {
+			/* Up to date disk. */
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_ACTIVE);
+		}
+		g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
+		    G_RAID_EVENT_SUBDISK);
+	}
+
+	/* Update status of our need for spare. */
+	if (mdi->mdio_started) {
+		mdi->mdio_incomplete =
+		    (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) <
+		     mdi->mdio_total_disks);
+	}
+
+	return (resurrection);
+}
+
+static void
+g_disk_md_jmicron_retaste(void *arg, int pending)
+{
+
+	G_RAID_DEBUG(1, "Array is not complete, trying to retaste.");
+	g_retaste(&g_raid_class);
+	free(arg, M_MD_JMICRON);
+}
+
+static void
+g_raid_md_jmicron_refill(struct g_raid_softc *sc)
+{
+	struct g_raid_md_object *md;
+	struct g_raid_md_jmicron_object *mdi;
+	struct jmicron_raid_conf *meta;
+	struct g_raid_disk *disk;
+	struct task *task;
+	int update, na;
+
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_jmicron_object *)md;
+	meta = mdi->mdio_meta;
+	update = 0;
+	do {
+		/* Make sure we miss anything. */
+		na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE);
+		if (na == mdi->mdio_total_disks)
+			break;
+
+		G_RAID_DEBUG1(1, md->mdo_softc,
+		    "Array is not complete (%d of %d), "
+		    "trying to refill.", na, mdi->mdio_total_disks);
+
+		/* Try to get use some of STALE disks. */
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_state == G_RAID_DISK_S_STALE) {
+				update += g_raid_md_jmicron_start_disk(disk);
+				if (disk->d_state == G_RAID_DISK_S_ACTIVE)
+					break;
+			}
+		}
+		if (disk != NULL)
+			continue;
+
+		/* Try to get use some of SPARE disks. */
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_state == G_RAID_DISK_S_SPARE) {
+				update += g_raid_md_jmicron_start_disk(disk);
+				if (disk->d_state == G_RAID_DISK_S_ACTIVE)
+					break;
+			}
+		}
+	} while (disk != NULL);
+
+	/* Write new metadata if we changed something. */
+	if (update) {
+		g_raid_md_write_jmicron(md, NULL, NULL, NULL);
+		meta = mdi->mdio_meta;
+	}
+
+	/* Update status of our need for spare. */
+	mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) <
+	    mdi->mdio_total_disks);
+
+	/* Request retaste hoping to find spare. */
+	if (mdi->mdio_incomplete) {
+		task = malloc(sizeof(struct task),
+		    M_MD_JMICRON, M_WAITOK | M_ZERO);
+		TASK_INIT(task, 0, g_disk_md_jmicron_retaste, task);
+		taskqueue_enqueue(taskqueue_swi, task);
+	}
+}
+
+static void
+g_raid_md_jmicron_start(struct g_raid_softc *sc)
+{
+	struct g_raid_md_object *md;
+	struct g_raid_md_jmicron_object *mdi;
+	struct g_raid_md_jmicron_perdisk *pd;
+	struct jmicron_raid_conf *meta;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	off_t size;
+	int j, disk_pos;
+	char buf[17];
+
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_jmicron_object *)md;
+	meta = mdi->mdio_meta;
+
+	/* Create volumes and subdisks. */
+	jmicron_meta_get_name(meta, buf);
+	vol = g_raid_create_volume(sc, buf, -1);
+	size = ((off_t)meta->disk_sectors_high << 16) + meta->disk_sectors_low;
+	size *= 512; //ZZZ
+	if (meta->type == JMICRON_T_RAID0) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
+		vol->v_mediasize = size * mdi->mdio_total_disks;
+	} else if (meta->type == JMICRON_T_RAID1) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
+		vol->v_mediasize = size;
+	} else if (meta->type == JMICRON_T_RAID01) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
+		vol->v_mediasize = size * mdi->mdio_total_disks / 2;
+	} else if (meta->type == JMICRON_T_CONCAT) {
+		if (mdi->mdio_total_disks == 1)
+			vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE;
+		else
+			vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT;
+		vol->v_mediasize = 0;
+	} else if (meta->type == JMICRON_T_RAID5) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
+		vol->v_mediasize = size * (mdi->mdio_total_disks - 1);
+	} else {
+		vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
+		vol->v_mediasize = 0;
+	}
+	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
+	vol->v_strip_size = 1024 << meta->stripe_shift; //ZZZ
+	vol->v_disks_count = mdi->mdio_total_disks;
+	vol->v_sectorsize = 512; //ZZZ
+	for (j = 0; j < vol->v_disks_count; j++) {
+		sd = &vol->v_subdisks[j];
+		sd->sd_offset = (off_t)meta->offset * 16 * 512; //ZZZ
+		sd->sd_size = size;
+	}
+	g_raid_start_volume(vol);
+
+	/* Create disk placeholders to store data for later writing. */
+	for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) {
+		pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO);
+		pd->pd_disk_pos = disk_pos;
+		pd->pd_disk_id = meta->disks[disk_pos];
+		disk = g_raid_create_disk(sc);
+		disk->d_md_data = (void *)pd;
+		disk->d_state = G_RAID_DISK_S_OFFLINE;
+		sd = &vol->v_subdisks[disk_pos];
+		sd->sd_disk = disk;
+		TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+	}
+
+	/* Make all disks found till the moment take their places. */
+	do {
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_state == G_RAID_DISK_S_NONE) {
+				g_raid_md_jmicron_start_disk(disk);
+				break;
+			}
+		}
+	} while (disk != NULL);
+
+	mdi->mdio_started = 1;
+	G_RAID_DEBUG1(0, sc, "Array started.");
+	g_raid_md_write_jmicron(md, NULL, NULL, NULL);
+
+	/* Pickup any STALE/SPARE disks to refill array if needed. */
+	g_raid_md_jmicron_refill(sc);
+
+	g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
+
+	callout_stop(&mdi->mdio_start_co);
+	G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount);
+	root_mount_rel(mdi->mdio_rootmount);
+	mdi->mdio_rootmount = NULL;
+}
+
+static void
+g_raid_md_jmicron_new_disk(struct g_raid_disk *disk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_object *md;
+	struct g_raid_md_jmicron_object *mdi;
+	struct jmicron_raid_conf *pdmeta;
+	struct g_raid_md_jmicron_perdisk *pd;
+
+	sc = disk->d_softc;
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_jmicron_object *)md;
+	pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data;
+	pdmeta = pd->pd_meta;
+
+	if (mdi->mdio_started) {
+		if (g_raid_md_jmicron_start_disk(disk))
+			g_raid_md_write_jmicron(md, NULL, NULL, NULL);
+	} else {
+		/*
+		 * If we haven't started yet - update common metadata
+		 * to get subdisks details, avoiding data from spare disks.
+		 */
+		if (mdi->mdio_meta == NULL ||
+		    jmicron_meta_find_disk(mdi->mdio_meta,
+		     mdi->mdio_meta->disk_id) == -3) {
+			if (mdi->mdio_meta != NULL)
+				free(mdi->mdio_meta, M_MD_JMICRON);
+			mdi->mdio_meta = jmicron_meta_copy(pdmeta);
+			mdi->mdio_total_disks = jmicron_meta_total_disks(pdmeta);
+		}
+		mdi->mdio_meta->flags |= pdmeta->flags & JMICRON_F_BADSEC;
+
+		mdi->mdio_disks_present++;
+		G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d+%d up)",
+		    mdi->mdio_disks_present,
+		    mdi->mdio_total_disks,
+		    jmicron_meta_total_spare(mdi->mdio_meta));
+
+		/* If we collected all needed disks - start array. */
+		if (mdi->mdio_disks_present == mdi->mdio_total_disks +
+		    jmicron_meta_total_spare(mdi->mdio_meta))
+			g_raid_md_jmicron_start(sc);
+	}
+}
+
+static void
+g_raid_jmicron_go(void *arg)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_object *md;
+	struct g_raid_md_jmicron_object *mdi;
+
+	sc = arg;
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_jmicron_object *)md;
+	if (!mdi->mdio_started) {
+		G_RAID_DEBUG1(0, sc, "Force array start due to timeout.");
+		g_raid_event_send(sc, G_RAID_NODE_E_START, 0);
+	}
+}
+
+static int
+g_raid_md_create_jmicron(struct g_raid_md_object *md, struct g_class *mp,
+    struct g_geom **gp)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_jmicron_object *mdi;
+	char name[16];
+
+	mdi = (struct g_raid_md_jmicron_object *)md;
+	mdi->mdio_config_id = arc4random();
+	snprintf(name, sizeof(name), "JMicron-%08x", mdi->mdio_config_id);
+	sc = g_raid_create_node(mp, name, md);
+	if (sc == NULL)
+		return (G_RAID_MD_TASTE_FAIL);
+	md->mdo_softc = sc;
+	*gp = sc->sc_geom;
+	return (G_RAID_MD_TASTE_NEW);
+}
+
+static int
+g_raid_md_taste_jmicron(struct g_raid_md_object *md, struct g_class *mp,
+                              struct g_consumer *cp, struct g_geom **gp)
+{
+	struct g_consumer *rcp;
+	struct g_provider *pp;
+	struct g_raid_md_jmicron_object *mdi, *mdi1;
+	struct g_raid_softc *sc;
+	struct g_raid_disk *disk;
+	struct jmicron_raid_conf *meta;
+	struct g_raid_md_jmicron_perdisk *pd;
+	struct g_geom *geom;
+	int error, disk_pos, result, spare, len;
+	char name[16];
+	uint16_t vendor;
+
+	G_RAID_DEBUG(1, "Tasting JMicron on %s", cp->provider->name);
+	mdi = (struct g_raid_md_jmicron_object *)md;
+	pp = cp->provider;
+
+	/* Read metadata from device. */
+	meta = NULL;
+	spare = 0;
+	vendor = 0xffff;
+	disk_pos = 0;
+	if (g_access(cp, 1, 0, 0) != 0)
+		return (G_RAID_MD_TASTE_FAIL);
+	g_topology_unlock();
+	len = 2;
+	if (pp->geom->rank == 1)
+		g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
+	meta = jmicron_meta_read(cp);
+	g_topology_lock();
+	g_access(cp, -1, 0, 0);
+	if (meta == NULL) {
+		if (g_raid_aggressive_spare) {
+			if (vendor == 0x197b) {
+				G_RAID_DEBUG(1,
+				    "No JMicron metadata, forcing spare.");
+				spare = 2;
+				goto search;
+			} else {
+				G_RAID_DEBUG(1,
+				    "JMicron vendor mismatch 0x%04x != 0x197b",
+				    vendor);
+			}
+		}
+		return (G_RAID_MD_TASTE_FAIL);
+	}
+
+	/* Check this disk position in obtained metadata. */
+	disk_pos = jmicron_meta_find_disk(meta, meta->disk_id);
+	if (disk_pos == -1) {
+		G_RAID_DEBUG(1, "JMicron disk_id %08x not found",
+		    meta->disk_id);
+		goto fail1;
+	}
+
+	/* Metadata valid. Print it. */
+	g_raid_md_jmicron_print(meta);
+	G_RAID_DEBUG(1, "JMicron disk position %d", disk_pos);
+	spare = (disk_pos == -2) ? 1 : 0;
+
+search:
+	/* Search for matching node. */
+	sc = NULL;
+	mdi1 = NULL;
+	LIST_FOREACH(geom, &mp->geom, geom) {
+		sc = geom->softc;
+		if (sc == NULL)
+			continue;
+		if (sc->sc_stopping != 0)
+			continue;
+		if (sc->sc_md->mdo_class != md->mdo_class)
+			continue;
+		mdi1 = (struct g_raid_md_jmicron_object *)sc->sc_md;
+		if (spare == 2) {
+			if (mdi1->mdio_incomplete)
+				break;
+		} else {
+			if (mdi1->mdio_config_id ==
+			    jmicron_meta_config_id(meta))
+				break;
+		}
+	}
+
+	/* Found matching node. */
+	if (geom != NULL) {
+		G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
+		result = G_RAID_MD_TASTE_EXISTING;
+
+	} else if (spare) { /* Not found needy node -- left for later. */
+		G_RAID_DEBUG(1, "Spare is not needed at this time");
+		goto fail1;
+
+	} else { /* Not found matching node -- create one. */
+		result = G_RAID_MD_TASTE_NEW;
+		mdi->mdio_config_id = jmicron_meta_config_id(meta);
+		snprintf(name, sizeof(name), "JMicron-%08x",
+		    mdi->mdio_config_id);
+		sc = g_raid_create_node(mp, name, md);
+		md->mdo_softc = sc;
+		geom = sc->sc_geom;
+		callout_init(&mdi->mdio_start_co, 1);
+		callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz,
+		    g_raid_jmicron_go, sc);
+		mdi->mdio_rootmount = root_mount_hold("GRAID-JMicron");
+		G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount);
+	}
+
+	rcp = g_new_consumer(geom);
+	g_attach(rcp, pp);
+	if (g_access(rcp, 1, 1, 1) != 0)
+		; //goto fail1;
+
+	g_topology_unlock();
+	sx_xlock(&sc->sc_lock);
+
+	pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO);
+	pd->pd_meta = meta;
+	if (spare == 2) {
+		pd->pd_disk_pos = -3;
+		pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK;
+	} else {
+		pd->pd_disk_pos = -1;
+		pd->pd_disk_id = meta->disk_id;
+	}
+	pd->pd_disk_size = pp->mediasize;
+	disk = g_raid_create_disk(sc);
+	disk->d_md_data = (void *)pd;
+	disk->d_consumer = rcp;
+	rcp->private = disk;
+
+	/* Read kernel dumping information. */
+	disk->d_kd.offset = 0;
+	disk->d_kd.length = OFF_MAX;
+	len = sizeof(disk->d_kd);
+	error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd);
+	if (disk->d_kd.di.dumper == NULL)
+		G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", 
+		    rcp->provider->name, error);
+
+	g_raid_md_jmicron_new_disk(disk);
+
+	sx_xunlock(&sc->sc_lock);
+	g_topology_lock();
+	*gp = geom;
+	return (result);
+fail1:
+	free(meta, M_MD_JMICRON);
+	return (G_RAID_MD_TASTE_FAIL);
+}
+
+static int
+g_raid_md_event_jmicron(struct g_raid_md_object *md,
+    struct g_raid_disk *disk, u_int event)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd;
+	struct g_raid_md_jmicron_object *mdi;
+	struct g_raid_md_jmicron_perdisk *pd;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_jmicron_object *)md;
+	if (disk == NULL) {
+		switch (event) {
+		case G_RAID_NODE_E_START:
+			if (!mdi->mdio_started)
+				g_raid_md_jmicron_start(sc);
+			return (0);
+		}
+		return (-1);
+	}
+	pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data;
+	switch (event) {
+	case G_RAID_DISK_E_DISCONNECTED:
+		/* If disk was assigned, just update statuses. */
+		if (pd->pd_disk_pos >= 0) {
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+			if (disk->d_consumer) {
+				g_raid_kill_consumer(sc, disk->d_consumer);
+				disk->d_consumer = NULL;
+			}
+			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_NONE);
+				g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
+				    G_RAID_EVENT_SUBDISK);
+			}
+		} else {
+			/* Otherwise -- delete. */
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
+			g_raid_destroy_disk(disk);
+		}
+
+		/* Write updated metadata to all disks. */
+		g_raid_md_write_jmicron(md, NULL, NULL, NULL);
+
+		/* Check if anything left except placeholders. */
+		if (g_raid_ndisks(sc, -1) ==
+		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
+			g_raid_destroy_node(sc, 0);
+		else
+			g_raid_md_jmicron_refill(sc);
+		return (0);
+	}
+	return (-2);
+}
+
+static int
+g_raid_md_ctl_jmicron(struct g_raid_md_object *md,
+    struct gctl_req *req)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	struct g_raid_md_jmicron_object *mdi;
+	struct g_raid_md_jmicron_perdisk *pd;
+	struct g_consumer *cp;
+	struct g_provider *pp;
+	char arg[16];
+	const char *verb, *volname, *levelname, *diskname;
+	int *nargs, *force;
+	off_t size, sectorsize, strip;
+	intmax_t *sizearg, *striparg;
+	int numdisks, i, len, level, qual, update;
+	int error;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_jmicron_object *)md;
+	verb = gctl_get_param(req, "verb", NULL);
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	error = 0;
+	if (strcmp(verb, "label") == 0) {
+
+		if (*nargs < 4) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		volname = gctl_get_asciiparam(req, "arg1");
+		if (volname == NULL) {
+			gctl_error(req, "No volume name.");
+			return (-2);
+		}
+		levelname = gctl_get_asciiparam(req, "arg2");
+		if (levelname == NULL) {
+			gctl_error(req, "No RAID level.");
+			return (-3);
+		}
+		if (g_raid_volume_str2level(levelname, &level, &qual)) {
+			gctl_error(req, "Unknown RAID level '%s'.", levelname);
+			return (-4);
+		}
+		numdisks = *nargs - 3;
+		force = gctl_get_paraml(req, "force", sizeof(*force));
+		if (!g_raid_md_jmicron_supported(level, qual, numdisks,
+		    force ? *force : 0)) {
+			gctl_error(req, "Unsupported RAID level "
+			    "(0x%02x/0x%02x), or number of disks (%d).",
+			    level, qual, numdisks);
+			return (-5);
+		}
+
+		/* Search for disks, connect them and probe. */
+		size = 0x7fffffffffffffffllu;
+		sectorsize = 0;
+		for (i = 0; i < numdisks; i++) {
+			snprintf(arg, sizeof(arg), "arg%d", i + 3);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -6;
+				break;
+			}
+			if (strcmp(diskname, "NONE") == 0) {
+				cp = NULL;
+				pp = NULL;
+			} else {
+				g_topology_lock();
+				cp = g_raid_open_consumer(sc, diskname);
+				if (cp == NULL) {
+					gctl_error(req, "Can't open '%s'.",
+					    diskname);
+					g_topology_unlock();
+					error = -7;
+					break;
+				}
+				pp = cp->provider;
+			}
+			pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO);
+			pd->pd_disk_pos = i;
+			pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK;
+			disk = g_raid_create_disk(sc);
+			disk->d_md_data = (void *)pd;
+			disk->d_consumer = cp;
+			if (cp == NULL)
+				continue;
+			cp->private = disk;
+			g_topology_unlock();
+
+			/* Read kernel dumping information. */
+			disk->d_kd.offset = 0;
+			disk->d_kd.length = OFF_MAX;
+			len = sizeof(disk->d_kd);
+			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
+			if (disk->d_kd.di.dumper == NULL)
+				G_RAID_DEBUG1(2, sc,
+				    "Dumping not supported by %s.",
+				    cp->provider->name);
+
+			pd->pd_disk_size = pp->mediasize;
+			if (size > pp->mediasize)
+				size = pp->mediasize;
+			if (sectorsize < pp->sectorsize)
+				sectorsize = pp->sectorsize;
+		}
+		if (error != 0)
+			return (error);
+
+		/* Reserve space for metadata. */
+		size -= sectorsize;
+
+		/* Handle size argument. */
+		len = sizeof(*sizearg);
+		sizearg = gctl_get_param(req, "size", &len);
+		if (sizearg != NULL && len == sizeof(*sizearg) &&
+		    *sizearg > 0) {
+			if (*sizearg > size) {
+				gctl_error(req, "Size too big %lld > %lld.",
+				    (long long)*sizearg, (long long)size);
+				return (-9);
+			}
+			size = *sizearg;
+		}
+
+		/* Handle strip argument. */
+		strip = 131072;
+		len = sizeof(*striparg);
+		striparg = gctl_get_param(req, "strip", &len);
+		if (striparg != NULL && len == sizeof(*striparg) &&
+		    *striparg > 0) {
+			if (*striparg < sectorsize) {
+				gctl_error(req, "Strip size too small.");
+				return (-10);
+			}
+			if (*striparg % sectorsize != 0) {
+				gctl_error(req, "Incorrect strip size.");
+				return (-11);
+			}
+			if (strip > 65535 * sectorsize) {
+				gctl_error(req, "Strip size too big.");
+				return (-12);
+			}
+			strip = *striparg;
+		}
+
+		/* Round size down to strip or sector. */
+		if (level == G_RAID_VOLUME_RL_RAID1)
+			size -= (size % sectorsize);
+		else if (level == G_RAID_VOLUME_RL_RAID1E &&
+		    (numdisks & 1) != 0)
+			size -= (size % (2 * strip));
+		else
+			size -= (size % strip);
+		if (size <= 0) {
+			gctl_error(req, "Size too small.");
+			return (-13);
+		}
+		if (size > 0xffffffffffffllu * sectorsize) {
+			gctl_error(req, "Size too big.");
+			return (-14);
+		}
+
+		/* We have all we need, create things: volume, ... */
+		mdi->mdio_total_disks = numdisks;
+		mdi->mdio_started = 1;
+		vol = g_raid_create_volume(sc, volname, -1);
+		vol->v_md_data = (void *)(intptr_t)0;
+		vol->v_raid_level = level;
+		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
+		vol->v_strip_size = strip;
+		vol->v_disks_count = numdisks;
+		if (level == G_RAID_VOLUME_RL_RAID0 ||
+		    level == G_RAID_VOLUME_RL_CONCAT ||
+		    level == G_RAID_VOLUME_RL_SINGLE)
+			vol->v_mediasize = size * numdisks;
+		else if (level == G_RAID_VOLUME_RL_RAID1)
+			vol->v_mediasize = size;
+		else if (level == G_RAID_VOLUME_RL_RAID5)
+			vol->v_mediasize = size * (numdisks - 1);
+		else { /* RAID1E */
+			vol->v_mediasize = ((size * numdisks) / strip / 2) *
+			    strip;
+		}
+		vol->v_sectorsize = sectorsize;
+		g_raid_start_volume(vol);
+
+		/* , and subdisks. */
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data;
+			sd = &vol->v_subdisks[pd->pd_disk_pos];
+			sd->sd_disk = disk;
+			sd->sd_offset = 0;
+			sd->sd_size = size;
+			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+			if (sd->sd_disk->d_consumer != NULL) {
+				g_raid_change_disk_state(disk,
+				    G_RAID_DISK_S_ACTIVE);
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_ACTIVE);
+				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
+				    G_RAID_EVENT_SUBDISK);
+			} else {
+				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+			}
+		}
+
+		/* Write metadata based on created entities. */
+		G_RAID_DEBUG1(0, sc, "Array started.");
+		g_raid_md_write_jmicron(md, NULL, NULL, NULL);
+
+		/* Pickup any STALE/SPARE disks to refill array if needed. */
+		g_raid_md_jmicron_refill(sc);
+
+		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
+		    G_RAID_EVENT_VOLUME);
+		return (0);
+	}
+	if (strcmp(verb, "delete") == 0) {
+
+		/* Check if some volume is still open. */
+		force = gctl_get_paraml(req, "force", sizeof(*force));
+		if (force != NULL && *force == 0 &&
+		    g_raid_nopens(sc) != 0) {
+			gctl_error(req, "Some volume is still open.");
+			return (-4);
+		}
+
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_consumer)
+				jmicron_meta_erase(disk->d_consumer);
+		}
+		g_raid_destroy_node(sc, 0);
+		return (0);
+	}
+	if (strcmp(verb, "remove") == 0 ||
+	    strcmp(verb, "fail") == 0) {
+		if (*nargs < 2) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		for (i = 1; i < *nargs; i++) {
+			snprintf(arg, sizeof(arg), "arg%d", i);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -2;
+				break;
+			}
+			if (strncmp(diskname, "/dev/", 5) == 0)
+				diskname += 5;
+
+			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+				if (disk->d_consumer != NULL && 
+				    disk->d_consumer->provider != NULL &&
+				    strcmp(disk->d_consumer->provider->name,
+				     diskname) == 0)
+					break;
+			}
+			if (disk == NULL) {
+				gctl_error(req, "Disk '%s' not found.",
+				    diskname);
+				error = -3;
+				break;
+			}
+
+			if (strcmp(verb, "fail") == 0) {
+				g_raid_md_fail_disk_jmicron(md, NULL, disk);
+				continue;
+			}
+
+			pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data;
+
+			/* Erase metadata on deleting disk. */
+			jmicron_meta_erase(disk->d_consumer);
+
+			/* If disk was assigned, just update statuses. */
+			if (pd->pd_disk_pos >= 0) {
+				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+				if (disk->d_consumer) {
+					g_raid_kill_consumer(sc, disk->d_consumer);
+					disk->d_consumer = NULL;
+				}
+				TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+					g_raid_change_subdisk_state(sd,
+					    G_RAID_SUBDISK_S_NONE);
+					g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
+					    G_RAID_EVENT_SUBDISK);
+				}
+			} else {
+				/* Otherwise -- delete. */
+				g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
+				g_raid_destroy_disk(disk);
+			}
+		}
+
+		/* Write updated metadata to remaining disks. */
+		g_raid_md_write_jmicron(md, NULL, NULL, NULL);
+
+		/* Check if anything left except placeholders. */
+		if (g_raid_ndisks(sc, -1) ==
+		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
+			g_raid_destroy_node(sc, 0);
+		else
+			g_raid_md_jmicron_refill(sc);
+		return (error);
+	}
+	if (strcmp(verb, "insert") == 0) {
+		if (*nargs < 2) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		update = 0;
+		for (i = 1; i < *nargs; i++) {
+			/* Get disk name. */
+			snprintf(arg, sizeof(arg), "arg%d", i);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -3;
+				break;
+			}
+
+			/* Try to find provider with specified name. */
+			g_topology_lock();
+			cp = g_raid_open_consumer(sc, diskname);
+			if (cp == NULL) {
+				gctl_error(req, "Can't open disk '%s'.",
+				    diskname);
+				g_topology_unlock();
+				error = -4;
+				break;
+			}
+			pp = cp->provider;
+
+			pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO);
+			pd->pd_disk_pos = -3;
+			pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK;
+			pd->pd_disk_size = pp->mediasize;
+
+			disk = g_raid_create_disk(sc);
+			disk->d_consumer = cp;
+			disk->d_consumer->private = disk;
+			disk->d_md_data = (void *)pd;
+			cp->private = disk;
+			g_topology_unlock();
+
+			/* Read kernel dumping information. */
+			disk->d_kd.offset = 0;
+			disk->d_kd.length = OFF_MAX;
+			len = sizeof(disk->d_kd);
+			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
+			if (disk->d_kd.di.dumper == NULL)
+				G_RAID_DEBUG1(2, sc,
+				    "Dumping not supported by %s.",
+				    cp->provider->name);
+
+			/* Welcome the "new" disk. */
+			update += g_raid_md_jmicron_start_disk(disk);
+			if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
+			    disk->d_state != G_RAID_DISK_S_SPARE) {
+				gctl_error(req, "Disk '%s' doesn't fit.",
+				    diskname);
+				g_raid_destroy_disk(disk);
+				error = -8;
+				break;
+			}
+		}
+
+		/* Write new metadata if we changed something. */
+		if (update)
+			g_raid_md_write_jmicron(md, NULL, NULL, NULL);
+		return (error);
+	}
+	gctl_error(req, "Command '%s' is not supported.", verb);
+	return (-100);
+}
+
+static int
+g_raid_md_write_jmicron(struct g_raid_md_object *md, struct g_raid_volume *tvol,
+    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	struct g_raid_md_jmicron_object *mdi;
+	struct g_raid_md_jmicron_perdisk *pd;
+	struct jmicron_raid_conf *meta;
+	int i, spares;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_jmicron_object *)md;
+
+	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
+		return (0);
+
+	/* There is only one volume. */
+	vol = TAILQ_FIRST(&sc->sc_volumes);
+
+	/* Fill global fields. */
+	meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK | M_ZERO);
+	strncpy(meta->signature, JMICRON_MAGIC, 2);
+	meta->version = JMICRON_VERSION;
+	jmicron_meta_put_name(meta, vol->v_name);
+	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
+		meta->type = JMICRON_T_RAID0;
+	else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
+		meta->type = JMICRON_T_RAID1;
+	else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
+		meta->type = JMICRON_T_RAID01;
+	else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT ||
+	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE)
+		meta->type = JMICRON_T_CONCAT;
+	else
+		meta->type = JMICRON_T_RAID5;
+	meta->stripe_shift = fls(vol->v_strip_size / 2048);
+	meta->flags = JMICRON_F_READY | JMICRON_F_BOOTABLE;
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		if (sd->sd_disk == NULL || sd->sd_disk->d_md_data == NULL)
+			meta->disks[i] = 0xffffffff;
+		else {
+			pd = (struct g_raid_md_jmicron_perdisk *)
+			    sd->sd_disk->d_md_data;
+			meta->disks[i] = pd->pd_disk_id;
+		}
+		if (sd->sd_state < G_RAID_SUBDISK_S_STALE)
+			meta->flags |= JMICRON_F_BADSEC;
+		if (vol->v_dirty)
+			meta->flags |= JMICRON_F_UNSYNC;
+	}
+
+	/* Put spares to their slots. */
+	spares = 0;
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data;
+		if (disk->d_state != G_RAID_DISK_S_SPARE)
+			continue;
+		meta->spare[spares] = pd->pd_disk_id;
+		if (++spares >= 2)
+			break;
+	}
+
+	/* We are done. Print meta data and store them to disks. */
+	if (mdi->mdio_meta != NULL)
+		free(mdi->mdio_meta, M_MD_JMICRON);
+	mdi->mdio_meta = meta;
+	i = 0;
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data;
+		if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
+		    disk->d_state != G_RAID_DISK_S_SPARE)
+			continue;
+		if (pd->pd_meta != NULL) {
+			free(pd->pd_meta, M_MD_JMICRON);
+			pd->pd_meta = NULL;
+		}
+		pd->pd_meta = jmicron_meta_copy(meta);
+		pd->pd_meta->disk_id = pd->pd_disk_id;
+		if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) {
+			pd->pd_meta->offset =
+			    (sd->sd_offset / 512) / 16;
+			pd->pd_meta->disk_sectors_high =
+			    (sd->sd_size / 512) >> 16;
+			pd->pd_meta->disk_sectors_low =
+			    (sd->sd_size / 512) & 0xffff;
+			if (sd->sd_state < G_RAID_SUBDISK_S_STALE)
+				pd->pd_meta->flags &= ~JMICRON_F_BADSEC;
+			else if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE)
+				pd->pd_meta->flags |= JMICRON_F_UNSYNC;
+		}
+		G_RAID_DEBUG(1, "Writing JMicron metadata to %s",
+		    g_raid_get_diskname(disk));
+		g_raid_md_jmicron_print(pd->pd_meta);
+		jmicron_meta_write(disk->d_consumer, pd->pd_meta);
+	}
+	return (0);
+}
+
+static int
+g_raid_md_fail_disk_jmicron(struct g_raid_md_object *md,
+    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_jmicron_object *mdi;
+	struct g_raid_md_jmicron_perdisk *pd;
+	struct g_raid_subdisk *sd;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_jmicron_object *)md;
+	pd = (struct g_raid_md_jmicron_perdisk *)tdisk->d_md_data;
+
+	/* We can't fail disk that is not a part of array now. */
+	if (pd->pd_disk_pos < 0)
+		return (-1);
+
+	if (tdisk->d_consumer)
+		jmicron_meta_erase(tdisk->d_consumer);
+
+	/* Change states. */
+	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
+	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
+		g_raid_change_subdisk_state(sd,
+		    G_RAID_SUBDISK_S_FAILED);
+		g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
+		    G_RAID_EVENT_SUBDISK);
+	}
+
+	/* Write updated metadata to remaining disks. */
+	g_raid_md_write_jmicron(md, NULL, NULL, tdisk);
+
+	/* Check if anything left except placeholders. */
+	if (g_raid_ndisks(sc, -1) ==
+	    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
+		g_raid_destroy_node(sc, 0);
+	else
+		g_raid_md_jmicron_refill(sc);
+	return (0);
+}
+
+static int
+g_raid_md_free_disk_jmicron(struct g_raid_md_object *md,
+    struct g_raid_disk *disk)
+{
+	struct g_raid_md_jmicron_perdisk *pd;
+
+	pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data;
+	if (pd->pd_meta != NULL) {
+		free(pd->pd_meta, M_MD_JMICRON);
+		pd->pd_meta = NULL;
+	}
+	free(pd, M_MD_JMICRON);
+	disk->d_md_data = NULL;
+	return (0);
+}
+
+static int
+g_raid_md_free_jmicron(struct g_raid_md_object *md)
+{
+	struct g_raid_md_jmicron_object *mdi;
+
+	mdi = (struct g_raid_md_jmicron_object *)md;
+	if (!mdi->mdio_started) {
+		mdi->mdio_started = 0;
+		callout_stop(&mdi->mdio_start_co);
+		G_RAID_DEBUG1(1, md->mdo_softc,
+		    "root_mount_rel %p", mdi->mdio_rootmount);
+		root_mount_rel(mdi->mdio_rootmount);
+		mdi->mdio_rootmount = NULL;
+	}
+	if (mdi->mdio_meta != NULL) {
+		free(mdi->mdio_meta, M_MD_JMICRON);
+		mdi->mdio_meta = NULL;
+	}
+	return (0);
+}
+
+G_RAID_MD_DECLARE(g_raid_md_jmicron);
diff --git a/sys/geom/raid/md_nvidia.c b/sys/geom/raid/md_nvidia.c
new file mode 100644
index 0000000..dbaee0a
--- /dev/null
+++ b/sys/geom/raid/md_nvidia.c
@@ -0,0 +1,1607 @@
+/*-
+ * Copyright (c) 2011 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+#include <geom/geom.h>
+#include "geom/raid/g_raid.h"
+#include "g_raid_md_if.h"
+
+static MALLOC_DEFINE(M_MD_NVIDIA, "md_nvidia_data", "GEOM_RAID NVIDIA metadata");
+
+struct nvidia_raid_conf {
+	uint8_t		nvidia_id[8];
+#define NVIDIA_MAGIC                "NVIDIA  "
+
+	uint32_t	config_size;
+	uint32_t	checksum;
+	uint16_t	version;
+	uint8_t		disk_number;
+	uint8_t		dummy_0;
+	uint32_t	total_sectors;
+	uint32_t	sector_size;
+	uint8_t		name[16];
+	uint8_t		revision[4];
+	uint32_t	disk_status;
+
+	uint32_t	magic_0;
+#define NVIDIA_MAGIC0		0x00640044
+
+	uint64_t	volume_id[2];
+	uint8_t		state;
+#define NVIDIA_S_IDLE		0
+#define NVIDIA_S_INIT		2
+#define NVIDIA_S_REBUILD	3
+#define NVIDIA_S_UPGRADE	4
+#define NVIDIA_S_SYNC		5
+	uint8_t		array_width;
+	uint8_t		total_disks;
+	uint8_t		orig_array_width;
+	uint16_t	type;
+#define NVIDIA_T_RAID0		0x0080
+#define NVIDIA_T_RAID1		0x0081
+#define NVIDIA_T_RAID3		0x0083
+#define NVIDIA_T_RAID5		0x0085	/* RLQ = 00/02? */
+#define NVIDIA_T_RAID5_SYM	0x0095	/* RLQ = 03 */
+#define NVIDIA_T_RAID10		0x008a
+#define NVIDIA_T_RAID01		0x8180
+#define NVIDIA_T_CONCAT		0x00ff
+
+	uint16_t	dummy_3;
+	uint32_t	strip_sectors;
+	uint32_t	strip_bytes;
+	uint32_t	strip_shift;
+	uint32_t	strip_mask;
+	uint32_t	stripe_sectors;
+	uint32_t	stripe_bytes;
+	uint32_t	rebuild_lba;
+	uint32_t	orig_type;
+	uint32_t	orig_total_sectors;
+	uint32_t	status;
+#define NVIDIA_S_BOOTABLE	0x00000001
+#define NVIDIA_S_DEGRADED	0x00000002
+
+	uint32_t	filler[98];
+} __packed;
+
+struct g_raid_md_nvidia_perdisk {
+	struct nvidia_raid_conf	*pd_meta;
+	int			 pd_disk_pos;
+	off_t			 pd_disk_size;
+};
+
+struct g_raid_md_nvidia_object {
+	struct g_raid_md_object	 mdio_base;
+	uint64_t		 mdio_volume_id[2];
+	struct nvidia_raid_conf	*mdio_meta;
+	struct callout		 mdio_start_co;	/* STARTING state timer. */
+	int			 mdio_total_disks;
+	int			 mdio_disks_present;
+	int			 mdio_started;
+	int			 mdio_incomplete;
+	struct root_hold_token	*mdio_rootmount; /* Root mount delay token. */
+};
+
+static g_raid_md_create_t g_raid_md_create_nvidia;
+static g_raid_md_taste_t g_raid_md_taste_nvidia;
+static g_raid_md_event_t g_raid_md_event_nvidia;
+static g_raid_md_ctl_t g_raid_md_ctl_nvidia;
+static g_raid_md_write_t g_raid_md_write_nvidia;
+static g_raid_md_fail_disk_t g_raid_md_fail_disk_nvidia;
+static g_raid_md_free_disk_t g_raid_md_free_disk_nvidia;
+static g_raid_md_free_t g_raid_md_free_nvidia;
+
+static kobj_method_t g_raid_md_nvidia_methods[] = {
+	KOBJMETHOD(g_raid_md_create,	g_raid_md_create_nvidia),
+	KOBJMETHOD(g_raid_md_taste,	g_raid_md_taste_nvidia),
+	KOBJMETHOD(g_raid_md_event,	g_raid_md_event_nvidia),
+	KOBJMETHOD(g_raid_md_ctl,	g_raid_md_ctl_nvidia),
+	KOBJMETHOD(g_raid_md_write,	g_raid_md_write_nvidia),
+	KOBJMETHOD(g_raid_md_fail_disk,	g_raid_md_fail_disk_nvidia),
+	KOBJMETHOD(g_raid_md_free_disk,	g_raid_md_free_disk_nvidia),
+	KOBJMETHOD(g_raid_md_free,	g_raid_md_free_nvidia),
+	{ 0, 0 }
+};
+
+static struct g_raid_md_class g_raid_md_nvidia_class = {
+	"NVIDIA",
+	g_raid_md_nvidia_methods,
+	sizeof(struct g_raid_md_nvidia_object),
+	.mdc_priority = 100
+};
+
+static int NVIDIANodeID = 1;
+
+static void
+g_raid_md_nvidia_print(struct nvidia_raid_conf *meta)
+{
+
+	if (g_raid_debug < 1)
+		return;
+
+	printf("********* ATA NVIDIA RAID Metadata *********\n");
+	printf("nvidia_id           <%.8s>\n", meta->nvidia_id);
+	printf("config_size         %u\n", meta->config_size);
+	printf("checksum            0x%08x\n", meta->checksum);
+	printf("version             0x%04x\n", meta->version);
+	printf("disk_number         %d\n", meta->disk_number);
+	printf("dummy_0             0x%02x\n", meta->dummy_0);
+	printf("total_sectors       %u\n", meta->total_sectors);
+	printf("sector_size         %u\n", meta->sector_size);
+	printf("name                <%.16s>\n", meta->name);
+	printf("revision            0x%02x%02x%02x%02x\n",
+	    meta->revision[0], meta->revision[1],
+	    meta->revision[2], meta->revision[3]);
+	printf("disk_status         0x%08x\n", meta->disk_status);
+	printf("magic_0             0x%08x\n", meta->magic_0);
+	printf("volume_id           0x%016jx%016jx\n",
+	    meta->volume_id[1], meta->volume_id[0]);
+	printf("state               0x%02x\n", meta->state);
+	printf("array_width         %u\n", meta->array_width);
+	printf("total_disks         %u\n", meta->total_disks);
+	printf("orig_array_width    %u\n", meta->orig_array_width);
+	printf("type                0x%04x\n", meta->type);
+	printf("dummy_3             0x%04x\n", meta->dummy_3);
+	printf("strip_sectors       %u\n", meta->strip_sectors);
+	printf("strip_bytes         %u\n", meta->strip_bytes);
+	printf("strip_shift         %u\n", meta->strip_shift);
+	printf("strip_mask          0x%08x\n", meta->strip_mask);
+	printf("stripe_sectors      %u\n", meta->stripe_sectors);
+	printf("stripe_bytes        %u\n", meta->stripe_bytes);
+	printf("rebuild_lba         %u\n", meta->rebuild_lba);
+	printf("orig_type           0x%04x\n", meta->orig_type);
+	printf("orig_total_sectors  %u\n", meta->orig_total_sectors);
+	printf("status              0x%08x\n", meta->status);
+	printf("=================================================\n");
+}
+
+static struct nvidia_raid_conf *
+nvidia_meta_copy(struct nvidia_raid_conf *meta)
+{
+	struct nvidia_raid_conf *nmeta;
+
+	nmeta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK);
+	memcpy(nmeta, meta, sizeof(*meta));
+	return (nmeta);
+}
+
+static int
+nvidia_meta_translate_disk(struct nvidia_raid_conf *meta, int md_disk_pos)
+{
+	int disk_pos;
+
+	if (md_disk_pos >= 0 && meta->type == NVIDIA_T_RAID01) {
+		disk_pos = (md_disk_pos / meta->array_width) +
+		    (md_disk_pos % meta->array_width) * meta->array_width;
+	} else
+		disk_pos = md_disk_pos;
+	return (disk_pos);
+}
+
+static void
+nvidia_meta_get_name(struct nvidia_raid_conf *meta, char *buf)
+{
+	int i;
+
+	strncpy(buf, meta->name, 16);
+	buf[16] = 0;
+	for (i = 15; i >= 0; i--) {
+		if (buf[i] > 0x20)
+			break;
+		buf[i] = 0;
+	}
+}
+
+static void
+nvidia_meta_put_name(struct nvidia_raid_conf *meta, char *buf)
+{
+
+	memset(meta->name, 0x20, 16);
+	memcpy(meta->name, buf, MIN(strlen(buf), 16));
+}
+
+static struct nvidia_raid_conf *
+nvidia_meta_read(struct g_consumer *cp)
+{
+	struct g_provider *pp;
+	struct nvidia_raid_conf *meta;
+	char *buf;
+	int error, i;
+	uint32_t checksum, *ptr;
+
+	pp = cp->provider;
+
+	/* Read the anchor sector. */
+	buf = g_read_data(cp,
+	    pp->mediasize - 2 * pp->sectorsize, pp->sectorsize, &error);
+	if (buf == NULL) {
+		G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
+		    pp->name, error);
+		return (NULL);
+	}
+	meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK);
+	memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize));
+	g_free(buf);
+
+	/* Check if this is an NVIDIA RAID struct */
+	if (strncmp(meta->nvidia_id, NVIDIA_MAGIC, strlen(NVIDIA_MAGIC))) {
+		G_RAID_DEBUG(1, "NVIDIA signature check failed on %s", pp->name);
+		free(meta, M_MD_NVIDIA);
+		return (NULL);
+	}
+	if (meta->config_size > 128 ||
+	    meta->config_size < 30) {
+		G_RAID_DEBUG(1, "NVIDIA metadata size looks wrong: %d",
+		    meta->config_size);
+		free(meta, M_MD_NVIDIA);
+		return (NULL);
+	}
+
+	/* Check metadata checksum. */
+	for (checksum = 0, ptr = (uint32_t *)meta,
+	    i = 0; i < meta->config_size; i++)
+		checksum += *ptr++;
+	if (checksum != 0) {
+		G_RAID_DEBUG(1, "NVIDIA checksum check failed on %s", pp->name);
+		free(meta, M_MD_NVIDIA);
+		return (NULL);
+	}
+
+	/* Check volume state. */
+	if (meta->state != NVIDIA_S_IDLE && meta->state != NVIDIA_S_INIT &&
+	    meta->state != NVIDIA_S_REBUILD && meta->state != NVIDIA_S_SYNC) {
+		G_RAID_DEBUG(1, "NVIDIA unknown state on %s (0x%02x)",
+		    pp->name, meta->state);
+		free(meta, M_MD_NVIDIA);
+		return (NULL);
+	}
+
+	/* Check raid type. */
+	if (meta->type != NVIDIA_T_RAID0 && meta->type != NVIDIA_T_RAID1 &&
+	    meta->type != NVIDIA_T_RAID3 && meta->type != NVIDIA_T_RAID5 &&
+	    meta->type != NVIDIA_T_RAID5_SYM &&
+	    meta->type != NVIDIA_T_RAID01 && meta->type != NVIDIA_T_CONCAT) {
+		G_RAID_DEBUG(1, "NVIDIA unknown RAID level on %s (0x%02x)",
+		    pp->name, meta->type);
+		free(meta, M_MD_NVIDIA);
+		return (NULL);
+	}
+
+	return (meta);
+}
+
+static int
+nvidia_meta_write(struct g_consumer *cp, struct nvidia_raid_conf *meta)
+{
+	struct g_provider *pp;
+	char *buf;
+	int error, i;
+	uint32_t checksum, *ptr;
+
+	pp = cp->provider;
+
+	/* Recalculate checksum for case if metadata were changed. */
+	meta->checksum = 0;
+	for (checksum = 0, ptr = (uint32_t *)meta,
+	    i = 0; i < meta->config_size; i++)
+		checksum += *ptr++;
+	meta->checksum -= checksum;
+
+	/* Create and fill buffer. */
+	buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO);
+	memcpy(buf, meta, sizeof(*meta));
+
+	/* Write metadata. */
+	error = g_write_data(cp,
+	    pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize);
+	if (error != 0) {
+		G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
+		    pp->name, error);
+	}
+
+	free(buf, M_MD_NVIDIA);
+	return (error);
+}
+
+static int
+nvidia_meta_erase(struct g_consumer *cp)
+{
+	struct g_provider *pp;
+	char *buf;
+	int error;
+
+	pp = cp->provider;
+	buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO);
+	error = g_write_data(cp,
+	    pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize);
+	if (error != 0) {
+		G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
+		    pp->name, error);
+	}
+	free(buf, M_MD_NVIDIA);
+	return (error);
+}
+
+static struct g_raid_disk *
+g_raid_md_nvidia_get_disk(struct g_raid_softc *sc, int id)
+{
+	struct g_raid_disk	*disk;
+	struct g_raid_md_nvidia_perdisk *pd;
+
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data;
+		if (pd->pd_disk_pos == id)
+			break;
+	}
+	return (disk);
+}
+
+static int
+g_raid_md_nvidia_supported(int level, int qual, int disks, int force)
+{
+
+	switch (level) {
+	case G_RAID_VOLUME_RL_RAID0:
+		if (disks < 1)
+			return (0);
+		if (!force && (disks < 2 || disks > 6))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID1:
+		if (disks < 1)
+			return (0);
+		if (!force && (disks != 2))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID1E:
+		if (disks < 2)
+			return (0);
+		if (disks % 2 != 0)
+			return (0);
+		if (!force && (disks < 4))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_SINGLE:
+		if (disks != 1)
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_CONCAT:
+		if (disks < 2)
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID5:
+		if (disks < 3)
+			return (0);
+		break;
+	default:
+		return (0);
+	}
+	if (qual != G_RAID_VOLUME_RLQ_NONE)
+		return (0);
+	return (1);
+}
+
+static int
+g_raid_md_nvidia_start_disk(struct g_raid_disk *disk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd, *tmpsd;
+	struct g_raid_disk *olddisk, *tmpdisk;
+	struct g_raid_md_object *md;
+	struct g_raid_md_nvidia_object *mdi;
+	struct g_raid_md_nvidia_perdisk *pd, *oldpd;
+	struct nvidia_raid_conf *meta;
+	int disk_pos, resurrection = 0;
+
+	sc = disk->d_softc;
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_nvidia_object *)md;
+	meta = mdi->mdio_meta;
+	pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data;
+	olddisk = NULL;
+
+	/* Find disk position in metadata by it's serial. */
+	if (pd->pd_meta != NULL) {
+		disk_pos = pd->pd_meta->disk_number;
+		if (disk_pos >= meta->total_disks || mdi->mdio_started)
+			disk_pos = -3;
+	} else
+		disk_pos = -3;
+	/* For RAID0+1 we need to translate order. */
+	disk_pos = nvidia_meta_translate_disk(meta, disk_pos);
+	if (disk_pos < 0) {
+		G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk");
+		/* If we are in the start process, that's all for now. */
+		if (!mdi->mdio_started)
+			goto nofit;
+		/*
+		 * If we have already started - try to get use of the disk.
+		 * Try to replace OFFLINE disks first, then FAILED.
+		 */
+		TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) {
+			if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE &&
+			    tmpdisk->d_state != G_RAID_DISK_S_FAILED)
+				continue;
+			/* Make sure this disk is big enough. */
+			TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) {
+				if (sd->sd_offset + sd->sd_size + 2 * 512 >
+				    pd->pd_disk_size) {
+					G_RAID_DEBUG1(1, sc,
+					    "Disk too small (%ju < %ju)",
+					    pd->pd_disk_size,
+					    sd->sd_offset + sd->sd_size + 512);
+					break;
+				}
+			}
+			if (sd != NULL)
+				continue;
+			if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) {
+				olddisk = tmpdisk;
+				break;
+			} else if (olddisk == NULL)
+				olddisk = tmpdisk;
+		}
+		if (olddisk == NULL) {
+nofit:
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
+			return (1);
+		}
+		oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data;
+		disk_pos = oldpd->pd_disk_pos;
+		resurrection = 1;
+	}
+
+	if (olddisk == NULL) {
+		/* Find placeholder by position. */
+		olddisk = g_raid_md_nvidia_get_disk(sc, disk_pos);
+		if (olddisk == NULL)
+			panic("No disk at position %d!", disk_pos);
+		if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) {
+			G_RAID_DEBUG1(1, sc, "More then one disk for pos %d",
+			    disk_pos);
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE);
+			return (0);
+		}
+		oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data;
+	}
+
+	/* Replace failed disk or placeholder with new disk. */
+	TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) {
+		TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next);
+		TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+		sd->sd_disk = disk;
+	}
+	oldpd->pd_disk_pos = -2;
+	pd->pd_disk_pos = disk_pos;
+
+	/* If it was placeholder -- destroy it. */
+	if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) {
+		g_raid_destroy_disk(olddisk);
+	} else {
+		/* Otherwise, make it STALE_FAILED. */
+		g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED);
+	}
+
+	/* Welcome the new disk. */
+	if (resurrection)
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
+	else// if (pd->pd_meta->disk_status == NVIDIA_S_CURRENT ||
+	    //pd->pd_meta->disk_status == NVIDIA_S_REBUILD)
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
+//	else
+//		g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
+	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+
+		/*
+		 * Different disks may have different sizes,
+		 * in concat mode. Update from real disk size.
+		 */
+		if (meta->type == NVIDIA_T_CONCAT)
+			sd->sd_size = pd->pd_disk_size - 0x800 * 512;
+
+		if (resurrection) {
+			/* New or ex-spare disk. */
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_NEW);
+		} else if (meta->state == NVIDIA_S_REBUILD &&
+		    (pd->pd_meta->disk_status & 0x100)) {
+			/* Rebuilding disk. */
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_REBUILD);
+			sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba /
+			    meta->array_width * pd->pd_meta->sector_size;
+		} else if (meta->state == NVIDIA_S_SYNC) {
+			/* Resyncing/dirty disk. */
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_RESYNC);
+			sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba /
+			    meta->array_width * pd->pd_meta->sector_size;
+		} else {
+			/* Up to date disk. */
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_ACTIVE);
+		}
+		g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
+		    G_RAID_EVENT_SUBDISK);
+	}
+
+	/* Update status of our need for spare. */
+	if (mdi->mdio_started) {
+		mdi->mdio_incomplete =
+		    (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) <
+		     mdi->mdio_total_disks);
+	}
+
+	return (resurrection);
+}
+
+static void
+g_disk_md_nvidia_retaste(void *arg, int pending)
+{
+
+	G_RAID_DEBUG(1, "Array is not complete, trying to retaste.");
+	g_retaste(&g_raid_class);
+	free(arg, M_MD_NVIDIA);
+}
+
+static void
+g_raid_md_nvidia_refill(struct g_raid_softc *sc)
+{
+	struct g_raid_md_object *md;
+	struct g_raid_md_nvidia_object *mdi;
+	struct nvidia_raid_conf *meta;
+	struct g_raid_disk *disk;
+	struct task *task;
+	int update, na;
+
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_nvidia_object *)md;
+	meta = mdi->mdio_meta;
+	update = 0;
+	do {
+		/* Make sure we miss anything. */
+		na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE);
+		if (na == mdi->mdio_total_disks)
+			break;
+
+		G_RAID_DEBUG1(1, md->mdo_softc,
+		    "Array is not complete (%d of %d), "
+		    "trying to refill.", na, mdi->mdio_total_disks);
+
+		/* Try to get use some of STALE disks. */
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_state == G_RAID_DISK_S_STALE) {
+				update += g_raid_md_nvidia_start_disk(disk);
+				if (disk->d_state == G_RAID_DISK_S_ACTIVE)
+					break;
+			}
+		}
+		if (disk != NULL)
+			continue;
+
+		/* Try to get use some of SPARE disks. */
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_state == G_RAID_DISK_S_SPARE) {
+				update += g_raid_md_nvidia_start_disk(disk);
+				if (disk->d_state == G_RAID_DISK_S_ACTIVE)
+					break;
+			}
+		}
+	} while (disk != NULL);
+
+	/* Write new metadata if we changed something. */
+	if (update) {
+		g_raid_md_write_nvidia(md, NULL, NULL, NULL);
+		meta = mdi->mdio_meta;
+	}
+
+	/* Update status of our need for spare. */
+	mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) <
+	    mdi->mdio_total_disks);
+
+	/* Request retaste hoping to find spare. */
+	if (mdi->mdio_incomplete) {
+		task = malloc(sizeof(struct task),
+		    M_MD_NVIDIA, M_WAITOK | M_ZERO);
+		TASK_INIT(task, 0, g_disk_md_nvidia_retaste, task);
+		taskqueue_enqueue(taskqueue_swi, task);
+	}
+}
+
+static void
+g_raid_md_nvidia_start(struct g_raid_softc *sc)
+{
+	struct g_raid_md_object *md;
+	struct g_raid_md_nvidia_object *mdi;
+	struct g_raid_md_nvidia_perdisk *pd;
+	struct nvidia_raid_conf *meta;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	off_t size;
+	int j, disk_pos;
+	char buf[17];
+
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_nvidia_object *)md;
+	meta = mdi->mdio_meta;
+
+	/* Create volumes and subdisks. */
+	nvidia_meta_get_name(meta, buf);
+	vol = g_raid_create_volume(sc, buf, -1);
+	vol->v_mediasize = (off_t)meta->total_sectors * 512;
+	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
+	if (meta->type == NVIDIA_T_RAID0) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
+		size = vol->v_mediasize / mdi->mdio_total_disks;
+	} else if (meta->type == NVIDIA_T_RAID1) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
+		size = vol->v_mediasize;
+	} else if (meta->type == NVIDIA_T_RAID01) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
+		size = vol->v_mediasize / (mdi->mdio_total_disks / 2);
+	} else if (meta->type == NVIDIA_T_CONCAT) {
+		if (mdi->mdio_total_disks == 1)
+			vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE;
+		else
+			vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT;
+		size = 0;
+	} else if (meta->type == NVIDIA_T_RAID5) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
+		size = vol->v_mediasize / (mdi->mdio_total_disks - 1);
+	} else if (meta->type == NVIDIA_T_RAID5_SYM) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
+//		vol->v_raid_level_qualifier = 0x03;
+		size = vol->v_mediasize / (mdi->mdio_total_disks - 1);
+	} else {
+		vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
+		size = 0;
+	}
+	vol->v_strip_size = meta->strip_sectors * 512; //ZZZ
+	vol->v_disks_count = mdi->mdio_total_disks;
+	vol->v_sectorsize = 512; //ZZZ
+	for (j = 0; j < vol->v_disks_count; j++) {
+		sd = &vol->v_subdisks[j];
+		sd->sd_offset = 0;
+		sd->sd_size = size;
+	}
+	g_raid_start_volume(vol);
+
+	/* Create disk placeholders to store data for later writing. */
+	for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) {
+		pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO);
+		pd->pd_disk_pos = disk_pos;
+		disk = g_raid_create_disk(sc);
+		disk->d_md_data = (void *)pd;
+		disk->d_state = G_RAID_DISK_S_OFFLINE;
+		sd = &vol->v_subdisks[disk_pos];
+		sd->sd_disk = disk;
+		TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+	}
+
+	/* Make all disks found till the moment take their places. */
+	do {
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_state == G_RAID_DISK_S_NONE) {
+				g_raid_md_nvidia_start_disk(disk);
+				break;
+			}
+		}
+	} while (disk != NULL);
+
+	mdi->mdio_started = 1;
+	G_RAID_DEBUG1(0, sc, "Array started.");
+	g_raid_md_write_nvidia(md, NULL, NULL, NULL);
+
+	/* Pickup any STALE/SPARE disks to refill array if needed. */
+	g_raid_md_nvidia_refill(sc);
+
+	g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
+
+	callout_stop(&mdi->mdio_start_co);
+	G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount);
+	root_mount_rel(mdi->mdio_rootmount);
+	mdi->mdio_rootmount = NULL;
+}
+
+static void
+g_raid_md_nvidia_new_disk(struct g_raid_disk *disk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_object *md;
+	struct g_raid_md_nvidia_object *mdi;
+	struct nvidia_raid_conf *pdmeta;
+	struct g_raid_md_nvidia_perdisk *pd;
+
+	sc = disk->d_softc;
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_nvidia_object *)md;
+	pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data;
+	pdmeta = pd->pd_meta;
+
+	if (mdi->mdio_started) {
+		if (g_raid_md_nvidia_start_disk(disk))
+			g_raid_md_write_nvidia(md, NULL, NULL, NULL);
+	} else {
+		if (mdi->mdio_meta == NULL ||
+		    mdi->mdio_meta->disk_number >= mdi->mdio_meta->total_disks) {
+			G_RAID_DEBUG1(1, sc, "Newer disk");
+			if (mdi->mdio_meta != NULL)
+				free(mdi->mdio_meta, M_MD_NVIDIA);
+			mdi->mdio_meta = nvidia_meta_copy(pdmeta);
+			mdi->mdio_total_disks = pdmeta->total_disks;
+			mdi->mdio_disks_present = 1;
+		} else if (pdmeta->disk_number < mdi->mdio_meta->total_disks) {
+			mdi->mdio_disks_present++;
+			G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
+			    mdi->mdio_disks_present,
+			    mdi->mdio_total_disks);
+		} else
+			G_RAID_DEBUG1(1, sc, "Spare disk");
+
+		/* If we collected all needed disks - start array. */
+		if (mdi->mdio_disks_present == mdi->mdio_total_disks)
+			g_raid_md_nvidia_start(sc);
+	}
+}
+
+static void
+g_raid_nvidia_go(void *arg)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_object *md;
+	struct g_raid_md_nvidia_object *mdi;
+
+	sc = arg;
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_nvidia_object *)md;
+	if (!mdi->mdio_started) {
+		G_RAID_DEBUG1(0, sc, "Force array start due to timeout.");
+		g_raid_event_send(sc, G_RAID_NODE_E_START, 0);
+	}
+}
+
+static int
+g_raid_md_create_nvidia(struct g_raid_md_object *md, struct g_class *mp,
+    struct g_geom **gp)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_nvidia_object *mdi;
+	char name[32];
+
+	mdi = (struct g_raid_md_nvidia_object *)md;
+	arc4rand(&mdi->mdio_volume_id, 16, 0);
+	snprintf(name, sizeof(name), "NVIDIA-%d",
+	    atomic_fetchadd_int(&NVIDIANodeID, 1));
+	sc = g_raid_create_node(mp, name, md);
+	if (sc == NULL)
+		return (G_RAID_MD_TASTE_FAIL);
+	md->mdo_softc = sc;
+	*gp = sc->sc_geom;
+	return (G_RAID_MD_TASTE_NEW);
+}
+
+static int
+g_raid_md_taste_nvidia(struct g_raid_md_object *md, struct g_class *mp,
+                              struct g_consumer *cp, struct g_geom **gp)
+{
+	struct g_consumer *rcp;
+	struct g_provider *pp;
+	struct g_raid_md_nvidia_object *mdi, *mdi1;
+	struct g_raid_softc *sc;
+	struct g_raid_disk *disk;
+	struct nvidia_raid_conf *meta;
+	struct g_raid_md_nvidia_perdisk *pd;
+	struct g_geom *geom;
+	int error, disk_pos, result, spare, len;
+	char name[32];
+	uint16_t vendor;
+
+	G_RAID_DEBUG(1, "Tasting NVIDIA on %s", cp->provider->name);
+	mdi = (struct g_raid_md_nvidia_object *)md;
+	pp = cp->provider;
+
+	/* Read metadata from device. */
+	meta = NULL;
+	spare = 0;
+	vendor = 0xffff;
+	disk_pos = 0;
+	if (g_access(cp, 1, 0, 0) != 0)
+		return (G_RAID_MD_TASTE_FAIL);
+	g_topology_unlock();
+	len = 2;
+	if (pp->geom->rank == 1)
+		g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
+	meta = nvidia_meta_read(cp);
+	g_topology_lock();
+	g_access(cp, -1, 0, 0);
+	if (meta == NULL) {
+		if (g_raid_aggressive_spare) {
+			if (vendor == 0x10de) {
+				G_RAID_DEBUG(1,
+				    "No NVIDIA metadata, forcing spare.");
+				spare = 2;
+				goto search;
+			} else {
+				G_RAID_DEBUG(1,
+				    "NVIDIA vendor mismatch 0x%04x != 0x10de",
+				    vendor);
+			}
+		}
+		return (G_RAID_MD_TASTE_FAIL);
+	}
+
+	/* Check this disk position in obtained metadata. */
+	disk_pos = meta->disk_number;
+	if (disk_pos == -1) {
+		G_RAID_DEBUG(1, "NVIDIA disk position not found");
+		goto fail1;
+	}
+
+	/* Metadata valid. Print it. */
+	g_raid_md_nvidia_print(meta);
+	G_RAID_DEBUG(1, "NVIDIA disk position %d", disk_pos);
+	spare = 0;//(meta->type == NVIDIA_T_SPARE) ? 1 : 0;
+
+search:
+	/* Search for matching node. */
+	sc = NULL;
+	mdi1 = NULL;
+	LIST_FOREACH(geom, &mp->geom, geom) {
+		sc = geom->softc;
+		if (sc == NULL)
+			continue;
+		if (sc->sc_stopping != 0)
+			continue;
+		if (sc->sc_md->mdo_class != md->mdo_class)
+			continue;
+		mdi1 = (struct g_raid_md_nvidia_object *)sc->sc_md;
+		if (spare) {
+			if (mdi1->mdio_incomplete)
+				break;
+		} else {
+			if (memcmp(&mdi1->mdio_volume_id,
+			     &meta->volume_id, 16) == 0)
+				break;
+		}
+	}
+
+	/* Found matching node. */
+	if (geom != NULL) {
+		G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
+		result = G_RAID_MD_TASTE_EXISTING;
+
+	} else if (spare) { /* Not found needy node -- left for later. */
+		G_RAID_DEBUG(1, "Spare is not needed at this time");
+		goto fail1;
+
+	} else { /* Not found matching node -- create one. */
+		result = G_RAID_MD_TASTE_NEW;
+		memcpy(&mdi->mdio_volume_id, &meta->volume_id, 16);
+		snprintf(name, sizeof(name), "NVIDIA-%d",
+		    atomic_fetchadd_int(&NVIDIANodeID, 1));
+		sc = g_raid_create_node(mp, name, md);
+		md->mdo_softc = sc;
+		geom = sc->sc_geom;
+		callout_init(&mdi->mdio_start_co, 1);
+		callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz,
+		    g_raid_nvidia_go, sc);
+		mdi->mdio_rootmount = root_mount_hold("GRAID-NVIDIA");
+		G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount);
+	}
+
+	rcp = g_new_consumer(geom);
+	g_attach(rcp, pp);
+	if (g_access(rcp, 1, 1, 1) != 0)
+		; //goto fail1;
+
+	g_topology_unlock();
+	sx_xlock(&sc->sc_lock);
+
+	pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO);
+	pd->pd_meta = meta;
+	if (spare == 2) {
+		pd->pd_disk_pos = -3;
+	} else {
+		pd->pd_disk_pos = -1;
+	}
+	pd->pd_disk_size = pp->mediasize;
+	disk = g_raid_create_disk(sc);
+	disk->d_md_data = (void *)pd;
+	disk->d_consumer = rcp;
+	rcp->private = disk;
+
+	/* Read kernel dumping information. */
+	disk->d_kd.offset = 0;
+	disk->d_kd.length = OFF_MAX;
+	len = sizeof(disk->d_kd);
+	error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd);
+	if (disk->d_kd.di.dumper == NULL)
+		G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", 
+		    rcp->provider->name, error);
+
+	g_raid_md_nvidia_new_disk(disk);
+
+	sx_xunlock(&sc->sc_lock);
+	g_topology_lock();
+	*gp = geom;
+	return (result);
+fail1:
+	free(meta, M_MD_NVIDIA);
+	return (G_RAID_MD_TASTE_FAIL);
+}
+
+static int
+g_raid_md_event_nvidia(struct g_raid_md_object *md,
+    struct g_raid_disk *disk, u_int event)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd;
+	struct g_raid_md_nvidia_object *mdi;
+	struct g_raid_md_nvidia_perdisk *pd;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_nvidia_object *)md;
+	if (disk == NULL) {
+		switch (event) {
+		case G_RAID_NODE_E_START:
+			if (!mdi->mdio_started) {
+				/* Bump volume ID to drop missing disks. */
+				arc4rand(&mdi->mdio_volume_id, 16, 0);
+				g_raid_md_nvidia_start(sc);
+			}
+			return (0);
+		}
+		return (-1);
+	}
+	pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data;
+	switch (event) {
+	case G_RAID_DISK_E_DISCONNECTED:
+		/* If disk was assigned, just update statuses. */
+		if (pd->pd_disk_pos >= 0) {
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+			if (disk->d_consumer) {
+				g_raid_kill_consumer(sc, disk->d_consumer);
+				disk->d_consumer = NULL;
+			}
+			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_NONE);
+				g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
+				    G_RAID_EVENT_SUBDISK);
+			}
+		} else {
+			/* Otherwise -- delete. */
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
+			g_raid_destroy_disk(disk);
+		}
+
+		if (mdi->mdio_started) {
+			/* Bump volume ID to prevent disk resurrection. */
+			if (pd->pd_disk_pos >= 0)
+				arc4rand(&mdi->mdio_volume_id, 16, 0);
+
+			/* Write updated metadata to all disks. */
+			g_raid_md_write_nvidia(md, NULL, NULL, NULL);
+		}
+
+		/* Check if anything left except placeholders. */
+		if (g_raid_ndisks(sc, -1) ==
+		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
+			g_raid_destroy_node(sc, 0);
+		else
+			g_raid_md_nvidia_refill(sc);
+		return (0);
+	}
+	return (-2);
+}
+
+static int
+g_raid_md_ctl_nvidia(struct g_raid_md_object *md,
+    struct gctl_req *req)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	struct g_raid_md_nvidia_object *mdi;
+	struct g_raid_md_nvidia_perdisk *pd;
+	struct g_consumer *cp;
+	struct g_provider *pp;
+	char arg[16];
+	const char *verb, *volname, *levelname, *diskname;
+	int *nargs, *force;
+	off_t size, sectorsize, strip;
+	intmax_t *sizearg, *striparg;
+	int numdisks, i, len, level, qual, update;
+	int error;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_nvidia_object *)md;
+	verb = gctl_get_param(req, "verb", NULL);
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	error = 0;
+	if (strcmp(verb, "label") == 0) {
+
+		if (*nargs < 4) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		volname = gctl_get_asciiparam(req, "arg1");
+		if (volname == NULL) {
+			gctl_error(req, "No volume name.");
+			return (-2);
+		}
+		levelname = gctl_get_asciiparam(req, "arg2");
+		if (levelname == NULL) {
+			gctl_error(req, "No RAID level.");
+			return (-3);
+		}
+		if (g_raid_volume_str2level(levelname, &level, &qual)) {
+			gctl_error(req, "Unknown RAID level '%s'.", levelname);
+			return (-4);
+		}
+		numdisks = *nargs - 3;
+		force = gctl_get_paraml(req, "force", sizeof(*force));
+		if (!g_raid_md_nvidia_supported(level, qual, numdisks,
+		    force ? *force : 0)) {
+			gctl_error(req, "Unsupported RAID level "
+			    "(0x%02x/0x%02x), or number of disks (%d).",
+			    level, qual, numdisks);
+			return (-5);
+		}
+
+		/* Search for disks, connect them and probe. */
+		size = 0x7fffffffffffffffllu;
+		sectorsize = 0;
+		for (i = 0; i < numdisks; i++) {
+			snprintf(arg, sizeof(arg), "arg%d", i + 3);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -6;
+				break;
+			}
+			if (strcmp(diskname, "NONE") == 0) {
+				cp = NULL;
+				pp = NULL;
+			} else {
+				g_topology_lock();
+				cp = g_raid_open_consumer(sc, diskname);
+				if (cp == NULL) {
+					gctl_error(req, "Can't open '%s'.",
+					    diskname);
+					g_topology_unlock();
+					error = -7;
+					break;
+				}
+				pp = cp->provider;
+			}
+			pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO);
+			pd->pd_disk_pos = i;
+			disk = g_raid_create_disk(sc);
+			disk->d_md_data = (void *)pd;
+			disk->d_consumer = cp;
+			if (cp == NULL)
+				continue;
+			cp->private = disk;
+			g_topology_unlock();
+
+			/* Read kernel dumping information. */
+			disk->d_kd.offset = 0;
+			disk->d_kd.length = OFF_MAX;
+			len = sizeof(disk->d_kd);
+			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
+			if (disk->d_kd.di.dumper == NULL)
+				G_RAID_DEBUG1(2, sc,
+				    "Dumping not supported by %s.",
+				    cp->provider->name);
+
+			pd->pd_disk_size = pp->mediasize;
+			if (size > pp->mediasize)
+				size = pp->mediasize;
+			if (sectorsize < pp->sectorsize)
+				sectorsize = pp->sectorsize;
+		}
+		if (error != 0)
+			return (error);
+
+		/* Reserve space for metadata. */
+		size -= 2 * sectorsize;
+
+		/* Handle size argument. */
+		len = sizeof(*sizearg);
+		sizearg = gctl_get_param(req, "size", &len);
+		if (sizearg != NULL && len == sizeof(*sizearg) &&
+		    *sizearg > 0) {
+			if (*sizearg > size) {
+				gctl_error(req, "Size too big %lld > %lld.",
+				    (long long)*sizearg, (long long)size);
+				return (-9);
+			}
+			size = *sizearg;
+		}
+
+		/* Handle strip argument. */
+		strip = 131072;
+		len = sizeof(*striparg);
+		striparg = gctl_get_param(req, "strip", &len);
+		if (striparg != NULL && len == sizeof(*striparg) &&
+		    *striparg > 0) {
+			if (*striparg < sectorsize) {
+				gctl_error(req, "Strip size too small.");
+				return (-10);
+			}
+			if (*striparg % sectorsize != 0) {
+				gctl_error(req, "Incorrect strip size.");
+				return (-11);
+			}
+			if (strip > 65535 * sectorsize) {
+				gctl_error(req, "Strip size too big.");
+				return (-12);
+			}
+			strip = *striparg;
+		}
+
+		/* Round size down to strip or sector. */
+		if (level == G_RAID_VOLUME_RL_RAID1)
+			size -= (size % sectorsize);
+		else if (level == G_RAID_VOLUME_RL_RAID1E &&
+		    (numdisks & 1) != 0)
+			size -= (size % (2 * strip));
+		else
+			size -= (size % strip);
+		if (size <= 0) {
+			gctl_error(req, "Size too small.");
+			return (-13);
+		}
+		if (size > 0xffffffffffffllu * sectorsize) {
+			gctl_error(req, "Size too big.");
+			return (-14);
+		}
+
+		/* We have all we need, create things: volume, ... */
+		mdi->mdio_total_disks = numdisks;
+		mdi->mdio_started = 1;
+		vol = g_raid_create_volume(sc, volname, -1);
+		vol->v_md_data = (void *)(intptr_t)0;
+		vol->v_raid_level = level;
+		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
+		vol->v_strip_size = strip;
+		vol->v_disks_count = numdisks;
+		if (level == G_RAID_VOLUME_RL_RAID0 ||
+		    level == G_RAID_VOLUME_RL_CONCAT ||
+		    level == G_RAID_VOLUME_RL_SINGLE)
+			vol->v_mediasize = size * numdisks;
+		else if (level == G_RAID_VOLUME_RL_RAID1)
+			vol->v_mediasize = size;
+		else if (level == G_RAID_VOLUME_RL_RAID5)
+			vol->v_mediasize = size * (numdisks - 1);
+		else { /* RAID1E */
+			vol->v_mediasize = ((size * numdisks) / strip / 2) *
+			    strip;
+		}
+		vol->v_sectorsize = sectorsize;
+		g_raid_start_volume(vol);
+
+		/* , and subdisks. */
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data;
+			sd = &vol->v_subdisks[pd->pd_disk_pos];
+			sd->sd_disk = disk;
+			sd->sd_offset = 0;
+			sd->sd_size = size;
+			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+			if (sd->sd_disk->d_consumer != NULL) {
+				g_raid_change_disk_state(disk,
+				    G_RAID_DISK_S_ACTIVE);
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_ACTIVE);
+				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
+				    G_RAID_EVENT_SUBDISK);
+			} else {
+				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+			}
+		}
+
+		/* Write metadata based on created entities. */
+		G_RAID_DEBUG1(0, sc, "Array started.");
+		g_raid_md_write_nvidia(md, NULL, NULL, NULL);
+
+		/* Pickup any STALE/SPARE disks to refill array if needed. */
+		g_raid_md_nvidia_refill(sc);
+
+		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
+		    G_RAID_EVENT_VOLUME);
+		return (0);
+	}
+	if (strcmp(verb, "delete") == 0) {
+
+		/* Check if some volume is still open. */
+		force = gctl_get_paraml(req, "force", sizeof(*force));
+		if (force != NULL && *force == 0 &&
+		    g_raid_nopens(sc) != 0) {
+			gctl_error(req, "Some volume is still open.");
+			return (-4);
+		}
+
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_consumer)
+				nvidia_meta_erase(disk->d_consumer);
+		}
+		g_raid_destroy_node(sc, 0);
+		return (0);
+	}
+	if (strcmp(verb, "remove") == 0 ||
+	    strcmp(verb, "fail") == 0) {
+		if (*nargs < 2) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		for (i = 1; i < *nargs; i++) {
+			snprintf(arg, sizeof(arg), "arg%d", i);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -2;
+				break;
+			}
+			if (strncmp(diskname, "/dev/", 5) == 0)
+				diskname += 5;
+
+			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+				if (disk->d_consumer != NULL && 
+				    disk->d_consumer->provider != NULL &&
+				    strcmp(disk->d_consumer->provider->name,
+				     diskname) == 0)
+					break;
+			}
+			if (disk == NULL) {
+				gctl_error(req, "Disk '%s' not found.",
+				    diskname);
+				error = -3;
+				break;
+			}
+
+			if (strcmp(verb, "fail") == 0) {
+				g_raid_md_fail_disk_nvidia(md, NULL, disk);
+				continue;
+			}
+
+			pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data;
+
+			/* Erase metadata on deleting disk. */
+			nvidia_meta_erase(disk->d_consumer);
+
+			/* If disk was assigned, just update statuses. */
+			if (pd->pd_disk_pos >= 0) {
+				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+				if (disk->d_consumer) {
+					g_raid_kill_consumer(sc, disk->d_consumer);
+					disk->d_consumer = NULL;
+				}
+				TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+					g_raid_change_subdisk_state(sd,
+					    G_RAID_SUBDISK_S_NONE);
+					g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
+					    G_RAID_EVENT_SUBDISK);
+				}
+			} else {
+				/* Otherwise -- delete. */
+				g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
+				g_raid_destroy_disk(disk);
+			}
+		}
+
+		/* Write updated metadata to remaining disks. */
+		g_raid_md_write_nvidia(md, NULL, NULL, NULL);
+
+		/* Check if anything left except placeholders. */
+		if (g_raid_ndisks(sc, -1) ==
+		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
+			g_raid_destroy_node(sc, 0);
+		else
+			g_raid_md_nvidia_refill(sc);
+		return (error);
+	}
+	if (strcmp(verb, "insert") == 0) {
+		if (*nargs < 2) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		update = 0;
+		for (i = 1; i < *nargs; i++) {
+			/* Get disk name. */
+			snprintf(arg, sizeof(arg), "arg%d", i);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -3;
+				break;
+			}
+
+			/* Try to find provider with specified name. */
+			g_topology_lock();
+			cp = g_raid_open_consumer(sc, diskname);
+			if (cp == NULL) {
+				gctl_error(req, "Can't open disk '%s'.",
+				    diskname);
+				g_topology_unlock();
+				error = -4;
+				break;
+			}
+			pp = cp->provider;
+
+			pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO);
+			pd->pd_disk_pos = -3;
+			pd->pd_disk_size = pp->mediasize;
+
+			disk = g_raid_create_disk(sc);
+			disk->d_consumer = cp;
+			disk->d_consumer->private = disk;
+			disk->d_md_data = (void *)pd;
+			cp->private = disk;
+			g_topology_unlock();
+
+			/* Read kernel dumping information. */
+			disk->d_kd.offset = 0;
+			disk->d_kd.length = OFF_MAX;
+			len = sizeof(disk->d_kd);
+			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
+			if (disk->d_kd.di.dumper == NULL)
+				G_RAID_DEBUG1(2, sc,
+				    "Dumping not supported by %s.",
+				    cp->provider->name);
+
+			/* Welcome the "new" disk. */
+			update += g_raid_md_nvidia_start_disk(disk);
+			if (disk->d_state != G_RAID_DISK_S_SPARE &&
+			    disk->d_state != G_RAID_DISK_S_ACTIVE) {
+				gctl_error(req, "Disk '%s' doesn't fit.",
+				    diskname);
+				g_raid_destroy_disk(disk);
+				error = -8;
+				break;
+			}
+		}
+
+		/* Write new metadata if we changed something. */
+		if (update)
+			g_raid_md_write_nvidia(md, NULL, NULL, NULL);
+		return (error);
+	}
+	gctl_error(req, "Command '%s' is not supported.", verb);
+	return (-100);
+}
+
+static int
+g_raid_md_write_nvidia(struct g_raid_md_object *md, struct g_raid_volume *tvol,
+    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	struct g_raid_md_nvidia_object *mdi;
+	struct g_raid_md_nvidia_perdisk *pd;
+	struct nvidia_raid_conf *meta;
+	int i, spares;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_nvidia_object *)md;
+
+	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
+		return (0);
+
+	/* There is only one volume. */
+	vol = TAILQ_FIRST(&sc->sc_volumes);
+
+	/* Fill global fields. */
+	meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK | M_ZERO);
+	if (mdi->mdio_meta)
+		memcpy(meta, mdi->mdio_meta, sizeof(*meta));
+	memcpy(meta->nvidia_id, NVIDIA_MAGIC, sizeof(NVIDIA_MAGIC));
+	meta->config_size = 30;
+	meta->version = 0x0064;
+	meta->total_sectors = vol->v_mediasize / vol->v_sectorsize;
+	meta->sector_size = vol->v_sectorsize;
+	nvidia_meta_put_name(meta, vol->v_name);
+	meta->magic_0 = NVIDIA_MAGIC0;
+	memcpy(&meta->volume_id, &mdi->mdio_volume_id, 16);
+	meta->state = NVIDIA_S_IDLE;
+	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
+		meta->array_width = 1;
+	else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
+		meta->array_width = vol->v_disks_count / 2;
+	else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
+		meta->array_width = vol->v_disks_count - 1;
+	else
+		meta->array_width = vol->v_disks_count;
+	meta->total_disks = vol->v_disks_count;
+	meta->orig_array_width = meta->array_width;
+	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
+		meta->type = NVIDIA_T_RAID0;
+	else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
+		meta->type = NVIDIA_T_RAID1;
+	else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
+		meta->type = NVIDIA_T_RAID01;
+	else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT ||
+	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE)
+		meta->type = NVIDIA_T_CONCAT;
+//	else if (vol->v_raid_level_qualifier == 0)
+//		meta->type = NVIDIA_T_RAID5;
+	else
+		meta->type = NVIDIA_T_RAID5_SYM;
+	meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize;
+	meta->strip_bytes = vol->v_strip_size;
+	meta->strip_shift = ffs(meta->strip_sectors) - 1;
+	meta->strip_mask = meta->strip_sectors - 1;
+	meta->stripe_sectors = meta->strip_sectors * meta->orig_array_width;
+	meta->stripe_bytes = meta->stripe_sectors * vol->v_sectorsize;
+	meta->rebuild_lba = 0;
+	meta->orig_type = meta->type;
+	meta->orig_total_sectors = meta->total_sectors;
+	meta->status = 0;
+
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		if ((sd->sd_state == G_RAID_SUBDISK_S_STALE ||
+		     sd->sd_state == G_RAID_SUBDISK_S_RESYNC ||
+		     vol->v_dirty) &&
+		     meta->state != NVIDIA_S_REBUILD)
+			meta->state = NVIDIA_S_SYNC;
+		else if (sd->sd_state == G_RAID_SUBDISK_S_NEW ||
+		     sd->sd_state == G_RAID_SUBDISK_S_REBUILD)
+			meta->state = NVIDIA_S_REBUILD;
+	}
+
+	/* We are done. Print meta data and store them to disks. */
+	if (mdi->mdio_meta != NULL)
+		free(mdi->mdio_meta, M_MD_NVIDIA);
+	mdi->mdio_meta = meta;
+	spares = 0;
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data;
+		if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
+		    disk->d_state != G_RAID_DISK_S_SPARE)
+			continue;
+		if (pd->pd_meta != NULL) {
+			free(pd->pd_meta, M_MD_NVIDIA);
+			pd->pd_meta = NULL;
+		}
+		pd->pd_meta = nvidia_meta_copy(meta);
+		if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) {
+			/* For RAID0+1 we need to translate order. */
+			pd->pd_meta->disk_number =
+			    nvidia_meta_translate_disk(meta, sd->sd_pos);
+			if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
+				pd->pd_meta->disk_status = 0x100;
+				pd->pd_meta->rebuild_lba =
+				    sd->sd_rebuild_pos / vol->v_sectorsize *
+				    meta->array_width;
+			}
+		} else
+			pd->pd_meta->disk_number = meta->total_disks + spares++;
+		G_RAID_DEBUG(1, "Writing NVIDIA metadata to %s",
+		    g_raid_get_diskname(disk));
+		g_raid_md_nvidia_print(pd->pd_meta);
+		nvidia_meta_write(disk->d_consumer, pd->pd_meta);
+	}
+	return (0);
+}
+
+static int
+g_raid_md_fail_disk_nvidia(struct g_raid_md_object *md,
+    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_nvidia_object *mdi;
+	struct g_raid_md_nvidia_perdisk *pd;
+	struct g_raid_subdisk *sd;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_nvidia_object *)md;
+	pd = (struct g_raid_md_nvidia_perdisk *)tdisk->d_md_data;
+
+	/* We can't fail disk that is not a part of array now. */
+	if (pd->pd_disk_pos < 0)
+		return (-1);
+
+	/* Erase metadata to prevent disks's later resurrection. */
+	if (tdisk->d_consumer)
+		nvidia_meta_erase(tdisk->d_consumer);
+
+	/* Change states. */
+	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
+	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
+		g_raid_change_subdisk_state(sd,
+		    G_RAID_SUBDISK_S_FAILED);
+		g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
+		    G_RAID_EVENT_SUBDISK);
+	}
+
+	/* Write updated metadata to remaining disks. */
+	g_raid_md_write_nvidia(md, NULL, NULL, tdisk);
+
+	/* Check if anything left except placeholders. */
+	if (g_raid_ndisks(sc, -1) ==
+	    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
+		g_raid_destroy_node(sc, 0);
+	else
+		g_raid_md_nvidia_refill(sc);
+	return (0);
+}
+
+static int
+g_raid_md_free_disk_nvidia(struct g_raid_md_object *md,
+    struct g_raid_disk *disk)
+{
+	struct g_raid_md_nvidia_perdisk *pd;
+
+	pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data;
+	if (pd->pd_meta != NULL) {
+		free(pd->pd_meta, M_MD_NVIDIA);
+		pd->pd_meta = NULL;
+	}
+	free(pd, M_MD_NVIDIA);
+	disk->d_md_data = NULL;
+	return (0);
+}
+
+static int
+g_raid_md_free_nvidia(struct g_raid_md_object *md)
+{
+	struct g_raid_md_nvidia_object *mdi;
+
+	mdi = (struct g_raid_md_nvidia_object *)md;
+	if (!mdi->mdio_started) {
+		mdi->mdio_started = 0;
+		callout_stop(&mdi->mdio_start_co);
+		G_RAID_DEBUG1(1, md->mdo_softc,
+		    "root_mount_rel %p", mdi->mdio_rootmount);
+		root_mount_rel(mdi->mdio_rootmount);
+		mdi->mdio_rootmount = NULL;
+	}
+	if (mdi->mdio_meta != NULL) {
+		free(mdi->mdio_meta, M_MD_NVIDIA);
+		mdi->mdio_meta = NULL;
+	}
+	return (0);
+}
+
+G_RAID_MD_DECLARE(g_raid_md_nvidia);
diff --git a/sys/geom/raid/md_promise.c b/sys/geom/raid/md_promise.c
new file mode 100644
index 0000000..b7bf070
--- /dev/null
+++ b/sys/geom/raid/md_promise.c
@@ -0,0 +1,1940 @@
+/*-
+ * Copyright (c) 2011 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <geom/geom.h>
+#include "geom/raid/g_raid.h"
+#include "g_raid_md_if.h"
+
+static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata");
+
+#define	PROMISE_MAX_DISKS	8
+#define	PROMISE_MAX_SUBDISKS	2
+#define	PROMISE_META_OFFSET	14
+
+struct promise_raid_disk {
+	uint8_t		flags;			/* Subdisk status. */
+#define PROMISE_F_VALID		0x01
+#define PROMISE_F_ONLINE	0x02
+#define PROMISE_F_ASSIGNED	0x04
+#define PROMISE_F_SPARE		0x08
+#define PROMISE_F_DUPLICATE	0x10
+#define PROMISE_F_REDIR		0x20
+#define PROMISE_F_DOWN		0x40
+#define PROMISE_F_READY		0x80
+
+	uint8_t		number;			/* Position in a volume. */
+	uint8_t		channel;		/* ATA channel number. */
+	uint8_t		device;			/* ATA device number. */
+	uint64_t	id __packed;		/* Subdisk ID. */
+} __packed;
+
+struct promise_raid_conf {
+	char		promise_id[24];
+#define PROMISE_MAGIC		"Promise Technology, Inc."
+#define FREEBSD_MAGIC		"FreeBSD ATA driver RAID "
+
+	uint32_t	dummy_0;
+	uint64_t	magic_0;
+#define PROMISE_MAGIC0(x)	(((uint64_t)(x.channel) << 48) | \
+				((uint64_t)(x.device != 0) << 56))
+	uint16_t	magic_1;
+	uint32_t	magic_2;
+	uint8_t		filler1[470];
+
+	uint32_t	integrity;
+#define PROMISE_I_VALID		0x00000080
+
+	struct promise_raid_disk	disk;	/* This subdisk info. */
+	uint32_t	disk_offset;		/* Subdisk offset. */
+	uint32_t	disk_sectors;		/* Subdisk size */
+	uint32_t	rebuild_lba;		/* Rebuild position. */
+	uint16_t	generation;		/* Generation number. */
+	uint8_t		status;			/* Volume status. */
+#define PROMISE_S_VALID		0x01
+#define PROMISE_S_ONLINE	0x02
+#define PROMISE_S_INITED	0x04
+#define PROMISE_S_READY		0x08
+#define PROMISE_S_DEGRADED	0x10
+#define PROMISE_S_MARKED	0x20
+#define PROMISE_S_MIGRATING	0x40
+#define PROMISE_S_FUNCTIONAL	0x80
+
+	uint8_t		type;			/* Voluem type. */
+#define PROMISE_T_RAID0		0x00
+#define PROMISE_T_RAID1		0x01
+#define PROMISE_T_RAID3		0x02
+#define PROMISE_T_RAID5		0x04
+#define PROMISE_T_SPAN		0x08
+#define PROMISE_T_JBOD		0x10
+
+	uint8_t		total_disks;		/* Disks in this volume. */
+	uint8_t		stripe_shift;		/* Strip size. */
+	uint8_t		array_width;		/* Number of RAID0 stripes. */
+	uint8_t		array_number;		/* Global volume number. */
+	uint32_t	total_sectors;		/* Volume size. */
+	uint16_t	cylinders;		/* Volume geometry: C. */
+	uint8_t		heads;			/* Volume geometry: H. */
+	uint8_t		sectors;		/* Volume geometry: S. */
+	uint64_t	volume_id __packed;	/* Volume ID, */
+	struct promise_raid_disk	disks[PROMISE_MAX_DISKS];
+						/* Subdisks in this volume. */
+	char		name[32];		/* Volume label. */
+
+	uint32_t	filler2[8];
+	uint32_t	magic_3;	/* Something related to rebuild. */
+	uint64_t	rebuild_lba64;	/* Per-volume rebuild position. */
+	uint32_t	magic_4;
+	uint32_t	magic_5;
+	uint32_t	filler3[325];
+	uint32_t	checksum;
+} __packed;
+
+struct g_raid_md_promise_perdisk {
+	int		 pd_updated;
+	int		 pd_subdisks;
+	struct promise_raid_conf	*pd_meta[PROMISE_MAX_SUBDISKS];
+};
+
+struct g_raid_md_promise_pervolume {
+	struct promise_raid_conf	*pv_meta;
+	uint64_t			 pv_id;
+	uint16_t			 pv_generation;
+	int				 pv_disks_present;
+	int				 pv_started;
+	struct callout			 pv_start_co;	/* STARTING state timer. */
+};
+
+static g_raid_md_create_t g_raid_md_create_promise;
+static g_raid_md_taste_t g_raid_md_taste_promise;
+static g_raid_md_event_t g_raid_md_event_promise;
+static g_raid_md_volume_event_t g_raid_md_volume_event_promise;
+static g_raid_md_ctl_t g_raid_md_ctl_promise;
+static g_raid_md_write_t g_raid_md_write_promise;
+static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise;
+static g_raid_md_free_disk_t g_raid_md_free_disk_promise;
+static g_raid_md_free_volume_t g_raid_md_free_volume_promise;
+static g_raid_md_free_t g_raid_md_free_promise;
+
+static kobj_method_t g_raid_md_promise_methods[] = {
+	KOBJMETHOD(g_raid_md_create,	g_raid_md_create_promise),
+	KOBJMETHOD(g_raid_md_taste,	g_raid_md_taste_promise),
+	KOBJMETHOD(g_raid_md_event,	g_raid_md_event_promise),
+	KOBJMETHOD(g_raid_md_volume_event,	g_raid_md_volume_event_promise),
+	KOBJMETHOD(g_raid_md_ctl,	g_raid_md_ctl_promise),
+	KOBJMETHOD(g_raid_md_write,	g_raid_md_write_promise),
+	KOBJMETHOD(g_raid_md_fail_disk,	g_raid_md_fail_disk_promise),
+	KOBJMETHOD(g_raid_md_free_disk,	g_raid_md_free_disk_promise),
+	KOBJMETHOD(g_raid_md_free_volume,	g_raid_md_free_volume_promise),
+	KOBJMETHOD(g_raid_md_free,	g_raid_md_free_promise),
+	{ 0, 0 }
+};
+
+static struct g_raid_md_class g_raid_md_promise_class = {
+	"Promise",
+	g_raid_md_promise_methods,
+	sizeof(struct g_raid_md_object),
+	.mdc_priority = 100
+};
+
+
+static void
+g_raid_md_promise_print(struct promise_raid_conf *meta)
+{
+	int i;
+
+	if (g_raid_debug < 1)
+		return;
+
+	printf("********* ATA Promise Metadata *********\n");
+	printf("promise_id          <%.24s>\n", meta->promise_id);
+	printf("disk                %02x %02x %02x %02x %016jx\n",
+	    meta->disk.flags, meta->disk.number, meta->disk.channel,
+	    meta->disk.device, meta->disk.id);
+	printf("disk_offset         %u\n", meta->disk_offset);
+	printf("disk_sectors        %u\n", meta->disk_sectors);
+	printf("rebuild_lba         %u\n", meta->rebuild_lba);
+	printf("generation          %u\n", meta->generation);
+	printf("status              0x%02x\n", meta->status);
+	printf("type                %u\n", meta->type);
+	printf("total_disks         %u\n", meta->total_disks);
+	printf("stripe_shift        %u\n", meta->stripe_shift);
+	printf("array_width         %u\n", meta->array_width);
+	printf("array_number        %u\n", meta->array_number);
+	printf("total_sectors       %u\n", meta->total_sectors);
+	printf("cylinders           %u\n", meta->cylinders);
+	printf("heads               %u\n", meta->heads);
+	printf("sectors             %u\n", meta->sectors);
+	printf("volume_id           0x%016jx\n", meta->volume_id);
+	printf("disks:\n");
+	for (i = 0; i < PROMISE_MAX_DISKS; i++ ) {
+		printf("                    %02x %02x %02x %02x %016jx\n",
+		    meta->disks[i].flags, meta->disks[i].number,
+		    meta->disks[i].channel, meta->disks[i].device,
+		    meta->disks[i].id);
+	}
+	printf("name                <%.32s>\n", meta->name);
+	printf("magic_3             0x%08x\n", meta->magic_3);
+	printf("rebuild_lba64       %ju\n", meta->rebuild_lba64);
+	printf("magic_4             0x%08x\n", meta->magic_4);
+	printf("magic_5             0x%08x\n", meta->magic_5);
+	printf("=================================================\n");
+}
+
+static struct promise_raid_conf *
+promise_meta_copy(struct promise_raid_conf *meta)
+{
+	struct promise_raid_conf *nmeta;
+
+	nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK);
+	memcpy(nmeta, meta, sizeof(*nmeta));
+	return (nmeta);
+}
+
+static int
+promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id)
+{
+	int pos;
+
+	for (pos = 0; pos < meta->total_disks; pos++) {
+		if (meta->disks[pos].id == id)
+			return (pos);
+	}
+	return (-1);
+}
+
+static int
+promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd,
+    uint32_t sectors, uint32_t *off, uint32_t *size)
+{
+	uint32_t coff, csize;
+	int i, j;
+
+	sectors -= 131072;
+	*off = 0;
+	*size = 0;
+	coff = 0;
+	csize = sectors;
+	i = 0;
+	while (1) {
+		for (j = 0; j < nsd; j++) {
+			if (metaarr[j]->disk_offset >= coff) {
+				csize = MIN(csize,
+				    metaarr[j]->disk_offset - coff);
+			}
+		}
+		if (csize > *size) {
+			*off = coff;
+			*size = csize;
+		}
+		if (i >= nsd)
+			break;
+		coff = metaarr[i]->disk_offset + metaarr[i]->disk_sectors;
+		csize = sectors - coff;
+		i++;
+	};
+	return ((*size > 0) ? 1 : 0);
+}
+
+static int
+promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos)
+{
+	int disk_pos, width;
+
+	if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
+		width = vol->v_disks_count / 2;
+		disk_pos = (md_disk_pos / width) +
+		    (md_disk_pos % width) * width;
+	} else
+		disk_pos = md_disk_pos;
+	return (disk_pos);
+}
+
+static void
+promise_meta_get_name(struct promise_raid_conf *meta, char *buf)
+{
+	int i;
+
+	strncpy(buf, meta->name, 32);
+	buf[32] = 0;
+	for (i = 31; i >= 0; i--) {
+		if (buf[i] > 0x20)
+			break;
+		buf[i] = 0;
+	}
+}
+
+static void
+promise_meta_put_name(struct promise_raid_conf *meta, char *buf)
+{
+
+	memset(meta->name, 0x20, 32);
+	memcpy(meta->name, buf, MIN(strlen(buf), 32));
+}
+
+static int
+promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr)
+{
+	struct g_provider *pp;
+	struct promise_raid_conf *meta;
+	char *buf;
+	int error, i, subdisks;
+	uint32_t checksum, *ptr;
+
+	pp = cp->provider;
+	subdisks = 0;
+next:
+	/* Read metadata block. */
+	buf = g_read_data(cp, pp->mediasize - pp->sectorsize *
+	    (63 - subdisks * PROMISE_META_OFFSET),
+	    pp->sectorsize * 4, &error);
+	if (buf == NULL) {
+		G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
+		    pp->name, error);
+		return (subdisks);
+	}
+	meta = (struct promise_raid_conf *)buf;
+
+	/* Check if this is an Promise RAID struct */
+	if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) &&
+	    strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) {
+		if (subdisks == 0)
+			G_RAID_DEBUG(1,
+			    "Promise signature check failed on %s", pp->name);
+		g_free(buf);
+		return (subdisks);
+	}
+	meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK);
+	memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4));
+	g_free(buf);
+
+	/* Check metadata checksum. */
+	for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
+		checksum += *ptr++;
+	if (checksum != meta->checksum) {
+		G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name);
+		free(meta, M_MD_PROMISE);
+		return (subdisks);
+	}
+
+	if ((meta->integrity & PROMISE_I_VALID) == 0) {
+		G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name);
+		free(meta, M_MD_PROMISE);
+		return (subdisks);
+	}
+
+	if (meta->total_disks > PROMISE_MAX_DISKS) {
+		G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)",
+		    pp->name, meta->total_disks);
+		free(meta, M_MD_PROMISE);
+		return (subdisks);
+	}
+
+	/* Save this part and look for next. */
+	*metaarr = meta;
+	metaarr++;
+	subdisks++;
+	if (subdisks < PROMISE_MAX_SUBDISKS)
+		goto next;
+
+	return (subdisks);
+}
+
+static int
+promise_meta_write(struct g_consumer *cp,
+    struct promise_raid_conf **metaarr, int nsd)
+{
+	struct g_provider *pp;
+	struct promise_raid_conf *meta;
+	char *buf;
+	int error, i, subdisk, fake;
+	uint32_t checksum, *ptr, off, size;
+
+	pp = cp->provider;
+	subdisk = 0;
+	fake = 0;
+next:
+	buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO);
+	meta = NULL;
+	if (subdisk < nsd) {
+		meta = metaarr[subdisk];
+	} else if (!fake && promise_meta_unused_range(metaarr, nsd,
+	    cp->provider->mediasize / cp->provider->sectorsize,
+	    &off, &size)) {
+		/* Optionally add record for unused space. */
+		meta = (struct promise_raid_conf *)buf;
+		memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC));
+		meta->dummy_0 = 0x00020000;
+		meta->integrity = PROMISE_I_VALID;
+		meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID;
+		meta->disk.number = 0xff;
+		arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
+		meta->disk_offset = off;
+		meta->disk_sectors = size;
+		meta->rebuild_lba = UINT32_MAX;
+		fake = 1;
+	}
+	if (meta != NULL) {
+		/* Recalculate checksum for case if metadata were changed. */
+		meta->checksum = 0;
+		for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++)
+			checksum += *ptr++;
+		meta->checksum = checksum;
+		memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta)));
+	}
+	error = g_write_data(cp, pp->mediasize - pp->sectorsize *
+	    (63 - subdisk * PROMISE_META_OFFSET),
+	    buf, pp->sectorsize * 4);
+	if (error != 0) {
+		G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
+		    pp->name, error);
+	}
+	free(buf, M_MD_PROMISE);
+
+	subdisk++;
+	if (subdisk < PROMISE_MAX_SUBDISKS)
+		goto next;
+
+	return (error);
+}
+
+static int
+promise_meta_erase(struct g_consumer *cp)
+{
+	struct g_provider *pp;
+	char *buf;
+	int error, subdisk;
+
+	pp = cp->provider;
+	buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO);
+	for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) {
+		error = g_write_data(cp, pp->mediasize - pp->sectorsize *
+		    (63 - subdisk * PROMISE_META_OFFSET),
+		    buf, 4 * pp->sectorsize);
+		if (error != 0) {
+			G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
+			    pp->name, error);
+		}
+	}
+	free(buf, M_MD_PROMISE);
+	return (error);
+}
+
+static int
+promise_meta_write_spare(struct g_consumer *cp)
+{
+	struct promise_raid_conf *meta;
+	int error;
+
+	meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
+	memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC));
+	meta->dummy_0 = 0x00020000;
+	meta->integrity = PROMISE_I_VALID;
+	meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID;
+	meta->disk.number = 0xff;
+	arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0);
+	meta->disk_sectors = cp->provider->mediasize / cp->provider->sectorsize;
+	meta->disk_sectors -= 131072;
+	meta->rebuild_lba = UINT32_MAX;
+	error = promise_meta_write(cp, &meta, 1);
+	free(meta, M_MD_PROMISE);
+	return (error);
+}
+
+static struct g_raid_volume *
+g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id)
+{
+	struct g_raid_volume	*vol;
+	struct g_raid_md_promise_pervolume *pv;
+
+	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+		pv = vol->v_md_data;
+		if (pv->pv_id == id)
+			break;
+	}
+	return (vol);
+}
+
+static int
+g_raid_md_promise_purge_volumes(struct g_raid_softc *sc)
+{
+	struct g_raid_volume	*vol, *tvol;
+	struct g_raid_md_promise_pervolume *pv;
+	int i, res;
+
+	res = 0;
+	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) {
+		pv = vol->v_md_data;
+		if (!pv->pv_started || vol->v_stopping)
+			continue;
+		for (i = 0; i < vol->v_disks_count; i++) {
+			if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE)
+				break;
+		}
+		if (i >= vol->v_disks_count) {
+			g_raid_destroy_volume(vol);
+			res = 1;
+		}
+	}
+	return (res);
+}
+
+static int
+g_raid_md_promise_purge_disks(struct g_raid_softc *sc)
+{
+	struct g_raid_disk	*disk, *tdisk;
+	struct g_raid_volume	*vol;
+	struct g_raid_md_promise_perdisk *pd;
+	int i, j, res;
+
+	res = 0;
+	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
+		if (disk->d_state == G_RAID_DISK_S_SPARE)
+			continue;
+		pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
+
+		/* Scan for deleted volumes. */
+		for (i = 0; i < pd->pd_subdisks; ) {
+			vol = g_raid_md_promise_get_volume(sc,
+			    pd->pd_meta[i]->volume_id);
+			if (vol != NULL && !vol->v_stopping) {
+				i++;
+				continue;
+			}
+			free(pd->pd_meta[i], M_MD_PROMISE);
+			for (j = i; j < pd->pd_subdisks - 1; j++)
+				pd->pd_meta[j] = pd->pd_meta[j + 1];
+			pd->pd_meta[PROMISE_MAX_SUBDISKS - 1] = NULL;
+			pd->pd_subdisks--;
+			pd->pd_updated = 1;
+		}
+
+		/* If there is no metadata left - erase and delete disk. */
+		if (pd->pd_subdisks == 0) {
+			promise_meta_erase(disk->d_consumer);
+			g_raid_destroy_disk(disk);
+			res = 1;
+		}
+	}
+	return (res);
+}
+
+static int
+g_raid_md_promise_supported(int level, int qual, int disks, int force)
+{
+
+	if (disks > PROMISE_MAX_DISKS)
+		return (0);
+	switch (level) {
+	case G_RAID_VOLUME_RL_RAID0:
+		if (disks < 1)
+			return (0);
+		if (!force && disks < 2)
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID1:
+		if (disks < 1)
+			return (0);
+		if (!force && (disks != 2))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID1E:
+		if (disks < 2)
+			return (0);
+		if (disks % 2 != 0)
+			return (0);
+		if (!force && (disks != 4))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_SINGLE:
+		if (disks != 1)
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_CONCAT:
+		if (disks < 2)
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID5:
+		if (disks < 3)
+			return (0);
+		break;
+	default:
+		return (0);
+	}
+	if (qual != G_RAID_VOLUME_RLQ_NONE)
+		return (0);
+	return (1);
+}
+
+static int
+g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn,
+    struct g_raid_volume *vol)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd;
+	struct g_raid_md_promise_perdisk *pd;
+	struct g_raid_md_promise_pervolume *pv;
+	struct promise_raid_conf *meta;
+	off_t size;
+	int disk_pos, md_disk_pos, i, resurrection = 0;
+	uint32_t eoff, esize;
+
+	sc = disk->d_softc;
+	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
+
+	pv = vol->v_md_data;
+	meta = pv->pv_meta;
+
+	if (sdn >= 0) {
+		/* Find disk position in metadata by it's serial. */
+		md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id);
+		/* For RAID0+1 we need to translate order. */
+		disk_pos = promise_meta_translate_disk(vol, md_disk_pos);
+	} else {
+		md_disk_pos = -1;
+		disk_pos = -1;
+	}
+	if (disk_pos < 0) {
+		G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s",
+		    g_raid_get_diskname(disk), vol->v_name);
+		/* Failed stale disk is useless for us. */
+		if (sdn >= 0 &&
+		    pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) {
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
+			return (0);
+		}
+		/* If we were given specific metadata subdisk - erase it. */
+		if (sdn >= 0) {
+			free(pd->pd_meta[sdn], M_MD_PROMISE);
+			for (i = sdn; i < pd->pd_subdisks - 1; i++)
+				pd->pd_meta[i] = pd->pd_meta[i + 1];
+			pd->pd_meta[PROMISE_MAX_SUBDISKS - 1] = NULL;
+			pd->pd_subdisks--;
+		}
+		/* If we are in the start process, that's all for now. */
+		if (!pv->pv_started)
+			goto nofit;
+		/*
+		 * If we have already started - try to get use of the disk.
+		 * Try to replace OFFLINE disks first, then FAILED.
+		 */
+		promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks,
+		    disk->d_consumer->provider->mediasize /
+		    disk->d_consumer->provider->sectorsize,
+		    &eoff, &esize);
+		if (esize == 0) {
+			G_RAID_DEBUG1(1, sc, "No free space on disk %s",
+			    g_raid_get_diskname(disk));
+			goto nofit;
+		}
+		size = INT64_MAX;
+		for (i = 0; i < vol->v_disks_count; i++) {
+			sd = &vol->v_subdisks[i];
+			if (sd->sd_state != G_RAID_SUBDISK_S_NONE)
+				size = sd->sd_size;
+			if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED &&
+			    (disk_pos < 0 ||
+			     vol->v_subdisks[i].sd_state < sd->sd_state))
+				disk_pos = i;
+		}
+		if (disk_pos >= 0 &&
+		    vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
+		    (off_t)esize * 512 < size) {
+			G_RAID_DEBUG1(1, sc, "Disk %s free space "
+			    "is too small (%ju < %ju)",
+			    g_raid_get_diskname(disk),
+			    (off_t)esize * 512, size);
+			disk_pos = -1;
+		}
+		if (disk_pos >= 0) {
+			if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT)
+				esize = size / 512;
+			/* For RAID0+1 we need to translate order. */
+			md_disk_pos = promise_meta_translate_disk(vol, disk_pos);
+		} else {
+nofit:
+			if (pd->pd_subdisks == 0) {
+				g_raid_change_disk_state(disk,
+				    G_RAID_DISK_S_SPARE);
+			}
+			return (0);
+		}
+		G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s",
+		    g_raid_get_diskname(disk), disk_pos, vol->v_name);
+		resurrection = 1;
+	}
+
+	sd = &vol->v_subdisks[disk_pos];
+
+	if (resurrection && sd->sd_disk != NULL) {
+		g_raid_change_disk_state(sd->sd_disk,
+		    G_RAID_DISK_S_STALE_FAILED);
+		TAILQ_REMOVE(&sd->sd_disk->d_subdisks,
+		    sd, sd_next);
+	}
+	vol->v_subdisks[disk_pos].sd_disk = disk;
+	TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+
+	/* Welcome the new disk. */
+	if (resurrection)
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
+	else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN)
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
+	else
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
+
+	if (resurrection) {
+		sd->sd_offset = (off_t)eoff * 512;
+		sd->sd_size = (off_t)esize * 512;
+	} else {
+		sd->sd_offset = (off_t)pd->pd_meta[sdn]->disk_offset * 512;
+		sd->sd_size = (off_t)pd->pd_meta[sdn]->disk_sectors * 512;
+	}
+
+	if (resurrection) {
+		/* Stale disk, almost same as new. */
+		g_raid_change_subdisk_state(sd,
+		    G_RAID_SUBDISK_S_NEW);
+	} else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) {
+		/* Failed disk. */
+		g_raid_change_subdisk_state(sd,
+		    G_RAID_SUBDISK_S_FAILED);
+	} else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) {
+		/* Rebuilding disk. */
+		g_raid_change_subdisk_state(sd,
+		    G_RAID_SUBDISK_S_REBUILD);
+		if (pd->pd_meta[sdn]->generation != meta->generation)
+			sd->sd_rebuild_pos = 0;
+		else {
+			sd->sd_rebuild_pos =
+			    (off_t)pd->pd_meta[sdn]->rebuild_lba * 512;
+		}
+	} else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) {
+		/* Rebuilding disk. */
+		g_raid_change_subdisk_state(sd,
+		    G_RAID_SUBDISK_S_NEW);
+	} else if (pd->pd_meta[sdn]->generation != meta->generation ||
+	    (meta->status & PROMISE_S_MARKED)) {
+		/* Stale disk or dirty volume (unclean shutdown). */
+		g_raid_change_subdisk_state(sd,
+		    G_RAID_SUBDISK_S_STALE);
+	} else {
+		/* Up to date disk. */
+		g_raid_change_subdisk_state(sd,
+		    G_RAID_SUBDISK_S_ACTIVE);
+	}
+	g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
+	    G_RAID_EVENT_SUBDISK);
+
+	return (resurrection);
+}
+
+static void
+g_raid_md_promise_refill(struct g_raid_softc *sc)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	struct g_raid_md_object *md;
+	struct g_raid_md_promise_perdisk *pd;
+	struct g_raid_md_promise_pervolume *pv;
+	int update, updated, i, bad;
+
+	md = sc->sc_md;
+restart:
+	updated = 0;
+	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+		pv = vol->v_md_data;
+		if (!pv->pv_started || vol->v_stopping)
+			continue;
+
+		/* Search for subdisk that needs replacement. */
+		bad = 0;
+		for (i = 0; i < vol->v_disks_count; i++) {
+			sd = &vol->v_subdisks[i];
+			if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
+			    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
+			        bad = 1;
+		}
+		if (!bad)
+			continue;
+
+		G_RAID_DEBUG1(1, sc, "Volume %s is not complete, "
+		    "trying to refill.", vol->v_name);
+
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			/* Skip failed. */
+			if (disk->d_state < G_RAID_DISK_S_SPARE)
+				continue;
+			/* Skip already used by this volume. */
+			for (i = 0; i < vol->v_disks_count; i++) {
+				sd = &vol->v_subdisks[i];
+				if (sd->sd_disk == disk)
+					break;
+			}
+			if (i < vol->v_disks_count)
+				continue;
+
+			/* Try to use disk if it has empty extents. */
+			pd = disk->d_md_data;
+			if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) {
+				update =
+				    g_raid_md_promise_start_disk(disk, -1, vol);
+			} else
+				update = 0;
+			if (update) {
+				g_raid_md_write_promise(md, vol, NULL, disk);
+				break;
+			}
+			updated += update;
+		}
+	}
+	if (updated)
+		goto restart;
+}
+
+static void
+g_raid_md_promise_start(struct g_raid_volume *vol)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	struct g_raid_md_object *md;
+	struct g_raid_md_promise_perdisk *pd;
+	struct g_raid_md_promise_pervolume *pv;
+	struct promise_raid_conf *meta;
+	int i;
+
+	sc = vol->v_softc;
+	md = sc->sc_md;
+	pv = vol->v_md_data;
+	meta = pv->pv_meta;
+
+	if (meta->type == PROMISE_T_RAID0)
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
+	else if (meta->type == PROMISE_T_RAID1) {
+		if (meta->array_width == 1)
+			vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
+		else
+			vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
+	} else if (meta->type == PROMISE_T_RAID3)
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID3;
+	else if (meta->type == PROMISE_T_RAID5)
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
+	else if (meta->type == PROMISE_T_SPAN)
+		vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT;
+	else if (meta->type == PROMISE_T_JBOD)
+		vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE;
+	else
+		vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
+	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
+	vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ
+	vol->v_disks_count = meta->total_disks;
+	vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ
+	vol->v_sectorsize = 512; //ZZZ
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		sd->sd_offset = (off_t)meta->disk_offset * 512; //ZZZ
+		sd->sd_size = (off_t)meta->disk_sectors * 512; //ZZZ
+	}
+	g_raid_start_volume(vol);
+
+	/* Make all disks found till the moment take their places. */
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		pd = disk->d_md_data;
+		for (i = 0; i < pd->pd_subdisks; i++) {
+			if (pd->pd_meta[i]->volume_id == meta->volume_id)
+				g_raid_md_promise_start_disk(disk, i, vol);
+		}
+	}
+
+	pv->pv_started = 1;
+	callout_stop(&pv->pv_start_co);
+	G_RAID_DEBUG1(0, sc, "Volume started.");
+	g_raid_md_write_promise(md, vol, NULL, NULL);
+
+	/* Pickup any STALE/SPARE disks to refill array if needed. */
+	g_raid_md_promise_refill(sc);
+
+	g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
+}
+
+static void
+g_raid_promise_go(void *arg)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_softc *sc;
+	struct g_raid_md_promise_pervolume *pv;
+
+	vol = arg;
+	pv = vol->v_md_data;
+	sc = vol->v_softc;
+	if (!pv->pv_started) {
+		G_RAID_DEBUG1(0, sc, "Force volume start due to timeout.");
+		g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD,
+		    G_RAID_EVENT_VOLUME);
+	}
+}
+
+static void
+g_raid_md_promise_new_disk(struct g_raid_disk *disk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_object *md;
+	struct promise_raid_conf *pdmeta;
+	struct g_raid_md_promise_perdisk *pd;
+	struct g_raid_md_promise_pervolume *pv;
+	struct g_raid_volume *vol;
+	int i;
+	char buf[33];
+
+	sc = disk->d_softc;
+	md = sc->sc_md;
+	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
+
+	if (pd->pd_subdisks == 0) {
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
+		g_raid_md_promise_refill(sc);
+		return;
+	}
+
+	for (i = 0; i < pd->pd_subdisks; i++) {
+		pdmeta = pd->pd_meta[i];
+
+		/* Look for volume with matching ID. */
+		vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
+		if (vol == NULL) {
+			promise_meta_get_name(pdmeta, buf);
+			vol = g_raid_create_volume(sc, buf, pdmeta->array_number);
+			pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
+			pv->pv_id = pdmeta->volume_id;
+			vol->v_md_data = pv;
+			callout_init(&pv->pv_start_co, 1);
+			callout_reset(&pv->pv_start_co,
+			    g_raid_start_timeout * hz,
+			    g_raid_promise_go, vol);
+		} else
+			pv = vol->v_md_data;
+
+		/* If we haven't started yet - check metadata freshness. */
+		if (pv->pv_meta == NULL || !pv->pv_started) {
+			if (pv->pv_meta == NULL ||
+			    ((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) {
+				G_RAID_DEBUG1(1, sc, "Newer disk");
+				if (pv->pv_meta != NULL)
+					free(pv->pv_meta, M_MD_PROMISE);
+				pv->pv_meta = promise_meta_copy(pdmeta);
+				pv->pv_generation = pv->pv_meta->generation;
+				pv->pv_disks_present = 1;
+			} else if (pdmeta->generation == pv->pv_generation) {
+				pv->pv_disks_present++;
+				G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
+				    pv->pv_disks_present,
+				    pv->pv_meta->total_disks);
+			} else {
+				G_RAID_DEBUG1(1, sc, "Older disk");
+			}
+		}
+	}
+
+	for (i = 0; i < pd->pd_subdisks; i++) {
+		pdmeta = pd->pd_meta[i];
+
+		/* Look for volume with matching ID. */
+		vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id);
+		if (vol == NULL)
+			continue;
+		pv = vol->v_md_data;
+
+		if (pv->pv_started) {
+			if (g_raid_md_promise_start_disk(disk, i, vol))
+				g_raid_md_write_promise(md, vol, NULL, NULL);
+		} else {
+			/* If we collected all needed disks - start array. */
+			if (pv->pv_disks_present == pv->pv_meta->total_disks)
+				g_raid_md_promise_start(vol);
+		}
+	}
+}
+
+static int
+g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp,
+    struct g_geom **gp)
+{
+	struct g_geom *geom;
+	struct g_raid_softc *sc;
+
+	/* Search for existing node. */
+	LIST_FOREACH(geom, &mp->geom, geom) {
+		sc = geom->softc;
+		if (sc == NULL)
+			continue;
+		if (sc->sc_stopping != 0)
+			continue;
+		if (sc->sc_md->mdo_class != md->mdo_class)
+			continue;
+		break;
+	}
+	if (geom != NULL) {
+		*gp = geom;
+		return (G_RAID_MD_TASTE_EXISTING);
+	}
+
+	/* Create new one if not found. */
+	sc = g_raid_create_node(mp, "Promise", md);
+	if (sc == NULL)
+		return (G_RAID_MD_TASTE_FAIL);
+	md->mdo_softc = sc;
+	*gp = sc->sc_geom;
+	return (G_RAID_MD_TASTE_NEW);
+}
+
+static int
+g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp,
+                              struct g_consumer *cp, struct g_geom **gp)
+{
+	struct g_consumer *rcp;
+	struct g_provider *pp;
+	struct g_raid_softc *sc;
+	struct g_raid_disk *disk;
+	struct promise_raid_conf *meta, *metaarr[4];
+	struct g_raid_md_promise_perdisk *pd;
+	struct g_geom *geom;
+	int error, i, j, result, len, subdisks;
+	char name[16];
+	uint16_t vendor;
+
+	G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name);
+	pp = cp->provider;
+
+	/* Read metadata from device. */
+	meta = NULL;
+	vendor = 0xffff;
+	if (g_access(cp, 1, 0, 0) != 0)
+		return (G_RAID_MD_TASTE_FAIL);
+	g_topology_unlock();
+	len = 2;
+	if (pp->geom->rank == 1)
+		g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
+	subdisks = promise_meta_read(cp, metaarr);
+	g_topology_lock();
+	g_access(cp, -1, 0, 0);
+	if (subdisks == 0) {
+		if (g_raid_aggressive_spare) {
+			if (vendor == 0x105a || vendor == 0x1002) {
+				G_RAID_DEBUG(1,
+				    "No Promise metadata, forcing spare.");
+				goto search;
+			} else {
+				G_RAID_DEBUG(1,
+				    "Promise/ATI vendor mismatch "
+				    "0x%04x != 0x105a/0x1002",
+				    vendor);
+			}
+		}
+		return (G_RAID_MD_TASTE_FAIL);
+	}
+
+	/* Metadata valid. Print it. */
+	for (i = 0; i < subdisks; i++)
+		g_raid_md_promise_print(metaarr[i]);
+
+	/* Purge meaningless (empty/spare) records. */
+	for (i = 0; i < subdisks; ) {
+		if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) {
+			i++;
+			continue;
+		}
+		free(metaarr[i], M_MD_PROMISE);
+		for (j = i; j < subdisks - 1; j++)
+			metaarr[i] = metaarr[j + 1];
+		metaarr[PROMISE_MAX_SUBDISKS - 1] = NULL;
+		subdisks--;
+	}
+
+search:
+	/* Search for matching node. */
+	sc = NULL;
+	LIST_FOREACH(geom, &mp->geom, geom) {
+		sc = geom->softc;
+		if (sc == NULL)
+			continue;
+		if (sc->sc_stopping != 0)
+			continue;
+		if (sc->sc_md->mdo_class != md->mdo_class)
+			continue;
+		break;
+	}
+
+	/* Found matching node. */
+	if (geom != NULL) {
+		G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
+		result = G_RAID_MD_TASTE_EXISTING;
+
+	} else { /* Not found matching node -- create one. */
+		result = G_RAID_MD_TASTE_NEW;
+		snprintf(name, sizeof(name), "Promise");
+		sc = g_raid_create_node(mp, name, md);
+		md->mdo_softc = sc;
+		geom = sc->sc_geom;
+	}
+
+	rcp = g_new_consumer(geom);
+	g_attach(rcp, pp);
+	if (g_access(rcp, 1, 1, 1) != 0)
+		; //goto fail1;
+
+	g_topology_unlock();
+	sx_xlock(&sc->sc_lock);
+
+	pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
+	pd->pd_subdisks = subdisks;
+	for (i = 0; i < subdisks; i++)
+		pd->pd_meta[i] = metaarr[i];
+	disk = g_raid_create_disk(sc);
+	disk->d_md_data = (void *)pd;
+	disk->d_consumer = rcp;
+	rcp->private = disk;
+
+	/* Read kernel dumping information. */
+	disk->d_kd.offset = 0;
+	disk->d_kd.length = OFF_MAX;
+	len = sizeof(disk->d_kd);
+	error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd);
+	if (disk->d_kd.di.dumper == NULL)
+		G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", 
+		    rcp->provider->name, error);
+
+	g_raid_md_promise_new_disk(disk);
+
+	sx_xunlock(&sc->sc_lock);
+	g_topology_lock();
+	*gp = geom;
+	return (result);
+}
+
+static int
+g_raid_md_event_promise(struct g_raid_md_object *md,
+    struct g_raid_disk *disk, u_int event)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_promise_perdisk *pd;
+
+	sc = md->mdo_softc;
+	if (disk == NULL)
+		return (-1);
+	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
+	switch (event) {
+	case G_RAID_DISK_E_DISCONNECTED:
+		/* Delete disk. */
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
+		g_raid_destroy_disk(disk);
+		g_raid_md_promise_purge_volumes(sc);
+
+		/* Write updated metadata to all disks. */
+		g_raid_md_write_promise(md, NULL, NULL, NULL);
+
+		/* Check if anything left. */
+		if (g_raid_ndisks(sc, -1) == 0)
+			g_raid_destroy_node(sc, 0);
+		else
+			g_raid_md_promise_refill(sc);
+		return (0);
+	}
+	return (-2);
+}
+
+static int
+g_raid_md_volume_event_promise(struct g_raid_md_object *md,
+    struct g_raid_volume *vol, u_int event)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_promise_pervolume *pv;
+
+	sc = md->mdo_softc;
+	pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
+	switch (event) {
+	case G_RAID_VOLUME_E_STARTMD:
+		if (!pv->pv_started)
+			g_raid_md_promise_start(vol);
+		return (0);
+	}
+	return (-2);
+}
+
+static int
+g_raid_md_ctl_promise(struct g_raid_md_object *md,
+    struct gctl_req *req)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol, *vol1;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS];
+	struct g_raid_md_promise_perdisk *pd;
+	struct g_raid_md_promise_pervolume *pv;
+	struct g_consumer *cp;
+	struct g_provider *pp;
+	char arg[16];
+	const char *verb, *volname, *levelname, *diskname;
+	char *tmp;
+	int *nargs, *force;
+	off_t size, sectorsize, strip;
+	intmax_t *sizearg, *striparg;
+	uint32_t offs[PROMISE_MAX_DISKS], esize;
+	int numdisks, i, len, level, qual;
+	int error;
+
+	sc = md->mdo_softc;
+	verb = gctl_get_param(req, "verb", NULL);
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	error = 0;
+	if (strcmp(verb, "label") == 0) {
+
+		if (*nargs < 4) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		volname = gctl_get_asciiparam(req, "arg1");
+		if (volname == NULL) {
+			gctl_error(req, "No volume name.");
+			return (-2);
+		}
+		levelname = gctl_get_asciiparam(req, "arg2");
+		if (levelname == NULL) {
+			gctl_error(req, "No RAID level.");
+			return (-3);
+		}
+		if (g_raid_volume_str2level(levelname, &level, &qual)) {
+			gctl_error(req, "Unknown RAID level '%s'.", levelname);
+			return (-4);
+		}
+		numdisks = *nargs - 3;
+		force = gctl_get_paraml(req, "force", sizeof(*force));
+		if (!g_raid_md_promise_supported(level, qual, numdisks,
+		    force ? *force : 0)) {
+			gctl_error(req, "Unsupported RAID level "
+			    "(0x%02x/0x%02x), or number of disks (%d).",
+			    level, qual, numdisks);
+			return (-5);
+		}
+
+		/* Search for disks, connect them and probe. */
+		size = INT64_MAX;
+		sectorsize = 0;
+		bzero(disks, sizeof(disks));
+		bzero(offs, sizeof(offs));
+		for (i = 0; i < numdisks; i++) {
+			snprintf(arg, sizeof(arg), "arg%d", i + 3);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -6;
+				break;
+			}
+			if (strcmp(diskname, "NONE") == 0)
+				continue;
+
+			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+				if (disk->d_consumer != NULL && 
+				    disk->d_consumer->provider != NULL &&
+				    strcmp(disk->d_consumer->provider->name,
+				     diskname) == 0)
+					break;
+			}
+			if (disk != NULL) {
+				if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
+					gctl_error(req, "Disk '%s' is in a "
+					    "wrong state (%s).", diskname,
+					    g_raid_disk_state2str(disk->d_state));
+					error = -7;
+					break;
+				}
+				pd = disk->d_md_data;
+				if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) {
+					gctl_error(req, "Disk '%s' already "
+					    "used by %d volumes.",
+					    diskname, pd->pd_subdisks);
+					error = -7;
+					break;
+				}
+				pp = disk->d_consumer->provider;
+				disks[i] = disk;
+				promise_meta_unused_range(pd->pd_meta,
+				    pd->pd_subdisks,
+				    pp->mediasize / pp->sectorsize,
+				    &offs[i], &esize);
+				size = MIN(size, (off_t)esize * pp->sectorsize);
+				sectorsize = MAX(sectorsize, pp->sectorsize);
+				continue;
+			}
+
+			g_topology_lock();
+			cp = g_raid_open_consumer(sc, diskname);
+			if (cp == NULL) {
+				gctl_error(req, "Can't open disk '%s'.",
+				    diskname);
+				g_topology_unlock();
+				error = -8;
+				break;
+			}
+			pp = cp->provider;
+			pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
+			disk = g_raid_create_disk(sc);
+			disk->d_md_data = (void *)pd;
+			disk->d_consumer = cp;
+			disks[i] = disk;
+			cp->private = disk;
+			g_topology_unlock();
+
+			/* Read kernel dumping information. */
+			disk->d_kd.offset = 0;
+			disk->d_kd.length = OFF_MAX;
+			len = sizeof(disk->d_kd);
+			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
+			if (disk->d_kd.di.dumper == NULL)
+				G_RAID_DEBUG1(2, sc,
+				    "Dumping not supported by %s.",
+				    cp->provider->name);
+
+			/* Reserve some space for metadata. */
+			size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize);
+			sectorsize = MAX(sectorsize, pp->sectorsize);
+		}
+		if (error != 0) {
+			for (i = 0; i < numdisks; i++) {
+				if (disks[i] != NULL &&
+				    disks[i]->d_state == G_RAID_DISK_S_NONE)
+					g_raid_destroy_disk(disks[i]);
+			}
+			return (error);
+		}
+
+		/* Handle size argument. */
+		len = sizeof(*sizearg);
+		sizearg = gctl_get_param(req, "size", &len);
+		if (sizearg != NULL && len == sizeof(*sizearg) &&
+		    *sizearg > 0) {
+			if (*sizearg > size) {
+				gctl_error(req, "Size too big %lld > %lld.",
+				    (long long)*sizearg, (long long)size);
+				return (-9);
+			}
+			size = *sizearg;
+		}
+
+		/* Handle strip argument. */
+		strip = 131072;
+		len = sizeof(*striparg);
+		striparg = gctl_get_param(req, "strip", &len);
+		if (striparg != NULL && len == sizeof(*striparg) &&
+		    *striparg > 0) {
+			if (*striparg < sectorsize) {
+				gctl_error(req, "Strip size too small.");
+				return (-10);
+			}
+			if (*striparg % sectorsize != 0) {
+				gctl_error(req, "Incorrect strip size.");
+				return (-11);
+			}
+			strip = *striparg;
+		}
+
+		/* Round size down to strip or sector. */
+		if (level == G_RAID_VOLUME_RL_RAID1 ||
+		    level == G_RAID_VOLUME_RL_SINGLE ||
+		    level == G_RAID_VOLUME_RL_CONCAT)
+			size -= (size % sectorsize);
+		else if (level == G_RAID_VOLUME_RL_RAID1E &&
+		    (numdisks & 1) != 0)
+			size -= (size % (2 * strip));
+		else
+			size -= (size % strip);
+		if (size <= 0) {
+			gctl_error(req, "Size too small.");
+			return (-13);
+		}
+		if (size > 0xffffffffllu * sectorsize) {
+			gctl_error(req, "Size too big.");
+			return (-14);
+		}
+
+		/* We have all we need, create things: volume, ... */
+		pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO);
+		arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0);
+		pv->pv_generation = 0;
+		pv->pv_started = 1;
+		vol = g_raid_create_volume(sc, volname, -1);
+		vol->v_md_data = pv;
+		vol->v_raid_level = level;
+		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
+		vol->v_strip_size = strip;
+		vol->v_disks_count = numdisks;
+		if (level == G_RAID_VOLUME_RL_RAID0 ||
+		    level == G_RAID_VOLUME_RL_CONCAT ||
+		    level == G_RAID_VOLUME_RL_SINGLE)
+			vol->v_mediasize = size * numdisks;
+		else if (level == G_RAID_VOLUME_RL_RAID1)
+			vol->v_mediasize = size;
+		else if (level == G_RAID_VOLUME_RL_RAID3 ||
+		    level == G_RAID_VOLUME_RL_RAID5)
+			vol->v_mediasize = size * (numdisks - 1);
+		else { /* RAID1E */
+			vol->v_mediasize = ((size * numdisks) / strip / 2) *
+			    strip;
+		}
+		vol->v_sectorsize = sectorsize;
+		g_raid_start_volume(vol);
+
+		/* , and subdisks. */
+		for (i = 0; i < numdisks; i++) {
+			disk = disks[i];
+			sd = &vol->v_subdisks[i];
+			sd->sd_disk = disk;
+			sd->sd_offset = (off_t)offs[i] * 512;
+			sd->sd_size = size;
+			if (disk == NULL)
+				continue;
+			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+			g_raid_change_disk_state(disk,
+			    G_RAID_DISK_S_ACTIVE);
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_ACTIVE);
+			g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
+			    G_RAID_EVENT_SUBDISK);
+		}
+
+		/* Write metadata based on created entities. */
+		G_RAID_DEBUG1(0, sc, "Array started.");
+		g_raid_md_write_promise(md, vol, NULL, NULL);
+
+		/* Pickup any STALE/SPARE disks to refill array if needed. */
+		g_raid_md_promise_refill(sc);
+
+		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
+		    G_RAID_EVENT_VOLUME);
+		return (0);
+	}
+	if (strcmp(verb, "add") == 0) {
+
+		gctl_error(req, "`add` command is not applicable, "
+		    "use `label` instead.");
+		return (-99);
+	}
+	if (strcmp(verb, "delete") == 0) {
+
+		/* Full node destruction. */
+		if (*nargs == 1) {
+			/* Check if some volume is still open. */
+			force = gctl_get_paraml(req, "force", sizeof(*force));
+			if (force != NULL && *force == 0 &&
+			    g_raid_nopens(sc) != 0) {
+				gctl_error(req, "Some volume is still open.");
+				return (-4);
+			}
+
+			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+				if (disk->d_consumer)
+					promise_meta_erase(disk->d_consumer);
+			}
+			g_raid_destroy_node(sc, 0);
+			return (0);
+		}
+
+		/* Destroy specified volume. If it was last - all node. */
+		if (*nargs != 2) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		volname = gctl_get_asciiparam(req, "arg1");
+		if (volname == NULL) {
+			gctl_error(req, "No volume name.");
+			return (-2);
+		}
+
+		/* Search for volume. */
+		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+			if (strcmp(vol->v_name, volname) == 0)
+				break;
+		}
+		if (vol == NULL) {
+			i = strtol(volname, &tmp, 10);
+			if (verb != volname && tmp[0] == 0) {
+				TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+					if (vol->v_global_id == i)
+						break;
+				}
+			}
+		}
+		if (vol == NULL) {
+			gctl_error(req, "Volume '%s' not found.", volname);
+			return (-3);
+		}
+
+		/* Check if volume is still open. */
+		force = gctl_get_paraml(req, "force", sizeof(*force));
+		if (force != NULL && *force == 0 &&
+		    vol->v_provider_open != 0) {
+			gctl_error(req, "Volume is still open.");
+			return (-4);
+		}
+
+		/* Destroy volume and potentially node. */
+		i = 0;
+		TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
+			i++;
+		if (i >= 2) {
+			g_raid_destroy_volume(vol);
+			g_raid_md_promise_purge_disks(sc);
+			g_raid_md_write_promise(md, NULL, NULL, NULL);
+		} else {
+			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+				if (disk->d_consumer)
+					promise_meta_erase(disk->d_consumer);
+			}
+			g_raid_destroy_node(sc, 0);
+		}
+		return (0);
+	}
+	if (strcmp(verb, "remove") == 0 ||
+	    strcmp(verb, "fail") == 0) {
+		if (*nargs < 2) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		for (i = 1; i < *nargs; i++) {
+			snprintf(arg, sizeof(arg), "arg%d", i);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -2;
+				break;
+			}
+			if (strncmp(diskname, "/dev/", 5) == 0)
+				diskname += 5;
+
+			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+				if (disk->d_consumer != NULL && 
+				    disk->d_consumer->provider != NULL &&
+				    strcmp(disk->d_consumer->provider->name,
+				     diskname) == 0)
+					break;
+			}
+			if (disk == NULL) {
+				gctl_error(req, "Disk '%s' not found.",
+				    diskname);
+				error = -3;
+				break;
+			}
+
+			if (strcmp(verb, "fail") == 0) {
+				g_raid_md_fail_disk_promise(md, NULL, disk);
+				continue;
+			}
+
+			pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
+
+			/* Erase metadata on deleting disk and destroy it. */
+			promise_meta_erase(disk->d_consumer);
+			g_raid_destroy_disk(disk);
+		}
+		g_raid_md_promise_purge_volumes(sc);
+
+		/* Write updated metadata to remaining disks. */
+		g_raid_md_write_promise(md, NULL, NULL, NULL);
+
+		/* Check if anything left. */
+		if (g_raid_ndisks(sc, -1) == 0)
+			g_raid_destroy_node(sc, 0);
+		else
+			g_raid_md_promise_refill(sc);
+		return (error);
+	}
+	if (strcmp(verb, "insert") == 0) {
+		if (*nargs < 2) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		for (i = 1; i < *nargs; i++) {
+			/* Get disk name. */
+			snprintf(arg, sizeof(arg), "arg%d", i);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -3;
+				break;
+			}
+
+			/* Try to find provider with specified name. */
+			g_topology_lock();
+			cp = g_raid_open_consumer(sc, diskname);
+			if (cp == NULL) {
+				gctl_error(req, "Can't open disk '%s'.",
+				    diskname);
+				g_topology_unlock();
+				error = -4;
+				break;
+			}
+			pp = cp->provider;
+			g_topology_unlock();
+
+			pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO);
+
+			disk = g_raid_create_disk(sc);
+			disk->d_consumer = cp;
+			disk->d_consumer->private = disk;
+			disk->d_md_data = (void *)pd;
+			cp->private = disk;
+
+			/* Read kernel dumping information. */
+			disk->d_kd.offset = 0;
+			disk->d_kd.length = OFF_MAX;
+			len = sizeof(disk->d_kd);
+			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
+			if (disk->d_kd.di.dumper == NULL)
+				G_RAID_DEBUG1(2, sc,
+				    "Dumping not supported by %s.",
+				    cp->provider->name);
+
+			/* Welcome the "new" disk. */
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
+			promise_meta_write_spare(cp);
+			g_raid_md_promise_refill(sc);
+		}
+		return (error);
+	}
+	return (-100);
+}
+
+static int
+g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol,
+    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	struct g_raid_md_promise_perdisk *pd;
+	struct g_raid_md_promise_pervolume *pv;
+	struct promise_raid_conf *meta;
+	off_t rebuild_lba64;
+	int i, j, pos, rebuild;
+
+	sc = md->mdo_softc;
+
+	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
+		return (0);
+
+	/* Generate new per-volume metadata for affected volumes. */
+	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
+		if (vol->v_stopping)
+			continue;
+
+		/* Skip volumes not related to specified targets. */
+		if (tvol != NULL && vol != tvol)
+			continue;
+		if (tsd != NULL && vol != tsd->sd_volume)
+			continue;
+		if (tdisk != NULL) {
+			for (i = 0; i < vol->v_disks_count; i++) {
+				if (vol->v_subdisks[i].sd_disk == tdisk)
+					break;
+			}
+			if (i >= vol->v_disks_count)
+				continue;
+		}
+
+		pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
+		pv->pv_generation++;
+
+		meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO);
+		if (pv->pv_meta != NULL)
+			memcpy(meta, pv->pv_meta, sizeof(*meta));
+		memcpy(meta->promise_id, PROMISE_MAGIC, sizeof(PROMISE_MAGIC));
+		meta->dummy_0 = 0x00020000;
+		meta->integrity = PROMISE_I_VALID;
+
+		meta->generation = pv->pv_generation;
+		meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE |
+		    PROMISE_S_INITED | PROMISE_S_READY;
+		if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED)
+			meta->status |= PROMISE_S_DEGRADED;
+		if (vol->v_dirty)
+			meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */
+		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 ||
+		    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE)
+			meta->type = PROMISE_T_RAID0;
+		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
+		    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
+			meta->type = PROMISE_T_RAID1;
+		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3)
+			meta->type = PROMISE_T_RAID3;
+		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
+			meta->type = PROMISE_T_RAID5;
+		else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT)
+			meta->type = PROMISE_T_SPAN;
+		else
+			meta->type = PROMISE_T_JBOD;
+		meta->total_disks = vol->v_disks_count;
+		meta->stripe_shift = ffs(vol->v_strip_size / 1024);
+		meta->array_width = vol->v_disks_count;
+		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
+		    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
+			meta->array_width /= 2;
+		meta->array_number = vol->v_global_id;
+		meta->total_sectors = vol->v_mediasize / vol->v_sectorsize;
+		meta->cylinders = meta->total_sectors / (255 * 63) - 1;
+		meta->heads = 254;
+		meta->sectors = 63;
+		meta->volume_id = pv->pv_id;
+		rebuild_lba64 = UINT64_MAX;
+		rebuild = 0;
+		for (i = 0; i < vol->v_disks_count; i++) {
+			sd = &vol->v_subdisks[i];
+			/* For RAID0+1 we need to translate order. */
+			pos = promise_meta_translate_disk(vol, i);
+			meta->disks[pos].flags = PROMISE_F_VALID |
+			    PROMISE_F_ASSIGNED;
+			if (sd->sd_state == G_RAID_SUBDISK_S_NONE) {
+				meta->disks[pos].flags |= 0;
+			} else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) {
+				meta->disks[pos].flags |=
+				    PROMISE_F_DOWN | PROMISE_F_REDIR;
+			} else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) {
+				meta->disks[pos].flags |=
+				    PROMISE_F_ONLINE | PROMISE_F_REDIR;
+				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
+					rebuild_lba64 = MIN(rebuild_lba64,
+					    sd->sd_rebuild_pos / 512);
+				} else
+					rebuild_lba64 = 0;
+				rebuild = 1;
+			} else {
+				meta->disks[pos].flags |= PROMISE_F_ONLINE;
+				if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) {
+					meta->status |= PROMISE_S_MARKED;
+					if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
+						rebuild_lba64 = MIN(rebuild_lba64,
+						    sd->sd_rebuild_pos / 512);
+					} else
+						rebuild_lba64 = 0;
+				}
+			}
+			if (pv->pv_meta != NULL) {
+				meta->disks[pos].id = pv->pv_meta->disks[pos].id;
+			} else {
+				meta->disks[pos].number = i * 2;
+				arc4rand(&meta->disks[pos].id,
+				    sizeof(meta->disks[pos].id), 0);
+			}
+		}
+		promise_meta_put_name(meta, vol->v_name);
+
+		/* Try to mimic AMD BIOS rebuild/resync behavior. */
+		if (rebuild_lba64 != UINT64_MAX) {
+			if (rebuild)
+				meta->magic_3 = 0x03040010UL; /* Rebuild? */
+			else
+				meta->magic_3 = 0x03040008UL; /* Resync? */
+			/* Translate from per-disk to per-volume LBA. */
+			if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
+			    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
+				rebuild_lba64 *= meta->array_width;
+			} else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
+			    vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) {
+				rebuild_lba64 *= meta->array_width - 1;
+			} else
+				rebuild_lba64 = 0;
+		} else
+			meta->magic_3 = 0x03000000UL;
+		meta->rebuild_lba64 = rebuild_lba64;
+		meta->magic_4 = 0x04010101UL;
+
+		/* Replace per-volume metadata with new. */
+		if (pv->pv_meta != NULL)
+			free(pv->pv_meta, M_MD_PROMISE);
+		pv->pv_meta = meta;
+
+		/* Copy new metadata to the disks, adding or replacing old. */
+		for (i = 0; i < vol->v_disks_count; i++) {
+			sd = &vol->v_subdisks[i];
+			disk = sd->sd_disk;
+			if (disk == NULL)
+				continue;
+			/* For RAID0+1 we need to translate order. */
+			pos = promise_meta_translate_disk(vol, i);
+			pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
+			for (j = 0; j < pd->pd_subdisks; j++) {
+				if (pd->pd_meta[j]->volume_id == meta->volume_id)
+					break;
+			}
+			if (j == pd->pd_subdisks)
+				pd->pd_subdisks++;
+			if (pd->pd_meta[j] != NULL)
+				free(pd->pd_meta[j], M_MD_PROMISE);
+			pd->pd_meta[j] = promise_meta_copy(meta);
+			pd->pd_meta[j]->disk = meta->disks[pos];
+			pd->pd_meta[j]->disk.number = pos;
+			pd->pd_meta[j]->disk_offset = sd->sd_offset / 512;
+			pd->pd_meta[j]->disk_sectors = sd->sd_size / 512;
+			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) {
+				pd->pd_meta[j]->rebuild_lba =
+				    sd->sd_rebuild_pos / 512;
+			} else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD)
+				pd->pd_meta[j]->rebuild_lba = 0;
+			else
+				pd->pd_meta[j]->rebuild_lba = UINT32_MAX;
+			pd->pd_updated = 1;
+		}
+	}
+
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
+		if (disk->d_state != G_RAID_DISK_S_ACTIVE)
+			continue;
+		if (!pd->pd_updated)
+			continue;
+		G_RAID_DEBUG(1, "Writing Promise metadata to %s",
+		    g_raid_get_diskname(disk));
+		for (i = 0; i < pd->pd_subdisks; i++)
+			g_raid_md_promise_print(pd->pd_meta[i]);
+		promise_meta_write(disk->d_consumer,
+		    pd->pd_meta, pd->pd_subdisks);
+		pd->pd_updated = 0;
+	}
+
+	return (0);
+}
+
+static int
+g_raid_md_fail_disk_promise(struct g_raid_md_object *md,
+    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_promise_perdisk *pd;
+	struct g_raid_subdisk *sd;
+	int i, pos;
+
+	sc = md->mdo_softc;
+	pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data;
+
+	/* We can't fail disk that is not a part of array now. */
+	if (tdisk->d_state != G_RAID_DISK_S_ACTIVE)
+		return (-1);
+
+	/*
+	 * Mark disk as failed in metadata and try to write that metadata
+	 * to the disk itself to prevent it's later resurrection as STALE.
+	 */
+	if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL)
+		G_RAID_DEBUG(1, "Writing Promise metadata to %s",
+		    g_raid_get_diskname(tdisk));
+	for (i = 0; i < pd->pd_subdisks; i++) {
+		pd->pd_meta[i]->disk.flags |=
+		    PROMISE_F_DOWN | PROMISE_F_REDIR;
+		pos = pd->pd_meta[i]->disk.number;
+		if (pos >= 0 && pos < PROMISE_MAX_DISKS) {
+			pd->pd_meta[i]->disks[pos].flags |=
+			    PROMISE_F_DOWN | PROMISE_F_REDIR;
+		}
+		g_raid_md_promise_print(pd->pd_meta[i]);
+	}
+	if (tdisk->d_consumer != NULL)
+		promise_meta_write(tdisk->d_consumer,
+		    pd->pd_meta, pd->pd_subdisks);
+
+	/* Change states. */
+	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
+	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
+		g_raid_change_subdisk_state(sd,
+		    G_RAID_SUBDISK_S_FAILED);
+		g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
+		    G_RAID_EVENT_SUBDISK);
+	}
+
+	/* Write updated metadata to remaining disks. */
+	g_raid_md_write_promise(md, NULL, NULL, tdisk);
+
+	g_raid_md_promise_refill(sc);
+	return (0);
+}
+
+static int
+g_raid_md_free_disk_promise(struct g_raid_md_object *md,
+    struct g_raid_disk *disk)
+{
+	struct g_raid_md_promise_perdisk *pd;
+	int i;
+
+	pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data;
+	for (i = 0; i < pd->pd_subdisks; i++) {
+		if (pd->pd_meta[i] != NULL) {
+			free(pd->pd_meta[i], M_MD_PROMISE);
+			pd->pd_meta[i] = NULL;
+		}
+	}
+	free(pd, M_MD_PROMISE);
+	disk->d_md_data = NULL;
+	return (0);
+}
+
+static int
+g_raid_md_free_volume_promise(struct g_raid_md_object *md,
+    struct g_raid_volume *vol)
+{
+	struct g_raid_md_promise_pervolume *pv;
+
+	pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data;
+	if (pv && pv->pv_meta != NULL) {
+		free(pv->pv_meta, M_MD_PROMISE);
+		pv->pv_meta = NULL;
+	}
+	if (pv && !pv->pv_started) {
+		pv->pv_started = 1;
+		callout_stop(&pv->pv_start_co);
+	}
+	return (0);
+}
+
+static int
+g_raid_md_free_promise(struct g_raid_md_object *md)
+{
+
+	return (0);
+}
+
+G_RAID_MD_DECLARE(g_raid_md_promise);
diff --git a/sys/geom/raid/md_sii.c b/sys/geom/raid/md_sii.c
new file mode 100644
index 0000000..305accd
--- /dev/null
+++ b/sys/geom/raid/md_sii.c
@@ -0,0 +1,1692 @@
+/*-
+ * Copyright (c) 2011 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+#include <geom/geom.h>
+#include "geom/raid/g_raid.h"
+#include "g_raid_md_if.h"
+
+static MALLOC_DEFINE(M_MD_SII, "md_sii_data", "GEOM_RAID SiI metadata");
+
+struct sii_raid_conf {
+	uint16_t	ata_params_00_53[54];
+	uint64_t	total_sectors;		/* 54 - 57 */
+	uint16_t	ata_params_58_81[72];
+	uint16_t	product_id;		/* 130 */
+	uint16_t	vendor_id;		/* 131 */
+	uint16_t	version_minor;		/* 132 */
+	uint16_t	version_major;		/* 133 */
+	uint8_t		timestamp[6];		/* 134 - 136 */
+	uint16_t	strip_sectors;		/* 137 */
+	uint16_t	dummy_2;
+	uint8_t		disk_number;		/* 139 */
+	uint8_t		type;
+#define SII_T_RAID0             0x00
+#define SII_T_RAID1             0x01
+#define SII_T_RAID01            0x02
+#define SII_T_SPARE             0x03
+#define SII_T_CONCAT            0x04
+#define SII_T_RAID5             0x10
+#define SII_T_RESERVED          0xfd
+#define SII_T_JBOD              0xff
+
+	uint8_t		raid0_disks;		/* 140 */
+	uint8_t		raid0_ident;
+	uint8_t		raid1_disks;		/* 141 */
+	uint8_t		raid1_ident;
+	uint64_t	rebuild_lba;		/* 142 - 145 */
+	uint32_t	generation;		/* 146 - 147 */
+	uint8_t		disk_status;		/* 148 */
+#define SII_S_CURRENT           0x01
+#define SII_S_REBUILD           0x02
+#define SII_S_DROPPED           0x03
+#define SII_S_REMOVED           0x04
+
+	uint8_t		raid_status;
+#define SII_S_ONLINE            0x01
+#define SII_S_AVAILABLE         0x02
+
+	uint8_t		raid_location;		/* 149 */
+	uint8_t		disk_location;
+	uint8_t		auto_rebuild;		/* 150 */
+#define SII_R_REBUILD           0x00
+#define SII_R_NOREBUILD         0xff
+
+	uint8_t		dummy_3;
+	uint8_t		name[16];		/* 151 - 158 */
+	uint16_t	checksum;		/* 159 */
+	uint16_t	ata_params_160_255[96];
+} __packed;
+
+struct g_raid_md_sii_perdisk {
+	struct sii_raid_conf	*pd_meta;
+	int			 pd_disk_pos;
+	off_t			 pd_disk_size;
+};
+
+struct g_raid_md_sii_object {
+	struct g_raid_md_object	 mdio_base;
+	uint8_t			 mdio_timestamp[6];
+	uint8_t			 mdio_location;
+	uint32_t		 mdio_generation;
+	struct sii_raid_conf	*mdio_meta;
+	struct callout		 mdio_start_co;	/* STARTING state timer. */
+	int			 mdio_total_disks;
+	int			 mdio_disks_present;
+	int			 mdio_started;
+	int			 mdio_incomplete;
+	struct root_hold_token	*mdio_rootmount; /* Root mount delay token. */
+};
+
+static g_raid_md_create_t g_raid_md_create_sii;
+static g_raid_md_taste_t g_raid_md_taste_sii;
+static g_raid_md_event_t g_raid_md_event_sii;
+static g_raid_md_ctl_t g_raid_md_ctl_sii;
+static g_raid_md_write_t g_raid_md_write_sii;
+static g_raid_md_fail_disk_t g_raid_md_fail_disk_sii;
+static g_raid_md_free_disk_t g_raid_md_free_disk_sii;
+static g_raid_md_free_t g_raid_md_free_sii;
+
+static kobj_method_t g_raid_md_sii_methods[] = {
+	KOBJMETHOD(g_raid_md_create,	g_raid_md_create_sii),
+	KOBJMETHOD(g_raid_md_taste,	g_raid_md_taste_sii),
+	KOBJMETHOD(g_raid_md_event,	g_raid_md_event_sii),
+	KOBJMETHOD(g_raid_md_ctl,	g_raid_md_ctl_sii),
+	KOBJMETHOD(g_raid_md_write,	g_raid_md_write_sii),
+	KOBJMETHOD(g_raid_md_fail_disk,	g_raid_md_fail_disk_sii),
+	KOBJMETHOD(g_raid_md_free_disk,	g_raid_md_free_disk_sii),
+	KOBJMETHOD(g_raid_md_free,	g_raid_md_free_sii),
+	{ 0, 0 }
+};
+
+static struct g_raid_md_class g_raid_md_sii_class = {
+	"SiI",
+	g_raid_md_sii_methods,
+	sizeof(struct g_raid_md_sii_object),
+	.mdc_priority = 100
+};
+
+static void
+g_raid_md_sii_print(struct sii_raid_conf *meta)
+{
+
+	if (g_raid_debug < 1)
+		return;
+
+	printf("********* ATA SiI RAID Metadata *********\n");
+	printf("total_sectors       %llu\n",
+	    (long long unsigned)meta->total_sectors);
+	printf("product_id          0x%04x\n", meta->product_id);
+	printf("vendor_id           0x%04x\n", meta->vendor_id);
+	printf("version_minor       0x%04x\n", meta->version_minor);
+	printf("version_major       0x%04x\n", meta->version_major);
+	printf("timestamp           0x%02x%02x%02x%02x%02x%02x\n",
+	    meta->timestamp[5], meta->timestamp[4], meta->timestamp[3],
+	    meta->timestamp[2], meta->timestamp[1], meta->timestamp[0]);
+	printf("strip_sectors       %d\n", meta->strip_sectors);
+	printf("disk_number         %d\n", meta->disk_number);
+	printf("type                0x%02x\n", meta->type);
+	printf("raid0_disks         %d\n", meta->raid0_disks);
+	printf("raid0_ident         %d\n", meta->raid0_ident);
+	printf("raid1_disks         %d\n", meta->raid1_disks);
+	printf("raid1_ident         %d\n", meta->raid1_ident);
+	printf("rebuild_lba         %llu\n",
+	    (long long unsigned)meta->rebuild_lba);
+	printf("generation          %d\n", meta->generation);
+	printf("disk_status         %d\n", meta->disk_status);
+	printf("raid_status         %d\n", meta->raid_status);
+	printf("raid_location       %d\n", meta->raid_location);
+	printf("disk_location       %d\n", meta->disk_location);
+	printf("auto_rebuild        %d\n", meta->auto_rebuild);
+	printf("name                <%.16s>\n", meta->name);
+	printf("checksum            0x%04x\n", meta->checksum);
+	printf("=================================================\n");
+}
+
+static struct sii_raid_conf *
+sii_meta_copy(struct sii_raid_conf *meta)
+{
+	struct sii_raid_conf *nmeta;
+
+	nmeta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK);
+	memcpy(nmeta, meta, sizeof(*meta));
+	return (nmeta);
+}
+
+static int
+sii_meta_total_disks(struct sii_raid_conf *meta)
+{
+
+	switch (meta->type) {
+	case SII_T_RAID0:
+	case SII_T_RAID5:
+	case SII_T_CONCAT:
+		return (meta->raid0_disks);
+	case SII_T_RAID1:
+		return (meta->raid1_disks);
+	case SII_T_RAID01:
+		return (meta->raid0_disks * meta->raid1_disks);
+	case SII_T_SPARE:
+	case SII_T_JBOD:
+		return (1);
+	}
+	return (0);
+}
+
+static int
+sii_meta_disk_pos(struct sii_raid_conf *meta, struct sii_raid_conf *pdmeta)
+{
+
+	if (pdmeta->type == SII_T_SPARE)
+		return (-3);
+
+	if (memcmp(&meta->timestamp, &pdmeta->timestamp, 6) != 0)
+		return (-1);
+
+	switch (pdmeta->type) {
+	case SII_T_RAID0:
+	case SII_T_RAID1:
+	case SII_T_RAID5:
+	case SII_T_CONCAT:
+		return (pdmeta->disk_number);
+	case SII_T_RAID01:
+		return (pdmeta->raid1_ident * pdmeta->raid1_disks +
+		    pdmeta->raid0_ident);
+	case SII_T_JBOD:
+		return (0);
+	}
+	return (-1);
+}
+
+static void
+sii_meta_get_name(struct sii_raid_conf *meta, char *buf)
+{
+	int i;
+
+	strncpy(buf, meta->name, 16);
+	buf[16] = 0;
+	for (i = 15; i >= 0; i--) {
+		if (buf[i] > 0x20)
+			break;
+		buf[i] = 0;
+	}
+}
+
+static void
+sii_meta_put_name(struct sii_raid_conf *meta, char *buf)
+{
+
+	memset(meta->name, 0x20, 16);
+	memcpy(meta->name, buf, MIN(strlen(buf), 16));
+}
+
+static struct sii_raid_conf *
+sii_meta_read(struct g_consumer *cp)
+{
+	struct g_provider *pp;
+	struct sii_raid_conf *meta;
+	char *buf;
+	int error, i;
+	uint16_t checksum, *ptr;
+
+	pp = cp->provider;
+
+	/* Read the anchor sector. */
+	buf = g_read_data(cp,
+	    pp->mediasize - pp->sectorsize, pp->sectorsize, &error);
+	if (buf == NULL) {
+		G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
+		    pp->name, error);
+		return (NULL);
+	}
+	meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK);
+	memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize));
+	g_free(buf);
+
+	/* Check vendor ID. */
+	if (meta->vendor_id != 0x1095) {
+		G_RAID_DEBUG(1, "SiI vendor ID check failed on %s (0x%04x)",
+		    pp->name, meta->vendor_id);
+		free(meta, M_MD_SII);
+		return (NULL);
+	}
+
+	/* Check metadata major version. */
+	if (meta->version_major != 2) {
+		G_RAID_DEBUG(1, "SiI version check failed on %s (%d.%d)",
+		    pp->name, meta->version_major, meta->version_minor);
+		free(meta, M_MD_SII);
+		return (NULL);
+	}
+
+	/* Check metadata checksum. */
+	for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i <= 159; i++)
+		checksum += *ptr++;
+	if (checksum != 0) {
+		G_RAID_DEBUG(1, "SiI checksum check failed on %s", pp->name);
+		free(meta, M_MD_SII);
+		return (NULL);
+	}
+
+	/* Check raid type. */
+	if (meta->type != SII_T_RAID0 && meta->type != SII_T_RAID1 &&
+	    meta->type != SII_T_RAID01 && meta->type != SII_T_SPARE &&
+	    meta->type != SII_T_RAID5 && meta->type != SII_T_CONCAT &&
+	    meta->type != SII_T_JBOD) {
+		G_RAID_DEBUG(1, "SiI unknown RAID level on %s (0x%02x)",
+		    pp->name, meta->type);
+		free(meta, M_MD_SII);
+		return (NULL);
+	}
+
+	return (meta);
+}
+
+static int
+sii_meta_write(struct g_consumer *cp, struct sii_raid_conf *meta)
+{
+	struct g_provider *pp;
+	char *buf;
+	int error, i;
+	uint16_t checksum, *ptr;
+
+	pp = cp->provider;
+
+	/* Recalculate checksum for case if metadata were changed. */
+	meta->checksum = 0;
+	for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 159; i++)
+		checksum += *ptr++;
+	meta->checksum -= checksum;
+
+	/* Create and fill buffer. */
+	buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO);
+	memcpy(buf, meta, sizeof(*meta));
+
+	/* Write 4 copies of metadata. */
+	for (i = 0; i < 4; i++) {
+		error = g_write_data(cp,
+		    pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)),
+		    buf, pp->sectorsize);
+		if (error != 0) {
+			G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
+			    pp->name, error);
+			break;
+		}
+	}
+
+	free(buf, M_MD_SII);
+	return (error);
+}
+
+static int
+sii_meta_erase(struct g_consumer *cp)
+{
+	struct g_provider *pp;
+	char *buf;
+	int error, i;
+
+	pp = cp->provider;
+	buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO);
+	/* Write 4 copies of metadata. */
+	for (i = 0; i < 4; i++) {
+		error = g_write_data(cp,
+		    pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)),
+		    buf, pp->sectorsize);
+		if (error != 0) {
+			G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
+			    pp->name, error);
+		}
+	}
+	free(buf, M_MD_SII);
+	return (error);
+}
+
+static int
+sii_meta_write_spare(struct g_consumer *cp)
+{
+	struct sii_raid_conf *meta;
+	int error;
+
+	meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO);
+	meta->total_sectors = cp->provider->mediasize /
+	    cp->provider->sectorsize - 0x800;
+	meta->vendor_id = 0x1095;
+	meta->version_minor = 0;
+	meta->version_major = 2;
+	meta->timestamp[0] = arc4random();
+	meta->timestamp[1] = arc4random();
+	meta->timestamp[2] = arc4random();
+	meta->timestamp[3] = arc4random();
+	meta->timestamp[4] = arc4random();
+	meta->timestamp[5] = arc4random();
+	meta->type = SII_T_SPARE;
+	meta->generation = 1;
+	meta->raid1_ident = 0xff;
+	meta->raid_location = arc4random();
+	error = sii_meta_write(cp, meta);
+	free(meta, M_MD_SII);
+	return (error);
+}
+
+static struct g_raid_disk *
+g_raid_md_sii_get_disk(struct g_raid_softc *sc, int id)
+{
+	struct g_raid_disk	*disk;
+	struct g_raid_md_sii_perdisk *pd;
+
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data;
+		if (pd->pd_disk_pos == id)
+			break;
+	}
+	return (disk);
+}
+
+static int
+g_raid_md_sii_supported(int level, int qual, int disks, int force)
+{
+
+	if (disks > 8)
+		return (0);
+	switch (level) {
+	case G_RAID_VOLUME_RL_RAID0:
+		if (disks < 1)
+			return (0);
+		if (!force && (disks < 2 || disks > 6))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID1:
+		if (disks < 1)
+			return (0);
+		if (!force && (disks != 2))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID1E:
+		if (disks < 2)
+			return (0);
+		if (disks % 2 != 0)
+			return (0);
+		if (!force && (disks < 4))
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_SINGLE:
+		if (disks != 1)
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_CONCAT:
+		if (disks < 2)
+			return (0);
+		break;
+	case G_RAID_VOLUME_RL_RAID5:
+		if (disks < 3)
+			return (0);
+		break;
+	default:
+		return (0);
+	}
+	if (qual != G_RAID_VOLUME_RLQ_NONE)
+		return (0);
+	return (1);
+}
+
+static int
+g_raid_md_sii_start_disk(struct g_raid_disk *disk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd, *tmpsd;
+	struct g_raid_disk *olddisk, *tmpdisk;
+	struct g_raid_md_object *md;
+	struct g_raid_md_sii_object *mdi;
+	struct g_raid_md_sii_perdisk *pd, *oldpd;
+	struct sii_raid_conf *meta;
+	int disk_pos, resurrection = 0;
+
+	sc = disk->d_softc;
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_sii_object *)md;
+	meta = mdi->mdio_meta;
+	pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data;
+	olddisk = NULL;
+
+	/* Find disk position in metadata by it's serial. */
+	if (pd->pd_meta != NULL)
+		disk_pos = sii_meta_disk_pos(meta, pd->pd_meta);
+	else
+		disk_pos = -3;
+	if (disk_pos < 0) {
+		G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk");
+		/* If we are in the start process, that's all for now. */
+		if (!mdi->mdio_started)
+			goto nofit;
+		/*
+		 * If we have already started - try to get use of the disk.
+		 * Try to replace OFFLINE disks first, then FAILED.
+		 */
+		TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) {
+			if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE &&
+			    tmpdisk->d_state != G_RAID_DISK_S_FAILED)
+				continue;
+			/* Make sure this disk is big enough. */
+			TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) {
+				if (sd->sd_offset + sd->sd_size + 512 >
+				    pd->pd_disk_size) {
+					G_RAID_DEBUG1(1, sc,
+					    "Disk too small (%ju < %ju)",
+					    pd->pd_disk_size,
+					    sd->sd_offset + sd->sd_size + 512);
+					break;
+				}
+			}
+			if (sd != NULL)
+				continue;
+			if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) {
+				olddisk = tmpdisk;
+				break;
+			} else if (olddisk == NULL)
+				olddisk = tmpdisk;
+		}
+		if (olddisk == NULL) {
+nofit:
+			if (disk_pos == -3 || pd->pd_disk_pos == -3) {
+				g_raid_change_disk_state(disk,
+				    G_RAID_DISK_S_SPARE);
+				return (1);
+			} else {
+				g_raid_change_disk_state(disk,
+				    G_RAID_DISK_S_STALE);
+				return (0);
+			}
+		}
+		oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data;
+		disk_pos = oldpd->pd_disk_pos;
+		resurrection = 1;
+	}
+
+	if (olddisk == NULL) {
+		/* Find placeholder by position. */
+		olddisk = g_raid_md_sii_get_disk(sc, disk_pos);
+		if (olddisk == NULL)
+			panic("No disk at position %d!", disk_pos);
+		if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) {
+			G_RAID_DEBUG1(1, sc, "More then one disk for pos %d",
+			    disk_pos);
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE);
+			return (0);
+		}
+		oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data;
+	}
+
+	/* Replace failed disk or placeholder with new disk. */
+	TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) {
+		TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next);
+		TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+		sd->sd_disk = disk;
+	}
+	oldpd->pd_disk_pos = -2;
+	pd->pd_disk_pos = disk_pos;
+
+	/* If it was placeholder -- destroy it. */
+	if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) {
+		g_raid_destroy_disk(olddisk);
+	} else {
+		/* Otherwise, make it STALE_FAILED. */
+		g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED);
+	}
+
+	/* Welcome the new disk. */
+	if (resurrection)
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
+	else if (pd->pd_meta->disk_status == SII_S_CURRENT ||
+	    pd->pd_meta->disk_status == SII_S_REBUILD)
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
+	else
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
+	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+
+		/*
+		 * Different disks may have different sizes,
+		 * in concat mode. Update from real disk size.
+		 */
+		if (meta->type == SII_T_CONCAT || meta->type == SII_T_JBOD)
+			sd->sd_size = pd->pd_disk_size - 0x800 * 512;
+
+		if (resurrection) {
+			/* New or ex-spare disk. */
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_NEW);
+		} else if (pd->pd_meta->disk_status == SII_S_REBUILD) {
+			/* Rebuilding disk. */
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_REBUILD);
+			if (pd->pd_meta->generation == meta->generation)
+				sd->sd_rebuild_pos = pd->pd_meta->rebuild_lba * 512;
+			else
+				sd->sd_rebuild_pos = 0;
+		} else if (pd->pd_meta->disk_status == SII_S_CURRENT) {
+			if (pd->pd_meta->raid_status == SII_S_ONLINE ||
+			    pd->pd_meta->generation != meta->generation) {
+				/* Dirty or resyncing disk. */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_STALE);
+			} else {
+				/* Up to date disk. */
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_ACTIVE);
+			}
+		} else {
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_FAILED);
+		}
+		g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
+		    G_RAID_EVENT_SUBDISK);
+	}
+
+	/* Update status of our need for spare. */
+	if (mdi->mdio_started) {
+		mdi->mdio_incomplete =
+		    (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) <
+		     mdi->mdio_total_disks);
+	}
+
+	return (resurrection);
+}
+
+static void
+g_disk_md_sii_retaste(void *arg, int pending)
+{
+
+	G_RAID_DEBUG(1, "Array is not complete, trying to retaste.");
+	g_retaste(&g_raid_class);
+	free(arg, M_MD_SII);
+}
+
+static void
+g_raid_md_sii_refill(struct g_raid_softc *sc)
+{
+	struct g_raid_md_object *md;
+	struct g_raid_md_sii_object *mdi;
+	struct sii_raid_conf *meta;
+	struct g_raid_disk *disk;
+	struct task *task;
+	int update, na;
+
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_sii_object *)md;
+	meta = mdi->mdio_meta;
+	update = 0;
+	do {
+		/* Make sure we miss anything. */
+		na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE);
+		if (na == mdi->mdio_total_disks)
+			break;
+
+		G_RAID_DEBUG1(1, md->mdo_softc,
+		    "Array is not complete (%d of %d), "
+		    "trying to refill.", na, mdi->mdio_total_disks);
+
+		/* Try to get use some of STALE disks. */
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_state == G_RAID_DISK_S_STALE) {
+				update += g_raid_md_sii_start_disk(disk);
+				if (disk->d_state == G_RAID_DISK_S_ACTIVE)
+					break;
+			}
+		}
+		if (disk != NULL)
+			continue;
+
+		/* Try to get use some of SPARE disks. */
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_state == G_RAID_DISK_S_SPARE) {
+				update += g_raid_md_sii_start_disk(disk);
+				if (disk->d_state == G_RAID_DISK_S_ACTIVE)
+					break;
+			}
+		}
+	} while (disk != NULL);
+
+	/* Write new metadata if we changed something. */
+	if (update) {
+		g_raid_md_write_sii(md, NULL, NULL, NULL);
+		meta = mdi->mdio_meta;
+	}
+
+	/* Update status of our need for spare. */
+	mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) <
+	    mdi->mdio_total_disks);
+
+	/* Request retaste hoping to find spare. */
+	if (mdi->mdio_incomplete) {
+		task = malloc(sizeof(struct task),
+		    M_MD_SII, M_WAITOK | M_ZERO);
+		TASK_INIT(task, 0, g_disk_md_sii_retaste, task);
+		taskqueue_enqueue(taskqueue_swi, task);
+	}
+}
+
+static void
+g_raid_md_sii_start(struct g_raid_softc *sc)
+{
+	struct g_raid_md_object *md;
+	struct g_raid_md_sii_object *mdi;
+	struct g_raid_md_sii_perdisk *pd;
+	struct sii_raid_conf *meta;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk, *best;
+	off_t size;
+	int j, disk_pos;
+	uint32_t gendiff, bestgendiff;
+	char buf[17];
+
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_sii_object *)md;
+	meta = mdi->mdio_meta;
+
+	/* Create volumes and subdisks. */
+	sii_meta_get_name(meta, buf);
+	vol = g_raid_create_volume(sc, buf, -1);
+	vol->v_mediasize = (off_t)meta->total_sectors * 512;
+	if (meta->type == SII_T_RAID0) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
+		size = vol->v_mediasize / mdi->mdio_total_disks;
+	} else if (meta->type == SII_T_RAID1) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
+		size = vol->v_mediasize;
+	} else if (meta->type == SII_T_RAID01) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
+		size = vol->v_mediasize / (mdi->mdio_total_disks / 2);
+	} else if (meta->type == SII_T_CONCAT) {
+		if (mdi->mdio_total_disks == 1)
+			vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE;
+		else
+			vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT;
+		size = 0;
+	} else if (meta->type == SII_T_RAID5) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
+		size = vol->v_mediasize / (mdi->mdio_total_disks - 1);
+	} else if (meta->type == SII_T_JBOD) {
+		vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE;
+		size = 0;
+	} else {
+		vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
+		size = 0;
+	}
+	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
+	vol->v_strip_size = meta->strip_sectors * 512; //ZZZ
+	vol->v_disks_count = mdi->mdio_total_disks;
+	vol->v_sectorsize = 512; //ZZZ
+	for (j = 0; j < vol->v_disks_count; j++) {
+		sd = &vol->v_subdisks[j];
+		sd->sd_offset = 0;
+		sd->sd_size = size;
+	}
+	g_raid_start_volume(vol);
+
+	/* Create disk placeholders to store data for later writing. */
+	for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) {
+		pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO);
+		pd->pd_disk_pos = disk_pos;
+		disk = g_raid_create_disk(sc);
+		disk->d_md_data = (void *)pd;
+		disk->d_state = G_RAID_DISK_S_OFFLINE;
+		sd = &vol->v_subdisks[disk_pos];
+		sd->sd_disk = disk;
+		TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+	}
+
+	/*
+	 * Make all disks found till the moment take their places
+	 * in order of their generation numbers.
+	 */
+	do {
+		best = NULL;
+		bestgendiff = 0xffffffff;
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_state != G_RAID_DISK_S_NONE)
+				continue;
+			pd = disk->d_md_data;
+			if (pd->pd_meta == NULL)
+				gendiff = 0xfffffffe;
+			else
+				gendiff = meta->generation -
+				    pd->pd_meta->generation;
+			if (gendiff < bestgendiff) {
+				best = disk;
+				bestgendiff = gendiff;
+			}
+		}
+		if (best != NULL)
+			g_raid_md_sii_start_disk(best);
+	} while (best != NULL);
+
+	mdi->mdio_started = 1;
+	G_RAID_DEBUG1(0, sc, "Array started.");
+	g_raid_md_write_sii(md, NULL, NULL, NULL);
+
+	/* Pickup any STALE/SPARE disks to refill array if needed. */
+	g_raid_md_sii_refill(sc);
+
+	g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME);
+
+	callout_stop(&mdi->mdio_start_co);
+	G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount);
+	root_mount_rel(mdi->mdio_rootmount);
+	mdi->mdio_rootmount = NULL;
+}
+
+static void
+g_raid_md_sii_new_disk(struct g_raid_disk *disk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_object *md;
+	struct g_raid_md_sii_object *mdi;
+	struct sii_raid_conf *pdmeta;
+	struct g_raid_md_sii_perdisk *pd;
+
+	sc = disk->d_softc;
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_sii_object *)md;
+	pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data;
+	pdmeta = pd->pd_meta;
+
+	if (mdi->mdio_started) {
+		if (g_raid_md_sii_start_disk(disk))
+			g_raid_md_write_sii(md, NULL, NULL, NULL);
+	} else {
+		if (mdi->mdio_meta == NULL ||
+		    ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) {
+			G_RAID_DEBUG1(1, sc, "Newer disk");
+			if (mdi->mdio_meta != NULL)
+				free(mdi->mdio_meta, M_MD_SII);
+			mdi->mdio_meta = sii_meta_copy(pdmeta);
+			mdi->mdio_generation = mdi->mdio_meta->generation;
+			mdi->mdio_total_disks = sii_meta_total_disks(pdmeta);
+			mdi->mdio_disks_present = 1;
+		} else if (pdmeta->generation == mdi->mdio_generation) {
+			mdi->mdio_disks_present++;
+			G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
+			    mdi->mdio_disks_present,
+			    mdi->mdio_total_disks);
+		} else {
+			G_RAID_DEBUG1(1, sc, "Older disk");
+		}
+
+		/* If we collected all needed disks - start array. */
+		if (mdi->mdio_disks_present == mdi->mdio_total_disks)
+			g_raid_md_sii_start(sc);
+	}
+}
+
+static void
+g_raid_sii_go(void *arg)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_object *md;
+	struct g_raid_md_sii_object *mdi;
+
+	sc = arg;
+	md = sc->sc_md;
+	mdi = (struct g_raid_md_sii_object *)md;
+	if (!mdi->mdio_started) {
+		G_RAID_DEBUG1(0, sc, "Force array start due to timeout.");
+		g_raid_event_send(sc, G_RAID_NODE_E_START, 0);
+	}
+}
+
+static int
+g_raid_md_create_sii(struct g_raid_md_object *md, struct g_class *mp,
+    struct g_geom **gp)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_sii_object *mdi;
+	char name[32];
+
+	mdi = (struct g_raid_md_sii_object *)md;
+	mdi->mdio_timestamp[5] = arc4random();
+	mdi->mdio_timestamp[4] = arc4random();
+	mdi->mdio_timestamp[3] = arc4random();
+	mdi->mdio_timestamp[2] = arc4random();
+	mdi->mdio_timestamp[1] = arc4random();
+	mdi->mdio_timestamp[0] = arc4random();
+	mdi->mdio_location = arc4random();
+	mdi->mdio_generation = 0;
+	snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x",
+	    mdi->mdio_timestamp[5], mdi->mdio_timestamp[4],
+	    mdi->mdio_timestamp[3], mdi->mdio_timestamp[2],
+	    mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]);
+	sc = g_raid_create_node(mp, name, md);
+	if (sc == NULL)
+		return (G_RAID_MD_TASTE_FAIL);
+	md->mdo_softc = sc;
+	*gp = sc->sc_geom;
+	return (G_RAID_MD_TASTE_NEW);
+}
+
+static int
+g_raid_md_taste_sii(struct g_raid_md_object *md, struct g_class *mp,
+                              struct g_consumer *cp, struct g_geom **gp)
+{
+	struct g_consumer *rcp;
+	struct g_provider *pp;
+	struct g_raid_md_sii_object *mdi, *mdi1;
+	struct g_raid_softc *sc;
+	struct g_raid_disk *disk;
+	struct sii_raid_conf *meta;
+	struct g_raid_md_sii_perdisk *pd;
+	struct g_geom *geom;
+	int error, disk_pos, result, spare, len;
+	char name[32];
+	uint16_t vendor;
+
+	G_RAID_DEBUG(1, "Tasting SiI on %s", cp->provider->name);
+	mdi = (struct g_raid_md_sii_object *)md;
+	pp = cp->provider;
+
+	/* Read metadata from device. */
+	meta = NULL;
+	spare = 0;
+	vendor = 0xffff;
+	disk_pos = 0;
+	if (g_access(cp, 1, 0, 0) != 0)
+		return (G_RAID_MD_TASTE_FAIL);
+	g_topology_unlock();
+	len = 2;
+	if (pp->geom->rank == 1)
+		g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
+	meta = sii_meta_read(cp);
+	g_topology_lock();
+	g_access(cp, -1, 0, 0);
+	if (meta == NULL) {
+		if (g_raid_aggressive_spare) {
+			if (vendor == 0x1095) {
+				G_RAID_DEBUG(1,
+				    "No SiI metadata, forcing spare.");
+				spare = 2;
+				goto search;
+			} else {
+				G_RAID_DEBUG(1,
+				    "SiI vendor mismatch 0x%04x != 0x1095",
+				    vendor);
+			}
+		}
+		return (G_RAID_MD_TASTE_FAIL);
+	}
+
+	/* Check this disk position in obtained metadata. */
+	disk_pos = sii_meta_disk_pos(meta, meta);
+	if (disk_pos == -1) {
+		G_RAID_DEBUG(1, "SiI disk position not found");
+		goto fail1;
+	}
+
+	/* Metadata valid. Print it. */
+	g_raid_md_sii_print(meta);
+	G_RAID_DEBUG(1, "SiI disk position %d", disk_pos);
+	spare = (meta->type == SII_T_SPARE) ? 1 : 0;
+
+search:
+	/* Search for matching node. */
+	sc = NULL;
+	mdi1 = NULL;
+	LIST_FOREACH(geom, &mp->geom, geom) {
+		sc = geom->softc;
+		if (sc == NULL)
+			continue;
+		if (sc->sc_stopping != 0)
+			continue;
+		if (sc->sc_md->mdo_class != md->mdo_class)
+			continue;
+		mdi1 = (struct g_raid_md_sii_object *)sc->sc_md;
+		if (spare) {
+			if (mdi1->mdio_incomplete)
+				break;
+		} else {
+			if (mdi1->mdio_location == meta->raid_location &&
+			    memcmp(&mdi1->mdio_timestamp,
+			     &meta->timestamp, 6) == 0)
+				break;
+		}
+	}
+
+	/* Found matching node. */
+	if (geom != NULL) {
+		G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
+		result = G_RAID_MD_TASTE_EXISTING;
+
+	} else if (spare) { /* Not found needy node -- left for later. */
+		G_RAID_DEBUG(1, "Spare is not needed at this time");
+		goto fail1;
+
+	} else { /* Not found matching node -- create one. */
+		result = G_RAID_MD_TASTE_NEW;
+		memcpy(&mdi->mdio_timestamp, &meta->timestamp, 6);
+		mdi->mdio_location = meta->raid_location;
+		snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x",
+		    mdi->mdio_timestamp[5], mdi->mdio_timestamp[4],
+		    mdi->mdio_timestamp[3], mdi->mdio_timestamp[2],
+		    mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]);
+		sc = g_raid_create_node(mp, name, md);
+		md->mdo_softc = sc;
+		geom = sc->sc_geom;
+		callout_init(&mdi->mdio_start_co, 1);
+		callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz,
+		    g_raid_sii_go, sc);
+		mdi->mdio_rootmount = root_mount_hold("GRAID-SiI");
+		G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount);
+	}
+
+	rcp = g_new_consumer(geom);
+	g_attach(rcp, pp);
+	if (g_access(rcp, 1, 1, 1) != 0)
+		; //goto fail1;
+
+	g_topology_unlock();
+	sx_xlock(&sc->sc_lock);
+
+	pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO);
+	pd->pd_meta = meta;
+	if (spare == 2) {
+		pd->pd_disk_pos = -3;
+	} else {
+		pd->pd_disk_pos = -1;
+	}
+	pd->pd_disk_size = pp->mediasize;
+	disk = g_raid_create_disk(sc);
+	disk->d_md_data = (void *)pd;
+	disk->d_consumer = rcp;
+	rcp->private = disk;
+
+	/* Read kernel dumping information. */
+	disk->d_kd.offset = 0;
+	disk->d_kd.length = OFF_MAX;
+	len = sizeof(disk->d_kd);
+	error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd);
+	if (disk->d_kd.di.dumper == NULL)
+		G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", 
+		    rcp->provider->name, error);
+
+	g_raid_md_sii_new_disk(disk);
+
+	sx_xunlock(&sc->sc_lock);
+	g_topology_lock();
+	*gp = geom;
+	return (result);
+fail1:
+	free(meta, M_MD_SII);
+	return (G_RAID_MD_TASTE_FAIL);
+}
+
+static int
+g_raid_md_event_sii(struct g_raid_md_object *md,
+    struct g_raid_disk *disk, u_int event)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd;
+	struct g_raid_md_sii_object *mdi;
+	struct g_raid_md_sii_perdisk *pd;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_sii_object *)md;
+	if (disk == NULL) {
+		switch (event) {
+		case G_RAID_NODE_E_START:
+			if (!mdi->mdio_started)
+				g_raid_md_sii_start(sc);
+			return (0);
+		}
+		return (-1);
+	}
+	pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data;
+	switch (event) {
+	case G_RAID_DISK_E_DISCONNECTED:
+		/* If disk was assigned, just update statuses. */
+		if (pd->pd_disk_pos >= 0) {
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+			if (disk->d_consumer) {
+				g_raid_kill_consumer(sc, disk->d_consumer);
+				disk->d_consumer = NULL;
+			}
+			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_NONE);
+				g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
+				    G_RAID_EVENT_SUBDISK);
+			}
+		} else {
+			/* Otherwise -- delete. */
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
+			g_raid_destroy_disk(disk);
+		}
+
+		/* Write updated metadata to all disks. */
+		g_raid_md_write_sii(md, NULL, NULL, NULL);
+
+		/* Check if anything left except placeholders. */
+		if (g_raid_ndisks(sc, -1) ==
+		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
+			g_raid_destroy_node(sc, 0);
+		else
+			g_raid_md_sii_refill(sc);
+		return (0);
+	}
+	return (-2);
+}
+
+static int
+g_raid_md_ctl_sii(struct g_raid_md_object *md,
+    struct gctl_req *req)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	struct g_raid_md_sii_object *mdi;
+	struct g_raid_md_sii_perdisk *pd;
+	struct g_consumer *cp;
+	struct g_provider *pp;
+	char arg[16];
+	const char *verb, *volname, *levelname, *diskname;
+	int *nargs, *force;
+	off_t size, sectorsize, strip;
+	intmax_t *sizearg, *striparg;
+	int numdisks, i, len, level, qual, update;
+	int error;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_sii_object *)md;
+	verb = gctl_get_param(req, "verb", NULL);
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	error = 0;
+	if (strcmp(verb, "label") == 0) {
+
+		if (*nargs < 4) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		volname = gctl_get_asciiparam(req, "arg1");
+		if (volname == NULL) {
+			gctl_error(req, "No volume name.");
+			return (-2);
+		}
+		levelname = gctl_get_asciiparam(req, "arg2");
+		if (levelname == NULL) {
+			gctl_error(req, "No RAID level.");
+			return (-3);
+		}
+		if (g_raid_volume_str2level(levelname, &level, &qual)) {
+			gctl_error(req, "Unknown RAID level '%s'.", levelname);
+			return (-4);
+		}
+		numdisks = *nargs - 3;
+		force = gctl_get_paraml(req, "force", sizeof(*force));
+		if (!g_raid_md_sii_supported(level, qual, numdisks,
+		    force ? *force : 0)) {
+			gctl_error(req, "Unsupported RAID level "
+			    "(0x%02x/0x%02x), or number of disks (%d).",
+			    level, qual, numdisks);
+			return (-5);
+		}
+
+		/* Search for disks, connect them and probe. */
+		size = 0x7fffffffffffffffllu;
+		sectorsize = 0;
+		for (i = 0; i < numdisks; i++) {
+			snprintf(arg, sizeof(arg), "arg%d", i + 3);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -6;
+				break;
+			}
+			if (strcmp(diskname, "NONE") == 0) {
+				cp = NULL;
+				pp = NULL;
+			} else {
+				g_topology_lock();
+				cp = g_raid_open_consumer(sc, diskname);
+				if (cp == NULL) {
+					gctl_error(req, "Can't open '%s'.",
+					    diskname);
+					g_topology_unlock();
+					error = -7;
+					break;
+				}
+				pp = cp->provider;
+			}
+			pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO);
+			pd->pd_disk_pos = i;
+			disk = g_raid_create_disk(sc);
+			disk->d_md_data = (void *)pd;
+			disk->d_consumer = cp;
+			if (cp == NULL)
+				continue;
+			cp->private = disk;
+			g_topology_unlock();
+
+			/* Read kernel dumping information. */
+			disk->d_kd.offset = 0;
+			disk->d_kd.length = OFF_MAX;
+			len = sizeof(disk->d_kd);
+			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
+			if (disk->d_kd.di.dumper == NULL)
+				G_RAID_DEBUG1(2, sc,
+				    "Dumping not supported by %s.",
+				    cp->provider->name);
+
+			pd->pd_disk_size = pp->mediasize;
+			if (size > pp->mediasize)
+				size = pp->mediasize;
+			if (sectorsize < pp->sectorsize)
+				sectorsize = pp->sectorsize;
+		}
+		if (error != 0)
+			return (error);
+
+		/* Reserve space for metadata. */
+		size -= 0x800 * sectorsize;
+
+		/* Handle size argument. */
+		len = sizeof(*sizearg);
+		sizearg = gctl_get_param(req, "size", &len);
+		if (sizearg != NULL && len == sizeof(*sizearg) &&
+		    *sizearg > 0) {
+			if (*sizearg > size) {
+				gctl_error(req, "Size too big %lld > %lld.",
+				    (long long)*sizearg, (long long)size);
+				return (-9);
+			}
+			size = *sizearg;
+		}
+
+		/* Handle strip argument. */
+		strip = 131072;
+		len = sizeof(*striparg);
+		striparg = gctl_get_param(req, "strip", &len);
+		if (striparg != NULL && len == sizeof(*striparg) &&
+		    *striparg > 0) {
+			if (*striparg < sectorsize) {
+				gctl_error(req, "Strip size too small.");
+				return (-10);
+			}
+			if (*striparg % sectorsize != 0) {
+				gctl_error(req, "Incorrect strip size.");
+				return (-11);
+			}
+			if (strip > 65535 * sectorsize) {
+				gctl_error(req, "Strip size too big.");
+				return (-12);
+			}
+			strip = *striparg;
+		}
+
+		/* Round size down to strip or sector. */
+		if (level == G_RAID_VOLUME_RL_RAID1)
+			size -= (size % sectorsize);
+		else if (level == G_RAID_VOLUME_RL_RAID1E &&
+		    (numdisks & 1) != 0)
+			size -= (size % (2 * strip));
+		else
+			size -= (size % strip);
+		if (size <= 0) {
+			gctl_error(req, "Size too small.");
+			return (-13);
+		}
+		if (size > 0xffffffffffffllu * sectorsize) {
+			gctl_error(req, "Size too big.");
+			return (-14);
+		}
+
+		/* We have all we need, create things: volume, ... */
+		mdi->mdio_total_disks = numdisks;
+		mdi->mdio_started = 1;
+		vol = g_raid_create_volume(sc, volname, -1);
+		vol->v_md_data = (void *)(intptr_t)0;
+		vol->v_raid_level = level;
+		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
+		vol->v_strip_size = strip;
+		vol->v_disks_count = numdisks;
+		if (level == G_RAID_VOLUME_RL_RAID0 ||
+		    level == G_RAID_VOLUME_RL_CONCAT ||
+		    level == G_RAID_VOLUME_RL_SINGLE)
+			vol->v_mediasize = size * numdisks;
+		else if (level == G_RAID_VOLUME_RL_RAID1)
+			vol->v_mediasize = size;
+		else if (level == G_RAID_VOLUME_RL_RAID5)
+			vol->v_mediasize = size * (numdisks - 1);
+		else { /* RAID1E */
+			vol->v_mediasize = ((size * numdisks) / strip / 2) *
+			    strip;
+		}
+		vol->v_sectorsize = sectorsize;
+		g_raid_start_volume(vol);
+
+		/* , and subdisks. */
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data;
+			sd = &vol->v_subdisks[pd->pd_disk_pos];
+			sd->sd_disk = disk;
+			sd->sd_offset = 0;
+			sd->sd_size = size;
+			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
+			if (sd->sd_disk->d_consumer != NULL) {
+				g_raid_change_disk_state(disk,
+				    G_RAID_DISK_S_ACTIVE);
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_ACTIVE);
+				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
+				    G_RAID_EVENT_SUBDISK);
+			} else {
+				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+			}
+		}
+
+		/* Write metadata based on created entities. */
+		G_RAID_DEBUG1(0, sc, "Array started.");
+		g_raid_md_write_sii(md, NULL, NULL, NULL);
+
+		/* Pickup any STALE/SPARE disks to refill array if needed. */
+		g_raid_md_sii_refill(sc);
+
+		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
+		    G_RAID_EVENT_VOLUME);
+		return (0);
+	}
+	if (strcmp(verb, "delete") == 0) {
+
+		/* Check if some volume is still open. */
+		force = gctl_get_paraml(req, "force", sizeof(*force));
+		if (force != NULL && *force == 0 &&
+		    g_raid_nopens(sc) != 0) {
+			gctl_error(req, "Some volume is still open.");
+			return (-4);
+		}
+
+		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+			if (disk->d_consumer)
+				sii_meta_erase(disk->d_consumer);
+		}
+		g_raid_destroy_node(sc, 0);
+		return (0);
+	}
+	if (strcmp(verb, "remove") == 0 ||
+	    strcmp(verb, "fail") == 0) {
+		if (*nargs < 2) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		for (i = 1; i < *nargs; i++) {
+			snprintf(arg, sizeof(arg), "arg%d", i);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -2;
+				break;
+			}
+			if (strncmp(diskname, "/dev/", 5) == 0)
+				diskname += 5;
+
+			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+				if (disk->d_consumer != NULL && 
+				    disk->d_consumer->provider != NULL &&
+				    strcmp(disk->d_consumer->provider->name,
+				     diskname) == 0)
+					break;
+			}
+			if (disk == NULL) {
+				gctl_error(req, "Disk '%s' not found.",
+				    diskname);
+				error = -3;
+				break;
+			}
+
+			if (strcmp(verb, "fail") == 0) {
+				g_raid_md_fail_disk_sii(md, NULL, disk);
+				continue;
+			}
+
+			pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data;
+
+			/* Erase metadata on deleting disk. */
+			sii_meta_erase(disk->d_consumer);
+
+			/* If disk was assigned, just update statuses. */
+			if (pd->pd_disk_pos >= 0) {
+				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+				if (disk->d_consumer) {
+					g_raid_kill_consumer(sc, disk->d_consumer);
+					disk->d_consumer = NULL;
+				}
+				TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+					g_raid_change_subdisk_state(sd,
+					    G_RAID_SUBDISK_S_NONE);
+					g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
+					    G_RAID_EVENT_SUBDISK);
+				}
+			} else {
+				/* Otherwise -- delete. */
+				g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
+				g_raid_destroy_disk(disk);
+			}
+		}
+
+		/* Write updated metadata to remaining disks. */
+		g_raid_md_write_sii(md, NULL, NULL, NULL);
+
+		/* Check if anything left except placeholders. */
+		if (g_raid_ndisks(sc, -1) ==
+		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
+			g_raid_destroy_node(sc, 0);
+		else
+			g_raid_md_sii_refill(sc);
+		return (error);
+	}
+	if (strcmp(verb, "insert") == 0) {
+		if (*nargs < 2) {
+			gctl_error(req, "Invalid number of arguments.");
+			return (-1);
+		}
+		update = 0;
+		for (i = 1; i < *nargs; i++) {
+			/* Get disk name. */
+			snprintf(arg, sizeof(arg), "arg%d", i);
+			diskname = gctl_get_asciiparam(req, arg);
+			if (diskname == NULL) {
+				gctl_error(req, "No disk name (%s).", arg);
+				error = -3;
+				break;
+			}
+
+			/* Try to find provider with specified name. */
+			g_topology_lock();
+			cp = g_raid_open_consumer(sc, diskname);
+			if (cp == NULL) {
+				gctl_error(req, "Can't open disk '%s'.",
+				    diskname);
+				g_topology_unlock();
+				error = -4;
+				break;
+			}
+			pp = cp->provider;
+
+			pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO);
+			pd->pd_disk_pos = -3;
+			pd->pd_disk_size = pp->mediasize;
+
+			disk = g_raid_create_disk(sc);
+			disk->d_consumer = cp;
+			disk->d_consumer->private = disk;
+			disk->d_md_data = (void *)pd;
+			cp->private = disk;
+			g_topology_unlock();
+
+			/* Read kernel dumping information. */
+			disk->d_kd.offset = 0;
+			disk->d_kd.length = OFF_MAX;
+			len = sizeof(disk->d_kd);
+			g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
+			if (disk->d_kd.di.dumper == NULL)
+				G_RAID_DEBUG1(2, sc,
+				    "Dumping not supported by %s.",
+				    cp->provider->name);
+
+			/* Welcome the "new" disk. */
+			update += g_raid_md_sii_start_disk(disk);
+			if (disk->d_state == G_RAID_DISK_S_SPARE) {
+				sii_meta_write_spare(cp);
+				g_raid_destroy_disk(disk);
+			} else if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
+				gctl_error(req, "Disk '%s' doesn't fit.",
+				    diskname);
+				g_raid_destroy_disk(disk);
+				error = -8;
+				break;
+			}
+		}
+
+		/* Write new metadata if we changed something. */
+		if (update)
+			g_raid_md_write_sii(md, NULL, NULL, NULL);
+		return (error);
+	}
+	gctl_error(req, "Command '%s' is not supported.", verb);
+	return (-100);
+}
+
+static int
+g_raid_md_write_sii(struct g_raid_md_object *md, struct g_raid_volume *tvol,
+    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk;
+	struct g_raid_md_sii_object *mdi;
+	struct g_raid_md_sii_perdisk *pd;
+	struct sii_raid_conf *meta;
+	int i;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_sii_object *)md;
+
+	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
+		return (0);
+
+	/* Bump generation. Newly written metadata may differ from previous. */
+	mdi->mdio_generation++;
+
+	/* There is only one volume. */
+	vol = TAILQ_FIRST(&sc->sc_volumes);
+
+	/* Fill global fields. */
+	meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO);
+	if (mdi->mdio_meta)
+		memcpy(meta, mdi->mdio_meta, sizeof(*meta));
+	meta->total_sectors = vol->v_mediasize / vol->v_sectorsize;
+	meta->vendor_id = 0x1095;
+	meta->version_minor = 0;
+	meta->version_major = 2;
+	memcpy(&meta->timestamp, &mdi->mdio_timestamp, 6);
+	meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize;
+	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) {
+		meta->type = SII_T_RAID0;
+		meta->raid0_disks = vol->v_disks_count;
+		meta->raid1_disks = 0xff;
+	} else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) {
+		meta->type = SII_T_RAID1;
+		meta->raid0_disks = 0xff;
+		meta->raid1_disks = vol->v_disks_count;
+	} else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
+		meta->type = SII_T_RAID01;
+		meta->raid0_disks = vol->v_disks_count / 2;
+		meta->raid1_disks = 2;
+	} else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT ||
+	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) {
+		meta->type = SII_T_JBOD;
+		meta->raid0_disks = vol->v_disks_count;
+		meta->raid1_disks = 0xff;
+	} else {
+		meta->type = SII_T_RAID5;
+		meta->raid0_disks = vol->v_disks_count;
+		meta->raid1_disks = 0xff;
+	}
+	meta->generation = mdi->mdio_generation;
+	meta->raid_status = vol->v_dirty ? SII_S_ONLINE : SII_S_AVAILABLE;
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		if (sd->sd_state == G_RAID_SUBDISK_S_STALE ||
+		    sd->sd_state == G_RAID_SUBDISK_S_RESYNC)
+			meta->raid_status = SII_S_ONLINE;
+	}
+	meta->raid_location = mdi->mdio_location;
+	sii_meta_put_name(meta, vol->v_name);
+
+	/* We are done. Print meta data and store them to disks. */
+	if (mdi->mdio_meta != NULL)
+		free(mdi->mdio_meta, M_MD_SII);
+	mdi->mdio_meta = meta;
+	i = 0;
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data;
+		if (disk->d_state != G_RAID_DISK_S_ACTIVE)
+			continue;
+		if (pd->pd_meta != NULL) {
+			free(pd->pd_meta, M_MD_SII);
+			pd->pd_meta = NULL;
+		}
+		pd->pd_meta = sii_meta_copy(meta);
+		if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) {
+			if (sd->sd_state < G_RAID_SUBDISK_S_NEW)
+				pd->pd_meta->disk_status = SII_S_DROPPED;
+			else if (sd->sd_state < G_RAID_SUBDISK_S_STALE) {
+				pd->pd_meta->disk_status = SII_S_REBUILD;
+				pd->pd_meta->rebuild_lba =
+				    sd->sd_rebuild_pos / vol->v_sectorsize;
+			} else
+				pd->pd_meta->disk_status = SII_S_CURRENT;
+			if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) {
+				pd->pd_meta->disk_number = sd->sd_pos;
+				pd->pd_meta->raid0_ident = 0xff;
+				pd->pd_meta->raid1_ident = 0;
+			} else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) {
+				pd->pd_meta->disk_number = sd->sd_pos / meta->raid1_disks;
+				pd->pd_meta->raid0_ident = sd->sd_pos % meta->raid1_disks;
+				pd->pd_meta->raid1_ident = sd->sd_pos / meta->raid1_disks;
+			} else {
+				pd->pd_meta->disk_number = sd->sd_pos;
+				pd->pd_meta->raid0_ident = 0;
+				pd->pd_meta->raid1_ident = 0xff;
+			}
+		}
+		G_RAID_DEBUG(1, "Writing SiI metadata to %s",
+		    g_raid_get_diskname(disk));
+		g_raid_md_sii_print(pd->pd_meta);
+		sii_meta_write(disk->d_consumer, pd->pd_meta);
+	}
+	return (0);
+}
+
+static int
+g_raid_md_fail_disk_sii(struct g_raid_md_object *md,
+    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_md_sii_object *mdi;
+	struct g_raid_md_sii_perdisk *pd;
+	struct g_raid_subdisk *sd;
+
+	sc = md->mdo_softc;
+	mdi = (struct g_raid_md_sii_object *)md;
+	pd = (struct g_raid_md_sii_perdisk *)tdisk->d_md_data;
+
+	/* We can't fail disk that is not a part of array now. */
+	if (pd->pd_disk_pos < 0)
+		return (-1);
+
+	/*
+	 * Mark disk as failed in metadata and try to write that metadata
+	 * to the disk itself to prevent it's later resurrection as STALE.
+	 */
+	if (tdisk->d_consumer) {
+		if (pd->pd_meta) {
+			pd->pd_meta->disk_status = SII_S_REMOVED;
+			sii_meta_write(tdisk->d_consumer, pd->pd_meta);
+		} else
+			sii_meta_erase(tdisk->d_consumer);
+	}
+
+	/* Change states. */
+	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
+	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
+		g_raid_change_subdisk_state(sd,
+		    G_RAID_SUBDISK_S_FAILED);
+		g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
+		    G_RAID_EVENT_SUBDISK);
+	}
+
+	/* Write updated metadata to remaining disks. */
+	g_raid_md_write_sii(md, NULL, NULL, tdisk);
+
+	/* Check if anything left except placeholders. */
+	if (g_raid_ndisks(sc, -1) ==
+	    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
+		g_raid_destroy_node(sc, 0);
+	else
+		g_raid_md_sii_refill(sc);
+	return (0);
+}
+
+static int
+g_raid_md_free_disk_sii(struct g_raid_md_object *md,
+    struct g_raid_disk *disk)
+{
+	struct g_raid_md_sii_perdisk *pd;
+
+	pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data;
+	if (pd->pd_meta != NULL) {
+		free(pd->pd_meta, M_MD_SII);
+		pd->pd_meta = NULL;
+	}
+	free(pd, M_MD_SII);
+	disk->d_md_data = NULL;
+	return (0);
+}
+
+static int
+g_raid_md_free_sii(struct g_raid_md_object *md)
+{
+	struct g_raid_md_sii_object *mdi;
+
+	mdi = (struct g_raid_md_sii_object *)md;
+	if (!mdi->mdio_started) {
+		mdi->mdio_started = 0;
+		callout_stop(&mdi->mdio_start_co);
+		G_RAID_DEBUG1(1, md->mdo_softc,
+		    "root_mount_rel %p", mdi->mdio_rootmount);
+		root_mount_rel(mdi->mdio_rootmount);
+		mdi->mdio_rootmount = NULL;
+	}
+	if (mdi->mdio_meta != NULL) {
+		free(mdi->mdio_meta, M_MD_SII);
+		mdi->mdio_meta = NULL;
+	}
+	return (0);
+}
+
+G_RAID_MD_DECLARE(g_raid_md_sii);
diff --git a/sys/geom/raid/tr_concat.c b/sys/geom/raid/tr_concat.c
new file mode 100644
index 0000000..c5f2913
--- /dev/null
+++ b/sys/geom/raid/tr_concat.c
@@ -0,0 +1,343 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <geom/geom.h>
+#include "geom/raid/g_raid.h"
+#include "g_raid_tr_if.h"
+
+static MALLOC_DEFINE(M_TR_CONCAT, "tr_concat_data", "GEOM_RAID CONCAT data");
+
+struct g_raid_tr_concat_object {
+	struct g_raid_tr_object	 trso_base;
+	int			 trso_starting;
+	int			 trso_stopped;
+};
+
+static g_raid_tr_taste_t g_raid_tr_taste_concat;
+static g_raid_tr_event_t g_raid_tr_event_concat;
+static g_raid_tr_start_t g_raid_tr_start_concat;
+static g_raid_tr_stop_t g_raid_tr_stop_concat;
+static g_raid_tr_iostart_t g_raid_tr_iostart_concat;
+static g_raid_tr_iodone_t g_raid_tr_iodone_concat;
+static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_concat;
+static g_raid_tr_free_t g_raid_tr_free_concat;
+
+static kobj_method_t g_raid_tr_concat_methods[] = {
+	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_concat),
+	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_concat),
+	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_concat),
+	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_concat),
+	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_concat),
+	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_concat),
+	KOBJMETHOD(g_raid_tr_kerneldump,	g_raid_tr_kerneldump_concat),
+	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_concat),
+	{ 0, 0 }
+};
+
+static struct g_raid_tr_class g_raid_tr_concat_class = {
+	"CONCAT",
+	g_raid_tr_concat_methods,
+	sizeof(struct g_raid_tr_concat_object),
+	.trc_priority = 50
+};
+
+static int
+g_raid_tr_taste_concat(struct g_raid_tr_object *tr, struct g_raid_volume *volume)
+{
+	struct g_raid_tr_concat_object *trs;
+
+	trs = (struct g_raid_tr_concat_object *)tr;
+	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_SINGLE &&
+	    tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_CONCAT &&
+	    !(tr->tro_volume->v_disks_count == 1 &&
+	      tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_UNKNOWN))
+		return (G_RAID_TR_TASTE_FAIL);
+	trs->trso_starting = 1;
+	return (G_RAID_TR_TASTE_SUCCEED);
+}
+
+static int
+g_raid_tr_update_state_concat(struct g_raid_volume *vol)
+{
+	struct g_raid_tr_concat_object *trs;
+	struct g_raid_softc *sc;
+	off_t size;
+	u_int s;
+	int i, n, f;
+
+	sc = vol->v_softc;
+	trs = (struct g_raid_tr_concat_object *)vol->v_tr;
+	if (trs->trso_stopped)
+		s = G_RAID_VOLUME_S_STOPPED;
+	else if (trs->trso_starting)
+		s = G_RAID_VOLUME_S_STARTING;
+	else {
+		n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
+		f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED);
+		if (n + f == vol->v_disks_count) {
+			if (f == 0)
+				s = G_RAID_VOLUME_S_OPTIMAL;
+			else
+				s = G_RAID_VOLUME_S_SUBOPTIMAL;
+		} else
+			s = G_RAID_VOLUME_S_BROKEN;
+	}
+	if (s != vol->v_state) {
+
+		/*
+		 * Some metadata modules may not know CONCAT volume
+		 * mediasize until all disks connected. Recalculate.
+		 */
+		if (G_RAID_VOLUME_S_ALIVE(s) &&
+		    !G_RAID_VOLUME_S_ALIVE(vol->v_state)) {
+			size = 0;
+			for (i = 0; i < vol->v_disks_count; i++) {
+				if (vol->v_subdisks[i].sd_state !=
+				    G_RAID_SUBDISK_S_NONE)
+					size += vol->v_subdisks[i].sd_size;
+			}
+			vol->v_mediasize = size;
+		}
+
+		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
+		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
+		    G_RAID_EVENT_VOLUME);
+		g_raid_change_volume_state(vol, s);
+		if (!trs->trso_starting && !trs->trso_stopped)
+			g_raid_write_metadata(sc, vol, NULL, NULL);
+	}
+	return (0);
+}
+
+static int
+g_raid_tr_event_concat(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd, u_int event)
+{
+	struct g_raid_tr_concat_object *trs;
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	int state;
+
+	trs = (struct g_raid_tr_concat_object *)tr;
+	vol = tr->tro_volume;
+	sc = vol->v_softc;
+
+	state = sd->sd_state;
+	if (state != G_RAID_SUBDISK_S_NONE &&
+	    state != G_RAID_SUBDISK_S_FAILED &&
+	    state != G_RAID_SUBDISK_S_ACTIVE) {
+		G_RAID_DEBUG1(1, sc,
+		    "Promote subdisk %s:%d from %s to ACTIVE.",
+		    vol->v_name, sd->sd_pos,
+		    g_raid_subdisk_state2str(sd->sd_state));
+		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
+	}
+	if (state != sd->sd_state &&
+	    !trs->trso_starting && !trs->trso_stopped)
+		g_raid_write_metadata(sc, vol, sd, NULL);
+	g_raid_tr_update_state_concat(vol);
+	return (0);
+}
+
+static int
+g_raid_tr_start_concat(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_concat_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_concat_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	g_raid_tr_update_state_concat(vol);
+	return (0);
+}
+
+static int
+g_raid_tr_stop_concat(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_concat_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_concat_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	trs->trso_stopped = 1;
+	g_raid_tr_update_state_concat(vol);
+	return (0);
+}
+
+static void
+g_raid_tr_iostart_concat(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct bio_queue_head queue;
+	struct bio *cbp;
+	char *addr;
+	off_t offset, length, remain;
+	u_int no;
+
+	vol = tr->tro_volume;
+	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
+	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) {
+		g_raid_iodone(bp, EIO);
+		return;
+	}
+	if (bp->bio_cmd == BIO_FLUSH) {
+		g_raid_tr_flush_common(tr, bp);
+		return;
+	}
+
+	offset = bp->bio_offset;
+	remain = bp->bio_length;
+	addr = bp->bio_data;
+	no = 0;
+	while (no < vol->v_disks_count &&
+	    offset >= vol->v_subdisks[no].sd_size) {
+		offset -= vol->v_subdisks[no].sd_size;
+		no++;
+	}
+	KASSERT(no < vol->v_disks_count,
+	    ("Request starts after volume end (%ju)", bp->bio_offset));
+	bioq_init(&queue);
+	do {
+		sd = &vol->v_subdisks[no];
+		length = MIN(sd->sd_size - offset, remain);
+		cbp = g_clone_bio(bp);
+		if (cbp == NULL)
+			goto failure;
+		cbp->bio_offset = offset;
+		cbp->bio_data = addr;
+		cbp->bio_length = length;
+		cbp->bio_caller1 = sd;
+		bioq_insert_tail(&queue, cbp);
+		remain -= length;
+		addr += length;
+		offset = 0;
+		no++;
+		KASSERT(no < vol->v_disks_count || remain == 0,
+		    ("Request ends after volume end (%ju, %ju)",
+			bp->bio_offset, bp->bio_length));
+	} while (remain > 0);
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		sd = cbp->bio_caller1;
+		cbp->bio_caller1 = NULL;
+		g_raid_subdisk_iostart(sd, cbp);
+	}
+	return;
+failure:
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		g_destroy_bio(cbp);
+	}
+	if (bp->bio_error == 0)
+		bp->bio_error = ENOMEM;
+	g_raid_iodone(bp, bp->bio_error);
+}
+
+static int
+g_raid_tr_kerneldump_concat(struct g_raid_tr_object *tr,
+    void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	char *addr;
+	off_t offset, length, remain;
+	int error, no;
+
+	vol = tr->tro_volume;
+	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
+		return (ENXIO);
+
+	offset = boffset;
+	remain = blength;
+	addr = virtual;
+	no = 0;
+	while (no < vol->v_disks_count &&
+	    offset >= vol->v_subdisks[no].sd_size) {
+		offset -= vol->v_subdisks[no].sd_size;
+		no++;
+	}
+	KASSERT(no < vol->v_disks_count,
+	    ("Request starts after volume end (%ju)", boffset));
+	do {
+		sd = &vol->v_subdisks[no];
+		length = MIN(sd->sd_size - offset, remain);
+		error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no],
+		    addr, 0, offset, length);
+		if (error != 0)
+			return (error);
+		remain -= length;
+		addr += length;
+		offset = 0;
+		no++;
+		KASSERT(no < vol->v_disks_count || remain == 0,
+		    ("Request ends after volume end (%ju, %zu)",
+			boffset, blength));
+	} while (remain > 0);
+	return (0);
+}
+
+static void
+g_raid_tr_iodone_concat(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd,struct bio *bp)
+{
+	struct bio *pbp;
+
+	pbp = bp->bio_parent;
+	if (pbp->bio_error == 0)
+		pbp->bio_error = bp->bio_error;
+	g_destroy_bio(bp);
+	pbp->bio_inbed++;
+	if (pbp->bio_children == pbp->bio_inbed) {
+		pbp->bio_completed = pbp->bio_length;
+		g_raid_iodone(pbp, bp->bio_error);
+	}
+}
+
+static int
+g_raid_tr_free_concat(struct g_raid_tr_object *tr)
+{
+
+	return (0);
+}
+
+G_RAID_TR_DECLARE(g_raid_tr_concat);
diff --git a/sys/geom/raid/tr_raid0.c b/sys/geom/raid/tr_raid0.c
new file mode 100644
index 0000000..0fb45a6
--- /dev/null
+++ b/sys/geom/raid/tr_raid0.c
@@ -0,0 +1,326 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <geom/geom.h>
+#include "geom/raid/g_raid.h"
+#include "g_raid_tr_if.h"
+
+static MALLOC_DEFINE(M_TR_RAID0, "tr_raid0_data", "GEOM_RAID RAID0 data");
+
+struct g_raid_tr_raid0_object {
+	struct g_raid_tr_object	 trso_base;
+	int			 trso_starting;
+	int			 trso_stopped;
+};
+
+static g_raid_tr_taste_t g_raid_tr_taste_raid0;
+static g_raid_tr_event_t g_raid_tr_event_raid0;
+static g_raid_tr_start_t g_raid_tr_start_raid0;
+static g_raid_tr_stop_t g_raid_tr_stop_raid0;
+static g_raid_tr_iostart_t g_raid_tr_iostart_raid0;
+static g_raid_tr_iodone_t g_raid_tr_iodone_raid0;
+static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid0;
+static g_raid_tr_free_t g_raid_tr_free_raid0;
+
+static kobj_method_t g_raid_tr_raid0_methods[] = {
+	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid0),
+	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid0),
+	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid0),
+	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid0),
+	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid0),
+	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid0),
+	KOBJMETHOD(g_raid_tr_kerneldump,	g_raid_tr_kerneldump_raid0),
+	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid0),
+	{ 0, 0 }
+};
+
+static struct g_raid_tr_class g_raid_tr_raid0_class = {
+	"RAID0",
+	g_raid_tr_raid0_methods,
+	sizeof(struct g_raid_tr_raid0_object),
+	.trc_priority = 100
+};
+
+static int
+g_raid_tr_taste_raid0(struct g_raid_tr_object *tr, struct g_raid_volume *volume)
+{
+	struct g_raid_tr_raid0_object *trs;
+
+	trs = (struct g_raid_tr_raid0_object *)tr;
+	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID0 ||
+	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE)
+		return (G_RAID_TR_TASTE_FAIL);
+	trs->trso_starting = 1;
+	return (G_RAID_TR_TASTE_SUCCEED);
+}
+
+static int
+g_raid_tr_update_state_raid0(struct g_raid_volume *vol)
+{
+	struct g_raid_tr_raid0_object *trs;
+	struct g_raid_softc *sc;
+	u_int s;
+	int n, f;
+
+	sc = vol->v_softc;
+	trs = (struct g_raid_tr_raid0_object *)vol->v_tr;
+	if (trs->trso_stopped)
+		s = G_RAID_VOLUME_S_STOPPED;
+	else if (trs->trso_starting)
+		s = G_RAID_VOLUME_S_STARTING;
+	else {
+		n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
+		f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED);
+		if (n + f == vol->v_disks_count) {
+			if (f == 0)
+				s = G_RAID_VOLUME_S_OPTIMAL;
+			else
+				s = G_RAID_VOLUME_S_SUBOPTIMAL;
+		} else
+			s = G_RAID_VOLUME_S_BROKEN;
+	}
+	if (s != vol->v_state) {
+		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
+		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
+		    G_RAID_EVENT_VOLUME);
+		g_raid_change_volume_state(vol, s);
+		if (!trs->trso_starting && !trs->trso_stopped)
+			g_raid_write_metadata(sc, vol, NULL, NULL);
+	}
+	return (0);
+}
+
+static int
+g_raid_tr_event_raid0(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd, u_int event)
+{
+	struct g_raid_tr_raid0_object *trs;
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	int state;
+
+	trs = (struct g_raid_tr_raid0_object *)tr;
+	vol = tr->tro_volume;
+	sc = vol->v_softc;
+
+	state = sd->sd_state;
+	if (state != G_RAID_SUBDISK_S_NONE &&
+	    state != G_RAID_SUBDISK_S_FAILED &&
+	    state != G_RAID_SUBDISK_S_ACTIVE) {
+		G_RAID_DEBUG1(1, sc,
+		    "Promote subdisk %s:%d from %s to ACTIVE.",
+		    vol->v_name, sd->sd_pos,
+		    g_raid_subdisk_state2str(sd->sd_state));
+		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
+	}
+	if (state != sd->sd_state &&
+	    !trs->trso_starting && !trs->trso_stopped)
+		g_raid_write_metadata(sc, vol, sd, NULL);
+	g_raid_tr_update_state_raid0(vol);
+	return (0);
+}
+
+static int
+g_raid_tr_start_raid0(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid0_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_raid0_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	g_raid_tr_update_state_raid0(vol);
+	return (0);
+}
+
+static int
+g_raid_tr_stop_raid0(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid0_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_raid0_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	trs->trso_stopped = 1;
+	g_raid_tr_update_state_raid0(vol);
+	return (0);
+}
+
+static void
+g_raid_tr_iostart_raid0(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct bio_queue_head queue;
+	struct bio *cbp;
+	char *addr;
+	off_t offset, start, length, nstripe, remain;
+	u_int no, strip_size;
+
+	vol = tr->tro_volume;
+	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
+	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) {
+		g_raid_iodone(bp, EIO);
+		return;
+	}
+	if (bp->bio_cmd == BIO_FLUSH) {
+		g_raid_tr_flush_common(tr, bp);
+		return;
+	}
+	addr = bp->bio_data;
+	strip_size = vol->v_strip_size;
+
+	/* Stripe number. */
+	nstripe = bp->bio_offset / strip_size;
+	/* Start position in stripe. */
+	start = bp->bio_offset % strip_size;
+	/* Disk number. */
+	no = nstripe % vol->v_disks_count;
+	/* Stripe start position in disk. */
+	offset = (nstripe / vol->v_disks_count) * strip_size;
+	/* Length of data to operate. */
+	remain = bp->bio_length;
+
+	bioq_init(&queue);
+	do {
+		length = MIN(strip_size - start, remain);
+		cbp = g_clone_bio(bp);
+		if (cbp == NULL)
+			goto failure;
+		cbp->bio_offset = offset + start;
+		cbp->bio_data = addr;
+		cbp->bio_length = length;
+		cbp->bio_caller1 = &vol->v_subdisks[no];
+		bioq_insert_tail(&queue, cbp);
+		if (++no >= vol->v_disks_count) {
+			no = 0;
+			offset += strip_size;
+		}
+		remain -= length;
+		addr += length;
+		start = 0;
+	} while (remain > 0);
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		sd = cbp->bio_caller1;
+		cbp->bio_caller1 = NULL;
+		g_raid_subdisk_iostart(sd, cbp);
+	}
+	return;
+failure:
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		g_destroy_bio(cbp);
+	}
+	if (bp->bio_error == 0)
+		bp->bio_error = ENOMEM;
+	g_raid_iodone(bp, bp->bio_error);
+}
+
+static int
+g_raid_tr_kerneldump_raid0(struct g_raid_tr_object *tr,
+    void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
+{
+	struct g_raid_volume *vol;
+	char *addr;
+	off_t offset, start, length, nstripe, remain;
+	u_int no, strip_size;
+	int error;
+
+	vol = tr->tro_volume;
+	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
+		return (ENXIO);
+	addr = virtual;
+	strip_size = vol->v_strip_size;
+
+	/* Stripe number. */
+	nstripe = boffset / strip_size;
+	/* Start position in stripe. */
+	start = boffset % strip_size;
+	/* Disk number. */
+	no = nstripe % vol->v_disks_count;
+	/* Stripe tart position in disk. */
+	offset = (nstripe / vol->v_disks_count) * strip_size;
+	/* Length of data to operate. */
+	remain = blength;
+
+	do {
+		length = MIN(strip_size - start, remain);
+		error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no],
+		    addr, 0, offset + start, length);
+		if (error != 0)
+			return (error);
+		if (++no >= vol->v_disks_count) {
+			no = 0;
+			offset += strip_size;
+		}
+		remain -= length;
+		addr += length;
+		start = 0;
+	} while (remain > 0);
+	return (0);
+}
+
+static void
+g_raid_tr_iodone_raid0(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd,struct bio *bp)
+{
+	struct bio *pbp;
+
+	pbp = bp->bio_parent;
+	if (pbp->bio_error == 0)
+		pbp->bio_error = bp->bio_error;
+	g_destroy_bio(bp);
+	pbp->bio_inbed++;
+	if (pbp->bio_children == pbp->bio_inbed) {
+		pbp->bio_completed = pbp->bio_length;
+		g_raid_iodone(pbp, bp->bio_error);
+	}
+}
+
+static int
+g_raid_tr_free_raid0(struct g_raid_tr_object *tr)
+{
+
+	return (0);
+}
+
+G_RAID_TR_DECLARE(g_raid_tr_raid0);
diff --git a/sys/geom/raid/tr_raid1.c b/sys/geom/raid/tr_raid1.c
new file mode 100644
index 0000000..b5e4953
--- /dev/null
+++ b/sys/geom/raid/tr_raid1.c
@@ -0,0 +1,993 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <geom/geom.h>
+#include "geom/raid/g_raid.h"
+#include "g_raid_tr_if.h"
+
+SYSCTL_DECL(_kern_geom_raid);
+SYSCTL_NODE(_kern_geom_raid, OID_AUTO, raid1, CTLFLAG_RW, 0,
+    "RAID1 parameters");
+
+#define RAID1_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
+static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB;
+TUNABLE_INT("kern.geom.raid.raid1.rebuild_slab_size",
+    &g_raid1_rebuild_slab);
+SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
+    &g_raid1_rebuild_slab, 0,
+    "Amount of the disk to rebuild each read/write cycle of the rebuild.");
+
+#define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
+static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO;
+TUNABLE_INT("kern.geom.raid.raid1.rebuild_fair_io",
+    &g_raid1_rebuild_fair_io);
+SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
+    &g_raid1_rebuild_fair_io, 0,
+    "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
+
+#define RAID1_REBUILD_CLUSTER_IDLE 100
+static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE;
+TUNABLE_INT("kern.geom.raid.raid1.rebuild_cluster_idle",
+    &g_raid1_rebuild_cluster_idle);
+SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
+    &g_raid1_rebuild_cluster_idle, 0,
+    "Number of slabs to do each time we trigger a rebuild cycle");
+
+#define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
+static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE;
+TUNABLE_INT("kern.geom.raid.raid1.rebuild_meta_update",
+    &g_raid1_rebuild_meta_update);
+SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
+    &g_raid1_rebuild_meta_update, 0,
+    "When to update the meta data.");
+
+static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data");
+
+#define TR_RAID1_NONE 0
+#define TR_RAID1_REBUILD 1
+#define TR_RAID1_RESYNC 2
+
+#define TR_RAID1_F_DOING_SOME	0x1
+#define TR_RAID1_F_LOCKED	0x2
+#define TR_RAID1_F_ABORT	0x4
+
+struct g_raid_tr_raid1_object {
+	struct g_raid_tr_object	 trso_base;
+	int			 trso_starting;
+	int			 trso_stopping;
+	int			 trso_type;
+	int			 trso_recover_slabs; /* slabs before rest */
+	int			 trso_fair_io;
+	int			 trso_meta_update;
+	int			 trso_flags;
+	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
+	void			*trso_buffer;	 /* Buffer space */
+	struct bio		 trso_bio;
+};
+
+static g_raid_tr_taste_t g_raid_tr_taste_raid1;
+static g_raid_tr_event_t g_raid_tr_event_raid1;
+static g_raid_tr_start_t g_raid_tr_start_raid1;
+static g_raid_tr_stop_t g_raid_tr_stop_raid1;
+static g_raid_tr_iostart_t g_raid_tr_iostart_raid1;
+static g_raid_tr_iodone_t g_raid_tr_iodone_raid1;
+static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1;
+static g_raid_tr_locked_t g_raid_tr_locked_raid1;
+static g_raid_tr_idle_t g_raid_tr_idle_raid1;
+static g_raid_tr_free_t g_raid_tr_free_raid1;
+
+static kobj_method_t g_raid_tr_raid1_methods[] = {
+	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1),
+	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1),
+	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1),
+	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1),
+	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1),
+	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1),
+	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1),
+	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1),
+	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1),
+	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1),
+	{ 0, 0 }
+};
+
+static struct g_raid_tr_class g_raid_tr_raid1_class = {
+	"RAID1",
+	g_raid_tr_raid1_methods,
+	sizeof(struct g_raid_tr_raid1_object),
+	.trc_priority = 100
+};
+
+static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr);
+static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd);
+
+static int
+g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
+{
+	struct g_raid_tr_raid1_object *trs;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 ||
+	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE)
+		return (G_RAID_TR_TASTE_FAIL);
+	trs->trso_starting = 1;
+	return (G_RAID_TR_TASTE_SUCCEED);
+}
+
+static int
+g_raid_tr_update_state_raid1(struct g_raid_volume *vol,
+    struct g_raid_subdisk *sd)
+{
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *tsd, *bestsd;
+	u_int s;
+	int i, na, ns;
+
+	sc = vol->v_softc;
+	trs = (struct g_raid_tr_raid1_object *)vol->v_tr;
+	if (trs->trso_stopping &&
+	    (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0)
+		s = G_RAID_VOLUME_S_STOPPED;
+	else if (trs->trso_starting)
+		s = G_RAID_VOLUME_S_STARTING;
+	else {
+		/* Make sure we have at least one ACTIVE disk. */
+		na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
+		if (na == 0) {
+			/*
+			 * Critical situation! We have no any active disk!
+			 * Choose the best disk we have to make it active.
+			 */
+			bestsd = &vol->v_subdisks[0];
+			for (i = 1; i < vol->v_disks_count; i++) {
+				tsd = &vol->v_subdisks[i];
+				if (tsd->sd_state > bestsd->sd_state)
+					bestsd = tsd;
+				else if (tsd->sd_state == bestsd->sd_state &&
+				    (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
+				     tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
+				    tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
+					bestsd = tsd;
+			}
+			if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) {
+				/* We found reasonable candidate. */
+				G_RAID_DEBUG1(1, sc,
+				    "Promote subdisk %s:%d from %s to ACTIVE.",
+				    vol->v_name, bestsd->sd_pos,
+				    g_raid_subdisk_state2str(bestsd->sd_state));
+				g_raid_change_subdisk_state(bestsd,
+				    G_RAID_SUBDISK_S_ACTIVE);
+				g_raid_write_metadata(sc,
+				    vol, bestsd, bestsd->sd_disk);
+			}
+		}
+		na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
+		ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
+		     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
+		if (na == vol->v_disks_count)
+			s = G_RAID_VOLUME_S_OPTIMAL;
+		else if (na + ns == vol->v_disks_count)
+			s = G_RAID_VOLUME_S_SUBOPTIMAL;
+		else if (na > 0)
+			s = G_RAID_VOLUME_S_DEGRADED;
+		else
+			s = G_RAID_VOLUME_S_BROKEN;
+		g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd);
+	}
+	if (s != vol->v_state) {
+		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
+		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
+		    G_RAID_EVENT_VOLUME);
+		g_raid_change_volume_state(vol, s);
+		if (!trs->trso_starting && !trs->trso_stopping)
+			g_raid_write_metadata(sc, vol, NULL, NULL);
+	}
+	return (0);
+}
+
+static void
+g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
+    struct g_raid_disk *disk)
+{
+	/*
+	 * We don't fail the last disk in the pack, since it still has decent
+	 * data on it and that's better than failing the disk if it is the root
+	 * file system.
+	 *
+	 * XXX should this be controlled via a tunable?  It makes sense for
+	 * the volume that has / on it.  I can't think of a case where we'd
+	 * want the volume to go away on this kind of event.
+	 */
+	if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 &&
+	    g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd)
+		return;
+	g_raid_fail_disk(sc, sd, disk);
+}
+
+static void
+g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_subdisk *sd, *good_sd;
+	struct bio *bp;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	if (trs->trso_flags & TR_RAID1_F_DOING_SOME)
+		return;
+	sd = trs->trso_failed_sd;
+	good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE);
+	if (good_sd == NULL) {
+		g_raid_tr_raid1_rebuild_abort(tr);
+		return;
+	}
+	bp = &trs->trso_bio;
+	memset(bp, 0, sizeof(*bp));
+	bp->bio_offset = sd->sd_rebuild_pos;
+	bp->bio_length = MIN(g_raid1_rebuild_slab,
+	    sd->sd_size - sd->sd_rebuild_pos);
+	bp->bio_data = trs->trso_buffer;
+	bp->bio_cmd = BIO_READ;
+	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
+	bp->bio_caller1 = good_sd;
+	trs->trso_flags |= TR_RAID1_F_DOING_SOME;
+	trs->trso_flags |= TR_RAID1_F_LOCKED;
+	g_raid_lock_range(sd->sd_volume,	/* Lock callback starts I/O */
+	   bp->bio_offset, bp->bio_length, NULL, bp);
+}
+
+static void
+g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+
+	vol = trs->trso_base.tro_volume;
+	sd = trs->trso_failed_sd;
+	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
+	free(trs->trso_buffer, M_TR_RAID1);
+	trs->trso_buffer = NULL;
+	trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
+	trs->trso_type = TR_RAID1_NONE;
+	trs->trso_recover_slabs = 0;
+	trs->trso_failed_sd = NULL;
+	g_raid_tr_update_state_raid1(vol, NULL);
+}
+
+static void
+g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_subdisk *sd;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	sd = trs->trso_failed_sd;
+	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
+	    "Subdisk %s:%d-%s rebuild completed.",
+	    sd->sd_volume->v_name, sd->sd_pos,
+	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
+	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
+	sd->sd_rebuild_pos = 0;
+	g_raid_tr_raid1_rebuild_done(trs);
+}
+
+static void
+g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_subdisk *sd;
+	struct g_raid_volume *vol;
+	off_t len;
+
+	vol = tr->tro_volume;
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	sd = trs->trso_failed_sd;
+	if (trs->trso_flags & TR_RAID1_F_DOING_SOME) {
+		G_RAID_DEBUG1(1, vol->v_softc,
+		    "Subdisk %s:%d-%s rebuild is aborting.",
+		    sd->sd_volume->v_name, sd->sd_pos,
+		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
+		trs->trso_flags |= TR_RAID1_F_ABORT;
+	} else {
+		G_RAID_DEBUG1(0, vol->v_softc,
+		    "Subdisk %s:%d-%s rebuild aborted.",
+		    sd->sd_volume->v_name, sd->sd_pos,
+		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
+		trs->trso_flags &= ~TR_RAID1_F_ABORT;
+		if (trs->trso_flags & TR_RAID1_F_LOCKED) {
+			trs->trso_flags &= ~TR_RAID1_F_LOCKED;
+			len = MIN(g_raid1_rebuild_slab,
+			    sd->sd_size - sd->sd_rebuild_pos);
+			g_raid_unlock_range(tr->tro_volume,
+			    sd->sd_rebuild_pos, len);
+		}
+		g_raid_tr_raid1_rebuild_done(trs);
+	}
+}
+
+static void
+g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_subdisk *sd, *fsd;
+
+	vol = tr->tro_volume;
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	if (trs->trso_failed_sd) {
+		G_RAID_DEBUG1(1, vol->v_softc,
+		    "Already rebuild in start rebuild. pos %jd\n",
+		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
+		return;
+	}
+	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE);
+	if (sd == NULL) {
+		G_RAID_DEBUG1(1, vol->v_softc,
+		    "No active disk to rebuild.  night night.");
+		return;
+	}
+	fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
+	if (fsd == NULL)
+		fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
+	if (fsd == NULL) {
+		fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
+		if (fsd != NULL) {
+			fsd->sd_rebuild_pos = 0;
+			g_raid_change_subdisk_state(fsd,
+			    G_RAID_SUBDISK_S_RESYNC);
+			g_raid_write_metadata(vol->v_softc, vol, fsd, NULL);
+		} else {
+			fsd = g_raid_get_subdisk(vol,
+			    G_RAID_SUBDISK_S_UNINITIALIZED);
+			if (fsd == NULL)
+				fsd = g_raid_get_subdisk(vol,
+				    G_RAID_SUBDISK_S_NEW);
+			if (fsd != NULL) {
+				fsd->sd_rebuild_pos = 0;
+				g_raid_change_subdisk_state(fsd,
+				    G_RAID_SUBDISK_S_REBUILD);
+				g_raid_write_metadata(vol->v_softc,
+				    vol, fsd, NULL);
+			}
+		}
+	}
+	if (fsd == NULL) {
+		G_RAID_DEBUG1(1, vol->v_softc,
+		    "No failed disk to rebuild.  night night.");
+		return;
+	}
+	trs->trso_failed_sd = fsd;
+	G_RAID_DEBUG1(0, vol->v_softc,
+	    "Subdisk %s:%d-%s rebuild start at %jd.",
+	    fsd->sd_volume->v_name, fsd->sd_pos,
+	    fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]",
+	    trs->trso_failed_sd->sd_rebuild_pos);
+	trs->trso_type = TR_RAID1_REBUILD;
+	trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK);
+	trs->trso_meta_update = g_raid1_rebuild_meta_update;
+	g_raid_tr_raid1_rebuild_some(tr);
+}
+
+
+static void
+g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_tr_raid1_object *trs;
+	int na, nr;
+	
+	/*
+	 * If we're stopping, don't do anything.  If we don't have at least one
+	 * good disk and one bad disk, we don't do anything.  And if there's a
+	 * 'good disk' stored in the trs, then we're in progress and we punt.
+	 * If we make it past all these checks, we need to rebuild.
+	 */
+	vol = tr->tro_volume;
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	if (trs->trso_stopping)
+		return;
+	na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
+	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
+	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
+	switch(trs->trso_type) {
+	case TR_RAID1_NONE:
+		if (na == 0)
+			return;
+		if (nr == 0) {
+			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
+			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
+			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
+			if (nr == 0)
+				return;
+		}
+		g_raid_tr_raid1_rebuild_start(tr);
+		break;
+	case TR_RAID1_REBUILD:
+		if (na == 0 || nr == 0 || trs->trso_failed_sd == sd)
+			g_raid_tr_raid1_rebuild_abort(tr);
+		break;
+	case TR_RAID1_RESYNC:
+		break;
+	}
+}
+
+static int
+g_raid_tr_event_raid1(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd, u_int event)
+{
+
+	g_raid_tr_update_state_raid1(tr->tro_volume, sd);
+	return (0);
+}
+
+static int
+g_raid_tr_start_raid1(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	g_raid_tr_update_state_raid1(vol, NULL);
+	return (0);
+}
+
+static int
+g_raid_tr_stop_raid1(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	trs->trso_stopping = 1;
+	g_raid_tr_update_state_raid1(vol, NULL);
+	return (0);
+}
+
+/*
+ * Select the disk to read from.  Take into account: subdisk state, running
+ * error recovery, average disk load, head position and possible cache hits.
+ */
+#define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
+static struct g_raid_subdisk *
+g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp,
+    u_int mask)
+{
+	struct g_raid_subdisk *sd, *best;
+	int i, prio, bestprio;
+
+	best = NULL;
+	bestprio = INT_MAX;
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
+		    ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD &&
+		      sd->sd_state != G_RAID_SUBDISK_S_RESYNC) ||
+		     bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos))
+			continue;
+		if ((mask & (1 << i)) != 0)
+			continue;
+		prio = G_RAID_SUBDISK_LOAD(sd);
+		prio += min(sd->sd_recovery, 255) << 22;
+		prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16;
+		/* If disk head is precisely in position - highly prefer it. */
+		if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset)
+			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
+		else
+		/* If disk head is close to position - prefer it. */
+		if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) <
+		    G_RAID_SUBDISK_TRACK_SIZE)
+			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
+		if (prio < bestprio) {
+			best = sd;
+			bestprio = prio;
+		}
+	}
+	return (best);
+}
+
+static void
+g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_subdisk *sd;
+	struct bio *cbp;
+
+	sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0);
+	KASSERT(sd != NULL, ("No active disks in volume %s.",
+		tr->tro_volume->v_name));
+
+	cbp = g_clone_bio(bp);
+	if (cbp == NULL) {
+		g_raid_iodone(bp, ENOMEM);
+		return;
+	}
+
+	g_raid_subdisk_iostart(sd, cbp);
+}
+
+static void
+g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct bio_queue_head queue;
+	struct bio *cbp;
+	int i;
+
+	vol = tr->tro_volume;
+	sc = vol->v_softc;
+
+	/*
+	 * Allocate all bios before sending any request, so we can return
+	 * ENOMEM in nice and clean way.
+	 */
+	bioq_init(&queue);
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		switch (sd->sd_state) {
+		case G_RAID_SUBDISK_S_ACTIVE:
+			break;
+		case G_RAID_SUBDISK_S_REBUILD:
+			/*
+			 * When rebuilding, only part of this subdisk is
+			 * writable, the rest will be written as part of the
+			 * that process.
+			 */
+			if (bp->bio_offset >= sd->sd_rebuild_pos)
+				continue;
+			break;
+		case G_RAID_SUBDISK_S_STALE:
+		case G_RAID_SUBDISK_S_RESYNC:
+			/*
+			 * Resyncing still writes on the theory that the
+			 * resync'd disk is very close and writing it will
+			 * keep it that way better if we keep up while
+			 * resyncing.
+			 */
+			break;
+		default:
+			continue;
+		}
+		cbp = g_clone_bio(bp);
+		if (cbp == NULL)
+			goto failure;
+		cbp->bio_caller1 = sd;
+		bioq_insert_tail(&queue, cbp);
+	}
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		sd = cbp->bio_caller1;
+		cbp->bio_caller1 = NULL;
+		g_raid_subdisk_iostart(sd, cbp);
+	}
+	return;
+failure:
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		g_destroy_bio(cbp);
+	}
+	if (bp->bio_error == 0)
+		bp->bio_error = ENOMEM;
+	g_raid_iodone(bp, bp->bio_error);
+}
+
+static void
+g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_tr_raid1_object *trs;
+
+	vol = tr->tro_volume;
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
+	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
+	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
+		g_raid_iodone(bp, EIO);
+		return;
+	}
+	/*
+	 * If we're rebuilding, squeeze in rebuild activity every so often,
+	 * even when the disk is busy.  Be sure to only count real I/O
+	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
+	 * by this module.
+	 */
+	if (trs->trso_failed_sd != NULL &&
+	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
+		/* Make this new or running now round short. */
+		trs->trso_recover_slabs = 0;
+		if (--trs->trso_fair_io <= 0) {
+			trs->trso_fair_io = g_raid1_rebuild_fair_io;
+			g_raid_tr_raid1_rebuild_some(tr);
+		}
+	}
+	switch (bp->bio_cmd) {
+	case BIO_READ:
+		g_raid_tr_iostart_raid1_read(tr, bp);
+		break;
+	case BIO_WRITE:
+		g_raid_tr_iostart_raid1_write(tr, bp);
+		break;
+	case BIO_DELETE:
+		g_raid_iodone(bp, EIO);
+		break;
+	case BIO_FLUSH:
+		g_raid_tr_flush_common(tr, bp);
+		break;
+	default:
+		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
+		    bp->bio_cmd, vol->v_name));
+		break;
+	}
+}
+
+static void
+g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd, struct bio *bp)
+{
+	struct bio *cbp;
+	struct g_raid_subdisk *nsd;
+	struct g_raid_volume *vol;
+	struct bio *pbp;
+	struct g_raid_tr_raid1_object *trs;
+	uintptr_t *mask;
+	int error, do_write;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	vol = tr->tro_volume;
+	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
+		/*
+		 * This operation is part of a rebuild or resync operation.
+		 * See what work just got done, then schedule the next bit of
+		 * work, if any.  Rebuild/resync is done a little bit at a
+		 * time.  Either when a timeout happens, or after we get a
+		 * bunch of I/Os to the disk (to make sure an active system
+		 * will complete in a sane amount of time).
+		 *
+		 * We are setup to do differing amounts of work for each of
+		 * these cases.  so long as the slabs is smallish (less than
+		 * 50 or so, I'd guess, but that's just a WAG), we shouldn't
+		 * have any bio starvation issues.  For active disks, we do
+		 * 5MB of data, for inactive ones, we do 50MB.
+		 */
+		if (trs->trso_type == TR_RAID1_REBUILD) {
+			if (bp->bio_cmd == BIO_READ) {
+
+				/* Immediately abort rebuild, if requested. */
+				if (trs->trso_flags & TR_RAID1_F_ABORT) {
+					trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
+					g_raid_tr_raid1_rebuild_abort(tr);
+					return;
+				}
+
+				/* On read error, skip and cross fingers. */
+				if (bp->bio_error != 0) {
+					G_RAID_LOGREQ(0, bp,
+					    "Read error during rebuild (%d), "
+					    "possible data loss!",
+					    bp->bio_error);
+					goto rebuild_round_done;
+				}
+
+				/*
+				 * The read operation finished, queue the
+				 * write and get out.
+				 */
+				G_RAID_LOGREQ(4, bp, "rebuild read done. %d",
+				    bp->bio_error);
+				bp->bio_cmd = BIO_WRITE;
+				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
+				bp->bio_offset = bp->bio_offset;
+				bp->bio_length = bp->bio_length;
+				G_RAID_LOGREQ(4, bp, "Queueing rebuild write.");
+				g_raid_subdisk_iostart(trs->trso_failed_sd, bp);
+			} else {
+				/*
+				 * The write operation just finished.  Do
+				 * another.  We keep cloning the master bio
+				 * since it has the right buffers allocated to
+				 * it.
+				 */
+				G_RAID_LOGREQ(4, bp,
+				    "rebuild write done. Error %d",
+				    bp->bio_error);
+				nsd = trs->trso_failed_sd;
+				if (bp->bio_error != 0 ||
+				    trs->trso_flags & TR_RAID1_F_ABORT) {
+					if ((trs->trso_flags &
+					    TR_RAID1_F_ABORT) == 0) {
+						g_raid_tr_raid1_fail_disk(sd->sd_softc,
+						    nsd, nsd->sd_disk);
+					}
+					trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
+					g_raid_tr_raid1_rebuild_abort(tr);
+					return;
+				}
+rebuild_round_done:
+				nsd = trs->trso_failed_sd;
+				trs->trso_flags &= ~TR_RAID1_F_LOCKED;
+				g_raid_unlock_range(sd->sd_volume,
+				    bp->bio_offset, bp->bio_length);
+				nsd->sd_rebuild_pos += bp->bio_length;
+				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
+					g_raid_tr_raid1_rebuild_finish(tr);
+					return;
+				}
+
+				/* Abort rebuild if we are stopping */
+				if (trs->trso_stopping) {
+					trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
+					g_raid_tr_raid1_rebuild_abort(tr);
+					return;
+				}
+
+				if (--trs->trso_meta_update <= 0) {
+					g_raid_write_metadata(vol->v_softc,
+					    vol, nsd, nsd->sd_disk);
+					trs->trso_meta_update =
+					    g_raid1_rebuild_meta_update;
+				}
+				trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
+				if (--trs->trso_recover_slabs <= 0)
+					return;
+				g_raid_tr_raid1_rebuild_some(tr);
+			}
+		} else if (trs->trso_type == TR_RAID1_RESYNC) {
+			/*
+			 * read good sd, read bad sd in parallel.  when both
+			 * done, compare the buffers.  write good to the bad
+			 * if different.  do the next bit of work.
+			 */
+			panic("Somehow, we think we're doing a resync");
+		}
+		return;
+	}
+	pbp = bp->bio_parent;
+	pbp->bio_inbed++;
+	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
+		/*
+		 * Read failed on first drive.  Retry the read error on
+		 * another disk drive, if available, before erroring out the
+		 * read.
+		 */
+		sd->sd_disk->d_read_errs++;
+		G_RAID_LOGREQ(0, bp,
+		    "Read error (%d), %d read errors total",
+		    bp->bio_error, sd->sd_disk->d_read_errs);
+
+		/*
+		 * If there are too many read errors, we move to degraded.
+		 * XXX Do we want to FAIL the drive (eg, make the user redo
+		 * everything to get it back in sync), or just degrade the
+		 * drive, which kicks off a resync?
+		 */
+		do_write = 1;
+		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) {
+			g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
+			if (pbp->bio_children == 1)
+				do_write = 0;
+		}
+
+		/*
+		 * Find the other disk, and try to do the I/O to it.
+		 */
+		mask = (uintptr_t *)(&pbp->bio_driver2);
+		if (pbp->bio_children == 1) {
+			/* Save original subdisk. */
+			pbp->bio_driver1 = do_write ? sd : NULL;
+			*mask = 0;
+		}
+		*mask |= 1 << sd->sd_pos;
+		nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask);
+		if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) {
+			g_destroy_bio(bp);
+			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
+			    nsd->sd_pos);
+			if (pbp->bio_children == 2 && do_write) {
+				sd->sd_recovery++;
+				cbp->bio_caller1 = nsd;
+				pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED;
+				/* Lock callback starts I/O */
+				g_raid_lock_range(sd->sd_volume,
+				    cbp->bio_offset, cbp->bio_length, pbp, cbp);
+			} else {
+				g_raid_subdisk_iostart(nsd, cbp);
+			}
+			return;
+		}
+		/*
+		 * We can't retry.  Return the original error by falling
+		 * through.  This will happen when there's only one good disk.
+		 * We don't need to fail the raid, since its actual state is
+		 * based on the state of the subdisks.
+		 */
+		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
+	}
+	if (bp->bio_cmd == BIO_READ &&
+	    bp->bio_error == 0 &&
+	    pbp->bio_children > 1 &&
+	    pbp->bio_driver1 != NULL) {
+		/*
+		 * If it was a read, and bio_children is >1, then we just
+		 * recovered the data from the second drive.  We should try to
+		 * write that data to the first drive if sector remapping is
+		 * enabled.  A write should put the data in a new place on the
+		 * disk, remapping the bad sector.  Do we need to do that by
+		 * queueing a request to the main worker thread?  It doesn't
+		 * affect the return code of this current read, and can be
+		 * done at our liesure.  However, to make the code simpler, it
+		 * is done syncrhonously.
+		 */
+		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
+		cbp = g_clone_bio(pbp);
+		if (cbp != NULL) {
+			g_destroy_bio(bp);
+			cbp->bio_cmd = BIO_WRITE;
+			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
+			G_RAID_LOGREQ(2, cbp,
+			    "Attempting bad sector remap on failing drive.");
+			g_raid_subdisk_iostart(pbp->bio_driver1, cbp);
+			return;
+		}
+	}
+	if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) {
+		/*
+		 * We're done with a recovery, mark the range as unlocked.
+		 * For any write errors, we agressively fail the disk since
+		 * there was both a READ and a WRITE error at this location.
+		 * Both types of errors generally indicates the drive is on
+		 * the verge of total failure anyway.  Better to stop trusting
+		 * it now.  However, we need to reset error to 0 in that case
+		 * because we're not failing the original I/O which succeeded.
+		 */
+		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
+			G_RAID_LOGREQ(0, bp, "Remap write failed: "
+			    "failing subdisk.");
+			g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
+			bp->bio_error = 0;
+		}
+		if (pbp->bio_driver1 != NULL) {
+			((struct g_raid_subdisk *)pbp->bio_driver1)
+			    ->sd_recovery--;
+		}
+		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
+		g_raid_unlock_range(sd->sd_volume, bp->bio_offset,
+		    bp->bio_length);
+	}
+	error = bp->bio_error;
+	g_destroy_bio(bp);
+	if (pbp->bio_children == pbp->bio_inbed) {
+		pbp->bio_completed = pbp->bio_length;
+		g_raid_iodone(pbp, error);
+	}
+}
+
+static int
+g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr,
+    void *virtual, vm_offset_t physical, off_t offset, size_t length)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	int error, i, ok;
+
+	vol = tr->tro_volume;
+	error = 0;
+	ok = 0;
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		switch (sd->sd_state) {
+		case G_RAID_SUBDISK_S_ACTIVE:
+			break;
+		case G_RAID_SUBDISK_S_REBUILD:
+			/*
+			 * When rebuilding, only part of this subdisk is
+			 * writable, the rest will be written as part of the
+			 * that process.
+			 */
+			if (offset >= sd->sd_rebuild_pos)
+				continue;
+			break;
+		case G_RAID_SUBDISK_S_STALE:
+		case G_RAID_SUBDISK_S_RESYNC:
+			/*
+			 * Resyncing still writes on the theory that the
+			 * resync'd disk is very close and writing it will
+			 * keep it that way better if we keep up while
+			 * resyncing.
+			 */
+			break;
+		default:
+			continue;
+		}
+		error = g_raid_subdisk_kerneldump(sd,
+		    virtual, physical, offset, length);
+		if (error == 0)
+			ok++;
+	}
+	return (ok > 0 ? 0 : error);
+}
+
+static int
+g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp)
+{
+	struct bio *bp;
+	struct g_raid_subdisk *sd;
+
+	bp = (struct bio *)argp;
+	sd = (struct g_raid_subdisk *)bp->bio_caller1;
+	g_raid_subdisk_iostart(sd, bp);
+
+	return (0);
+}
+
+static int
+g_raid_tr_idle_raid1(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+	trs->trso_fair_io = g_raid1_rebuild_fair_io;
+	trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle;
+	if (trs->trso_type == TR_RAID1_REBUILD)
+		g_raid_tr_raid1_rebuild_some(tr);
+	return (0);
+}
+
+static int
+g_raid_tr_free_raid1(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1_object *trs;
+
+	trs = (struct g_raid_tr_raid1_object *)tr;
+
+	if (trs->trso_buffer != NULL) {
+		free(trs->trso_buffer, M_TR_RAID1);
+		trs->trso_buffer = NULL;
+	}
+	return (0);
+}
+
+G_RAID_TR_DECLARE(g_raid_tr_raid1);
diff --git a/sys/geom/raid/tr_raid1e.c b/sys/geom/raid/tr_raid1e.c
new file mode 100644
index 0000000..9ebe218
--- /dev/null
+++ b/sys/geom/raid/tr_raid1e.c
@@ -0,0 +1,1227 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <geom/geom.h>
+#include "geom/raid/g_raid.h"
+#include "g_raid_tr_if.h"
+
+#define N	2
+
+SYSCTL_DECL(_kern_geom_raid);
+SYSCTL_NODE(_kern_geom_raid, OID_AUTO, raid1e, CTLFLAG_RW, 0,
+    "RAID1E parameters");
+
+#define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
+static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
+TUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size",
+    &g_raid1e_rebuild_slab);
+SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
+    &g_raid1e_rebuild_slab, 0,
+    "Amount of the disk to rebuild each read/write cycle of the rebuild.");
+
+#define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
+static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
+TUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io",
+    &g_raid1e_rebuild_fair_io);
+SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
+    &g_raid1e_rebuild_fair_io, 0,
+    "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
+
+#define RAID1E_REBUILD_CLUSTER_IDLE 100
+static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
+TUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle",
+    &g_raid1e_rebuild_cluster_idle);
+SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
+    &g_raid1e_rebuild_cluster_idle, 0,
+    "Number of slabs to do each time we trigger a rebuild cycle");
+
+#define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
+static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
+TUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update",
+    &g_raid1e_rebuild_meta_update);
+SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
+    &g_raid1e_rebuild_meta_update, 0,
+    "When to update the meta data.");
+
+static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
+
+#define TR_RAID1E_NONE 0
+#define TR_RAID1E_REBUILD 1
+#define TR_RAID1E_RESYNC 2
+
+#define TR_RAID1E_F_DOING_SOME	0x1
+#define TR_RAID1E_F_LOCKED	0x2
+#define TR_RAID1E_F_ABORT	0x4
+
+struct g_raid_tr_raid1e_object {
+	struct g_raid_tr_object	 trso_base;
+	int			 trso_starting;
+	int			 trso_stopping;
+	int			 trso_type;
+	int			 trso_recover_slabs; /* slabs before rest */
+	int			 trso_fair_io;
+	int			 trso_meta_update;
+	int			 trso_flags;
+	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
+	void			*trso_buffer;	 /* Buffer space */
+	off_t			 trso_lock_pos; /* Locked range start. */
+	off_t			 trso_lock_len; /* Locked range length. */
+	struct bio		 trso_bio;
+};
+
+static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
+static g_raid_tr_event_t g_raid_tr_event_raid1e;
+static g_raid_tr_start_t g_raid_tr_start_raid1e;
+static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
+static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
+static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
+static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
+static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
+static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
+static g_raid_tr_free_t g_raid_tr_free_raid1e;
+
+static kobj_method_t g_raid_tr_raid1e_methods[] = {
+	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
+	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
+	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
+	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
+	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
+	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
+	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
+	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
+	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
+	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
+	{ 0, 0 }
+};
+
+static struct g_raid_tr_class g_raid_tr_raid1e_class = {
+	"RAID1E",
+	g_raid_tr_raid1e_methods,
+	sizeof(struct g_raid_tr_raid1e_object),
+	.trc_priority = 200
+};
+
+static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
+static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd);
+static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
+    int no, off_t off, off_t len, u_int mask);
+
+static inline void
+V2P(struct g_raid_volume *vol, off_t virt,
+    int *disk, off_t *offset, off_t *start)
+{
+	off_t nstrip;
+	u_int strip_size;
+
+	strip_size = vol->v_strip_size;
+	/* Strip number. */
+	nstrip = virt / strip_size;
+	/* Start position in strip. */
+	*start = virt % strip_size;
+	/* Disk number. */
+	*disk = (nstrip * N) % vol->v_disks_count;
+	/* Strip start position in disk. */
+	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
+}
+
+static inline void
+P2V(struct g_raid_volume *vol, int disk, off_t offset,
+    off_t *virt, int *copy)
+{
+	off_t nstrip, start;
+	u_int strip_size;
+
+	strip_size = vol->v_strip_size;
+	/* Start position in strip. */
+	start = offset % strip_size;
+	/* Physical strip number. */
+	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
+	/* Number of physical strip (copy) inside virtual strip. */
+	*copy = nstrip % N;
+	/* Offset in virtual space. */
+	*virt = (nstrip / N) * strip_size + start;
+}
+
+static int
+g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
+{
+	struct g_raid_tr_raid1e_object *trs;
+
+	trs = (struct g_raid_tr_raid1e_object *)tr;
+	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
+	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE)
+		return (G_RAID_TR_TASTE_FAIL);
+	trs->trso_starting = 1;
+	return (G_RAID_TR_TASTE_SUCCEED);
+}
+
+static int
+g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
+{
+	struct g_raid_tr_raid1e_object *trs;
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd, *bestsd, *worstsd;
+	int i, j, state, sstate;
+
+	sc = vol->v_softc;
+	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
+	state = G_RAID_VOLUME_S_OPTIMAL;
+	for (i = 0; i < vol->v_disks_count / N; i++) {
+		bestsd = &vol->v_subdisks[i * N];
+		for (j = 1; j < N; j++) {
+			sd = &vol->v_subdisks[i * N + j];
+			if (sd->sd_state > bestsd->sd_state)
+				bestsd = sd;
+			else if (sd->sd_state == bestsd->sd_state &&
+			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
+			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
+			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
+				bestsd = sd;
+		}
+		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
+		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
+			/* We found reasonable candidate. */
+			G_RAID_DEBUG1(1, sc,
+			    "Promote subdisk %s:%d from %s to ACTIVE.",
+			    vol->v_name, bestsd->sd_pos,
+			    g_raid_subdisk_state2str(bestsd->sd_state));
+			g_raid_change_subdisk_state(bestsd,
+			    G_RAID_SUBDISK_S_ACTIVE);
+			g_raid_write_metadata(sc,
+			    vol, bestsd, bestsd->sd_disk);
+		}
+		worstsd = &vol->v_subdisks[i * N];
+		for (j = 1; j < N; j++) {
+			sd = &vol->v_subdisks[i * N + j];
+			if (sd->sd_state < worstsd->sd_state)
+				worstsd = sd;
+		}
+		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
+			sstate = G_RAID_VOLUME_S_OPTIMAL;
+		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
+			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
+		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
+			sstate = G_RAID_VOLUME_S_DEGRADED;
+		else
+			sstate = G_RAID_VOLUME_S_BROKEN;
+		if (sstate < state)
+			state = sstate;
+	}
+	return (state);
+}
+
+static int
+g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
+{
+	struct g_raid_tr_raid1e_object *trs;
+	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd, *bestsd, *worstsd;
+	int i, j, state, sstate;
+
+	sc = vol->v_softc;
+	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
+	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
+	    vol->v_disks_count)
+		return (G_RAID_VOLUME_S_OPTIMAL);
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
+			/* We found reasonable candidate. */
+			G_RAID_DEBUG1(1, sc,
+			    "Promote subdisk %s:%d from %s to STALE.",
+			    vol->v_name, sd->sd_pos,
+			    g_raid_subdisk_state2str(sd->sd_state));
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_STALE);
+			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
+		}
+	}
+	state = G_RAID_VOLUME_S_OPTIMAL;
+	for (i = 0; i < vol->v_disks_count; i++) {
+		bestsd = &vol->v_subdisks[i];
+		worstsd = &vol->v_subdisks[i];
+		for (j = 1; j < N; j++) {
+			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
+			if (sd->sd_state > bestsd->sd_state)
+				bestsd = sd;
+			else if (sd->sd_state == bestsd->sd_state &&
+			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
+			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
+			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
+				bestsd = sd;
+			if (sd->sd_state < worstsd->sd_state)
+				worstsd = sd;
+		}
+		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
+			sstate = G_RAID_VOLUME_S_OPTIMAL;
+		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
+			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
+		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
+			sstate = G_RAID_VOLUME_S_DEGRADED;
+		else
+			sstate = G_RAID_VOLUME_S_BROKEN;
+		if (sstate < state)
+			state = sstate;
+	}
+	return (state);
+}
+
+static int
+g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
+    struct g_raid_subdisk *sd)
+{
+	struct g_raid_tr_raid1e_object *trs;
+	struct g_raid_softc *sc;
+	u_int s;
+
+	sc = vol->v_softc;
+	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
+	if (trs->trso_stopping &&
+	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
+		s = G_RAID_VOLUME_S_STOPPED;
+	else if (trs->trso_starting)
+		s = G_RAID_VOLUME_S_STARTING;
+	else {
+		if ((vol->v_disks_count % N) == 0)
+			s = g_raid_tr_update_state_raid1e_even(vol);
+		else
+			s = g_raid_tr_update_state_raid1e_odd(vol);
+	}
+	if (s != vol->v_state) {
+		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
+		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
+		    G_RAID_EVENT_VOLUME);
+		g_raid_change_volume_state(vol, s);
+		if (!trs->trso_starting && !trs->trso_stopping)
+			g_raid_write_metadata(sc, vol, NULL, NULL);
+	}
+	if (!trs->trso_starting && !trs->trso_stopping)
+		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
+	return (0);
+}
+
+static void
+g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
+    struct g_raid_disk *disk)
+{
+	/*
+	 * We don't fail the last disk in the pack, since it still has decent
+	 * data on it and that's better than failing the disk if it is the root
+	 * file system.
+	 *
+	 * XXX should this be controlled via a tunable?  It makes sense for
+	 * the volume that has / on it.  I can't think of a case where we'd
+	 * want the volume to go away on this kind of event.
+	 */
+	if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 &&
+	    g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd)
+		return;
+	g_raid_fail_disk(sc, sd, disk);
+}
+
+static void
+g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+
+	vol = trs->trso_base.tro_volume;
+	sd = trs->trso_failed_sd;
+	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
+	free(trs->trso_buffer, M_TR_RAID1E);
+	trs->trso_buffer = NULL;
+	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
+	trs->trso_type = TR_RAID1E_NONE;
+	trs->trso_recover_slabs = 0;
+	trs->trso_failed_sd = NULL;
+	g_raid_tr_update_state_raid1e(vol, NULL);
+}
+
+static void
+g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1e_object *trs;
+	struct g_raid_subdisk *sd;
+
+	trs = (struct g_raid_tr_raid1e_object *)tr;
+	sd = trs->trso_failed_sd;
+	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
+	    "Subdisk %s:%d-%s rebuild completed.",
+	    sd->sd_volume->v_name, sd->sd_pos,
+	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
+	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
+	sd->sd_rebuild_pos = 0;
+	g_raid_tr_raid1e_rebuild_done(trs);
+}
+
+static void
+g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1e_object *trs;
+	struct g_raid_subdisk *sd;
+	struct g_raid_volume *vol;
+
+	vol = tr->tro_volume;
+	trs = (struct g_raid_tr_raid1e_object *)tr;
+	sd = trs->trso_failed_sd;
+	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
+		G_RAID_DEBUG1(1, vol->v_softc,
+		    "Subdisk %s:%d-%s rebuild is aborting.",
+		    sd->sd_volume->v_name, sd->sd_pos,
+		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
+		trs->trso_flags |= TR_RAID1E_F_ABORT;
+	} else {
+		G_RAID_DEBUG1(0, vol->v_softc,
+		    "Subdisk %s:%d-%s rebuild aborted.",
+		    sd->sd_volume->v_name, sd->sd_pos,
+		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
+		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
+		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
+			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
+			g_raid_unlock_range(tr->tro_volume,
+			    trs->trso_lock_pos, trs->trso_lock_len);
+		}
+		g_raid_tr_raid1e_rebuild_done(trs);
+	}
+}
+
+static void
+g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1e_object *trs;
+	struct g_raid_softc *sc;
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct bio *bp;
+	off_t len, virtual, vend, offset, start;
+	int disk, copy, best;
+
+	trs = (struct g_raid_tr_raid1e_object *)tr;
+	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
+		return;
+	vol = tr->tro_volume;
+	sc = vol->v_softc;
+	sd = trs->trso_failed_sd;
+
+	while (1) {
+		if (sd->sd_rebuild_pos >= sd->sd_size) {
+			g_raid_tr_raid1e_rebuild_finish(tr);
+			return;
+		}
+		/* Get virtual offset from physical rebuild position. */
+		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
+		/* Get physical offset back to get first stripe position. */
+		V2P(vol, virtual, &disk, &offset, &start);
+		/* Calculate contignous data length. */
+		len = MIN(g_raid1e_rebuild_slab,
+		    sd->sd_size - sd->sd_rebuild_pos);
+		if ((vol->v_disks_count % N) != 0)
+			len = MIN(len, vol->v_strip_size - start);
+		/* Find disk with most accurate data. */
+		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
+		    offset + start, len, 0);
+		if (best < 0) {
+			/* There is no any valid disk. */
+			g_raid_tr_raid1e_rebuild_abort(tr);
+			return;
+		} else if (best != copy) {
+			/* Some other disk has better data. */
+			break;
+		}
+		/* We have the most accurate data. Skip the range. */
+		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
+		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
+		sd->sd_rebuild_pos += len;
+	}
+
+	bp = &trs->trso_bio;
+	memset(bp, 0, sizeof(*bp));
+	bp->bio_offset = offset + start +
+	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
+	bp->bio_length = len;
+	bp->bio_data = trs->trso_buffer;
+	bp->bio_cmd = BIO_READ;
+	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
+	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
+	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
+	/*
+	 * If we are crossing stripe boundary, correct affected virtual
+	 * range we should lock.
+	 */
+	if (start + len > vol->v_strip_size) {
+		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
+		len = vend - virtual;
+	}
+	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
+	trs->trso_flags |= TR_RAID1E_F_LOCKED;
+	trs->trso_lock_pos = virtual;
+	trs->trso_lock_len = len;
+	/* Lock callback starts I/O */
+	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
+}
+
+static void
+g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_tr_raid1e_object *trs;
+	struct g_raid_subdisk *sd;
+
+	vol = tr->tro_volume;
+	trs = (struct g_raid_tr_raid1e_object *)tr;
+	if (trs->trso_failed_sd) {
+		G_RAID_DEBUG1(1, vol->v_softc,
+		    "Already rebuild in start rebuild. pos %jd\n",
+		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
+		return;
+	}
+	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
+	if (sd == NULL)
+		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
+	if (sd == NULL) {
+		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
+		if (sd != NULL) {
+			sd->sd_rebuild_pos = 0;
+			g_raid_change_subdisk_state(sd,
+			    G_RAID_SUBDISK_S_RESYNC);
+			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
+		} else {
+			sd = g_raid_get_subdisk(vol,
+			    G_RAID_SUBDISK_S_UNINITIALIZED);
+			if (sd == NULL)
+				sd = g_raid_get_subdisk(vol,
+				    G_RAID_SUBDISK_S_NEW);
+			if (sd != NULL) {
+				sd->sd_rebuild_pos = 0;
+				g_raid_change_subdisk_state(sd,
+				    G_RAID_SUBDISK_S_REBUILD);
+				g_raid_write_metadata(vol->v_softc,
+				    vol, sd, NULL);
+			}
+		}
+	}
+	if (sd == NULL) {
+		G_RAID_DEBUG1(1, vol->v_softc,
+		    "No failed disk to rebuild.  night night.");
+		return;
+	}
+	trs->trso_failed_sd = sd;
+	G_RAID_DEBUG1(0, vol->v_softc,
+	    "Subdisk %s:%d-%s rebuild start at %jd.",
+	    sd->sd_volume->v_name, sd->sd_pos,
+	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
+	    trs->trso_failed_sd->sd_rebuild_pos);
+	trs->trso_type = TR_RAID1E_REBUILD;
+	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
+	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
+	g_raid_tr_raid1e_rebuild_some(tr);
+}
+
+static void
+g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_tr_raid1e_object *trs;
+	int nr;
+	
+	vol = tr->tro_volume;
+	trs = (struct g_raid_tr_raid1e_object *)tr;
+	if (trs->trso_stopping)
+		return;
+	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
+	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
+	switch(trs->trso_type) {
+	case TR_RAID1E_NONE:
+		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
+			return;
+		if (nr == 0) {
+			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
+			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
+			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
+			if (nr == 0)
+				return;
+		}
+		g_raid_tr_raid1e_rebuild_start(tr);
+		break;
+	case TR_RAID1E_REBUILD:
+		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
+		    trs->trso_failed_sd == sd)
+			g_raid_tr_raid1e_rebuild_abort(tr);
+		break;
+	case TR_RAID1E_RESYNC:
+		break;
+	}
+}
+
+static int
+g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd, u_int event)
+{
+
+	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
+	return (0);
+}
+
+static int
+g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1e_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_raid1e_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	g_raid_tr_update_state_raid1e(vol, NULL);
+	return (0);
+}
+
+static int
+g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1e_object *trs;
+	struct g_raid_volume *vol;
+
+	trs = (struct g_raid_tr_raid1e_object *)tr;
+	vol = tr->tro_volume;
+	trs->trso_starting = 0;
+	trs->trso_stopping = 1;
+	g_raid_tr_update_state_raid1e(vol, NULL);
+	return (0);
+}
+
+/*
+ * Select the disk to read from.  Take into account: subdisk state, running
+ * error recovery, average disk load, head position and possible cache hits.
+ */
+#define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
+static int
+g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
+    int no, off_t off, off_t len, u_int mask)
+{
+	struct g_raid_subdisk *sd;
+	off_t offset;
+	int i, best, prio, bestprio;
+
+	best = -1;
+	bestprio = INT_MAX;
+	for (i = 0; i < N; i++) {
+		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
+		offset = off;
+		if (no + i >= vol->v_disks_count)
+			offset += vol->v_strip_size;
+
+		prio = G_RAID_SUBDISK_LOAD(sd);
+		if ((mask & (1 << sd->sd_pos)) != 0)
+			continue;
+		switch (sd->sd_state) {
+		case G_RAID_SUBDISK_S_ACTIVE:
+			break;
+		case G_RAID_SUBDISK_S_RESYNC:
+			if (offset + off < sd->sd_rebuild_pos)
+				break;
+			/* FALLTHROUGH */
+		case G_RAID_SUBDISK_S_STALE:
+			prio += i << 24;
+			break;
+		case G_RAID_SUBDISK_S_REBUILD:
+			if (offset + off < sd->sd_rebuild_pos)
+				break;
+			/* FALLTHROUGH */
+		default:
+			continue;
+		}
+		prio += min(sd->sd_recovery, 255) << 16;
+		/* If disk head is precisely in position - highly prefer it. */
+		if (G_RAID_SUBDISK_POS(sd) == offset)
+			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
+		else
+		/* If disk head is close to position - prefer it. */
+		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
+		    G_RAID_SUBDISK_TRACK_SIZE)
+			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
+		if (prio < bestprio) {
+			bestprio = prio;
+			best = i;
+		}
+	}
+	return (best);
+}
+
+static void
+g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct bio_queue_head queue;
+	struct bio *cbp;
+	char *addr;
+	off_t offset, start, length, remain;
+	u_int no, strip_size;
+	int best;
+
+	vol = tr->tro_volume;
+	addr = bp->bio_data;
+	strip_size = vol->v_strip_size;
+	V2P(vol, bp->bio_offset, &no, &offset, &start);
+	remain = bp->bio_length;
+	bioq_init(&queue);
+	while (remain > 0) {
+		length = MIN(strip_size - start, remain);
+		best = g_raid_tr_raid1e_select_read_disk(vol,
+		    no, offset, length, 0);
+		KASSERT(best >= 0, ("No readable disk in volume %s!",
+		    vol->v_name));
+		no += best;
+		if (no >= vol->v_disks_count) {
+			no -= vol->v_disks_count;
+			offset += strip_size;
+		}
+		cbp = g_clone_bio(bp);
+		if (cbp == NULL)
+			goto failure;
+		cbp->bio_offset = offset + start;
+		cbp->bio_data = addr;
+		cbp->bio_length = length;
+		cbp->bio_caller1 = &vol->v_subdisks[no];
+		bioq_insert_tail(&queue, cbp);
+		no += N - best;
+		if (no >= vol->v_disks_count) {
+			no -= vol->v_disks_count;
+			offset += strip_size;
+		}
+		remain -= length;
+		addr += length;
+		start = 0;
+	}
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		sd = cbp->bio_caller1;
+		cbp->bio_caller1 = NULL;
+		g_raid_subdisk_iostart(sd, cbp);
+	}
+	return;
+failure:
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		g_destroy_bio(cbp);
+	}
+	if (bp->bio_error == 0)
+		bp->bio_error = ENOMEM;
+	g_raid_iodone(bp, bp->bio_error);
+}
+
+static void
+g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct bio_queue_head queue;
+	struct bio *cbp;
+	char *addr;
+	off_t offset, start, length, remain;
+	u_int no, strip_size;
+	int i;
+
+	vol = tr->tro_volume;
+	addr = bp->bio_data;
+	strip_size = vol->v_strip_size;
+	V2P(vol, bp->bio_offset, &no, &offset, &start);
+	remain = bp->bio_length;
+	bioq_init(&queue);
+	while (remain > 0) {
+		length = MIN(strip_size - start, remain);
+		for (i = 0; i < N; i++) {
+			sd = &vol->v_subdisks[no];
+			switch (sd->sd_state) {
+			case G_RAID_SUBDISK_S_ACTIVE:
+			case G_RAID_SUBDISK_S_STALE:
+			case G_RAID_SUBDISK_S_RESYNC:
+				break;
+			case G_RAID_SUBDISK_S_REBUILD:
+				if (offset + start >= sd->sd_rebuild_pos)
+					goto nextdisk;
+				break;
+			default:
+				goto nextdisk;
+			}
+			cbp = g_clone_bio(bp);
+			if (cbp == NULL)
+				goto failure;
+			cbp->bio_offset = offset + start;
+			cbp->bio_data = addr;
+			cbp->bio_length = length;
+			cbp->bio_caller1 = sd;
+			bioq_insert_tail(&queue, cbp);
+nextdisk:
+			if (++no >= vol->v_disks_count) {
+				no = 0;
+				offset += strip_size;
+			}
+		}
+		remain -= length;
+		addr += length;
+		start = 0;
+	}
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		sd = cbp->bio_caller1;
+		cbp->bio_caller1 = NULL;
+		g_raid_subdisk_iostart(sd, cbp);
+	}
+	return;
+failure:
+	for (cbp = bioq_first(&queue); cbp != NULL;
+	    cbp = bioq_first(&queue)) {
+		bioq_remove(&queue, cbp);
+		g_destroy_bio(cbp);
+	}
+	if (bp->bio_error == 0)
+		bp->bio_error = ENOMEM;
+	g_raid_iodone(bp, bp->bio_error);
+}
+
+static void
+g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_tr_raid1e_object *trs;
+
+	vol = tr->tro_volume;
+	trs = (struct g_raid_tr_raid1e_object *)tr;
+	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
+	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
+	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
+		g_raid_iodone(bp, EIO);
+		return;
+	}
+	/*
+	 * If we're rebuilding, squeeze in rebuild activity every so often,
+	 * even when the disk is busy.  Be sure to only count real I/O
+	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
+	 * by this module.
+	 */
+	if (trs->trso_failed_sd != NULL &&
+	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
+		/* Make this new or running now round short. */
+		trs->trso_recover_slabs = 0;
+		if (--trs->trso_fair_io <= 0) {
+			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
+			g_raid_tr_raid1e_rebuild_some(tr);
+		}
+	}
+	switch (bp->bio_cmd) {
+	case BIO_READ:
+		g_raid_tr_iostart_raid1e_read(tr, bp);
+		break;
+	case BIO_WRITE:
+		g_raid_tr_iostart_raid1e_write(tr, bp);
+		break;
+	case BIO_DELETE:
+		g_raid_iodone(bp, EIO);
+		break;
+	case BIO_FLUSH:
+		g_raid_tr_flush_common(tr, bp);
+		break;
+	default:
+		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
+		    bp->bio_cmd, vol->v_name));
+		break;
+	}
+}
+
+static void
+g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
+    struct g_raid_subdisk *sd, struct bio *bp)
+{
+	struct bio *cbp;
+	struct g_raid_subdisk *nsd;
+	struct g_raid_volume *vol;
+	struct bio *pbp;
+	struct g_raid_tr_raid1e_object *trs;
+	off_t virtual, offset, start;
+	uintptr_t mask;
+	int error, do_write, copy, disk, best;
+
+	trs = (struct g_raid_tr_raid1e_object *)tr;
+	vol = tr->tro_volume;
+	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
+		if (trs->trso_type == TR_RAID1E_REBUILD) {
+			nsd = trs->trso_failed_sd;
+			if (bp->bio_cmd == BIO_READ) {
+
+				/* Immediately abort rebuild, if requested. */
+				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
+					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
+					g_raid_tr_raid1e_rebuild_abort(tr);
+					return;
+				}
+
+				/* On read error, skip and cross fingers. */
+				if (bp->bio_error != 0) {
+					G_RAID_LOGREQ(0, bp,
+					    "Read error during rebuild (%d), "
+					    "possible data loss!",
+					    bp->bio_error);
+					goto rebuild_round_done;
+				}
+
+				/*
+				 * The read operation finished, queue the
+				 * write and get out.
+				 */
+				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
+				    bp->bio_error);
+				bp->bio_cmd = BIO_WRITE;
+				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
+				bp->bio_offset = nsd->sd_rebuild_pos;
+				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
+				g_raid_subdisk_iostart(nsd, bp);
+			} else {
+				/*
+				 * The write operation just finished.  Do
+				 * another.  We keep cloning the master bio
+				 * since it has the right buffers allocated to
+				 * it.
+				 */
+				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
+				    bp->bio_error);
+				if (bp->bio_error != 0 ||
+				    trs->trso_flags & TR_RAID1E_F_ABORT) {
+					if ((trs->trso_flags &
+					    TR_RAID1E_F_ABORT) == 0) {
+						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
+						    nsd, nsd->sd_disk);
+					}
+					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
+					g_raid_tr_raid1e_rebuild_abort(tr);
+					return;
+				}
+rebuild_round_done:
+				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
+				g_raid_unlock_range(tr->tro_volume,
+				    trs->trso_lock_pos, trs->trso_lock_len);
+				nsd->sd_rebuild_pos += bp->bio_length;
+				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
+					g_raid_tr_raid1e_rebuild_finish(tr);
+					return;
+				}
+
+				/* Abort rebuild if we are stopping */
+				if (trs->trso_stopping) {
+					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
+					g_raid_tr_raid1e_rebuild_abort(tr);
+					return;
+				}
+
+				if (--trs->trso_meta_update <= 0) {
+					g_raid_write_metadata(vol->v_softc,
+					    vol, nsd, nsd->sd_disk);
+					trs->trso_meta_update =
+					    g_raid1e_rebuild_meta_update;
+					/* Compensate short rebuild I/Os. */
+					if ((vol->v_disks_count % N) != 0 &&
+					    vol->v_strip_size <
+					     g_raid1e_rebuild_slab) {
+						trs->trso_meta_update *=
+						    g_raid1e_rebuild_slab;
+						trs->trso_meta_update /=
+						    vol->v_strip_size;
+					}
+				}
+				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
+				if (--trs->trso_recover_slabs <= 0)
+					return;
+				/* Run next rebuild iteration. */
+				g_raid_tr_raid1e_rebuild_some(tr);
+			}
+		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
+			/*
+			 * read good sd, read bad sd in parallel.  when both
+			 * done, compare the buffers.  write good to the bad
+			 * if different.  do the next bit of work.
+			 */
+			panic("Somehow, we think we're doing a resync");
+		}
+		return;
+	}
+	pbp = bp->bio_parent;
+	pbp->bio_inbed++;
+	mask = (intptr_t)bp->bio_caller2;
+	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
+		/*
+		 * Read failed on first drive.  Retry the read error on
+		 * another disk drive, if available, before erroring out the
+		 * read.
+		 */
+		sd->sd_disk->d_read_errs++;
+		G_RAID_LOGREQ(0, bp,
+		    "Read error (%d), %d read errors total",
+		    bp->bio_error, sd->sd_disk->d_read_errs);
+
+		/*
+		 * If there are too many read errors, we move to degraded.
+		 * XXX Do we want to FAIL the drive (eg, make the user redo
+		 * everything to get it back in sync), or just degrade the
+		 * drive, which kicks off a resync?
+		 */
+		do_write = 0;
+		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
+			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
+		else if (mask == 0)
+			do_write = 1;
+
+		/* Restore what we were doing. */
+		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
+		V2P(vol, virtual, &disk, &offset, &start);
+
+		/* Find the other disk, and try to do the I/O to it. */
+		mask |= 1 << copy;
+		best = g_raid_tr_raid1e_select_read_disk(vol,
+		    disk, offset, start, mask);
+		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
+			disk += best;
+			if (disk >= vol->v_disks_count) {
+				disk -= vol->v_disks_count;
+				offset += vol->v_strip_size;
+			}
+			cbp->bio_offset = offset + start;
+			cbp->bio_length = bp->bio_length;
+			cbp->bio_data = bp->bio_data;
+			g_destroy_bio(bp);
+			nsd = &vol->v_subdisks[disk];
+			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
+			    nsd->sd_pos);
+			if (do_write)
+				mask |= 1 << 31;
+			if ((mask & (1 << 31)) != 0)
+				sd->sd_recovery++;
+			cbp->bio_caller2 = (void *)mask;
+			if (do_write) {
+				cbp->bio_caller1 = nsd;
+				/* Lock callback starts I/O */
+				g_raid_lock_range(sd->sd_volume,
+				    virtual, cbp->bio_length, pbp, cbp);
+			} else {
+				g_raid_subdisk_iostart(nsd, cbp);
+			}
+			return;
+		}
+		/*
+		 * We can't retry.  Return the original error by falling
+		 * through.  This will happen when there's only one good disk.
+		 * We don't need to fail the raid, since its actual state is
+		 * based on the state of the subdisks.
+		 */
+		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
+	}
+	if (bp->bio_cmd == BIO_READ &&
+	    bp->bio_error == 0 &&
+	    (mask & (1 << 31)) != 0) {
+		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
+
+		/* Restore what we were doing. */
+		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
+		V2P(vol, virtual, &disk, &offset, &start);
+
+		/* Find best disk to write. */
+		best = g_raid_tr_raid1e_select_read_disk(vol,
+		    disk, offset, start, ~mask);
+		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
+			disk += best;
+			if (disk >= vol->v_disks_count) {
+				disk -= vol->v_disks_count;
+				offset += vol->v_strip_size;
+			}
+			cbp->bio_offset = offset + start;
+			cbp->bio_length = bp->bio_length;
+			cbp->bio_data = bp->bio_data;
+			cbp->bio_cmd = BIO_WRITE;
+			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
+			cbp->bio_caller2 = (void *)mask;
+			g_destroy_bio(bp);
+			G_RAID_LOGREQ(2, cbp,
+			    "Attempting bad sector remap on failing drive.");
+			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
+			return;
+		}
+	}
+	if ((mask & (1 << 31)) != 0) {
+		/*
+		 * We're done with a recovery, mark the range as unlocked.
+		 * For any write errors, we agressively fail the disk since
+		 * there was both a READ and a WRITE error at this location.
+		 * Both types of errors generally indicates the drive is on
+		 * the verge of total failure anyway.  Better to stop trusting
+		 * it now.  However, we need to reset error to 0 in that case
+		 * because we're not failing the original I/O which succeeded.
+		 */
+
+		/* Restore what we were doing. */
+		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
+		V2P(vol, virtual, &disk, &offset, &start);
+
+		for (copy = 0; copy < N; copy++) {
+			if ((mask & (1 << copy) ) != 0)
+				vol->v_subdisks[(disk + copy) %
+				    vol->v_disks_count].sd_recovery--;
+		}
+
+		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
+			G_RAID_LOGREQ(0, bp, "Remap write failed: "
+			    "failing subdisk.");
+			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
+			bp->bio_error = 0;
+		}
+		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
+		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
+	}
+	error = bp->bio_error;
+	g_destroy_bio(bp);
+	if (pbp->bio_children == pbp->bio_inbed) {
+		pbp->bio_completed = pbp->bio_length;
+		g_raid_iodone(pbp, error);
+	}
+}
+
+static int
+g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
+    void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
+{
+	struct g_raid_volume *vol;
+	struct g_raid_subdisk *sd;
+	struct bio_queue_head queue;
+	char *addr;
+	off_t offset, start, length, remain;
+	u_int no, strip_size;
+	int i, error;
+
+	vol = tr->tro_volume;
+	addr = virtual;
+	strip_size = vol->v_strip_size;
+	V2P(vol, boffset, &no, &offset, &start);
+	remain = blength;
+	bioq_init(&queue);
+	while (remain > 0) {
+		length = MIN(strip_size - start, remain);
+		for (i = 0; i < N; i++) {
+			sd = &vol->v_subdisks[no];
+			switch (sd->sd_state) {
+			case G_RAID_SUBDISK_S_ACTIVE:
+			case G_RAID_SUBDISK_S_STALE:
+			case G_RAID_SUBDISK_S_RESYNC:
+				break;
+			case G_RAID_SUBDISK_S_REBUILD:
+				if (offset + start >= sd->sd_rebuild_pos)
+					goto nextdisk;
+				break;
+			default:
+				goto nextdisk;
+			}
+			error = g_raid_subdisk_kerneldump(sd,
+			    addr, 0, offset + start, length);
+			if (error != 0)
+				return (error);
+nextdisk:
+			if (++no >= vol->v_disks_count) {
+				no = 0;
+				offset += strip_size;
+			}
+		}
+		remain -= length;
+		addr += length;
+		start = 0;
+	}
+	return (0);
+}
+
+static int
+g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
+{
+	struct bio *bp;
+	struct g_raid_subdisk *sd;
+
+	bp = (struct bio *)argp;
+	sd = (struct g_raid_subdisk *)bp->bio_caller1;
+	g_raid_subdisk_iostart(sd, bp);
+
+	return (0);
+}
+
+static int
+g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1e_object *trs;
+	struct g_raid_volume *vol;
+
+	vol = tr->tro_volume;
+	trs = (struct g_raid_tr_raid1e_object *)tr;
+	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
+	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
+	/* Compensate short rebuild I/Os. */
+	if ((vol->v_disks_count % N) != 0 &&
+	    vol->v_strip_size < g_raid1e_rebuild_slab) {
+		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
+		trs->trso_recover_slabs /= vol->v_strip_size;
+	}
+	if (trs->trso_type == TR_RAID1E_REBUILD)
+		g_raid_tr_raid1e_rebuild_some(tr);
+	return (0);
+}
+
+static int
+g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
+{
+	struct g_raid_tr_raid1e_object *trs;
+
+	trs = (struct g_raid_tr_raid1e_object *)tr;
+
+	if (trs->trso_buffer != NULL) {
+		free(trs->trso_buffer, M_TR_RAID1E);
+		trs->trso_buffer = NULL;
+	}
+	return (0);
+}
+
+G_RAID_TR_DECLARE(g_raid_tr_raid1e);
diff --git a/sys/modules/geom/Makefile b/sys/modules/geom/Makefile
index 0b2e3e8..ca7d7e6 100644
--- a/sys/modules/geom/Makefile
+++ b/sys/modules/geom/Makefile
@@ -18,6 +18,7 @@ SUBDIR=	geom_bde \
 	geom_nop \
 	geom_part \
 	geom_pc98 \
+	geom_raid \
 	geom_raid3 \
 	geom_sched \
 	geom_shsec \
diff --git a/sys/modules/geom/geom_raid/Makefile b/sys/modules/geom/geom_raid/Makefile
new file mode 100644
index 0000000..4487807
--- /dev/null
+++ b/sys/modules/geom/geom_raid/Makefile
@@ -0,0 +1,19 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../../geom/raid
+
+KMOD=	geom_raid
+SRCS=	g_raid.c
+SRCS+=	g_raid_ctl.c
+SRCS+=	bus_if.h device_if.h
+SRCS+=	g_raid_md_if.h g_raid_md_if.c
+SRCS+=	g_raid_tr_if.h g_raid_tr_if.c
+
+SRCS+=	md_intel.c md_jmicron.c md_nvidia.c md_promise.c md_sii.c
+
+SRCS+=	tr_concat.c tr_raid0.c tr_raid1.c tr_raid1e.c
+
+MFILES=	kern/bus_if.m kern/device_if.m
+MFILES+= geom/raid/g_raid_md_if.m geom/raid/g_raid_tr_if.m
+
+.include <bsd.kmod.mk>
-- 
cgit v1.1