From 8dab5b050118631ed065f01515a1e2617f8e98de Mon Sep 17 00:00:00 2001 From: mav Date: Thu, 24 Mar 2011 21:31:32 +0000 Subject: MFgraid/head: Add new RAID GEOM class, that is going to replace ataraid(4) in supporting various BIOS-based software RAIDs. Unlike ataraid(4) this implementation does not depend on legacy ata(4) subsystem and can be used with any disk drivers, including new CAM-based ones (ahci(4), siis(4), mvs(4), ata(4) with `options ATA_CAM`). To make code more readable and extensible, this implementation follows modular design, including core part and two sets of modules, implementing support for different metadata formats and RAID levels. Support for such popular metadata formats is now implemented: Intel, JMicron, NVIDIA, Promise (also used by AMD/ATI) and SiliconImage. Such RAID levels are now supported: RAID0, RAID1, RAID1E, RAID10, SINGLE, CONCAT. For any all of these RAID levels and metadata formats this class supports full cycle of volume operations: reading, writing, creation, deletion, disk removal and insertion, rebuilding, dirty shutdown detection and resynchronization, bad sector recovery, faulty disks tracking, hot-spare disks. For Intel and Promise formats there is support multiple volumes per disk set. Look graid(8) manual page for additional details. Co-authored by: imp Sponsored by: Cisco Systems, Inc. and iXsystems, Inc. --- etc/mtree/BSD.include.dist | 2 + include/Makefile | 2 +- sbin/geom/class/Makefile | 1 + sbin/geom/class/raid/Makefile | 10 + sbin/geom/class/raid/geom_raid.c | 91 ++ sbin/geom/class/raid/graid.8 | 266 ++++ sys/conf/NOTES | 1 + sys/conf/files | 13 + sys/conf/options | 1 + sys/geom/raid/g_raid.c | 2340 +++++++++++++++++++++++++++++++++++ sys/geom/raid/g_raid.h | 403 ++++++ sys/geom/raid/g_raid_ctl.c | 217 ++++ sys/geom/raid/g_raid_md_if.m | 156 +++ sys/geom/raid/g_raid_tr_if.m | 118 ++ sys/geom/raid/md_intel.c | 2323 ++++++++++++++++++++++++++++++++++ sys/geom/raid/md_jmicron.c | 1582 +++++++++++++++++++++++ sys/geom/raid/md_nvidia.c | 1607 ++++++++++++++++++++++++ sys/geom/raid/md_promise.c | 1940 +++++++++++++++++++++++++++++ sys/geom/raid/md_sii.c | 1692 +++++++++++++++++++++++++ sys/geom/raid/tr_concat.c | 343 +++++ sys/geom/raid/tr_raid0.c | 326 +++++ sys/geom/raid/tr_raid1.c | 993 +++++++++++++++ sys/geom/raid/tr_raid1e.c | 1227 ++++++++++++++++++ sys/modules/geom/Makefile | 1 + sys/modules/geom/geom_raid/Makefile | 19 + 25 files changed, 15673 insertions(+), 1 deletion(-) create mode 100644 sbin/geom/class/raid/Makefile create mode 100644 sbin/geom/class/raid/geom_raid.c create mode 100644 sbin/geom/class/raid/graid.8 create mode 100644 sys/geom/raid/g_raid.c create mode 100644 sys/geom/raid/g_raid.h create mode 100644 sys/geom/raid/g_raid_ctl.c create mode 100644 sys/geom/raid/g_raid_md_if.m create mode 100644 sys/geom/raid/g_raid_tr_if.m create mode 100644 sys/geom/raid/md_intel.c create mode 100644 sys/geom/raid/md_jmicron.c create mode 100644 sys/geom/raid/md_nvidia.c create mode 100644 sys/geom/raid/md_promise.c create mode 100644 sys/geom/raid/md_sii.c create mode 100644 sys/geom/raid/tr_concat.c create mode 100644 sys/geom/raid/tr_raid0.c create mode 100644 sys/geom/raid/tr_raid1.c create mode 100644 sys/geom/raid/tr_raid1e.c create mode 100644 sys/modules/geom/geom_raid/Makefile diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist index b227bdb..a19eddc 100644 --- a/etc/mtree/BSD.include.dist +++ b/etc/mtree/BSD.include.dist @@ -190,6 +190,8 @@ .. nop .. + raid + .. raid3 .. shsec diff --git a/include/Makefile b/include/Makefile index 249db95..9bcced5 100644 --- a/include/Makefile +++ b/include/Makefile @@ -47,7 +47,7 @@ LSUBDIRS= cam/ata cam/scsi \ ${_fs_nwfs} fs/portalfs fs/procfs fs/smbfs fs/udf fs/unionfs \ geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \ geom/mirror geom/mountver geom/multipath geom/nop \ - geom/raid3 geom/shsec geom/stripe geom/virstor \ + geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \ netgraph/atm netgraph/netflow \ security/audit \ security/mac_biba security/mac_bsdextended security/mac_lomac \ diff --git a/sbin/geom/class/Makefile b/sbin/geom/class/Makefile index 0611cdd..912561f 100644 --- a/sbin/geom/class/Makefile +++ b/sbin/geom/class/Makefile @@ -14,6 +14,7 @@ SUBDIR+=mountver SUBDIR+=multipath SUBDIR+=nop SUBDIR+=part +SUBDIR+=raid SUBDIR+=raid3 SUBDIR+=sched SUBDIR+=shsec diff --git a/sbin/geom/class/raid/Makefile b/sbin/geom/class/raid/Makefile new file mode 100644 index 0000000..743f690 --- /dev/null +++ b/sbin/geom/class/raid/Makefile @@ -0,0 +1,10 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR}/../../misc + +GEOM_CLASS= raid + +DPADD= ${LIBMD} +LDADD= -lmd + +.include diff --git a/sbin/geom/class/raid/geom_raid.c b/sbin/geom/class/raid/geom_raid.c new file mode 100644 index 0000000..2f16295 --- /dev/null +++ b/sbin/geom/class/raid/geom_raid.c @@ -0,0 +1,91 @@ +/*- + * Copyright (c) 2010 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +uint32_t lib_version = G_LIB_VERSION; +uint32_t version = G_RAID_VERSION; + +struct g_command class_commands[] = { + { "label", G_FLAG_VERBOSE, NULL, + { + { 'f', "force", NULL, G_TYPE_BOOL }, + { 'S', "size", G_VAL_OPTIONAL, G_TYPE_NUMBER }, + { 's', "strip", G_VAL_OPTIONAL, G_TYPE_NUMBER }, + G_OPT_SENTINEL + }, + "[-fv] [-S size] [-s stripsize] format label level prov ..." + }, + { "add", G_FLAG_VERBOSE, NULL, + { + { 'f', "force", NULL, G_TYPE_BOOL }, + { 'S', "size", G_VAL_OPTIONAL, G_TYPE_NUMBER }, + { 's', "strip", G_VAL_OPTIONAL, G_TYPE_NUMBER }, + G_OPT_SENTINEL + }, + "[-fv] [-S size] [-s stripsize] name label level" + }, + { "delete", G_FLAG_VERBOSE, NULL, + { + { 'f', "force", NULL, G_TYPE_BOOL }, + G_OPT_SENTINEL + }, + "[-fv] name [label|num]" + }, + { "insert", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, + "[-v] name prov ..." + }, + { "remove", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, + "[-v] name prov ..." + }, + { "fail", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, + "[-v] name prov ..." + }, + { "stop", G_FLAG_VERBOSE, NULL, + { + { 'f', "force", NULL, G_TYPE_BOOL }, + G_OPT_SENTINEL + }, + "[-fv] name" + }, + G_CMD_SENTINEL +}; + diff --git a/sbin/geom/class/raid/graid.8 b/sbin/geom/class/raid/graid.8 new file mode 100644 index 0000000..d1c92a2 --- /dev/null +++ b/sbin/geom/class/raid/graid.8 @@ -0,0 +1,266 @@ +.\" Copyright (c) 2010 Alexander Motin +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd March 22, 2011 +.Dt GRAID 8 +.Os +.Sh NAME +.Nm graid +.Nd "control utility for software RAID devices" +.Sh SYNOPSIS +.Nm +.Cm label +.Op Fl f +.Op Fl S Ar size +.Op Fl s Ar strip +.Ar format +.Ar label +.Ar level +.Ar prov ... +.Nm +.Cm add +.Op Fl f +.Op Fl S Ar size +.Op Fl s Ar strip +.Ar name +.Ar label +.Ar level +.Nm +.Cm delete +.Op Fl f +.Ar name +.Op Ar label | Ar num +.Nm +.Cm insert +.Ar name +.Ar prov ... +.Nm +.Cm remove +.Ar name +.Ar prov ... +.Nm +.Cm fail +.Ar name +.Ar prov ... +.Nm +.Cm stop +.Op Fl fv +.Ar name ... +.Nm +.Cm list +.Nm +.Cm status +.Nm +.Cm load +.Nm +.Cm unload +.Sh DESCRIPTION +The +.Nm +utility is used to manage software RAID configurations, supported by the +GEOM RAID class. +GEOM RAID class uses on-disk metadata to provide access to software-RAID +volumes defined by different RAID BIOSes. +Depending on RAID BIOS type and it's metadata format, different subsets of +configurations and features are supported. +To allow booting from RAID volume, the metadata format should match the +RAID BIOS type and its capabilities. +To guarantee that these match, it is recommended to create volumes via the +RAID BIOS interface, while experienced users are free to do it using this +utility. +.Pp +The first argument to +.Nm +indicates an action to be performed: +.Bl -tag -width ".Cm destroy" +.It Cm label +Create an array with single volume. +The +.Ar format +argument specifies the on-disk metadata format to use for this array, +such as "Intel". +The +.Ar label +argument specifies the label of the created volume. +The +.Ar level +argument specifies the RAID level of the created volume, such as: +"RAID0", "RAID1", etc. +The subsequent list enumerates providers to use as array components. +The special name "NONE" can be used to reserve space for absent disks. +The order of components can be important, depending on specific RAID level +and metadata format. +.Pp +Additional options include: +.Bl -tag -width ".Fl s Ar strip" +.It Fl f +Enforce specified configuration creation if it is officially unsupported, +but technically can be created. +.It Fl S Ar size +Use +.Ar size +bytes on each component for this volume. +Should be used if several volumes per array are planned, or if smaller +components going to be inserted later. +Defaults to size of the smallest component. +.It Fl s Ar strip +Specifies strip size in bytes. +Defaults to 131072. +.El +.It Cm add +Create another volume on the existing array. +The +.Ar name +argument is the name of the existing array, reported by label command. +The rest of arguments are the same as for the label command. +.It Cm delete +Delete volume(s) from the existing array. +When the last volume is deleted, the array is also deleted and its metadata +erased. +The +.Ar name +argument is the name of existing array. +Optional +.Ar label +or +.Ar num +arguments allow specifying volume for deletion. +.Pp +Additional options include: +.Bl -tag -width ".Fl f" +.It Fl f +Delete volume(s) even if it is still open. +.El +.It Cm insert +Insert specified provider(s) into specified array instead of the first missing +or failed components. +If there are no such components, mark disk(s) as spare. +.It Cm remove +Remove the specified provider(s) from the specified array and erase metadata. +If there are spare disks present, the removed disk(s) will be replaced by +spares. +.It Cm fail +Mark the given disks(s) as failed, removing from active use unless absolutely +necessary due to exhausted redundancy. +If there are spare disks present - failed disk(s) will be replaced with one +of them. +.It Cm stop +Stop the given array. +The metadata will not be erased. +.Pp +Additional options include: +.Bl -tag -width ".Fl f" +.It Fl f +Stop the given array even if some of its volumes are opened. +.El +.It Cm list +See +.Xr geom 8 . +.It Cm status +See +.Xr geom 8 . +.It Cm load +See +.Xr geom 8 . +.It Cm unload +See +.Xr geom 8 . +.El +.Pp +Additional options include: +.Bl -tag -width ".Fl v" +.It Fl v +Be more verbose. +.El +.Sh SUPPORTED METADATA FORMATS +The GEOM RAID class follows a modular design, allowing different metadata +formats to be used. +Support is currently implemented for the following formats: +.Bl -tag -width "Intel" +.It Intel +The format used by Intel RAID BIOS. +Supports up to two volumes per array. +Supports configurations: RAID0 (2+ disks), RAID1 (2 disks), +RAID5 (3+ disks), RAID10 (4 disks). +Configurations not supported by Intel RAID BIOS, but enforceable on your own +risk: RAID1 (3+ disks), RAID1E (3+ disks), RAID10 (6+ disks). +.It JMicron +The format used by JMicron RAID BIOS. +Supports one volume per array. +Supports configurations: RAID0 (2+ disks), RAID1 (2 disks), +RAID10 (4 disks), CONCAT (2+ disks). +Configurations not supported by JMicron RAID BIOS, but enforceable on your own +risk: RAID1 (3+ disks), RAID1E (3+ disks), RAID10 (6+ disks), RAID5 (3+ disks). +.It NVIDIA +The format used by NVIDIA MediaShield RAID BIOS. +Supports one volume per array. +Supports configurations: RAID0 (2+ disks), RAID1 (2 disks), +RAID5 (3+ disks), RAID10 (4+ disks), SINGLE (1 disk), CONCAT (2+ disks). +Configurations not supported by NVIDIA MediaShield RAID BIOS, but enforceable +on your own risk: RAID1 (3+ disks). +.It Promise +The format used by Promise and AMD/ATI RAID BIOSes and FreeBSD ataraid(4) +driver. +Supports multiple volumes per array. +Each disk can be split to be used by up to two arbitrary volumes. +Supports configurations: RAID0 (2+ disks), RAID1 (2 disks), +RAID5 (3+ disks), RAID10 (4 disks), SINGLE (1 disk), CONCAT (2+ disks). +Configurations not supported by RAID BIOSes, but enforceable on your +own risk: RAID1 (3+ disks), RAID10 (6+ disks). +.It SiI +The format used by SiliconImage RAID BIOS. +Supports one volume per array. +Supports configurations: RAID0 (2+ disks), RAID1 (2 disks), +RAID5 (3+ disks), RAID10 (4 disks), SINGLE (1 disk), CONCAT (2+ disks). +Configurations not supported by SiliconImage RAID BIOS, but enforceable on your +own risk: RAID1 (3+ disks), RAID10 (6+ disks). +.El +.Sh SUPPORTED RAID LEVELS +The GEOM RAID class follows a modular design, allowing different RAID levels +to be used. +Support for the following RAID levels is currently implemented: RAID0, RAID1, +RAID1E, RAID10, SINGLE, CONCAT. +.Sh RAID LEVEL MIGRATION +The GEOM RAID class has no support for RAID level migration, allowed by some +metadata formats. +If you started migration using BIOS or in some other way, make sure to +complete it there. +Do not run GEOM RAID class on migrating volumes under pain of possible data +corruption! +.Sh EXIT STATUS +Exit status is 0 on success, and non-zero if the command fails. +.Sh SEE ALSO +.Xr geom 4 , +.Xr geom 8 , +.Xr vinum 8 +.Sh HISTORY +The +.Nm +utility appeared in +.Fx 9.0 . +.Sh AUTHORS +.An Alexander Motin Aq mav@FreeBSD.org +.An M. Warner Losh Aq imp@FreeBSD.org diff --git a/sys/conf/NOTES b/sys/conf/NOTES index cf8064f..851b9b8 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -163,6 +163,7 @@ options GEOM_PART_MBR # MBR partitioning options GEOM_PART_PC98 # PC-9800 disk partitioning options GEOM_PART_VTOC8 # SMI VTOC8 disk label options GEOM_PC98 # NEC PC9800 partitioning +options GEOM_RAID # Soft RAID functionality. options GEOM_RAID3 # RAID3 functionality. options GEOM_SHSEC # Shared secret. options GEOM_STRIPE # Disk striping. diff --git a/sys/conf/files b/sys/conf/files index 8af90a4..bced838 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -2115,6 +2115,19 @@ geom/part/g_part_gpt.c optional geom_part_gpt geom/part/g_part_mbr.c optional geom_part_mbr geom/part/g_part_pc98.c optional geom_part_pc98 geom/part/g_part_vtoc8.c optional geom_part_vtoc8 +geom/raid/g_raid.c optional geom_raid +geom/raid/g_raid_ctl.c optional geom_raid +geom/raid/g_raid_md_if.m optional geom_raid +geom/raid/g_raid_tr_if.m optional geom_raid +geom/raid/md_intel.c optional geom_raid +geom/raid/md_jmicron.c optional geom_raid +geom/raid/md_nvidia.c optional geom_raid +geom/raid/md_promise.c optional geom_raid +geom/raid/md_sii.c optional geom_raid +geom/raid/tr_concat.c optional geom_raid +geom/raid/tr_raid0.c optional geom_raid +geom/raid/tr_raid1.c optional geom_raid +geom/raid/tr_raid1e.c optional geom_raid geom/raid3/g_raid3.c optional geom_raid3 geom/raid3/g_raid3_ctl.c optional geom_raid3 geom/shsec/g_shsec.c optional geom_shsec diff --git a/sys/conf/options b/sys/conf/options index b3642e9..a507d69 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -102,6 +102,7 @@ GEOM_PART_MBR opt_geom.h GEOM_PART_PC98 opt_geom.h GEOM_PART_VTOC8 opt_geom.h GEOM_PC98 opt_geom.h +GEOM_RAID opt_geom.h GEOM_RAID3 opt_geom.h GEOM_SHSEC opt_geom.h GEOM_STRIPE opt_geom.h diff --git a/sys/geom/raid/g_raid.c b/sys/geom/raid/g_raid.c new file mode 100644 index 0000000..eebb360 --- /dev/null +++ b/sys/geom/raid/g_raid.c @@ -0,0 +1,2340 @@ +/*- + * Copyright (c) 2010 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "g_raid_md_if.h" +#include "g_raid_tr_if.h" + +static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data"); + +SYSCTL_DECL(_kern_geom); +SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff"); +u_int g_raid_aggressive_spare = 0; +TUNABLE_INT("kern.geom.raid.aggressive_spare", &g_raid_aggressive_spare); +SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RW, + &g_raid_aggressive_spare, 0, "Use disks without metadata as spare"); +u_int g_raid_debug = 2; +TUNABLE_INT("kern.geom.raid.debug", &g_raid_debug); +SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RW, &g_raid_debug, 0, + "Debug level"); +int g_raid_read_err_thresh = 10; +TUNABLE_INT("kern.geom.raid.read_err_thresh", &g_raid_read_err_thresh); +SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RW, + &g_raid_read_err_thresh, 0, + "Number of read errors equated to disk failure"); +u_int g_raid_start_timeout = 30; +TUNABLE_INT("kern.geom.raid.start_timeout", &g_raid_start_timeout); +SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RW, + &g_raid_start_timeout, 0, + "Time to wait for all array components"); +static u_int g_raid_clean_time = 5; +TUNABLE_INT("kern.geom.raid.clean_time", &g_raid_clean_time); +SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RW, + &g_raid_clean_time, 0, "Mark volume as clean when idling"); +static u_int g_raid_disconnect_on_failure = 1; +TUNABLE_INT("kern.geom.raid.disconnect_on_failure", + &g_raid_disconnect_on_failure); +SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RW, + &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure."); +static u_int g_raid_name_format = 0; +TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format); +SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW, + &g_raid_name_format, 0, "Providers name format."); +static u_int g_raid_idle_threshold = 1000000; +TUNABLE_INT("kern.geom.raid.idle_threshold", &g_raid_idle_threshold); +SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RW, + &g_raid_idle_threshold, 1000000, + "Time in microseconds to consider a volume idle."); + +#define MSLEEP(rv, ident, mtx, priority, wmesg, timeout) do { \ + G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ + rv = msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ + G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ +} while (0) + +LIST_HEAD(, g_raid_md_class) g_raid_md_classes = + LIST_HEAD_INITIALIZER(g_raid_md_classes); + +LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes = + LIST_HEAD_INITIALIZER(g_raid_tr_classes); + +LIST_HEAD(, g_raid_volume) g_raid_volumes = + LIST_HEAD_INITIALIZER(g_raid_volumes); + +static eventhandler_tag g_raid_pre_sync = NULL; +static int g_raid_started = 0; + +static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp, + struct g_geom *gp); +static g_taste_t g_raid_taste; +static void g_raid_init(struct g_class *mp); +static void g_raid_fini(struct g_class *mp); + +struct g_class g_raid_class = { + .name = G_RAID_CLASS_NAME, + .version = G_VERSION, + .ctlreq = g_raid_ctl, + .taste = g_raid_taste, + .destroy_geom = g_raid_destroy_geom, + .init = g_raid_init, + .fini = g_raid_fini +}; + +static void g_raid_destroy_provider(struct g_raid_volume *vol); +static int g_raid_update_disk(struct g_raid_disk *disk, u_int event); +static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event); +static int g_raid_update_volume(struct g_raid_volume *vol, u_int event); +static int g_raid_update_node(struct g_raid_softc *sc, u_int event); +static void g_raid_dumpconf(struct sbuf *sb, const char *indent, + struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); +static void g_raid_start(struct bio *bp); +static void g_raid_start_request(struct bio *bp); +static void g_raid_disk_done(struct bio *bp); +static void g_raid_poll(struct g_raid_softc *sc); + +static const char * +g_raid_node_event2str(int event) +{ + + switch (event) { + case G_RAID_NODE_E_WAKE: + return ("WAKE"); + case G_RAID_NODE_E_START: + return ("START"); + default: + return ("INVALID"); + } +} + +const char * +g_raid_disk_state2str(int state) +{ + + switch (state) { + case G_RAID_DISK_S_NONE: + return ("NONE"); + case G_RAID_DISK_S_OFFLINE: + return ("OFFLINE"); + case G_RAID_DISK_S_FAILED: + return ("FAILED"); + case G_RAID_DISK_S_STALE_FAILED: + return ("STALE_FAILED"); + case G_RAID_DISK_S_SPARE: + return ("SPARE"); + case G_RAID_DISK_S_STALE: + return ("STALE"); + case G_RAID_DISK_S_ACTIVE: + return ("ACTIVE"); + default: + return ("INVALID"); + } +} + +static const char * +g_raid_disk_event2str(int event) +{ + + switch (event) { + case G_RAID_DISK_E_DISCONNECTED: + return ("DISCONNECTED"); + default: + return ("INVALID"); + } +} + +const char * +g_raid_subdisk_state2str(int state) +{ + + switch (state) { + case G_RAID_SUBDISK_S_NONE: + return ("NONE"); + case G_RAID_SUBDISK_S_FAILED: + return ("FAILED"); + case G_RAID_SUBDISK_S_NEW: + return ("NEW"); + case G_RAID_SUBDISK_S_REBUILD: + return ("REBUILD"); + case G_RAID_SUBDISK_S_UNINITIALIZED: + return ("UNINITIALIZED"); + case G_RAID_SUBDISK_S_STALE: + return ("STALE"); + case G_RAID_SUBDISK_S_RESYNC: + return ("RESYNC"); + case G_RAID_SUBDISK_S_ACTIVE: + return ("ACTIVE"); + default: + return ("INVALID"); + } +} + +static const char * +g_raid_subdisk_event2str(int event) +{ + + switch (event) { + case G_RAID_SUBDISK_E_NEW: + return ("NEW"); + case G_RAID_SUBDISK_E_DISCONNECTED: + return ("DISCONNECTED"); + default: + return ("INVALID"); + } +} + +const char * +g_raid_volume_state2str(int state) +{ + + switch (state) { + case G_RAID_VOLUME_S_STARTING: + return ("STARTING"); + case G_RAID_VOLUME_S_BROKEN: + return ("BROKEN"); + case G_RAID_VOLUME_S_DEGRADED: + return ("DEGRADED"); + case G_RAID_VOLUME_S_SUBOPTIMAL: + return ("SUBOPTIMAL"); + case G_RAID_VOLUME_S_OPTIMAL: + return ("OPTIMAL"); + case G_RAID_VOLUME_S_UNSUPPORTED: + return ("UNSUPPORTED"); + case G_RAID_VOLUME_S_STOPPED: + return ("STOPPED"); + default: + return ("INVALID"); + } +} + +static const char * +g_raid_volume_event2str(int event) +{ + + switch (event) { + case G_RAID_VOLUME_E_UP: + return ("UP"); + case G_RAID_VOLUME_E_DOWN: + return ("DOWN"); + case G_RAID_VOLUME_E_START: + return ("START"); + case G_RAID_VOLUME_E_STARTMD: + return ("STARTMD"); + default: + return ("INVALID"); + } +} + +const char * +g_raid_volume_level2str(int level, int qual) +{ + + switch (level) { + case G_RAID_VOLUME_RL_RAID0: + return ("RAID0"); + case G_RAID_VOLUME_RL_RAID1: + return ("RAID1"); + case G_RAID_VOLUME_RL_RAID3: + return ("RAID3"); + case G_RAID_VOLUME_RL_RAID4: + return ("RAID4"); + case G_RAID_VOLUME_RL_RAID5: + return ("RAID5"); + case G_RAID_VOLUME_RL_RAID6: + return ("RAID6"); + case G_RAID_VOLUME_RL_RAID1E: + return ("RAID1E"); + case G_RAID_VOLUME_RL_SINGLE: + return ("SINGLE"); + case G_RAID_VOLUME_RL_CONCAT: + return ("CONCAT"); + case G_RAID_VOLUME_RL_RAID5E: + return ("RAID5E"); + case G_RAID_VOLUME_RL_RAID5EE: + return ("RAID5EE"); + default: + return ("UNKNOWN"); + } +} + +int +g_raid_volume_str2level(const char *str, int *level, int *qual) +{ + + *level = G_RAID_VOLUME_RL_UNKNOWN; + *qual = G_RAID_VOLUME_RLQ_NONE; + if (strcasecmp(str, "RAID0") == 0) + *level = G_RAID_VOLUME_RL_RAID0; + else if (strcasecmp(str, "RAID1") == 0) + *level = G_RAID_VOLUME_RL_RAID1; + else if (strcasecmp(str, "RAID3") == 0) + *level = G_RAID_VOLUME_RL_RAID3; + else if (strcasecmp(str, "RAID4") == 0) + *level = G_RAID_VOLUME_RL_RAID4; + else if (strcasecmp(str, "RAID5") == 0) + *level = G_RAID_VOLUME_RL_RAID5; + else if (strcasecmp(str, "RAID6") == 0) + *level = G_RAID_VOLUME_RL_RAID6; + else if (strcasecmp(str, "RAID10") == 0 || + strcasecmp(str, "RAID1E") == 0) + *level = G_RAID_VOLUME_RL_RAID1E; + else if (strcasecmp(str, "SINGLE") == 0) + *level = G_RAID_VOLUME_RL_SINGLE; + else if (strcasecmp(str, "CONCAT") == 0) + *level = G_RAID_VOLUME_RL_CONCAT; + else if (strcasecmp(str, "RAID5E") == 0) + *level = G_RAID_VOLUME_RL_RAID5E; + else if (strcasecmp(str, "RAID5EE") == 0) + *level = G_RAID_VOLUME_RL_RAID5EE; + else + return (-1); + return (0); +} + +const char * +g_raid_get_diskname(struct g_raid_disk *disk) +{ + + if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) + return ("[unknown]"); + return (disk->d_consumer->provider->name); +} + +void +g_raid_report_disk_state(struct g_raid_disk *disk) +{ + struct g_raid_subdisk *sd; + int len, state; + uint32_t s; + + if (disk->d_consumer == NULL) + return; + if (disk->d_state == G_RAID_DISK_S_FAILED || + disk->d_state == G_RAID_DISK_S_STALE_FAILED) { + s = G_STATE_FAILED; + } else { + state = G_RAID_SUBDISK_S_ACTIVE; + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + if (sd->sd_state < state) + state = sd->sd_state; + } + if (state == G_RAID_SUBDISK_S_FAILED) + s = G_STATE_FAILED; + else if (state == G_RAID_SUBDISK_S_NEW || + state == G_RAID_SUBDISK_S_REBUILD) + s = G_STATE_REBUILD; + else if (state == G_RAID_SUBDISK_S_STALE || + state == G_RAID_SUBDISK_S_RESYNC) + s = G_STATE_RESYNC; + else + s = G_STATE_ACTIVE; + } + len = sizeof(s); + g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s); + G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.", + g_raid_get_diskname(disk), s); +} + +void +g_raid_change_disk_state(struct g_raid_disk *disk, int state) +{ + + G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.", + g_raid_get_diskname(disk), + g_raid_disk_state2str(disk->d_state), + g_raid_disk_state2str(state)); + disk->d_state = state; + g_raid_report_disk_state(disk); +} + +void +g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state) +{ + + G_RAID_DEBUG1(0, sd->sd_softc, + "Subdisk %s:%d-%s state changed from %s to %s.", + sd->sd_volume->v_name, sd->sd_pos, + sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", + g_raid_subdisk_state2str(sd->sd_state), + g_raid_subdisk_state2str(state)); + sd->sd_state = state; + if (sd->sd_disk) + g_raid_report_disk_state(sd->sd_disk); +} + +void +g_raid_change_volume_state(struct g_raid_volume *vol, int state) +{ + + G_RAID_DEBUG1(0, vol->v_softc, + "Volume %s state changed from %s to %s.", + vol->v_name, + g_raid_volume_state2str(vol->v_state), + g_raid_volume_state2str(state)); + vol->v_state = state; +} + +/* + * --- Events handling functions --- + * Events in geom_raid are used to maintain subdisks and volumes status + * from one thread to simplify locking. + */ +static void +g_raid_event_free(struct g_raid_event *ep) +{ + + free(ep, M_RAID); +} + +int +g_raid_event_send(void *arg, int event, int flags) +{ + struct g_raid_softc *sc; + struct g_raid_event *ep; + int error; + + if ((flags & G_RAID_EVENT_VOLUME) != 0) { + sc = ((struct g_raid_volume *)arg)->v_softc; + } else if ((flags & G_RAID_EVENT_DISK) != 0) { + sc = ((struct g_raid_disk *)arg)->d_softc; + } else if ((flags & G_RAID_EVENT_SUBDISK) != 0) { + sc = ((struct g_raid_subdisk *)arg)->sd_softc; + } else { + sc = arg; + } + ep = malloc(sizeof(*ep), M_RAID, + sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT); + if (ep == NULL) + return (ENOMEM); + ep->e_tgt = arg; + ep->e_event = event; + ep->e_flags = flags; + ep->e_error = 0; + G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc); + mtx_lock(&sc->sc_queue_mtx); + TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); + mtx_unlock(&sc->sc_queue_mtx); + wakeup(sc); + + if ((flags & G_RAID_EVENT_WAIT) == 0) + return (0); + + sx_assert(&sc->sc_lock, SX_XLOCKED); + G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep); + sx_xunlock(&sc->sc_lock); + while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) { + mtx_lock(&sc->sc_queue_mtx); + MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event", + hz * 5); + } + error = ep->e_error; + g_raid_event_free(ep); + sx_xlock(&sc->sc_lock); + return (error); +} + +static void +g_raid_event_cancel(struct g_raid_softc *sc, void *tgt) +{ + struct g_raid_event *ep, *tmpep; + + sx_assert(&sc->sc_lock, SX_XLOCKED); + + mtx_lock(&sc->sc_queue_mtx); + TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { + if (ep->e_tgt != tgt) + continue; + TAILQ_REMOVE(&sc->sc_events, ep, e_next); + if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) + g_raid_event_free(ep); + else { + ep->e_error = ECANCELED; + wakeup(ep); + } + } + mtx_unlock(&sc->sc_queue_mtx); +} + +static int +g_raid_event_check(struct g_raid_softc *sc, void *tgt) +{ + struct g_raid_event *ep; + int res = 0; + + sx_assert(&sc->sc_lock, SX_XLOCKED); + + mtx_lock(&sc->sc_queue_mtx); + TAILQ_FOREACH(ep, &sc->sc_events, e_next) { + if (ep->e_tgt != tgt) + continue; + res = 1; + break; + } + mtx_unlock(&sc->sc_queue_mtx); + return (res); +} + +/* + * Return the number of disks in given state. + * If state is equal to -1, count all connected disks. + */ +u_int +g_raid_ndisks(struct g_raid_softc *sc, int state) +{ + struct g_raid_disk *disk; + u_int n; + + sx_assert(&sc->sc_lock, SX_LOCKED); + + n = 0; + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_state == state || state == -1) + n++; + } + return (n); +} + +/* + * Return the number of subdisks in given state. + * If state is equal to -1, count all connected disks. + */ +u_int +g_raid_nsubdisks(struct g_raid_volume *vol, int state) +{ + struct g_raid_subdisk *subdisk; + struct g_raid_softc *sc; + u_int i, n ; + + sc = vol->v_softc; + sx_assert(&sc->sc_lock, SX_LOCKED); + + n = 0; + for (i = 0; i < vol->v_disks_count; i++) { + subdisk = &vol->v_subdisks[i]; + if ((state == -1 && + subdisk->sd_state != G_RAID_SUBDISK_S_NONE) || + subdisk->sd_state == state) + n++; + } + return (n); +} + +/* + * Return the first subdisk in given state. + * If state is equal to -1, then the first connected disks. + */ +struct g_raid_subdisk * +g_raid_get_subdisk(struct g_raid_volume *vol, int state) +{ + struct g_raid_subdisk *sd; + struct g_raid_softc *sc; + u_int i; + + sc = vol->v_softc; + sx_assert(&sc->sc_lock, SX_LOCKED); + + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + if ((state == -1 && + sd->sd_state != G_RAID_SUBDISK_S_NONE) || + sd->sd_state == state) + return (sd); + } + return (NULL); +} + +struct g_consumer * +g_raid_open_consumer(struct g_raid_softc *sc, const char *name) +{ + struct g_consumer *cp; + struct g_provider *pp; + + g_topology_assert(); + + if (strncmp(name, "/dev/", 5) == 0) + name += 5; + pp = g_provider_by_name(name); + if (pp == NULL) + return (NULL); + cp = g_new_consumer(sc->sc_geom); + if (g_attach(cp, pp) != 0) { + g_destroy_consumer(cp); + return (NULL); + } + if (g_access(cp, 1, 1, 1) != 0) { + g_detach(cp); + g_destroy_consumer(cp); + return (NULL); + } + return (cp); +} + +static u_int +g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp) +{ + struct bio *bp; + u_int nreqs = 0; + + mtx_lock(&sc->sc_queue_mtx); + TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { + if (bp->bio_from == cp) + nreqs++; + } + mtx_unlock(&sc->sc_queue_mtx); + return (nreqs); +} + +u_int +g_raid_nopens(struct g_raid_softc *sc) +{ + struct g_raid_volume *vol; + u_int opens; + + opens = 0; + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + if (vol->v_provider_open != 0) + opens++; + } + return (opens); +} + +static int +g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp) +{ + + if (cp->index > 0) { + G_RAID_DEBUG1(2, sc, + "I/O requests for %s exist, can't destroy it now.", + cp->provider->name); + return (1); + } + if (g_raid_nrequests(sc, cp) > 0) { + G_RAID_DEBUG1(2, sc, + "I/O requests for %s in queue, can't destroy it now.", + cp->provider->name); + return (1); + } + return (0); +} + +static void +g_raid_destroy_consumer(void *arg, int flags __unused) +{ + struct g_consumer *cp; + + g_topology_assert(); + + cp = arg; + G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); + g_detach(cp); + g_destroy_consumer(cp); +} + +void +g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp) +{ + struct g_provider *pp; + int retaste_wait; + + g_topology_assert_not(); + + g_topology_lock(); + cp->private = NULL; + if (g_raid_consumer_is_busy(sc, cp)) + goto out; + pp = cp->provider; + retaste_wait = 0; + if (cp->acw == 1) { + if ((pp->geom->flags & G_GEOM_WITHER) == 0) + retaste_wait = 1; + } + if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) + g_access(cp, -cp->acr, -cp->acw, -cp->ace); + if (retaste_wait) { + /* + * After retaste event was send (inside g_access()), we can send + * event to detach and destroy consumer. + * A class, which has consumer to the given provider connected + * will not receive retaste event for the provider. + * This is the way how I ignore retaste events when I close + * consumers opened for write: I detach and destroy consumer + * after retaste event is sent. + */ + g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL); + goto out; + } + G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name); + g_detach(cp); + g_destroy_consumer(cp); +out: + g_topology_unlock(); +} + +static void +g_raid_orphan(struct g_consumer *cp) +{ + struct g_raid_disk *disk; + + g_topology_assert(); + + disk = cp->private; + if (disk == NULL) + return; + g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED, + G_RAID_EVENT_DISK); +} + +static int +g_raid_clean(struct g_raid_volume *vol, int acw) +{ + struct g_raid_softc *sc; + int timeout; + + sc = vol->v_softc; + g_topology_assert_not(); + sx_assert(&sc->sc_lock, SX_XLOCKED); + +// if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) +// return (0); + if (!vol->v_dirty) + return (0); + if (vol->v_writes > 0) + return (0); + if (acw > 0 || (acw == -1 && + vol->v_provider != NULL && vol->v_provider->acw > 0)) { + timeout = g_raid_clean_time - (time_uptime - vol->v_last_write); + if (timeout > 0) + return (timeout); + } + vol->v_dirty = 0; + G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.", + vol->v_name); + g_raid_write_metadata(sc, vol, NULL, NULL); + return (0); +} + +static void +g_raid_dirty(struct g_raid_volume *vol) +{ + struct g_raid_softc *sc; + + sc = vol->v_softc; + g_topology_assert_not(); + sx_assert(&sc->sc_lock, SX_XLOCKED); + +// if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) +// return; + vol->v_dirty = 1; + G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.", + vol->v_name); + g_raid_write_metadata(sc, vol, NULL, NULL); +} + +void +g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct bio_queue_head queue; + struct bio *cbp; + int i; + + vol = tr->tro_volume; + sc = vol->v_softc; + + /* + * Allocate all bios before sending any request, so we can return + * ENOMEM in nice and clean way. + */ + bioq_init(&queue); + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + if (sd->sd_state == G_RAID_SUBDISK_S_NONE || + sd->sd_state == G_RAID_SUBDISK_S_FAILED) + continue; + cbp = g_clone_bio(bp); + if (cbp == NULL) + goto failure; + cbp->bio_caller1 = sd; + bioq_insert_tail(&queue, cbp); + } + for (cbp = bioq_first(&queue); cbp != NULL; + cbp = bioq_first(&queue)) { + bioq_remove(&queue, cbp); + sd = cbp->bio_caller1; + cbp->bio_caller1 = NULL; + g_raid_subdisk_iostart(sd, cbp); + } + return; +failure: + for (cbp = bioq_first(&queue); cbp != NULL; + cbp = bioq_first(&queue)) { + bioq_remove(&queue, cbp); + g_destroy_bio(cbp); + } + if (bp->bio_error == 0) + bp->bio_error = ENOMEM; + g_raid_iodone(bp, bp->bio_error); +} + +static void +g_raid_tr_kerneldump_common_done(struct bio *bp) +{ + + bp->bio_flags |= BIO_DONE; +} + +int +g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr, + void *virtual, vm_offset_t physical, off_t offset, size_t length) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + struct bio bp; + + vol = tr->tro_volume; + sc = vol->v_softc; + + bzero(&bp, sizeof(bp)); + bp.bio_cmd = BIO_WRITE; + bp.bio_done = g_raid_tr_kerneldump_common_done; + bp.bio_attribute = NULL; + bp.bio_offset = offset; + bp.bio_length = length; + bp.bio_data = virtual; + bp.bio_to = vol->v_provider; + + g_raid_start(&bp); + while (!(bp.bio_flags & BIO_DONE)) { + G_RAID_DEBUG1(4, sc, "Poll..."); + g_raid_poll(sc); + DELAY(10); + } + + return (bp.bio_error != 0 ? EIO : 0); +} + +static int +g_raid_dump(void *arg, + void *virtual, vm_offset_t physical, off_t offset, size_t length) +{ + struct g_raid_volume *vol; + int error; + + vol = (struct g_raid_volume *)arg; + G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.", + (long long unsigned)offset, (long long unsigned)length); + + error = G_RAID_TR_KERNELDUMP(vol->v_tr, + virtual, physical, offset, length); + return (error); +} + +static void +g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp) +{ + struct g_kerneldump *gkd; + struct g_provider *pp; + struct g_raid_volume *vol; + + gkd = (struct g_kerneldump*)bp->bio_data; + pp = bp->bio_to; + vol = pp->private; + g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)", + pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); + gkd->di.dumper = g_raid_dump; + gkd->di.priv = vol; + gkd->di.blocksize = vol->v_sectorsize; + gkd->di.maxiosize = DFLTPHYS; + gkd->di.mediaoffset = gkd->offset; + if ((gkd->offset + gkd->length) > vol->v_mediasize) + gkd->length = vol->v_mediasize - gkd->offset; + gkd->di.mediasize = gkd->length; + g_io_deliver(bp, 0); +} + +static void +g_raid_start(struct bio *bp) +{ + struct g_raid_softc *sc; + + sc = bp->bio_to->geom->softc; + /* + * If sc == NULL or there are no valid disks, provider's error + * should be set and g_raid_start() should not be called at all. + */ +// KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING, +// ("Provider's error should be set (error=%d)(mirror=%s).", +// bp->bio_to->error, bp->bio_to->name)); + G_RAID_LOGREQ(3, bp, "Request received."); + + switch (bp->bio_cmd) { + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + case BIO_FLUSH: + break; + case BIO_GETATTR: + if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) + g_raid_kerneldump(sc, bp); + else + g_io_deliver(bp, EOPNOTSUPP); + return; + default: + g_io_deliver(bp, EOPNOTSUPP); + return; + } + mtx_lock(&sc->sc_queue_mtx); + bioq_disksort(&sc->sc_queue, bp); + mtx_unlock(&sc->sc_queue_mtx); + if (!dumping) { + G_RAID_DEBUG1(4, sc, "Waking up %p.", sc); + wakeup(sc); + } +} + +static int +g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len) +{ + /* + * 5 cases: + * (1) bp entirely below NO + * (2) bp entirely above NO + * (3) bp start below, but end in range YES + * (4) bp entirely within YES + * (5) bp starts within, ends above YES + * + * lock range 10-19 (offset 10 length 10) + * (1) 1-5: first if kicks it out + * (2) 30-35: second if kicks it out + * (3) 5-15: passes both ifs + * (4) 12-14: passes both ifs + * (5) 19-20: passes both + */ + off_t lend = lstart + len - 1; + off_t bstart = bp->bio_offset; + off_t bend = bp->bio_offset + bp->bio_length - 1; + + if (bend < lstart) + return (0); + if (lend < bstart) + return (0); + return (1); +} + +static int +g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp) +{ + struct g_raid_lock *lp; + + sx_assert(&vol->v_softc->sc_lock, SX_LOCKED); + + LIST_FOREACH(lp, &vol->v_locks, l_next) { + if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length)) + return (1); + } + return (0); +} + +static void +g_raid_start_request(struct bio *bp) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + + sc = bp->bio_to->geom->softc; + sx_assert(&sc->sc_lock, SX_LOCKED); + vol = bp->bio_to->private; + + /* + * Check to see if this item is in a locked range. If so, + * queue it to our locked queue and return. We'll requeue + * it when the range is unlocked. Internal I/O for the + * rebuild/rescan/recovery process is excluded from this + * check so we can actually do the recovery. + */ + if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) && + g_raid_is_in_locked_range(vol, bp)) { + G_RAID_LOGREQ(3, bp, "Defer request."); + bioq_insert_tail(&vol->v_locked, bp); + return; + } + + /* + * If we're actually going to do the write/delete, then + * update the idle stats for the volume. + */ + if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { + if (!vol->v_dirty) + g_raid_dirty(vol); + vol->v_writes++; + } + + /* + * Put request onto inflight queue, so we can check if new + * synchronization requests don't collide with it. Then tell + * the transformation layer to start the I/O. + */ + bioq_insert_tail(&vol->v_inflight, bp); + G_RAID_LOGREQ(4, bp, "Request started"); + G_RAID_TR_IOSTART(vol->v_tr, bp); +} + +static void +g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp) +{ + off_t off, len; + struct bio *nbp; + struct g_raid_lock *lp; + + vol->v_pending_lock = 0; + LIST_FOREACH(lp, &vol->v_locks, l_next) { + if (lp->l_pending) { + off = lp->l_offset; + len = lp->l_length; + lp->l_pending = 0; + TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) { + if (g_raid_bio_overlaps(nbp, off, len)) + lp->l_pending++; + } + if (lp->l_pending) { + vol->v_pending_lock = 1; + G_RAID_DEBUG1(4, vol->v_softc, + "Deferred lock(%jd, %jd) has %d pending", + (intmax_t)off, (intmax_t)(off + len), + lp->l_pending); + continue; + } + G_RAID_DEBUG1(4, vol->v_softc, + "Deferred lock of %jd to %jd completed", + (intmax_t)off, (intmax_t)(off + len)); + G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); + } + } +} + +void +g_raid_iodone(struct bio *bp, int error) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + + sc = bp->bio_to->geom->softc; + sx_assert(&sc->sc_lock, SX_LOCKED); + vol = bp->bio_to->private; + G_RAID_LOGREQ(3, bp, "Request done: %d.", error); + + /* Update stats if we done write/delete. */ + if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { + vol->v_writes--; + vol->v_last_write = time_uptime; + } + + bioq_remove(&vol->v_inflight, bp); + if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp)) + g_raid_finish_with_locked_ranges(vol, bp); + getmicrouptime(&vol->v_last_done); + g_io_deliver(bp, error); +} + +int +g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, + struct bio *ignore, void *argp) +{ + struct g_raid_softc *sc; + struct g_raid_lock *lp; + struct bio *bp; + + sc = vol->v_softc; + lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO); + LIST_INSERT_HEAD(&vol->v_locks, lp, l_next); + lp->l_offset = off; + lp->l_length = len; + lp->l_callback_arg = argp; + + lp->l_pending = 0; + TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) { + if (bp != ignore && g_raid_bio_overlaps(bp, off, len)) + lp->l_pending++; + } + + /* + * If there are any writes that are pending, we return EBUSY. All + * callers will have to wait until all pending writes clear. + */ + if (lp->l_pending > 0) { + vol->v_pending_lock = 1; + G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend", + (intmax_t)off, (intmax_t)(off+len), lp->l_pending); + return (EBUSY); + } + G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd", + (intmax_t)off, (intmax_t)(off+len)); + G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); + return (0); +} + +int +g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len) +{ + struct g_raid_lock *lp; + struct g_raid_softc *sc; + struct bio *bp; + + sc = vol->v_softc; + LIST_FOREACH(lp, &vol->v_locks, l_next) { + if (lp->l_offset == off && lp->l_length == len) { + LIST_REMOVE(lp, l_next); + /* XXX + * Right now we just put them all back on the queue + * and hope for the best. We hope this because any + * locked ranges will go right back on this list + * when the worker thread runs. + * XXX + */ + G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd", + (intmax_t)lp->l_offset, + (intmax_t)(lp->l_offset+lp->l_length)); + mtx_lock(&sc->sc_queue_mtx); + while ((bp = bioq_takefirst(&vol->v_locked)) != NULL) + bioq_disksort(&sc->sc_queue, bp); + mtx_unlock(&sc->sc_queue_mtx); + free(lp, M_RAID); + return (0); + } + } + return (EINVAL); +} + +void +g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp) +{ + struct g_consumer *cp; + struct g_raid_disk *disk, *tdisk; + + bp->bio_caller1 = sd; + + /* + * Make sure that the disk is present. Generally it is a task of + * transformation layers to not send requests to absent disks, but + * it is better to be safe and report situation then sorry. + */ + if (sd->sd_disk == NULL) { + G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!"); +nodisk: + bp->bio_from = NULL; + bp->bio_to = NULL; + bp->bio_error = ENXIO; + g_raid_disk_done(bp); + return; + } + disk = sd->sd_disk; + if (disk->d_state != G_RAID_DISK_S_ACTIVE && + disk->d_state != G_RAID_DISK_S_FAILED) { + G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a " + "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); + goto nodisk; + } + + cp = disk->d_consumer; + bp->bio_from = cp; + bp->bio_to = cp->provider; + cp->index++; + + /* Update average disks load. */ + TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) { + if (tdisk->d_consumer == NULL) + tdisk->d_load = 0; + else + tdisk->d_load = (tdisk->d_consumer->index * + G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8; + } + + disk->d_last_offset = bp->bio_offset + bp->bio_length; + if (dumping) { + G_RAID_LOGREQ(3, bp, "Sending dumping request."); + if (bp->bio_cmd == BIO_WRITE) { + bp->bio_error = g_raid_subdisk_kerneldump(sd, + bp->bio_data, 0, bp->bio_offset, bp->bio_length); + } else + bp->bio_error = EOPNOTSUPP; + g_raid_disk_done(bp); + } else { + bp->bio_done = g_raid_disk_done; + bp->bio_offset += sd->sd_offset; + G_RAID_LOGREQ(3, bp, "Sending request."); + g_io_request(bp, cp); + } +} + +int +g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, + void *virtual, vm_offset_t physical, off_t offset, size_t length) +{ + + if (sd->sd_disk == NULL) + return (ENXIO); + if (sd->sd_disk->d_kd.di.dumper == NULL) + return (EOPNOTSUPP); + return (dump_write(&sd->sd_disk->d_kd.di, + virtual, physical, + sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset, + length)); +} + +static void +g_raid_disk_done(struct bio *bp) +{ + struct g_raid_softc *sc; + struct g_raid_subdisk *sd; + + sd = bp->bio_caller1; + sc = sd->sd_softc; + mtx_lock(&sc->sc_queue_mtx); + bioq_disksort(&sc->sc_queue, bp); + mtx_unlock(&sc->sc_queue_mtx); + if (!dumping) + wakeup(sc); +} + +static void +g_raid_disk_done_request(struct bio *bp) +{ + struct g_raid_softc *sc; + struct g_raid_disk *disk; + struct g_raid_subdisk *sd; + struct g_raid_volume *vol; + + g_topology_assert_not(); + + G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error); + sd = bp->bio_caller1; + sc = sd->sd_softc; + vol = sd->sd_volume; + if (bp->bio_from != NULL) { + bp->bio_from->index--; + disk = bp->bio_from->private; + if (disk == NULL) + g_raid_kill_consumer(sc, bp->bio_from); + } + bp->bio_offset -= sd->sd_offset; + + G_RAID_TR_IODONE(vol->v_tr, sd, bp); +} + +static void +g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep) +{ + + if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0) + ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event); + else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0) + ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event); + else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0) + ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event); + else + ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event); + if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) { + KASSERT(ep->e_error == 0, + ("Error cannot be handled.")); + g_raid_event_free(ep); + } else { + ep->e_flags |= G_RAID_EVENT_DONE; + G_RAID_DEBUG1(4, sc, "Waking up %p.", ep); + mtx_lock(&sc->sc_queue_mtx); + wakeup(ep); + mtx_unlock(&sc->sc_queue_mtx); + } +} + +/* + * Worker thread. + */ +static void +g_raid_worker(void *arg) +{ + struct g_raid_softc *sc; + struct g_raid_event *ep; + struct g_raid_volume *vol; + struct bio *bp; + struct timeval now, t; + int timeout, rv; + + sc = arg; + thread_lock(curthread); + sched_prio(curthread, PRIBIO); + thread_unlock(curthread); + + sx_xlock(&sc->sc_lock); + for (;;) { + mtx_lock(&sc->sc_queue_mtx); + /* + * First take a look at events. + * This is important to handle events before any I/O requests. + */ + bp = NULL; + vol = NULL; + rv = 0; + ep = TAILQ_FIRST(&sc->sc_events); + if (ep != NULL) + TAILQ_REMOVE(&sc->sc_events, ep, e_next); + else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) + ; + else { + getmicrouptime(&now); + t = now; + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + if (bioq_first(&vol->v_inflight) == NULL && + vol->v_tr && + timevalcmp(&vol->v_last_done, &t, < )) + t = vol->v_last_done; + } + timevalsub(&t, &now); + timeout = g_raid_idle_threshold + + t.tv_sec * 1000000 + t.tv_usec; + if (timeout > 0) { + /* + * Two steps to avoid overflows at HZ=1000 + * and idle timeouts > 2.1s. Some rounding + * errors can occur, but they are < 1tick, + * which is deemed to be close enough for + * this purpose. + */ + int micpertic = 1000000 / hz; + timeout = (timeout + micpertic - 1) / micpertic; + sx_xunlock(&sc->sc_lock); + MSLEEP(rv, sc, &sc->sc_queue_mtx, + PRIBIO | PDROP, "-", timeout); + sx_xlock(&sc->sc_lock); + goto process; + } else + rv = EWOULDBLOCK; + } + mtx_unlock(&sc->sc_queue_mtx); +process: + if (ep != NULL) { + g_raid_handle_event(sc, ep); + } else if (bp != NULL) { + if (bp->bio_to != NULL && + bp->bio_to->geom == sc->sc_geom) + g_raid_start_request(bp); + else + g_raid_disk_done_request(bp); + } else if (rv == EWOULDBLOCK) { + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + if (vol->v_writes == 0 && vol->v_dirty) + g_raid_clean(vol, -1); + if (bioq_first(&vol->v_inflight) == NULL && + vol->v_tr) { + t.tv_sec = g_raid_idle_threshold / 1000000; + t.tv_usec = g_raid_idle_threshold % 1000000; + timevaladd(&t, &vol->v_last_done); + getmicrouptime(&now); + if (timevalcmp(&t, &now, <= )) { + G_RAID_TR_IDLE(vol->v_tr); + vol->v_last_done = now; + } + } + } + } + if (sc->sc_stopping == G_RAID_DESTROY_HARD) + g_raid_destroy_node(sc, 1); /* May not return. */ + } +} + +static void +g_raid_poll(struct g_raid_softc *sc) +{ + struct g_raid_event *ep; + struct bio *bp; + + sx_xlock(&sc->sc_lock); + mtx_lock(&sc->sc_queue_mtx); + /* + * First take a look at events. + * This is important to handle events before any I/O requests. + */ + ep = TAILQ_FIRST(&sc->sc_events); + if (ep != NULL) { + TAILQ_REMOVE(&sc->sc_events, ep, e_next); + mtx_unlock(&sc->sc_queue_mtx); + g_raid_handle_event(sc, ep); + goto out; + } + bp = bioq_takefirst(&sc->sc_queue); + if (bp != NULL) { + mtx_unlock(&sc->sc_queue_mtx); + if (bp->bio_from == NULL || + bp->bio_from->geom != sc->sc_geom) + g_raid_start_request(bp); + else + g_raid_disk_done_request(bp); + } +out: + sx_xunlock(&sc->sc_lock); +} + +static void +g_raid_launch_provider(struct g_raid_volume *vol) +{ + struct g_raid_disk *disk; + struct g_raid_softc *sc; + struct g_provider *pp; + char name[G_RAID_MAX_VOLUMENAME]; + off_t off; + + sc = vol->v_softc; + sx_assert(&sc->sc_lock, SX_LOCKED); + + g_topology_lock(); + /* Try to name provider with volume name. */ + snprintf(name, sizeof(name), "raid/%s", vol->v_name); + if (g_raid_name_format == 0 || vol->v_name[0] == 0 || + g_provider_by_name(name) != NULL) { + /* Otherwise use sequential volume number. */ + snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id); + } + pp = g_new_providerf(sc->sc_geom, "%s", name); + pp->private = vol; + pp->mediasize = vol->v_mediasize; + pp->sectorsize = vol->v_sectorsize; + pp->stripesize = 0; + pp->stripeoffset = 0; + if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || + vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || + vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE || + vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) { + if ((disk = vol->v_subdisks[0].sd_disk) != NULL && + disk->d_consumer != NULL && + disk->d_consumer->provider != NULL) { + pp->stripesize = disk->d_consumer->provider->stripesize; + off = disk->d_consumer->provider->stripeoffset; + pp->stripeoffset = off + vol->v_subdisks[0].sd_offset; + if (off > 0) + pp->stripeoffset %= off; + } + if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) { + pp->stripesize *= (vol->v_disks_count - 1); + pp->stripeoffset *= (vol->v_disks_count - 1); + } + } else + pp->stripesize = vol->v_strip_size; + vol->v_provider = pp; + g_error_provider(pp, 0); + g_topology_unlock(); + G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.", + pp->name, vol->v_name); +} + +static void +g_raid_destroy_provider(struct g_raid_volume *vol) +{ + struct g_raid_softc *sc; + struct g_provider *pp; + struct bio *bp, *tmp; + + g_topology_assert_not(); + sc = vol->v_softc; + pp = vol->v_provider; + KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name)); + + g_topology_lock(); + g_error_provider(pp, ENXIO); + mtx_lock(&sc->sc_queue_mtx); + TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) { + if (bp->bio_to != pp) + continue; + bioq_remove(&sc->sc_queue, bp); + g_io_deliver(bp, ENXIO); + } + mtx_unlock(&sc->sc_queue_mtx); + G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.", + pp->name, vol->v_name); + g_wither_provider(pp, ENXIO); + g_topology_unlock(); + vol->v_provider = NULL; +} + +/* + * Update device state. + */ +static int +g_raid_update_volume(struct g_raid_volume *vol, u_int event) +{ + struct g_raid_softc *sc; + + sc = vol->v_softc; + sx_assert(&sc->sc_lock, SX_XLOCKED); + + G_RAID_DEBUG1(2, sc, "Event %s for volume %s.", + g_raid_volume_event2str(event), + vol->v_name); + switch (event) { + case G_RAID_VOLUME_E_DOWN: + if (vol->v_provider != NULL) + g_raid_destroy_provider(vol); + break; + case G_RAID_VOLUME_E_UP: + if (vol->v_provider == NULL) + g_raid_launch_provider(vol); + break; + case G_RAID_VOLUME_E_START: + if (vol->v_tr) + G_RAID_TR_START(vol->v_tr); + return (0); + default: + if (sc->sc_md) + G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event); + return (0); + } + + /* Manage root mount release. */ + if (vol->v_starting) { + vol->v_starting = 0; + G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount); + root_mount_rel(vol->v_rootmount); + vol->v_rootmount = NULL; + } + if (vol->v_stopping && vol->v_provider_open == 0) + g_raid_destroy_volume(vol); + return (0); +} + +/* + * Update subdisk state. + */ +static int +g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + + sc = sd->sd_softc; + vol = sd->sd_volume; + sx_assert(&sc->sc_lock, SX_XLOCKED); + + G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.", + g_raid_subdisk_event2str(event), + vol->v_name, sd->sd_pos, + sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); + if (vol->v_tr) + G_RAID_TR_EVENT(vol->v_tr, sd, event); + + return (0); +} + +/* + * Update disk state. + */ +static int +g_raid_update_disk(struct g_raid_disk *disk, u_int event) +{ + struct g_raid_softc *sc; + + sc = disk->d_softc; + sx_assert(&sc->sc_lock, SX_XLOCKED); + + G_RAID_DEBUG1(2, sc, "Event %s for disk %s.", + g_raid_disk_event2str(event), + g_raid_get_diskname(disk)); + + if (sc->sc_md) + G_RAID_MD_EVENT(sc->sc_md, disk, event); + return (0); +} + +/* + * Node event. + */ +static int +g_raid_update_node(struct g_raid_softc *sc, u_int event) +{ + sx_assert(&sc->sc_lock, SX_XLOCKED); + + G_RAID_DEBUG1(2, sc, "Event %s for the array.", + g_raid_node_event2str(event)); + + if (event == G_RAID_NODE_E_WAKE) + return (0); + if (sc->sc_md) + G_RAID_MD_EVENT(sc->sc_md, NULL, event); + return (0); +} + +static int +g_raid_access(struct g_provider *pp, int acr, int acw, int ace) +{ + struct g_raid_volume *vol; + struct g_raid_softc *sc; + int dcr, dcw, dce, opens, error = 0; + + g_topology_assert(); + sc = pp->geom->softc; + vol = pp->private; + KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); + KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name)); + + G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name, + acr, acw, ace); + + dcr = pp->acr + acr; + dcw = pp->acw + acw; + dce = pp->ace + ace; + + g_topology_unlock(); + sx_xlock(&sc->sc_lock); + /* Deny new opens while dying. */ + if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) { + error = ENXIO; + goto out; + } + if (dcw == 0 && vol->v_dirty) + g_raid_clean(vol, dcw); + vol->v_provider_open += acr + acw + ace; + /* Handle delayed node destruction. */ + if (sc->sc_stopping == G_RAID_DESTROY_DELAYED && + vol->v_provider_open == 0) { + /* Count open volumes. */ + opens = g_raid_nopens(sc); + if (opens == 0) { + sc->sc_stopping = G_RAID_DESTROY_HARD; + /* Wake up worker to make it selfdestruct. */ + g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); + } + } + /* Handle open volume destruction. */ + if (vol->v_stopping && vol->v_provider_open == 0) + g_raid_destroy_volume(vol); +out: + sx_xunlock(&sc->sc_lock); + g_topology_lock(); + return (error); +} + +struct g_raid_softc * +g_raid_create_node(struct g_class *mp, + const char *name, struct g_raid_md_object *md) +{ + struct g_raid_softc *sc; + struct g_geom *gp; + int error; + + g_topology_assert(); + G_RAID_DEBUG(1, "Creating array %s.", name); + + gp = g_new_geomf(mp, "%s", name); + sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO); + gp->start = g_raid_start; + gp->orphan = g_raid_orphan; + gp->access = g_raid_access; + gp->dumpconf = g_raid_dumpconf; + + sc->sc_md = md; + sc->sc_geom = gp; + sc->sc_flags = 0; + TAILQ_INIT(&sc->sc_volumes); + TAILQ_INIT(&sc->sc_disks); + sx_init(&sc->sc_lock, "gmirror:lock"); + mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF); + TAILQ_INIT(&sc->sc_events); + bioq_init(&sc->sc_queue); + gp->softc = sc; + error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0, + "g_raid %s", name); + if (error != 0) { + G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name); + mtx_destroy(&sc->sc_queue_mtx); + sx_destroy(&sc->sc_lock); + g_destroy_geom(sc->sc_geom); + free(sc, M_RAID); + return (NULL); + } + + G_RAID_DEBUG1(0, sc, "Array %s created.", name); + return (sc); +} + +struct g_raid_volume * +g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id) +{ + struct g_raid_volume *vol, *vol1; + int i; + + G_RAID_DEBUG1(1, sc, "Creating volume %s.", name); + vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO); + vol->v_softc = sc; + strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME); + vol->v_state = G_RAID_VOLUME_S_STARTING; + vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; + vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN; + bioq_init(&vol->v_inflight); + bioq_init(&vol->v_locked); + LIST_INIT(&vol->v_locks); + for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { + vol->v_subdisks[i].sd_softc = sc; + vol->v_subdisks[i].sd_volume = vol; + vol->v_subdisks[i].sd_pos = i; + vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE; + } + + /* Find free ID for this volume. */ + g_topology_lock(); + vol1 = vol; + if (id >= 0) { + LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { + if (vol1->v_global_id == id) + break; + } + } + if (vol1 != NULL) { + for (id = 0; ; id++) { + LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { + if (vol1->v_global_id == id) + break; + } + if (vol1 == NULL) + break; + } + } + vol->v_global_id = id; + LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next); + g_topology_unlock(); + + /* Delay root mounting. */ + vol->v_rootmount = root_mount_hold("GRAID"); + G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount); + vol->v_starting = 1; + TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next); + return (vol); +} + +struct g_raid_disk * +g_raid_create_disk(struct g_raid_softc *sc) +{ + struct g_raid_disk *disk; + + G_RAID_DEBUG1(1, sc, "Creating disk."); + disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO); + disk->d_softc = sc; + disk->d_state = G_RAID_DISK_S_NONE; + TAILQ_INIT(&disk->d_subdisks); + TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next); + return (disk); +} + +int g_raid_start_volume(struct g_raid_volume *vol) +{ + struct g_raid_tr_class *class; + struct g_raid_tr_object *obj; + int status; + + G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name); + LIST_FOREACH(class, &g_raid_tr_classes, trc_list) { + G_RAID_DEBUG1(2, vol->v_softc, + "Tasting volume %s for %s transformation.", + vol->v_name, class->name); + obj = (void *)kobj_create((kobj_class_t)class, M_RAID, + M_WAITOK); + obj->tro_class = class; + obj->tro_volume = vol; + status = G_RAID_TR_TASTE(obj, vol); + if (status != G_RAID_TR_TASTE_FAIL) + break; + kobj_delete((kobj_t)obj, M_RAID); + } + if (class == NULL) { + G_RAID_DEBUG1(0, vol->v_softc, + "No transformation module found for %s.", + vol->v_name); + vol->v_tr = NULL; + g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED); + g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN, + G_RAID_EVENT_VOLUME); + return (-1); + } + G_RAID_DEBUG1(2, vol->v_softc, + "Transformation module %s chosen for %s.", + class->name, vol->v_name); + vol->v_tr = obj; + return (0); +} + +int +g_raid_destroy_node(struct g_raid_softc *sc, int worker) +{ + struct g_raid_volume *vol, *tmpv; + struct g_raid_disk *disk, *tmpd; + int error = 0; + + sc->sc_stopping = G_RAID_DESTROY_HARD; + TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) { + if (g_raid_destroy_volume(vol)) + error = EBUSY; + } + if (error) + return (error); + TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) { + if (g_raid_destroy_disk(disk)) + error = EBUSY; + } + if (error) + return (error); + if (sc->sc_md) { + G_RAID_MD_FREE(sc->sc_md); + kobj_delete((kobj_t)sc->sc_md, M_RAID); + sc->sc_md = NULL; + } + if (sc->sc_geom != NULL) { + G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name); + g_topology_lock(); + sc->sc_geom->softc = NULL; + g_wither_geom(sc->sc_geom, ENXIO); + g_topology_unlock(); + sc->sc_geom = NULL; + } else + G_RAID_DEBUG(1, "Array destroyed."); + if (worker) { + g_raid_event_cancel(sc, sc); + mtx_destroy(&sc->sc_queue_mtx); + sx_xunlock(&sc->sc_lock); + sx_destroy(&sc->sc_lock); + wakeup(&sc->sc_stopping); + free(sc, M_RAID); + curthread->td_pflags &= ~TDP_GEOM; + G_RAID_DEBUG(1, "Thread exiting."); + kproc_exit(0); + } else { + /* Wake up worker to make it selfdestruct. */ + g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); + } + return (0); +} + +int +g_raid_destroy_volume(struct g_raid_volume *vol) +{ + struct g_raid_softc *sc; + struct g_raid_disk *disk; + int i; + + sc = vol->v_softc; + G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name); + vol->v_stopping = 1; + if (vol->v_state != G_RAID_VOLUME_S_STOPPED) { + if (vol->v_tr) { + G_RAID_TR_STOP(vol->v_tr); + return (EBUSY); + } else + vol->v_state = G_RAID_VOLUME_S_STOPPED; + } + if (g_raid_event_check(sc, vol) != 0) + return (EBUSY); + if (vol->v_provider != NULL) + return (EBUSY); + if (vol->v_provider_open != 0) + return (EBUSY); + if (vol->v_tr) { + G_RAID_TR_FREE(vol->v_tr); + kobj_delete((kobj_t)vol->v_tr, M_RAID); + vol->v_tr = NULL; + } + if (vol->v_rootmount) + root_mount_rel(vol->v_rootmount); + g_topology_lock(); + LIST_REMOVE(vol, v_global_next); + g_topology_unlock(); + TAILQ_REMOVE(&sc->sc_volumes, vol, v_next); + for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { + g_raid_event_cancel(sc, &vol->v_subdisks[i]); + disk = vol->v_subdisks[i].sd_disk; + if (disk == NULL) + continue; + TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next); + } + G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name); + if (sc->sc_md) + G_RAID_MD_FREE_VOLUME(sc->sc_md, vol); + g_raid_event_cancel(sc, vol); + free(vol, M_RAID); + if (sc->sc_stopping == G_RAID_DESTROY_HARD) { + /* Wake up worker to let it selfdestruct. */ + g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); + } + return (0); +} + +int +g_raid_destroy_disk(struct g_raid_disk *disk) +{ + struct g_raid_softc *sc; + struct g_raid_subdisk *sd, *tmp; + + sc = disk->d_softc; + G_RAID_DEBUG1(2, sc, "Destroying disk."); + if (disk->d_consumer) { + g_raid_kill_consumer(sc, disk->d_consumer); + disk->d_consumer = NULL; + } + TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) { + g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, + G_RAID_EVENT_SUBDISK); + TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next); + sd->sd_disk = NULL; + } + TAILQ_REMOVE(&sc->sc_disks, disk, d_next); + if (sc->sc_md) + G_RAID_MD_FREE_DISK(sc->sc_md, disk); + g_raid_event_cancel(sc, disk); + free(disk, M_RAID); + return (0); +} + +int +g_raid_destroy(struct g_raid_softc *sc, int how) +{ + int opens; + + g_topology_assert_not(); + if (sc == NULL) + return (ENXIO); + sx_assert(&sc->sc_lock, SX_XLOCKED); + + /* Count open volumes. */ + opens = g_raid_nopens(sc); + + /* React on some opened volumes. */ + if (opens > 0) { + switch (how) { + case G_RAID_DESTROY_SOFT: + G_RAID_DEBUG1(1, sc, + "%d volumes are still open.", + opens); + return (EBUSY); + case G_RAID_DESTROY_DELAYED: + G_RAID_DEBUG1(1, sc, + "Array will be destroyed on last close."); + sc->sc_stopping = G_RAID_DESTROY_DELAYED; + return (EBUSY); + case G_RAID_DESTROY_HARD: + G_RAID_DEBUG1(1, sc, + "%d volumes are still open.", + opens); + } + } + + /* Mark node for destruction. */ + sc->sc_stopping = G_RAID_DESTROY_HARD; + /* Wake up worker to let it selfdestruct. */ + g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); + /* Sleep until node destroyed. */ + sx_sleep(&sc->sc_stopping, &sc->sc_lock, + PRIBIO | PDROP, "r:destroy", 0); + return (0); +} + +static void +g_raid_taste_orphan(struct g_consumer *cp) +{ + + KASSERT(1 == 0, ("%s called while tasting %s.", __func__, + cp->provider->name)); +} + +static struct g_geom * +g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) +{ + struct g_consumer *cp; + struct g_geom *gp, *geom; + struct g_raid_md_class *class; + struct g_raid_md_object *obj; + int status; + + g_topology_assert(); + g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); + G_RAID_DEBUG(2, "Tasting provider %s.", pp->name); + + gp = g_new_geomf(mp, "mirror:taste"); + /* + * This orphan function should be never called. + */ + gp->orphan = g_raid_taste_orphan; + cp = g_new_consumer(gp); + g_attach(cp, pp); + + geom = NULL; + LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { + G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.", + pp->name, class->name); + obj = (void *)kobj_create((kobj_class_t)class, M_RAID, + M_WAITOK); + obj->mdo_class = class; + status = G_RAID_MD_TASTE(obj, mp, cp, &geom); + if (status != G_RAID_MD_TASTE_NEW) + kobj_delete((kobj_t)obj, M_RAID); + if (status != G_RAID_MD_TASTE_FAIL) + break; + } + + g_detach(cp); + g_destroy_consumer(cp); + g_destroy_geom(gp); + G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name); + return (geom); +} + +int +g_raid_create_node_format(const char *format, struct g_geom **gp) +{ + struct g_raid_md_class *class; + struct g_raid_md_object *obj; + int status; + + G_RAID_DEBUG(2, "Creating array for %s metadata.", format); + LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { + if (strcasecmp(class->name, format) == 0) + break; + } + if (class == NULL) { + G_RAID_DEBUG(1, "No support for %s metadata.", format); + return (G_RAID_MD_TASTE_FAIL); + } + obj = (void *)kobj_create((kobj_class_t)class, M_RAID, + M_WAITOK); + obj->mdo_class = class; + status = G_RAID_MD_CREATE(obj, &g_raid_class, gp); + if (status != G_RAID_MD_TASTE_NEW) + kobj_delete((kobj_t)obj, M_RAID); + return (status); +} + +static int +g_raid_destroy_geom(struct gctl_req *req __unused, + struct g_class *mp __unused, struct g_geom *gp) +{ + struct g_raid_softc *sc; + int error; + + g_topology_unlock(); + sc = gp->softc; + sx_xlock(&sc->sc_lock); + g_cancel_event(sc); + error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT); + if (error != 0) + sx_xunlock(&sc->sc_lock); + g_topology_lock(); + return (error); +} + +void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol, + struct g_raid_subdisk *sd, struct g_raid_disk *disk) +{ + + if (sc->sc_stopping == G_RAID_DESTROY_HARD) + return; + if (sc->sc_md) + G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk); +} + +void g_raid_fail_disk(struct g_raid_softc *sc, + struct g_raid_subdisk *sd, struct g_raid_disk *disk) +{ + + if (disk == NULL) + disk = sd->sd_disk; + if (disk == NULL) { + G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!"); + return; + } + if (disk->d_state != G_RAID_DISK_S_ACTIVE) { + G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a " + "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); + return; + } + if (sc->sc_md) + G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk); +} + +static void +g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, + struct g_consumer *cp, struct g_provider *pp) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + int i, s; + + g_topology_assert(); + + sc = gp->softc; + if (sc == NULL) + return; + if (pp != NULL) { + vol = pp->private; + g_topology_unlock(); + sx_xlock(&sc->sc_lock); + sbuf_printf(sb, "%s\n", indent, + vol->v_name); + sbuf_printf(sb, "%s%s\n", indent, + g_raid_volume_level2str(vol->v_raid_level, + vol->v_raid_level_qualifier)); + sbuf_printf(sb, + "%s%s\n", indent, + vol->v_tr ? vol->v_tr->tro_class->name : "NONE"); + sbuf_printf(sb, "%s%u\n", indent, + vol->v_disks_count); + sbuf_printf(sb, "%s%u\n", indent, + vol->v_strip_size); + sbuf_printf(sb, "%s%s\n", indent, + g_raid_volume_state2str(vol->v_state)); + sbuf_printf(sb, "%s%s\n", indent, + vol->v_dirty ? "Yes" : "No"); + sbuf_printf(sb, "%s", indent); + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + if (sd->sd_disk != NULL && + sd->sd_disk->d_consumer != NULL) { + sbuf_printf(sb, "%s ", + g_raid_get_diskname(sd->sd_disk)); + } else { + sbuf_printf(sb, "NONE "); + } + sbuf_printf(sb, "(%s", + g_raid_subdisk_state2str(sd->sd_state)); + if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || + sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { + sbuf_printf(sb, " %d%%", + (int)(sd->sd_rebuild_pos * 100 / + sd->sd_size)); + } + sbuf_printf(sb, ")"); + if (i + 1 < vol->v_disks_count) + sbuf_printf(sb, ", "); + } + sbuf_printf(sb, "\n"); + sx_xunlock(&sc->sc_lock); + g_topology_lock(); + } else if (cp != NULL) { + disk = cp->private; + if (disk == NULL) + return; + g_topology_unlock(); + sx_xlock(&sc->sc_lock); + sbuf_printf(sb, "%s%s", indent, + g_raid_disk_state2str(disk->d_state)); + if (!TAILQ_EMPTY(&disk->d_subdisks)) { + sbuf_printf(sb, " ("); + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + sbuf_printf(sb, "%s", + g_raid_subdisk_state2str(sd->sd_state)); + if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || + sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { + sbuf_printf(sb, " %d%%", + (int)(sd->sd_rebuild_pos * 100 / + sd->sd_size)); + } + if (TAILQ_NEXT(sd, sd_next)) + sbuf_printf(sb, ", "); + } + sbuf_printf(sb, ")"); + } + sbuf_printf(sb, "\n"); + sbuf_printf(sb, "%s", indent); + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + sbuf_printf(sb, "r%d(%s):%d@%ju", + sd->sd_volume->v_global_id, + sd->sd_volume->v_name, + sd->sd_pos, sd->sd_offset); + if (TAILQ_NEXT(sd, sd_next)) + sbuf_printf(sb, ", "); + } + sbuf_printf(sb, "\n"); + sbuf_printf(sb, "%s%d\n", indent, + disk->d_read_errs); + sx_xunlock(&sc->sc_lock); + g_topology_lock(); + } else { + g_topology_unlock(); + sx_xlock(&sc->sc_lock); + if (sc->sc_md) { + sbuf_printf(sb, "%s%s\n", indent, + sc->sc_md->mdo_class->name); + } + if (!TAILQ_EMPTY(&sc->sc_volumes)) { + s = 0xff; + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + if (vol->v_state < s) + s = vol->v_state; + } + sbuf_printf(sb, "%s%s\n", indent, + g_raid_volume_state2str(s)); + } + sx_xunlock(&sc->sc_lock); + g_topology_lock(); + } +} + +static void +g_raid_shutdown_pre_sync(void *arg, int howto) +{ + struct g_class *mp; + struct g_geom *gp, *gp2; + struct g_raid_softc *sc; + int error; + + mp = arg; + DROP_GIANT(); + g_topology_lock(); + LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { + if ((sc = gp->softc) == NULL) + continue; + g_topology_unlock(); + sx_xlock(&sc->sc_lock); + g_cancel_event(sc); + error = g_raid_destroy(sc, G_RAID_DESTROY_DELAYED); + if (error != 0) + sx_xunlock(&sc->sc_lock); + g_topology_lock(); + } + g_topology_unlock(); + PICKUP_GIANT(); +} + +static void +g_raid_init(struct g_class *mp) +{ + + g_raid_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, + g_raid_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); + if (g_raid_pre_sync == NULL) + G_RAID_DEBUG(0, "Warning! Cannot register shutdown event."); + g_raid_started = 1; +} + +static void +g_raid_fini(struct g_class *mp) +{ + + if (g_raid_pre_sync != NULL) + EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_raid_pre_sync); + g_raid_started = 0; +} + +int +g_raid_md_modevent(module_t mod, int type, void *arg) +{ + struct g_raid_md_class *class, *c, *nc; + int error; + + error = 0; + class = arg; + switch (type) { + case MOD_LOAD: + c = LIST_FIRST(&g_raid_md_classes); + if (c == NULL || c->mdc_priority > class->mdc_priority) + LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list); + else { + while ((nc = LIST_NEXT(c, mdc_list)) != NULL && + nc->mdc_priority < class->mdc_priority) + c = nc; + LIST_INSERT_AFTER(c, class, mdc_list); + } + if (g_raid_started) + g_retaste(&g_raid_class); + break; + case MOD_UNLOAD: + LIST_REMOVE(class, mdc_list); + break; + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +int +g_raid_tr_modevent(module_t mod, int type, void *arg) +{ + struct g_raid_tr_class *class, *c, *nc; + int error; + + error = 0; + class = arg; + switch (type) { + case MOD_LOAD: + c = LIST_FIRST(&g_raid_tr_classes); + if (c == NULL || c->trc_priority > class->trc_priority) + LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list); + else { + while ((nc = LIST_NEXT(c, trc_list)) != NULL && + nc->trc_priority < class->trc_priority) + c = nc; + LIST_INSERT_AFTER(c, class, trc_list); + } + break; + case MOD_UNLOAD: + LIST_REMOVE(class, trc_list); + break; + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +/* + * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid) + * to reduce module priority, allowing submodules to register them first. + */ +static moduledata_t g_raid_mod = { + "g_raid", + g_modevent, + &g_raid_class +}; +DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD); +MODULE_VERSION(geom_raid, 0); diff --git a/sys/geom/raid/g_raid.h b/sys/geom/raid/g_raid.h new file mode 100644 index 0000000..1c14ad6 --- /dev/null +++ b/sys/geom/raid/g_raid.h @@ -0,0 +1,403 @@ +/*- + * Copyright (c) 2010 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _G_RAID_H_ +#define _G_RAID_H_ + +#include +#include +#include +#include + +#define G_RAID_CLASS_NAME "RAID" + +#define G_RAID_MAGIC "GEOM::RAID" + +#define G_RAID_VERSION 0 + +struct g_raid_md_object; +struct g_raid_tr_object; + +#define G_RAID_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL +#define G_RAID_DEVICE_FLAG_NOFAILSYNC 0x0000000000000002ULL +#define G_RAID_DEVICE_FLAG_MASK (G_RAID_DEVICE_FLAG_NOAUTOSYNC | \ + G_RAID_DEVICE_FLAG_NOFAILSYNC) + +#ifdef _KERNEL +extern u_int g_raid_aggressive_spare; +extern u_int g_raid_debug; +extern int g_raid_read_err_thresh; +extern u_int g_raid_start_timeout; +extern struct g_class g_raid_class; + +#define G_RAID_DEBUG(lvl, fmt, ...) do { \ + if (g_raid_debug >= (lvl)) { \ + if (g_raid_debug > 0) { \ + printf("GEOM_RAID[%u]: " fmt "\n", \ + lvl, ## __VA_ARGS__); \ + } else { \ + printf("GEOM_RAID: " fmt "\n", \ + ## __VA_ARGS__); \ + } \ + } \ +} while (0) +#define G_RAID_DEBUG1(lvl, sc, fmt, ...) do { \ + if (g_raid_debug >= (lvl)) { \ + if (g_raid_debug > 0) { \ + printf("GEOM_RAID[%u]: %s: " fmt "\n", \ + lvl, (sc)->sc_name, ## __VA_ARGS__); \ + } else { \ + printf("GEOM_RAID: %s: " fmt "\n", \ + (sc)->sc_name, ## __VA_ARGS__); \ + } \ + } \ +} while (0) +#define G_RAID_LOGREQ(lvl, bp, fmt, ...) do { \ + if (g_raid_debug >= (lvl)) { \ + if (g_raid_debug > 0) { \ + printf("GEOM_RAID[%u]: " fmt " ", \ + lvl, ## __VA_ARGS__); \ + } else \ + printf("GEOM_RAID: " fmt " ", ## __VA_ARGS__); \ + g_print_bio(bp); \ + printf("\n"); \ + } \ +} while (0) + +/* + * Flags we use to distinguish I/O initiated by the TR layer to maintain + * the volume's characteristics, fix subdisks, extra copies of data, etc. + * + * G_RAID_BIO_FLAG_SYNC I/O to update an extra copy of the data + * for RAID volumes that maintain extra data + * and need to rebuild that data. + * G_RAID_BIO_FLAG_REMAP I/O done to try to provoke a subdisk into + * doing some desirable action such as bad + * block remapping after we detect a bad part + * of the disk. + * G_RAID_BIO_FLAG_LOCKED I/O holds range lock that should re released. + * + * and the following meta item: + * G_RAID_BIO_FLAG_SPECIAL And of the I/O flags that need to make it + * through the range locking which would + * otherwise defer the I/O until after that + * range is unlocked. + */ +#define G_RAID_BIO_FLAG_SYNC 0x01 +#define G_RAID_BIO_FLAG_REMAP 0x02 +#define G_RAID_BIO_FLAG_SPECIAL \ + (G_RAID_BIO_FLAG_SYNC|G_RAID_BIO_FLAG_REMAP) +#define G_RAID_BIO_FLAG_LOCKED 0x80 + +struct g_raid_lock { + off_t l_offset; + off_t l_length; + void *l_callback_arg; + int l_pending; + LIST_ENTRY(g_raid_lock) l_next; +}; + +#define G_RAID_EVENT_WAIT 0x01 +#define G_RAID_EVENT_VOLUME 0x02 +#define G_RAID_EVENT_SUBDISK 0x04 +#define G_RAID_EVENT_DISK 0x08 +#define G_RAID_EVENT_DONE 0x10 +struct g_raid_event { + void *e_tgt; + int e_event; + int e_flags; + int e_error; + TAILQ_ENTRY(g_raid_event) e_next; +}; +#define G_RAID_DISK_S_NONE 0x00 /* State is unknown. */ +#define G_RAID_DISK_S_OFFLINE 0x01 /* Missing disk placeholder. */ +#define G_RAID_DISK_S_FAILED 0x02 /* Failed. */ +#define G_RAID_DISK_S_STALE_FAILED 0x03 /* Old failed. */ +#define G_RAID_DISK_S_SPARE 0x04 /* Hot-spare. */ +#define G_RAID_DISK_S_STALE 0x05 /* Old disk, unused now. */ +#define G_RAID_DISK_S_ACTIVE 0x06 /* Operational. */ + +#define G_RAID_DISK_E_DISCONNECTED 0x01 + +struct g_raid_disk { + struct g_raid_softc *d_softc; /* Back-pointer to softc. */ + struct g_consumer *d_consumer; /* GEOM disk consumer. */ + void *d_md_data; /* Disk's metadata storage. */ + struct g_kerneldump d_kd; /* Kernel dumping method/args. */ + uint64_t d_flags; /* Additional flags. */ + u_int d_state; /* Disk state. */ + u_int d_load; /* Disk average load. */ + off_t d_last_offset; /* Last head offset. */ + int d_read_errs; /* Count of the read errors */ + TAILQ_HEAD(, g_raid_subdisk) d_subdisks; /* List of subdisks. */ + TAILQ_ENTRY(g_raid_disk) d_next; /* Next disk in the node. */ +}; + +#define G_RAID_SUBDISK_S_NONE 0x00 /* Absent. */ +#define G_RAID_SUBDISK_S_FAILED 0x01 /* Failed. */ +#define G_RAID_SUBDISK_S_NEW 0x02 /* Blank. */ +#define G_RAID_SUBDISK_S_REBUILD 0x03 /* Blank + rebuild. */ +#define G_RAID_SUBDISK_S_UNINITIALIZED 0x04 /* Disk of the new volume. */ +#define G_RAID_SUBDISK_S_STALE 0x05 /* Dirty. */ +#define G_RAID_SUBDISK_S_RESYNC 0x06 /* Dirty + check/repair. */ +#define G_RAID_SUBDISK_S_ACTIVE 0x07 /* Usable. */ + +#define G_RAID_SUBDISK_E_NEW 0x01 /* A new subdisk has arrived */ +#define G_RAID_SUBDISK_E_FAILED 0x02 /* A subdisk failed, but remains in volume */ +#define G_RAID_SUBDISK_E_DISCONNECTED 0x03 /* A subdisk removed from volume. */ +#define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80 /* translation private events */ + +#define G_RAID_SUBDISK_POS(sd) \ + ((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0) +#define G_RAID_SUBDISK_TRACK_SIZE (1 * 1024 * 1024) +#define G_RAID_SUBDISK_LOAD(sd) \ + ((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0) +#define G_RAID_SUBDISK_LOAD_SCALE 256 + +struct g_raid_subdisk { + struct g_raid_softc *sd_softc; /* Back-pointer to softc. */ + struct g_raid_disk *sd_disk; /* Where this subdisk lives. */ + struct g_raid_volume *sd_volume; /* Volume, sd is a part of. */ + off_t sd_offset; /* Offset on the disk. */ + off_t sd_size; /* Size on the disk. */ + u_int sd_pos; /* Position in volume. */ + u_int sd_state; /* Subdisk state. */ + off_t sd_rebuild_pos; /* Rebuild position. */ + int sd_recovery; /* Count of recovery reqs. */ + TAILQ_ENTRY(g_raid_subdisk) sd_next; /* Next subdisk on disk. */ +}; + +#define G_RAID_MAX_SUBDISKS 16 +#define G_RAID_MAX_VOLUMENAME 32 + +#define G_RAID_VOLUME_S_STARTING 0x00 +#define G_RAID_VOLUME_S_BROKEN 0x01 +#define G_RAID_VOLUME_S_DEGRADED 0x02 +#define G_RAID_VOLUME_S_SUBOPTIMAL 0x03 +#define G_RAID_VOLUME_S_OPTIMAL 0x04 +#define G_RAID_VOLUME_S_UNSUPPORTED 0x05 +#define G_RAID_VOLUME_S_STOPPED 0x06 + +#define G_RAID_VOLUME_S_ALIVE(s) \ + ((s) == G_RAID_VOLUME_S_DEGRADED || \ + (s) == G_RAID_VOLUME_S_SUBOPTIMAL || \ + (s) == G_RAID_VOLUME_S_OPTIMAL) + +#define G_RAID_VOLUME_E_DOWN 0x00 +#define G_RAID_VOLUME_E_UP 0x01 +#define G_RAID_VOLUME_E_START 0x10 +#define G_RAID_VOLUME_E_STARTMD 0x11 + +#define G_RAID_VOLUME_RL_RAID0 0x00 +#define G_RAID_VOLUME_RL_RAID1 0x01 +#define G_RAID_VOLUME_RL_RAID3 0x03 +#define G_RAID_VOLUME_RL_RAID4 0x04 +#define G_RAID_VOLUME_RL_RAID5 0x05 +#define G_RAID_VOLUME_RL_RAID6 0x06 +#define G_RAID_VOLUME_RL_RAID1E 0x11 +#define G_RAID_VOLUME_RL_SINGLE 0x0f +#define G_RAID_VOLUME_RL_CONCAT 0x1f +#define G_RAID_VOLUME_RL_RAID5E 0x15 +#define G_RAID_VOLUME_RL_RAID5EE 0x25 +#define G_RAID_VOLUME_RL_UNKNOWN 0xff + +#define G_RAID_VOLUME_RLQ_NONE 0x00 +#define G_RAID_VOLUME_RLQ_UNKNOWN 0xff + +struct g_raid_volume; + +struct g_raid_volume { + struct g_raid_softc *v_softc; /* Back-pointer to softc. */ + struct g_provider *v_provider; /* GEOM provider. */ + struct g_raid_subdisk v_subdisks[G_RAID_MAX_SUBDISKS]; + /* Subdisks of this volume. */ + void *v_md_data; /* Volume's metadata storage. */ + struct g_raid_tr_object *v_tr; /* Transformation object. */ + char v_name[G_RAID_MAX_VOLUMENAME]; + /* Volume name. */ + u_int v_state; /* Volume state. */ + u_int v_raid_level; /* Array RAID level. */ + u_int v_raid_level_qualifier; /* RAID level det. */ + u_int v_disks_count; /* Number of disks in array. */ + u_int v_strip_size; /* Array strip size. */ + u_int v_sectorsize; /* Volume sector size. */ + off_t v_mediasize; /* Volume media size. */ + struct bio_queue_head v_inflight; /* In-flight write requests. */ + struct bio_queue_head v_locked; /* Blocked I/O requests. */ + LIST_HEAD(, g_raid_lock) v_locks; /* List of locked regions. */ + int v_pending_lock; /* writes to locked region */ + int v_dirty; /* Volume is DIRTY. */ + struct timeval v_last_done; /* Time of the last I/O. */ + time_t v_last_write; /* Time of the last write. */ + u_int v_writes; /* Number of active writes. */ + struct root_hold_token *v_rootmount; /* Root mount delay token. */ + int v_starting; /* Volume is starting */ + int v_stopping; /* Volume is stopping */ + int v_provider_open; /* Number of opens. */ + int v_global_id; /* Global volume ID (rX). */ + TAILQ_ENTRY(g_raid_volume) v_next; /* List of volumes entry. */ + LIST_ENTRY(g_raid_volume) v_global_next; /* Global list entry. */ +}; + +#define G_RAID_NODE_E_WAKE 0x00 +#define G_RAID_NODE_E_START 0x01 + +struct g_raid_softc { + struct g_raid_md_object *sc_md; /* Metadata object. */ + struct g_geom *sc_geom; /* GEOM class instance. */ + uint64_t sc_flags; /* Additional flags. */ + TAILQ_HEAD(, g_raid_volume) sc_volumes; /* List of volumes. */ + TAILQ_HEAD(, g_raid_disk) sc_disks; /* List of disks. */ + struct sx sc_lock; /* Main node lock. */ + struct proc *sc_worker; /* Worker process. */ + struct mtx sc_queue_mtx; /* Worker queues lock. */ + TAILQ_HEAD(, g_raid_event) sc_events; /* Worker events queue. */ + struct bio_queue_head sc_queue; /* Worker I/O queue. */ + int sc_stopping; /* Node is stopping */ +}; +#define sc_name sc_geom->name + +/* + * KOBJ parent class of metadata processing modules. + */ +struct g_raid_md_class { + KOBJ_CLASS_FIELDS; + int mdc_priority; + LIST_ENTRY(g_raid_md_class) mdc_list; +}; + +/* + * KOBJ instance of metadata processing module. + */ +struct g_raid_md_object { + KOBJ_FIELDS; + struct g_raid_md_class *mdo_class; + struct g_raid_softc *mdo_softc; /* Back-pointer to softc. */ +}; + +int g_raid_md_modevent(module_t, int, void *); + +#define G_RAID_MD_DECLARE(name) \ + static moduledata_t name##_mod = { \ + #name, \ + g_raid_md_modevent, \ + &name##_class \ + }; \ + DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); \ + MODULE_DEPEND(name, geom_raid, 0, 0, 0) + +/* + * KOBJ parent class of data transformation modules. + */ +struct g_raid_tr_class { + KOBJ_CLASS_FIELDS; + int trc_priority; + LIST_ENTRY(g_raid_tr_class) trc_list; +}; + +/* + * KOBJ instance of data transformation module. + */ +struct g_raid_tr_object { + KOBJ_FIELDS; + struct g_raid_tr_class *tro_class; + struct g_raid_volume *tro_volume; /* Back-pointer to volume. */ +}; + +int g_raid_tr_modevent(module_t, int, void *); + +#define G_RAID_TR_DECLARE(name) \ + static moduledata_t name##_mod = { \ + #name, \ + g_raid_tr_modevent, \ + &name##_class \ + }; \ + DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); \ + MODULE_DEPEND(name, geom_raid, 0, 0, 0) + +const char * g_raid_volume_level2str(int level, int qual); +int g_raid_volume_str2level(const char *str, int *level, int *qual); +const char * g_raid_volume_state2str(int state); +const char * g_raid_subdisk_state2str(int state); +const char * g_raid_disk_state2str(int state); + +struct g_raid_softc * g_raid_create_node(struct g_class *mp, + const char *name, struct g_raid_md_object *md); +int g_raid_create_node_format(const char *format, struct g_geom **gp); +struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc, + const char *name, int id); +struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc); +const char * g_raid_get_diskname(struct g_raid_disk *disk); + +int g_raid_start_volume(struct g_raid_volume *vol); + +int g_raid_destroy_node(struct g_raid_softc *sc, int worker); +int g_raid_destroy_volume(struct g_raid_volume *vol); +int g_raid_destroy_disk(struct g_raid_disk *disk); + +void g_raid_iodone(struct bio *bp, int error); +void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp); +int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, + void *virtual, vm_offset_t physical, off_t offset, size_t length); + +struct g_consumer *g_raid_open_consumer(struct g_raid_softc *sc, + const char *name); +void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp); + +void g_raid_report_disk_state(struct g_raid_disk *disk); +void g_raid_change_disk_state(struct g_raid_disk *disk, int state); +void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state); +void g_raid_change_volume_state(struct g_raid_volume *vol, int state); + +void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol, + struct g_raid_subdisk *sd, struct g_raid_disk *disk); +void g_raid_fail_disk(struct g_raid_softc *sc, + struct g_raid_subdisk *sd, struct g_raid_disk *disk); + +void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp); +int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr, + void *virtual, vm_offset_t physical, off_t offset, size_t length); + +u_int g_raid_ndisks(struct g_raid_softc *sc, int state); +u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state); +u_int g_raid_nopens(struct g_raid_softc *sc); +struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol, + int state); +#define G_RAID_DESTROY_SOFT 0 +#define G_RAID_DESTROY_DELAYED 1 +#define G_RAID_DESTROY_HARD 2 +int g_raid_destroy(struct g_raid_softc *sc, int how); +int g_raid_event_send(void *arg, int event, int flags); +int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, + struct bio *ignore, void *argp); +int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len); + +g_ctl_req_t g_raid_ctl; +#endif /* _KERNEL */ + +#endif /* !_G_RAID_H_ */ diff --git a/sys/geom/raid/g_raid_ctl.c b/sys/geom/raid/g_raid_ctl.c new file mode 100644 index 0000000..028aa94 --- /dev/null +++ b/sys/geom/raid/g_raid_ctl.c @@ -0,0 +1,217 @@ +/*- + * Copyright (c) 2010 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "g_raid_md_if.h" + + +static struct g_raid_softc * +g_raid_find_node(struct g_class *mp, const char *name) +{ + struct g_raid_softc *sc; + struct g_geom *gp; + + LIST_FOREACH(gp, &mp->geom, geom) { + sc = gp->softc; + if (sc == NULL) + continue; + if (sc->sc_stopping != 0) + continue; + if (strcasecmp(sc->sc_name, name) == 0) + return (sc); + } + return (NULL); +} + +static void +g_raid_ctl_label(struct gctl_req *req, struct g_class *mp) +{ + struct g_geom *geom; + struct g_raid_softc *sc; + const char *format; + int *nargs; + int crstatus, ctlstatus; + char buf[64]; + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument.", "nargs"); + return; + } + if (*nargs < 4) { + gctl_error(req, "Invalid number of arguments."); + return; + } + format = gctl_get_asciiparam(req, "arg0"); + if (format == NULL) { + gctl_error(req, "No format recieved."); + return; + } + crstatus = g_raid_create_node_format(format, &geom); + if (crstatus == G_RAID_MD_TASTE_FAIL) { + gctl_error(req, "Failed to create array with format '%s'.", + format); + return; + } + sc = (struct g_raid_softc *)geom->softc; + g_topology_unlock(); + sx_xlock(&sc->sc_lock); + ctlstatus = G_RAID_MD_CTL(sc->sc_md, req); + if (ctlstatus < 0) { + gctl_error(req, "Command failed: %d.", ctlstatus); + if (crstatus == G_RAID_MD_TASTE_NEW) + g_raid_destroy_node(sc, 0); + } else { + if (crstatus == G_RAID_MD_TASTE_NEW) + snprintf(buf, sizeof(buf), "%s created\n", sc->sc_name); + else + snprintf(buf, sizeof(buf), "%s reused\n", sc->sc_name); + gctl_set_param_err(req, "output", buf, strlen(buf) + 1); + } + sx_xunlock(&sc->sc_lock); + g_topology_lock(); +} + +static void +g_raid_ctl_stop(struct gctl_req *req, struct g_class *mp) +{ + struct g_raid_softc *sc; + const char *nodename; + int *nargs, *force; + int error, how; + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument.", "nargs"); + return; + } + if (*nargs != 1) { + gctl_error(req, "Invalid number of arguments."); + return; + } + nodename = gctl_get_asciiparam(req, "arg0"); + if (nodename == NULL) { + gctl_error(req, "No array name recieved."); + return; + } + sc = g_raid_find_node(mp, nodename); + if (sc == NULL) { + gctl_error(req, "Array '%s' not found.", nodename); + return; + } + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (force != NULL && *force) + how = G_RAID_DESTROY_HARD; + else + how = G_RAID_DESTROY_SOFT; + g_topology_unlock(); + sx_xlock(&sc->sc_lock); + error = g_raid_destroy(sc, how); + if (error != 0) + sx_xunlock(&sc->sc_lock); + g_topology_lock(); +} + +static void +g_raid_ctl_other(struct gctl_req *req, struct g_class *mp) +{ + struct g_raid_softc *sc; + const char *nodename; + int *nargs; + int ctlstatus; + + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + if (nargs == NULL) { + gctl_error(req, "No '%s' argument.", "nargs"); + return; + } + if (*nargs < 1) { + gctl_error(req, "Invalid number of arguments."); + return; + } + nodename = gctl_get_asciiparam(req, "arg0"); + if (nodename == NULL) { + gctl_error(req, "No array name recieved."); + return; + } + sc = g_raid_find_node(mp, nodename); + if (sc == NULL) { + gctl_error(req, "Array '%s' not found.", nodename); + return; + } + g_topology_unlock(); + sx_xlock(&sc->sc_lock); + if (sc->sc_md != NULL) { + ctlstatus = G_RAID_MD_CTL(sc->sc_md, req); + if (ctlstatus < 0) + gctl_error(req, "Command failed: %d.", ctlstatus); + } + sx_xunlock(&sc->sc_lock); + g_topology_lock(); +} + +void +g_raid_ctl(struct gctl_req *req, struct g_class *mp, const char *verb) +{ + uint32_t *version; + + g_topology_assert(); + + version = gctl_get_paraml(req, "version", sizeof(*version)); + if (version == NULL) { + gctl_error(req, "No '%s' argument.", "version"); + return; + } + if (*version != G_RAID_VERSION) { + gctl_error(req, "Userland and kernel parts are out of sync."); + return; + } + + if (strcmp(verb, "label") == 0) + g_raid_ctl_label(req, mp); + else if (strcmp(verb, "stop") == 0) + g_raid_ctl_stop(req, mp); + else + g_raid_ctl_other(req, mp); +} diff --git a/sys/geom/raid/g_raid_md_if.m b/sys/geom/raid/g_raid_md_if.m new file mode 100644 index 0000000..05e9f66 --- /dev/null +++ b/sys/geom/raid/g_raid_md_if.m @@ -0,0 +1,156 @@ +#- +# Copyright (c) 2010 Alexander Motin +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# $FreeBSD$ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +# The G_RAID metadata class interface. + +INTERFACE g_raid_md; + +HEADER { +#define G_RAID_MD_TASTE_FAIL -1 +#define G_RAID_MD_TASTE_EXISTING 0 +#define G_RAID_MD_TASTE_NEW 1 +}; + +# Default implementations of methods. +CODE { + static int + g_raid_md_create_default(struct g_raid_md_object *md) + { + + return (G_RAID_MD_TASTE_FAIL); + } + + static int + g_raid_md_ctl_default(struct g_raid_md_object *md, + struct gctl_req *req) + { + + return (-1); + } + + static int + g_raid_md_volume_event_default(struct g_raid_md_object *md, + struct g_raid_volume *vol, u_int event) + { + + return (-1); + } + + static int + g_raid_md_free_disk_default(struct g_raid_md_object *md, + struct g_raid_volume *vol) + { + + return (0); + } + + static int + g_raid_md_free_volume_default(struct g_raid_md_object *md, + struct g_raid_volume *vol) + { + + return (0); + } +}; + +# create() - create new node from scratch. +METHOD int create { + struct g_raid_md_object *md; + struct g_class *mp; + struct g_geom **gp; +} DEFAULT g_raid_md_create_default; + +# taste() - taste disk and, if needed, create new node. +METHOD int taste { + struct g_raid_md_object *md; + struct g_class *mp; + struct g_consumer *cp; + struct g_geom **gp; +}; + +# ctl() - user-level control commands handling method. +METHOD int ctl { + struct g_raid_md_object *md; + struct gctl_req *req; +} DEFAULT g_raid_md_ctl_default; + +# event() - events handling method. +METHOD int event { + struct g_raid_md_object *md; + struct g_raid_disk *disk; + u_int event; +}; + +# volume_event() - events handling method. +METHOD int volume_event { + struct g_raid_md_object *md; + struct g_raid_volume *vol; + u_int event; +} DEFAULT g_raid_md_volume_event_default; + +# write() - metadata write method. +METHOD int write { + struct g_raid_md_object *md; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; +}; + +# fail_disk() - mark disk as failed and remove it from use. +METHOD int fail_disk { + struct g_raid_md_object *md; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; +}; + +# free_disk() - disk destructor. +METHOD int free_disk { + struct g_raid_md_object *md; + struct g_raid_disk *disk; +} DEFAULT g_raid_md_free_disk_default; + +# free_volume() - volume destructor. +METHOD int free_volume { + struct g_raid_md_object *md; + struct g_raid_volume *vol; +} DEFAULT g_raid_md_free_volume_default; + +# free() - destructor. +METHOD int free { + struct g_raid_md_object *md; +}; diff --git a/sys/geom/raid/g_raid_tr_if.m b/sys/geom/raid/g_raid_tr_if.m new file mode 100644 index 0000000..193b429 --- /dev/null +++ b/sys/geom/raid/g_raid_tr_if.m @@ -0,0 +1,118 @@ +#- +# Copyright (c) 2010 Alexander Motin +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# $FreeBSD$ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +# The G_RAID transformation class interface. + +INTERFACE g_raid_tr; + +# Default implementations of methods. +CODE { + static int + g_raid_tr_locked_default(struct g_raid_tr_object *tr, void *argp) + { + + return (0); + } +}; + +HEADER { +#define G_RAID_TR_TASTE_FAIL -1 +#define G_RAID_TR_TASTE_SUCCEED 0 +}; + +# taste() - volume taste method. +METHOD int taste { + struct g_raid_tr_object *tr; + struct g_raid_volume *volume; +}; + +# event() - events handling method. +METHOD int event { + struct g_raid_tr_object *tr; + struct g_raid_subdisk *sd; + u_int event; +}; + +# start() - begin operation. +METHOD int start { + struct g_raid_tr_object *tr; +}; + +# stop() - stop operation. +METHOD int stop { + struct g_raid_tr_object *tr; +}; + +# iorequest() - manage forward transformation and generates requests to disks. +METHOD void iostart { + struct g_raid_tr_object *tr; + struct bio *bp; +}; + +# iodone() - manages backward transformation and reports completion status. +METHOD void iodone { + struct g_raid_tr_object *tr; + struct g_raid_subdisk *sd; + struct bio *bp; +}; + +# kerneldump() - optimized for rebustness (simplified) kernel dumping routine. +METHOD int kerneldump { + struct g_raid_tr_object *tr; + void *virtual; + vm_offset_t physical; + off_t offset; + size_t length; +} DEFAULT g_raid_tr_kerneldump_common; + +# locked() - callback method for lock(). +METHOD int locked { + struct g_raid_tr_object *tr; + void *argp; +} DEFAULT g_raid_tr_locked_default; + +# free() - destructor. +METHOD int free { + struct g_raid_tr_object *tr; +}; + +# idle() - callback when the volume is idle for a while and the TR wants +# to schedule some work for that idle period. +METHOD int idle { + struct g_raid_tr_object *tr; +}; diff --git a/sys/geom/raid/md_intel.c b/sys/geom/raid/md_intel.c new file mode 100644 index 0000000..32dc8f0 --- /dev/null +++ b/sys/geom/raid/md_intel.c @@ -0,0 +1,2323 @@ +/*- + * Copyright (c) 2010 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "geom/raid/g_raid.h" +#include "g_raid_md_if.h" + +static MALLOC_DEFINE(M_MD_INTEL, "md_intel_data", "GEOM_RAID Intel metadata"); + +struct intel_raid_map { + uint32_t offset; + uint32_t disk_sectors; + uint32_t stripe_count; + uint16_t strip_sectors; + uint8_t status; +#define INTEL_S_READY 0x00 +#define INTEL_S_UNINITIALIZED 0x01 +#define INTEL_S_DEGRADED 0x02 +#define INTEL_S_FAILURE 0x03 + + uint8_t type; +#define INTEL_T_RAID0 0x00 +#define INTEL_T_RAID1 0x01 +#define INTEL_T_RAID5 0x05 + + uint8_t total_disks; + uint8_t total_domains; + uint8_t failed_disk_num; + uint8_t ddf; + uint32_t filler_2[7]; + uint32_t disk_idx[1]; /* total_disks entries. */ +#define INTEL_DI_IDX 0x00ffffff +#define INTEL_DI_RBLD 0x01000000 +} __packed; + +struct intel_raid_vol { + uint8_t name[16]; + u_int64_t total_sectors __packed; + uint32_t state; +#define INTEL_ST_BOOTABLE 0x00000001 +#define INTEL_ST_BOOT_DEVICE 0x00000002 +#define INTEL_ST_READ_COALESCING 0x00000004 +#define INTEL_ST_WRITE_COALESCING 0x00000008 +#define INTEL_ST_LAST_SHUTDOWN_DIRTY 0x00000010 +#define INTEL_ST_HIDDEN_AT_BOOT 0x00000020 +#define INTEL_ST_CURRENTLY_HIDDEN 0x00000040 +#define INTEL_ST_VERIFY_AND_FIX 0x00000080 +#define INTEL_ST_MAP_STATE_UNINIT 0x00000100 +#define INTEL_ST_NO_AUTO_RECOVERY 0x00000200 +#define INTEL_ST_CLONE_N_GO 0x00000400 +#define INTEL_ST_CLONE_MAN_SYNC 0x00000800 +#define INTEL_ST_CNG_MASTER_DISK_NUM 0x00001000 + uint32_t reserved; + uint8_t migr_priority; + uint8_t num_sub_vols; + uint8_t tid; + uint8_t cng_master_disk; + uint16_t cache_policy; + uint8_t cng_state; + uint8_t cng_sub_state; + uint32_t filler_0[10]; + + uint32_t curr_migr_unit; + uint32_t checkpoint_id; + uint8_t migr_state; + uint8_t migr_type; +#define INTEL_MT_INIT 0 +#define INTEL_MT_REBUILD 1 +#define INTEL_MT_VERIFY 2 +#define INTEL_MT_GEN_MIGR 3 +#define INTEL_MT_STATE_CHANGE 4 +#define INTEL_MT_REPAIR 5 + uint8_t dirty; + uint8_t fs_state; + uint16_t verify_errors; + uint16_t bad_blocks; + uint32_t filler_1[4]; + struct intel_raid_map map[1]; /* 2 entries if migr_state != 0. */ +} __packed; + +struct intel_raid_disk { +#define INTEL_SERIAL_LEN 16 + uint8_t serial[INTEL_SERIAL_LEN]; + uint32_t sectors; + uint32_t id; + uint32_t flags; +#define INTEL_F_SPARE 0x01 +#define INTEL_F_ASSIGNED 0x02 +#define INTEL_F_FAILED 0x04 +#define INTEL_F_ONLINE 0x08 + + uint32_t filler[5]; +} __packed; + +struct intel_raid_conf { + uint8_t intel_id[24]; +#define INTEL_MAGIC "Intel Raid ISM Cfg Sig. " + + uint8_t version[6]; +#define INTEL_VERSION_1000 "1.0.00" /* RAID0 */ +#define INTEL_VERSION_1100 "1.1.00" /* RAID1 */ +#define INTEL_VERSION_1200 "1.2.00" /* Many volumes */ +#define INTEL_VERSION_1201 "1.2.01" /* 3 or 4 disks */ +#define INTEL_VERSION_1202 "1.2.02" /* RAID5 */ +#define INTEL_VERSION_1204 "1.2.04" /* 5 or 6 disks */ +#define INTEL_VERSION_1206 "1.2.06" /* CNG */ +#define INTEL_VERSION_1300 "1.3.00" /* Attributes */ + + uint8_t dummy_0[2]; + uint32_t checksum; + uint32_t config_size; + uint32_t config_id; + uint32_t generation; + uint32_t error_log_size; + uint32_t attributes; +#define INTEL_ATTR_RAID0 0x00000001 +#define INTEL_ATTR_RAID1 0x00000002 +#define INTEL_ATTR_RAID10 0x00000004 +#define INTEL_ATTR_RAID1E 0x00000008 +#define INTEL_ATTR_RAID5 0x00000010 +#define INTEL_ATTR_RAIDCNG 0x00000020 +#define INTEL_ATTR_2TB 0x20000000 +#define INTEL_ATTR_PM 0x40000000 +#define INTEL_ATTR_CHECKSUM 0x80000000 + + uint8_t total_disks; + uint8_t total_volumes; + uint8_t dummy_2[2]; + uint32_t filler_0[39]; + struct intel_raid_disk disk[1]; /* total_disks entries. */ + /* Here goes total_volumes of struct intel_raid_vol. */ +} __packed; + +#define INTEL_MAX_MD_SIZE(ndisks) \ + (sizeof(struct intel_raid_conf) + \ + sizeof(struct intel_raid_disk) * (ndisks - 1) + \ + sizeof(struct intel_raid_vol) * 2 + \ + sizeof(struct intel_raid_map) * 2 + \ + sizeof(uint32_t) * (ndisks - 1) * 4) + +struct g_raid_md_intel_perdisk { + struct intel_raid_conf *pd_meta; + int pd_disk_pos; + struct intel_raid_disk pd_disk_meta; +}; + +struct g_raid_md_intel_object { + struct g_raid_md_object mdio_base; + uint32_t mdio_config_id; + uint32_t mdio_generation; + struct intel_raid_conf *mdio_meta; + struct callout mdio_start_co; /* STARTING state timer. */ + int mdio_disks_present; + int mdio_started; + int mdio_incomplete; + struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ +}; + +static g_raid_md_create_t g_raid_md_create_intel; +static g_raid_md_taste_t g_raid_md_taste_intel; +static g_raid_md_event_t g_raid_md_event_intel; +static g_raid_md_ctl_t g_raid_md_ctl_intel; +static g_raid_md_write_t g_raid_md_write_intel; +static g_raid_md_fail_disk_t g_raid_md_fail_disk_intel; +static g_raid_md_free_disk_t g_raid_md_free_disk_intel; +static g_raid_md_free_t g_raid_md_free_intel; + +static kobj_method_t g_raid_md_intel_methods[] = { + KOBJMETHOD(g_raid_md_create, g_raid_md_create_intel), + KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_intel), + KOBJMETHOD(g_raid_md_event, g_raid_md_event_intel), + KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_intel), + KOBJMETHOD(g_raid_md_write, g_raid_md_write_intel), + KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_intel), + KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_intel), + KOBJMETHOD(g_raid_md_free, g_raid_md_free_intel), + { 0, 0 } +}; + +static struct g_raid_md_class g_raid_md_intel_class = { + "Intel", + g_raid_md_intel_methods, + sizeof(struct g_raid_md_intel_object), + .mdc_priority = 100 +}; + + +static struct intel_raid_map * +intel_get_map(struct intel_raid_vol *mvol, int i) +{ + struct intel_raid_map *mmap; + + if (i > (mvol->migr_state ? 1 : 0)) + return (NULL); + mmap = &mvol->map[0]; + for (; i > 0; i--) { + mmap = (struct intel_raid_map *) + &mmap->disk_idx[mmap->total_disks]; + } + return ((struct intel_raid_map *)mmap); +} + +static struct intel_raid_vol * +intel_get_volume(struct intel_raid_conf *meta, int i) +{ + struct intel_raid_vol *mvol; + struct intel_raid_map *mmap; + + if (i > 1) + return (NULL); + mvol = (struct intel_raid_vol *)&meta->disk[meta->total_disks]; + for (; i > 0; i--) { + mmap = intel_get_map(mvol, mvol->migr_state ? 1 : 0); + mvol = (struct intel_raid_vol *) + &mmap->disk_idx[mmap->total_disks]; + } + return (mvol); +} + +static void +g_raid_md_intel_print(struct intel_raid_conf *meta) +{ + struct intel_raid_vol *mvol; + struct intel_raid_map *mmap; + int i, j, k; + + if (g_raid_debug < 1) + return; + + printf("********* ATA Intel MatrixRAID Metadata *********\n"); + printf("intel_id <%.24s>\n", meta->intel_id); + printf("version <%.6s>\n", meta->version); + printf("checksum 0x%08x\n", meta->checksum); + printf("config_size 0x%08x\n", meta->config_size); + printf("config_id 0x%08x\n", meta->config_id); + printf("generation 0x%08x\n", meta->generation); + printf("attributes 0x%08x\n", meta->attributes); + printf("total_disks %u\n", meta->total_disks); + printf("total_volumes %u\n", meta->total_volumes); + printf("DISK# serial disk_sectors disk_id flags\n"); + for (i = 0; i < meta->total_disks; i++ ) { + printf(" %d <%.16s> %u 0x%08x 0x%08x\n", i, + meta->disk[i].serial, meta->disk[i].sectors, + meta->disk[i].id, meta->disk[i].flags); + } + for (i = 0; i < meta->total_volumes; i++) { + mvol = intel_get_volume(meta, i); + printf(" ****** Volume %d ******\n", i); + printf(" name %.16s\n", mvol->name); + printf(" total_sectors %ju\n", mvol->total_sectors); + printf(" state %u\n", mvol->state); + printf(" reserved %u\n", mvol->reserved); + printf(" curr_migr_unit %u\n", mvol->curr_migr_unit); + printf(" checkpoint_id %u\n", mvol->checkpoint_id); + printf(" migr_state %u\n", mvol->migr_state); + printf(" migr_type %u\n", mvol->migr_type); + printf(" dirty %u\n", mvol->dirty); + + for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) { + printf(" *** Map %d ***\n", j); + mmap = intel_get_map(mvol, j); + printf(" offset %u\n", mmap->offset); + printf(" disk_sectors %u\n", mmap->disk_sectors); + printf(" stripe_count %u\n", mmap->stripe_count); + printf(" strip_sectors %u\n", mmap->strip_sectors); + printf(" status %u\n", mmap->status); + printf(" type %u\n", mmap->type); + printf(" total_disks %u\n", mmap->total_disks); + printf(" total_domains %u\n", mmap->total_domains); + printf(" failed_disk_num %u\n", mmap->failed_disk_num); + printf(" ddf %u\n", mmap->ddf); + printf(" disk_idx "); + for (k = 0; k < mmap->total_disks; k++) + printf(" 0x%08x", mmap->disk_idx[k]); + printf("\n"); + } + } + printf("=================================================\n"); +} + +static struct intel_raid_conf * +intel_meta_copy(struct intel_raid_conf *meta) +{ + struct intel_raid_conf *nmeta; + + nmeta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK); + memcpy(nmeta, meta, meta->config_size); + return (nmeta); +} + +static int +intel_meta_find_disk(struct intel_raid_conf *meta, char *serial) +{ + int pos; + + for (pos = 0; pos < meta->total_disks; pos++) { + if (strncmp(meta->disk[pos].serial, + serial, INTEL_SERIAL_LEN) == 0) + return (pos); + } + return (-1); +} + +static struct intel_raid_conf * +intel_meta_read(struct g_consumer *cp) +{ + struct g_provider *pp; + struct intel_raid_conf *meta; + struct intel_raid_vol *mvol; + struct intel_raid_map *mmap; + char *buf; + int error, i, j, k, left, size; + uint32_t checksum, *ptr; + + pp = cp->provider; + + /* Read the anchor sector. */ + buf = g_read_data(cp, + pp->mediasize - pp->sectorsize * 2, pp->sectorsize, &error); + if (buf == NULL) { + G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", + pp->name, error); + return (NULL); + } + meta = (struct intel_raid_conf *)buf; + + /* Check if this is an Intel RAID struct */ + if (strncmp(meta->intel_id, INTEL_MAGIC, strlen(INTEL_MAGIC))) { + G_RAID_DEBUG(1, "Intel signature check failed on %s", pp->name); + g_free(buf); + return (NULL); + } + if (meta->config_size > 65536 || + meta->config_size < sizeof(struct intel_raid_conf)) { + G_RAID_DEBUG(1, "Intel metadata size looks wrong: %d", + meta->config_size); + g_free(buf); + return (NULL); + } + size = meta->config_size; + meta = malloc(size, M_MD_INTEL, M_WAITOK); + memcpy(meta, buf, min(size, pp->sectorsize)); + g_free(buf); + + /* Read all the rest, if needed. */ + if (meta->config_size > pp->sectorsize) { + left = (meta->config_size - 1) / pp->sectorsize; + buf = g_read_data(cp, + pp->mediasize - pp->sectorsize * (2 + left), + pp->sectorsize * left, &error); + if (buf == NULL) { + G_RAID_DEBUG(1, "Cannot read remaining metadata" + " part from %s (error=%d).", + pp->name, error); + free(meta, M_MD_INTEL); + return (NULL); + } + memcpy(((char *)meta) + pp->sectorsize, buf, + pp->sectorsize * left); + g_free(buf); + } + + /* Check metadata checksum. */ + for (checksum = 0, ptr = (uint32_t *)meta, i = 0; + i < (meta->config_size / sizeof(uint32_t)); i++) { + checksum += *ptr++; + } + checksum -= meta->checksum; + if (checksum != meta->checksum) { + G_RAID_DEBUG(1, "Intel checksum check failed on %s", pp->name); + free(meta, M_MD_INTEL); + return (NULL); + } + + /* Validate metadata size. */ + size = sizeof(struct intel_raid_conf) + + sizeof(struct intel_raid_disk) * (meta->total_disks - 1) + + sizeof(struct intel_raid_vol) * meta->total_volumes; + if (size > meta->config_size) { +badsize: + G_RAID_DEBUG(1, "Intel metadata size incorrect %d < %d", + meta->config_size, size); + free(meta, M_MD_INTEL); + return (NULL); + } + for (i = 0; i < meta->total_volumes; i++) { + mvol = intel_get_volume(meta, i); + mmap = intel_get_map(mvol, 0); + size += 4 * (mmap->total_disks - 1); + if (size > meta->config_size) + goto badsize; + if (mvol->migr_state) { + size += sizeof(struct intel_raid_map); + if (size > meta->config_size) + goto badsize; + mmap = intel_get_map(mvol, 1); + size += 4 * (mmap->total_disks - 1); + if (size > meta->config_size) + goto badsize; + } + } + + /* Validate disk indexes. */ + for (i = 0; i < meta->total_volumes; i++) { + mvol = intel_get_volume(meta, i); + for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) { + mmap = intel_get_map(mvol, j); + for (k = 0; k < mmap->total_disks; k++) { + if ((mmap->disk_idx[k] & INTEL_DI_IDX) > + meta->total_disks) { + G_RAID_DEBUG(1, "Intel metadata disk" + " index %d too big (>%d)", + mmap->disk_idx[k] & INTEL_DI_IDX, + meta->total_disks); + free(meta, M_MD_INTEL); + return (NULL); + } + } + } + } + + /* Validate migration types. */ + for (i = 0; i < meta->total_volumes; i++) { + mvol = intel_get_volume(meta, i); + if (mvol->migr_state && + mvol->migr_type != INTEL_MT_INIT && + mvol->migr_type != INTEL_MT_REBUILD && + mvol->migr_type != INTEL_MT_VERIFY && + mvol->migr_type != INTEL_MT_REPAIR) { + G_RAID_DEBUG(1, "Intel metadata has unsupported" + " migration type %d", mvol->migr_type); + free(meta, M_MD_INTEL); + return (NULL); + } + } + + return (meta); +} + +static int +intel_meta_write(struct g_consumer *cp, struct intel_raid_conf *meta) +{ + struct g_provider *pp; + char *buf; + int error, i, sectors; + uint32_t checksum, *ptr; + + pp = cp->provider; + + /* Recalculate checksum for case if metadata were changed. */ + meta->checksum = 0; + for (checksum = 0, ptr = (uint32_t *)meta, i = 0; + i < (meta->config_size / sizeof(uint32_t)); i++) { + checksum += *ptr++; + } + meta->checksum = checksum; + + /* Create and fill buffer. */ + sectors = (meta->config_size + pp->sectorsize - 1) / pp->sectorsize; + buf = malloc(sectors * pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); + if (sectors > 1) { + memcpy(buf, ((char *)meta) + pp->sectorsize, + (sectors - 1) * pp->sectorsize); + } + memcpy(buf + (sectors - 1) * pp->sectorsize, meta, pp->sectorsize); + + error = g_write_data(cp, + pp->mediasize - pp->sectorsize * (1 + sectors), + buf, pp->sectorsize * sectors); + if (error != 0) { + G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", + pp->name, error); + } + + free(buf, M_MD_INTEL); + return (error); +} + +static int +intel_meta_erase(struct g_consumer *cp) +{ + struct g_provider *pp; + char *buf; + int error; + + pp = cp->provider; + buf = malloc(pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); + error = g_write_data(cp, + pp->mediasize - 2 * pp->sectorsize, + buf, pp->sectorsize); + if (error != 0) { + G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", + pp->name, error); + } + free(buf, M_MD_INTEL); + return (error); +} + +static int +intel_meta_write_spare(struct g_consumer *cp, struct intel_raid_disk *d) +{ + struct intel_raid_conf *meta; + int error; + + /* Fill anchor and single disk. */ + meta = malloc(INTEL_MAX_MD_SIZE(1), M_MD_INTEL, M_WAITOK | M_ZERO); + memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC)); + memcpy(&meta->version[0], INTEL_VERSION_1000, + sizeof(INTEL_VERSION_1000)); + meta->config_size = INTEL_MAX_MD_SIZE(1); + meta->config_id = arc4random(); + meta->generation = 1; + meta->total_disks = 1; + meta->disk[0] = *d; + error = intel_meta_write(cp, meta); + free(meta, M_MD_INTEL); + return (error); +} + +static struct g_raid_disk * +g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id) +{ + struct g_raid_disk *disk; + struct g_raid_md_intel_perdisk *pd; + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; + if (pd->pd_disk_pos == id) + break; + } + return (disk); +} + +static int +g_raid_md_intel_supported(int level, int qual, int disks, int force) +{ + + switch (level) { + case G_RAID_VOLUME_RL_RAID0: + if (disks < 1) + return (0); + if (!force && (disks < 2 || disks > 6)) + return (0); + break; + case G_RAID_VOLUME_RL_RAID1: + if (disks < 1) + return (0); + if (!force && (disks != 2)) + return (0); + break; + case G_RAID_VOLUME_RL_RAID1E: + if (disks < 2) + return (0); + if (!force && (disks != 4)) + return (0); + break; + case G_RAID_VOLUME_RL_RAID5: + if (disks < 3) + return (0); + if (!force && disks > 6) + return (0); + break; + default: + return (0); + } + if (qual != G_RAID_VOLUME_RLQ_NONE) + return (0); + return (1); +} + +static struct g_raid_volume * +g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id) +{ + struct g_raid_volume *mvol; + + TAILQ_FOREACH(mvol, &sc->sc_volumes, v_next) { + if ((intptr_t)(mvol->v_md_data) == id) + break; + } + return (mvol); +} + +static int +g_raid_md_intel_start_disk(struct g_raid_disk *disk) +{ + struct g_raid_softc *sc; + struct g_raid_subdisk *sd, *tmpsd; + struct g_raid_disk *olddisk, *tmpdisk; + struct g_raid_md_object *md; + struct g_raid_md_intel_object *mdi; + struct g_raid_md_intel_perdisk *pd, *oldpd; + struct intel_raid_conf *meta; + struct intel_raid_vol *mvol; + struct intel_raid_map *mmap0, *mmap1; + int disk_pos, resurrection = 0; + + sc = disk->d_softc; + md = sc->sc_md; + mdi = (struct g_raid_md_intel_object *)md; + meta = mdi->mdio_meta; + pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; + olddisk = NULL; + + /* Find disk position in metadata by it's serial. */ + disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial); + if (disk_pos < 0) { + G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); + /* Failed stale disk is useless for us. */ + if (pd->pd_disk_meta.flags & INTEL_F_FAILED) { + g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); + return (0); + } + /* If we are in the start process, that's all for now. */ + if (!mdi->mdio_started) + goto nofit; + /* + * If we have already started - try to get use of the disk. + * Try to replace OFFLINE disks first, then FAILED. + */ + TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { + if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && + tmpdisk->d_state != G_RAID_DISK_S_FAILED) + continue; + /* Make sure this disk is big enough. */ + TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { + if (sd->sd_offset + sd->sd_size + 4096 > + (off_t)pd->pd_disk_meta.sectors * 512) { + G_RAID_DEBUG1(1, sc, + "Disk too small (%llu < %llu)", + ((unsigned long long) + pd->pd_disk_meta.sectors) * 512, + (unsigned long long) + sd->sd_offset + sd->sd_size + 4096); + break; + } + } + if (sd != NULL) + continue; + if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { + olddisk = tmpdisk; + break; + } else if (olddisk == NULL) + olddisk = tmpdisk; + } + if (olddisk == NULL) { +nofit: + if (pd->pd_disk_meta.flags & INTEL_F_SPARE) { + g_raid_change_disk_state(disk, + G_RAID_DISK_S_SPARE); + return (1); + } else { + g_raid_change_disk_state(disk, + G_RAID_DISK_S_STALE); + return (0); + } + } + oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; + disk_pos = oldpd->pd_disk_pos; + resurrection = 1; + } + + if (olddisk == NULL) { + /* Find placeholder by position. */ + olddisk = g_raid_md_intel_get_disk(sc, disk_pos); + if (olddisk == NULL) + panic("No disk at position %d!", disk_pos); + if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { + G_RAID_DEBUG1(1, sc, "More then one disk for pos %d", + disk_pos); + g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); + return (0); + } + oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; + } + + /* Replace failed disk or placeholder with new disk. */ + TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { + TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + sd->sd_disk = disk; + } + oldpd->pd_disk_pos = -2; + pd->pd_disk_pos = disk_pos; + + /* If it was placeholder -- destroy it. */ + if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { + g_raid_destroy_disk(olddisk); + } else { + /* Otherwise, make it STALE_FAILED. */ + g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); + /* Update global metadata just in case. */ + memcpy(&meta->disk[disk_pos], &pd->pd_disk_meta, + sizeof(struct intel_raid_disk)); + } + + /* Welcome the new disk. */ + if (resurrection) + g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); + else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) + g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); + else if (meta->disk[disk_pos].flags & INTEL_F_SPARE) + g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); + else + g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + mvol = intel_get_volume(meta, + (uintptr_t)(sd->sd_volume->v_md_data)); + mmap0 = intel_get_map(mvol, 0); + if (mvol->migr_state) + mmap1 = intel_get_map(mvol, 1); + else + mmap1 = mmap0; + + if (resurrection) { + /* Stale disk, almost same as new. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NEW); + } else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) { + /* Failed disk, almost useless. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_FAILED); + } else if (mvol->migr_state == 0) { + if (mmap0->status == INTEL_S_UNINITIALIZED) { + /* Freshly created uninitialized volume. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_UNINITIALIZED); + } else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { + /* Freshly inserted disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NEW); + } else if (mvol->dirty) { + /* Dirty volume (unclean shutdown). */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_STALE); + } else { + /* Up to date disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_ACTIVE); + } + } else if (mvol->migr_type == INTEL_MT_INIT || + mvol->migr_type == INTEL_MT_REBUILD) { + if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { + /* Freshly inserted disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NEW); + } else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { + /* Rebuilding disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_REBUILD); + if (mvol->dirty) { + sd->sd_rebuild_pos = 0; + } else { + sd->sd_rebuild_pos = + (off_t)mvol->curr_migr_unit * + sd->sd_volume->v_strip_size * + mmap0->total_domains; + } + } else if (mvol->dirty) { + /* Dirty volume (unclean shutdown). */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_STALE); + } else { + /* Up to date disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_ACTIVE); + } + } else if (mvol->migr_type == INTEL_MT_VERIFY || + mvol->migr_type == INTEL_MT_REPAIR) { + if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { + /* Freshly inserted disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NEW); + } else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { + /* Resyncing disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_RESYNC); + if (mvol->dirty) { + sd->sd_rebuild_pos = 0; + } else { + sd->sd_rebuild_pos = + (off_t)mvol->curr_migr_unit * + sd->sd_volume->v_strip_size * + mmap0->total_domains; + } + } else if (mvol->dirty) { + /* Dirty volume (unclean shutdown). */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_STALE); + } else { + /* Up to date disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_ACTIVE); + } + } + g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, + G_RAID_EVENT_SUBDISK); + } + + /* Update status of our need for spare. */ + if (mdi->mdio_started) { + mdi->mdio_incomplete = + (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < + meta->total_disks); + } + + return (resurrection); +} + +static void +g_disk_md_intel_retaste(void *arg, int pending) +{ + + G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); + g_retaste(&g_raid_class); + free(arg, M_MD_INTEL); +} + +static void +g_raid_md_intel_refill(struct g_raid_softc *sc) +{ + struct g_raid_md_object *md; + struct g_raid_md_intel_object *mdi; + struct intel_raid_conf *meta; + struct g_raid_disk *disk; + struct task *task; + int update, na; + + md = sc->sc_md; + mdi = (struct g_raid_md_intel_object *)md; + meta = mdi->mdio_meta; + update = 0; + do { + /* Make sure we miss anything. */ + na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); + if (na == meta->total_disks) + break; + + G_RAID_DEBUG1(1, md->mdo_softc, + "Array is not complete (%d of %d), " + "trying to refill.", na, meta->total_disks); + + /* Try to get use some of STALE disks. */ + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_state == G_RAID_DISK_S_STALE) { + update += g_raid_md_intel_start_disk(disk); + if (disk->d_state == G_RAID_DISK_S_ACTIVE) + break; + } + } + if (disk != NULL) + continue; + + /* Try to get use some of SPARE disks. */ + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_state == G_RAID_DISK_S_SPARE) { + update += g_raid_md_intel_start_disk(disk); + if (disk->d_state == G_RAID_DISK_S_ACTIVE) + break; + } + } + } while (disk != NULL); + + /* Write new metadata if we changed something. */ + if (update) { + g_raid_md_write_intel(md, NULL, NULL, NULL); + meta = mdi->mdio_meta; + } + + /* Update status of our need for spare. */ + mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < + meta->total_disks); + + /* Request retaste hoping to find spare. */ + if (mdi->mdio_incomplete) { + task = malloc(sizeof(struct task), + M_MD_INTEL, M_WAITOK | M_ZERO); + TASK_INIT(task, 0, g_disk_md_intel_retaste, task); + taskqueue_enqueue(taskqueue_swi, task); + } +} + +static void +g_raid_md_intel_start(struct g_raid_softc *sc) +{ + struct g_raid_md_object *md; + struct g_raid_md_intel_object *mdi; + struct g_raid_md_intel_perdisk *pd; + struct intel_raid_conf *meta; + struct intel_raid_vol *mvol; + struct intel_raid_map *mmap; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + int i, j, disk_pos; + + md = sc->sc_md; + mdi = (struct g_raid_md_intel_object *)md; + meta = mdi->mdio_meta; + + /* Create volumes and subdisks. */ + for (i = 0; i < meta->total_volumes; i++) { + mvol = intel_get_volume(meta, i); + mmap = intel_get_map(mvol, 0); + vol = g_raid_create_volume(sc, mvol->name, -1); + vol->v_md_data = (void *)(intptr_t)i; + if (mmap->type == INTEL_T_RAID0) + vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; + else if (mmap->type == INTEL_T_RAID1 && + mmap->total_domains >= 2 && + mmap->total_domains <= mmap->total_disks) { + /* Assume total_domains is correct. */ + if (mmap->total_domains == mmap->total_disks) + vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; + else + vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; + } else if (mmap->type == INTEL_T_RAID1) { + /* total_domains looks wrong. */ + if (mmap->total_disks <= 2) + vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; + else + vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; + } else if (mmap->type == INTEL_T_RAID5) + vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; + else + vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; + vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; + vol->v_strip_size = (u_int)mmap->strip_sectors * 512; //ZZZ + vol->v_disks_count = mmap->total_disks; + vol->v_mediasize = (off_t)mvol->total_sectors * 512; //ZZZ + vol->v_sectorsize = 512; //ZZZ + for (j = 0; j < vol->v_disks_count; j++) { + sd = &vol->v_subdisks[j]; + sd->sd_offset = (off_t)mmap->offset * 512; //ZZZ + sd->sd_size = (off_t)mmap->disk_sectors * 512; //ZZZ + } + g_raid_start_volume(vol); + } + + /* Create disk placeholders to store data for later writing. */ + for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) { + pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); + pd->pd_disk_pos = disk_pos; + pd->pd_disk_meta = meta->disk[disk_pos]; + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_state = G_RAID_DISK_S_OFFLINE; + for (i = 0; i < meta->total_volumes; i++) { + mvol = intel_get_volume(meta, i); + mmap = intel_get_map(mvol, 0); + for (j = 0; j < mmap->total_disks; j++) { + if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos) + break; + } + if (j == mmap->total_disks) + continue; + vol = g_raid_md_intel_get_volume(sc, i); + sd = &vol->v_subdisks[j]; + sd->sd_disk = disk; + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + } + } + + /* Make all disks found till the moment take their places. */ + do { + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_state == G_RAID_DISK_S_NONE) { + g_raid_md_intel_start_disk(disk); + break; + } + } + } while (disk != NULL); + + mdi->mdio_started = 1; + G_RAID_DEBUG1(0, sc, "Array started."); + g_raid_md_write_intel(md, NULL, NULL, NULL); + + /* Pickup any STALE/SPARE disks to refill array if needed. */ + g_raid_md_intel_refill(sc); + + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + g_raid_event_send(vol, G_RAID_VOLUME_E_START, + G_RAID_EVENT_VOLUME); + } + + callout_stop(&mdi->mdio_start_co); + G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); + root_mount_rel(mdi->mdio_rootmount); + mdi->mdio_rootmount = NULL; +} + +static void +g_raid_md_intel_new_disk(struct g_raid_disk *disk) +{ + struct g_raid_softc *sc; + struct g_raid_md_object *md; + struct g_raid_md_intel_object *mdi; + struct intel_raid_conf *pdmeta; + struct g_raid_md_intel_perdisk *pd; + + sc = disk->d_softc; + md = sc->sc_md; + mdi = (struct g_raid_md_intel_object *)md; + pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; + pdmeta = pd->pd_meta; + + if (mdi->mdio_started) { + if (g_raid_md_intel_start_disk(disk)) + g_raid_md_write_intel(md, NULL, NULL, NULL); + } else { + /* If we haven't started yet - check metadata freshness. */ + if (mdi->mdio_meta == NULL || + ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) { + G_RAID_DEBUG1(1, sc, "Newer disk"); + if (mdi->mdio_meta != NULL) + free(mdi->mdio_meta, M_MD_INTEL); + mdi->mdio_meta = intel_meta_copy(pdmeta); + mdi->mdio_generation = mdi->mdio_meta->generation; + mdi->mdio_disks_present = 1; + } else if (pdmeta->generation == mdi->mdio_generation) { + mdi->mdio_disks_present++; + G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", + mdi->mdio_disks_present, + mdi->mdio_meta->total_disks); + } else { + G_RAID_DEBUG1(1, sc, "Older disk"); + } + /* If we collected all needed disks - start array. */ + if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks) + g_raid_md_intel_start(sc); + } +} + +static void +g_raid_intel_go(void *arg) +{ + struct g_raid_softc *sc; + struct g_raid_md_object *md; + struct g_raid_md_intel_object *mdi; + + sc = arg; + md = sc->sc_md; + mdi = (struct g_raid_md_intel_object *)md; + if (!mdi->mdio_started) { + G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); + g_raid_event_send(sc, G_RAID_NODE_E_START, 0); + } +} + +static int +g_raid_md_create_intel(struct g_raid_md_object *md, struct g_class *mp, + struct g_geom **gp) +{ + struct g_raid_softc *sc; + struct g_raid_md_intel_object *mdi; + char name[16]; + + mdi = (struct g_raid_md_intel_object *)md; + mdi->mdio_config_id = arc4random(); + mdi->mdio_generation = 0; + snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id); + sc = g_raid_create_node(mp, name, md); + if (sc == NULL) + return (G_RAID_MD_TASTE_FAIL); + md->mdo_softc = sc; + *gp = sc->sc_geom; + return (G_RAID_MD_TASTE_NEW); +} + +/* + * Return the last N characters of the serial label. The Linux and + * ataraid(7) code always uses the last 16 characters of the label to + * store into the Intel meta format. Generalize this to N characters + * since that's easy. Labels can be up to 20 characters for SATA drives + * and up 251 characters for SAS drives. Since intel controllers don't + * support SAS drives, just stick with the SATA limits for stack friendliness. + */ +static int +g_raid_md_get_label(struct g_consumer *cp, char *serial, int serlen) +{ + char serial_buffer[24]; + int len, error; + + len = sizeof(serial_buffer); + error = g_io_getattr("GEOM::ident", cp, &len, serial_buffer); + if (error != 0) + return (error); + len = strlen(serial_buffer); + if (len > serlen) + len -= serlen; + else + len = 0; + strncpy(serial, serial_buffer + len, serlen); + return (0); +} + +static int +g_raid_md_taste_intel(struct g_raid_md_object *md, struct g_class *mp, + struct g_consumer *cp, struct g_geom **gp) +{ + struct g_consumer *rcp; + struct g_provider *pp; + struct g_raid_md_intel_object *mdi, *mdi1; + struct g_raid_softc *sc; + struct g_raid_disk *disk; + struct intel_raid_conf *meta; + struct g_raid_md_intel_perdisk *pd; + struct g_geom *geom; + int error, disk_pos, result, spare, len; + char serial[INTEL_SERIAL_LEN]; + char name[16]; + uint16_t vendor; + + G_RAID_DEBUG(1, "Tasting Intel on %s", cp->provider->name); + mdi = (struct g_raid_md_intel_object *)md; + pp = cp->provider; + + /* Read metadata from device. */ + meta = NULL; + spare = 0; + vendor = 0xffff; + disk_pos = 0; + if (g_access(cp, 1, 0, 0) != 0) + return (G_RAID_MD_TASTE_FAIL); + g_topology_unlock(); + error = g_raid_md_get_label(cp, serial, sizeof(serial)); + if (error != 0) { + G_RAID_DEBUG(1, "Cannot get serial number from %s (error=%d).", + pp->name, error); + goto fail2; + } + len = 2; + if (pp->geom->rank == 1) + g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); + meta = intel_meta_read(cp); + g_topology_lock(); + g_access(cp, -1, 0, 0); + if (meta == NULL) { + if (g_raid_aggressive_spare) { + if (vendor == 0x8086) { + G_RAID_DEBUG(1, + "No Intel metadata, forcing spare."); + spare = 2; + goto search; + } else { + G_RAID_DEBUG(1, + "Intel vendor mismatch 0x%04x != 0x8086", + vendor); + } + } + return (G_RAID_MD_TASTE_FAIL); + } + + /* Check this disk position in obtained metadata. */ + disk_pos = intel_meta_find_disk(meta, serial); + if (disk_pos < 0) { + G_RAID_DEBUG(1, "Intel serial '%s' not found", serial); + goto fail1; + } + if (meta->disk[disk_pos].sectors != + (pp->mediasize / pp->sectorsize)) { + G_RAID_DEBUG(1, "Intel size mismatch %u != %u", + meta->disk[disk_pos].sectors, + (u_int)(pp->mediasize / pp->sectorsize)); + goto fail1; + } + + /* Metadata valid. Print it. */ + g_raid_md_intel_print(meta); + G_RAID_DEBUG(1, "Intel disk position %d", disk_pos); + spare = meta->disk[disk_pos].flags & INTEL_F_SPARE; + +search: + /* Search for matching node. */ + sc = NULL; + mdi1 = NULL; + LIST_FOREACH(geom, &mp->geom, geom) { + sc = geom->softc; + if (sc == NULL) + continue; + if (sc->sc_stopping != 0) + continue; + if (sc->sc_md->mdo_class != md->mdo_class) + continue; + mdi1 = (struct g_raid_md_intel_object *)sc->sc_md; + if (spare) { + if (mdi1->mdio_incomplete) + break; + } else { + if (mdi1->mdio_config_id == meta->config_id) + break; + } + } + + /* Found matching node. */ + if (geom != NULL) { + G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); + result = G_RAID_MD_TASTE_EXISTING; + + } else if (spare) { /* Not found needy node -- left for later. */ + G_RAID_DEBUG(1, "Spare is not needed at this time"); + goto fail1; + + } else { /* Not found matching node -- create one. */ + result = G_RAID_MD_TASTE_NEW; + mdi->mdio_config_id = meta->config_id; + snprintf(name, sizeof(name), "Intel-%08x", meta->config_id); + sc = g_raid_create_node(mp, name, md); + md->mdo_softc = sc; + geom = sc->sc_geom; + callout_init(&mdi->mdio_start_co, 1); + callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, + g_raid_intel_go, sc); + mdi->mdio_rootmount = root_mount_hold("GRAID-Intel"); + G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); + } + + rcp = g_new_consumer(geom); + g_attach(rcp, pp); + if (g_access(rcp, 1, 1, 1) != 0) + ; //goto fail1; + + g_topology_unlock(); + sx_xlock(&sc->sc_lock); + + pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); + pd->pd_meta = meta; + pd->pd_disk_pos = -1; + if (spare == 2) { + memcpy(&pd->pd_disk_meta.serial[0], serial, INTEL_SERIAL_LEN); + pd->pd_disk_meta.sectors = pp->mediasize / pp->sectorsize; + pd->pd_disk_meta.id = 0; + pd->pd_disk_meta.flags = INTEL_F_SPARE; + } else { + pd->pd_disk_meta = meta->disk[disk_pos]; + } + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_consumer = rcp; + rcp->private = disk; + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", + rcp->provider->name, error); + + g_raid_md_intel_new_disk(disk); + + sx_xunlock(&sc->sc_lock); + g_topology_lock(); + *gp = geom; + return (result); +fail2: + g_topology_lock(); + g_access(cp, -1, 0, 0); +fail1: + free(meta, M_MD_INTEL); + return (G_RAID_MD_TASTE_FAIL); +} + +static int +g_raid_md_event_intel(struct g_raid_md_object *md, + struct g_raid_disk *disk, u_int event) +{ + struct g_raid_softc *sc; + struct g_raid_subdisk *sd; + struct g_raid_md_intel_object *mdi; + struct g_raid_md_intel_perdisk *pd; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_intel_object *)md; + if (disk == NULL) { + switch (event) { + case G_RAID_NODE_E_START: + if (!mdi->mdio_started) + g_raid_md_intel_start(sc); + return (0); + } + return (-1); + } + pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; + switch (event) { + case G_RAID_DISK_E_DISCONNECTED: + /* If disk was assigned, just update statuses. */ + if (pd->pd_disk_pos >= 0) { + g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); + if (disk->d_consumer) { + g_raid_kill_consumer(sc, disk->d_consumer); + disk->d_consumer = NULL; + } + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NONE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, + G_RAID_EVENT_SUBDISK); + } + } else { + /* Otherwise -- delete. */ + g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); + g_raid_destroy_disk(disk); + } + + /* Write updated metadata to all disks. */ + g_raid_md_write_intel(md, NULL, NULL, NULL); + + /* Check if anything left except placeholders. */ + if (g_raid_ndisks(sc, -1) == + g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) + g_raid_destroy_node(sc, 0); + else + g_raid_md_intel_refill(sc); + return (0); + } + return (-2); +} + +static int +g_raid_md_ctl_intel(struct g_raid_md_object *md, + struct gctl_req *req) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol, *vol1; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + struct g_raid_md_intel_object *mdi; + struct g_raid_md_intel_perdisk *pd; + struct g_consumer *cp; + struct g_provider *pp; + char arg[16], serial[INTEL_SERIAL_LEN]; + const char *verb, *volname, *levelname, *diskname; + char *tmp; + int *nargs, *force; + off_t off, size, sectorsize, strip; + intmax_t *sizearg, *striparg; + int numdisks, i, len, level, qual, update; + int error; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_intel_object *)md; + verb = gctl_get_param(req, "verb", NULL); + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + error = 0; + if (strcmp(verb, "label") == 0) { + + if (*nargs < 4) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + volname = gctl_get_asciiparam(req, "arg1"); + if (volname == NULL) { + gctl_error(req, "No volume name."); + return (-2); + } + levelname = gctl_get_asciiparam(req, "arg2"); + if (levelname == NULL) { + gctl_error(req, "No RAID level."); + return (-3); + } + if (g_raid_volume_str2level(levelname, &level, &qual)) { + gctl_error(req, "Unknown RAID level '%s'.", levelname); + return (-4); + } + numdisks = *nargs - 3; + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (!g_raid_md_intel_supported(level, qual, numdisks, + force ? *force : 0)) { + gctl_error(req, "Unsupported RAID level " + "(0x%02x/0x%02x), or number of disks (%d).", + level, qual, numdisks); + return (-5); + } + + /* Search for disks, connect them and probe. */ + size = 0x7fffffffffffffffllu; + sectorsize = 0; + for (i = 0; i < numdisks; i++) { + snprintf(arg, sizeof(arg), "arg%d", i + 3); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -6; + break; + } + if (strcmp(diskname, "NONE") == 0) { + cp = NULL; + pp = NULL; + } else { + g_topology_lock(); + cp = g_raid_open_consumer(sc, diskname); + if (cp == NULL) { + gctl_error(req, "Can't open disk '%s'.", + diskname); + g_topology_unlock(); + error = -4; + break; + } + pp = cp->provider; + } + pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); + pd->pd_disk_pos = i; + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_consumer = cp; + if (cp == NULL) { + strcpy(&pd->pd_disk_meta.serial[0], "NONE"); + pd->pd_disk_meta.id = 0xffffffff; + pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; + continue; + } + cp->private = disk; + g_topology_unlock(); + + error = g_raid_md_get_label(cp, + &pd->pd_disk_meta.serial[0], INTEL_SERIAL_LEN); + if (error != 0) { + gctl_error(req, + "Can't get serial for provider '%s'.", + diskname); + error = -8; + break; + } + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, + "Dumping not supported by %s.", + cp->provider->name); + + pd->pd_disk_meta.sectors = pp->mediasize / pp->sectorsize; + if (size > pp->mediasize) + size = pp->mediasize; + if (sectorsize < pp->sectorsize) + sectorsize = pp->sectorsize; + pd->pd_disk_meta.id = 0; + pd->pd_disk_meta.flags = INTEL_F_ASSIGNED | INTEL_F_ONLINE; + } + if (error != 0) + return (error); + + /* Reserve some space for metadata. */ + size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; + + /* Handle size argument. */ + len = sizeof(*sizearg); + sizearg = gctl_get_param(req, "size", &len); + if (sizearg != NULL && len == sizeof(*sizearg) && + *sizearg > 0) { + if (*sizearg > size) { + gctl_error(req, "Size too big %lld > %lld.", + (long long)*sizearg, (long long)size); + return (-9); + } + size = *sizearg; + } + + /* Handle strip argument. */ + strip = 131072; + len = sizeof(*striparg); + striparg = gctl_get_param(req, "strip", &len); + if (striparg != NULL && len == sizeof(*striparg) && + *striparg > 0) { + if (*striparg < sectorsize) { + gctl_error(req, "Strip size too small."); + return (-10); + } + if (*striparg % sectorsize != 0) { + gctl_error(req, "Incorrect strip size."); + return (-11); + } + if (strip > 65535 * sectorsize) { + gctl_error(req, "Strip size too big."); + return (-12); + } + strip = *striparg; + } + + /* Round size down to strip or sector. */ + if (level == G_RAID_VOLUME_RL_RAID1) + size -= (size % sectorsize); + else if (level == G_RAID_VOLUME_RL_RAID1E && + (numdisks & 1) != 0) + size -= (size % (2 * strip)); + else + size -= (size % strip); + if (size <= 0) { + gctl_error(req, "Size too small."); + return (-13); + } + if (size > 0xffffffffllu * sectorsize) { + gctl_error(req, "Size too big."); + return (-14); + } + + /* We have all we need, create things: volume, ... */ + mdi->mdio_started = 1; + vol = g_raid_create_volume(sc, volname, -1); + vol->v_md_data = (void *)(intptr_t)0; + vol->v_raid_level = level; + vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; + vol->v_strip_size = strip; + vol->v_disks_count = numdisks; + if (level == G_RAID_VOLUME_RL_RAID0) + vol->v_mediasize = size * numdisks; + else if (level == G_RAID_VOLUME_RL_RAID1) + vol->v_mediasize = size; + else if (level == G_RAID_VOLUME_RL_RAID5) + vol->v_mediasize = size * (numdisks - 1); + else { /* RAID1E */ + vol->v_mediasize = ((size * numdisks) / strip / 2) * + strip; + } + vol->v_sectorsize = sectorsize; + g_raid_start_volume(vol); + + /* , and subdisks. */ + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; + sd = &vol->v_subdisks[pd->pd_disk_pos]; + sd->sd_disk = disk; + sd->sd_offset = 0; + sd->sd_size = size; + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + if (sd->sd_disk->d_consumer != NULL) { + g_raid_change_disk_state(disk, + G_RAID_DISK_S_ACTIVE); + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_ACTIVE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, + G_RAID_EVENT_SUBDISK); + } else { + g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); + } + } + + /* Write metadata based on created entities. */ + G_RAID_DEBUG1(0, sc, "Array started."); + g_raid_md_write_intel(md, NULL, NULL, NULL); + + /* Pickup any STALE/SPARE disks to refill array if needed. */ + g_raid_md_intel_refill(sc); + + g_raid_event_send(vol, G_RAID_VOLUME_E_START, + G_RAID_EVENT_VOLUME); + return (0); + } + if (strcmp(verb, "add") == 0) { + + if (*nargs != 3) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + volname = gctl_get_asciiparam(req, "arg1"); + if (volname == NULL) { + gctl_error(req, "No volume name."); + return (-2); + } + levelname = gctl_get_asciiparam(req, "arg2"); + if (levelname == NULL) { + gctl_error(req, "No RAID level."); + return (-3); + } + if (g_raid_volume_str2level(levelname, &level, &qual)) { + gctl_error(req, "Unknown RAID level '%s'.", levelname); + return (-4); + } + + /* Look for existing volumes. */ + i = 0; + vol1 = NULL; + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + vol1 = vol; + i++; + } + if (i > 1) { + gctl_error(req, "Maximum two volumes supported."); + return (-6); + } + if (vol1 == NULL) { + gctl_error(req, "At least one volume must exist."); + return (-7); + } + + numdisks = vol1->v_disks_count; + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (!g_raid_md_intel_supported(level, qual, numdisks, + force ? *force : 0)) { + gctl_error(req, "Unsupported RAID level " + "(0x%02x/0x%02x), or number of disks (%d).", + level, qual, numdisks); + return (-5); + } + + /* Collect info about present disks. */ + size = 0x7fffffffffffffffllu; + sectorsize = 512; + for (i = 0; i < numdisks; i++) { + disk = vol1->v_subdisks[i].sd_disk; + pd = (struct g_raid_md_intel_perdisk *) + disk->d_md_data; + if ((off_t)pd->pd_disk_meta.sectors * 512 < size) + size = (off_t)pd->pd_disk_meta.sectors * 512; + if (disk->d_consumer != NULL && + disk->d_consumer->provider != NULL && + disk->d_consumer->provider->sectorsize > + sectorsize) { + sectorsize = + disk->d_consumer->provider->sectorsize; + } + } + + /* Reserve some space for metadata. */ + size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; + + /* Decide insert before or after. */ + sd = &vol1->v_subdisks[0]; + if (sd->sd_offset > + size - (sd->sd_offset + sd->sd_size)) { + off = 0; + size = sd->sd_offset; + } else { + off = sd->sd_offset + sd->sd_size; + size = size - (sd->sd_offset + sd->sd_size); + } + + /* Handle strip argument. */ + strip = 131072; + len = sizeof(*striparg); + striparg = gctl_get_param(req, "strip", &len); + if (striparg != NULL && len == sizeof(*striparg) && + *striparg > 0) { + if (*striparg < sectorsize) { + gctl_error(req, "Strip size too small."); + return (-10); + } + if (*striparg % sectorsize != 0) { + gctl_error(req, "Incorrect strip size."); + return (-11); + } + if (strip > 65535 * sectorsize) { + gctl_error(req, "Strip size too big."); + return (-12); + } + strip = *striparg; + } + + /* Round offset up to strip. */ + if (off % strip != 0) { + size -= strip - off % strip; + off += strip - off % strip; + } + + /* Handle size argument. */ + len = sizeof(*sizearg); + sizearg = gctl_get_param(req, "size", &len); + if (sizearg != NULL && len == sizeof(*sizearg) && + *sizearg > 0) { + if (*sizearg > size) { + gctl_error(req, "Size too big %lld > %lld.", + (long long)*sizearg, (long long)size); + return (-9); + } + size = *sizearg; + } + + /* Round size down to strip or sector. */ + if (level == G_RAID_VOLUME_RL_RAID1) + size -= (size % sectorsize); + else + size -= (size % strip); + if (size <= 0) { + gctl_error(req, "Size too small."); + return (-13); + } + if (size > 0xffffffffllu * sectorsize) { + gctl_error(req, "Size too big."); + return (-14); + } + + /* We have all we need, create things: volume, ... */ + vol = g_raid_create_volume(sc, volname, -1); + vol->v_md_data = (void *)(intptr_t)i; + vol->v_raid_level = level; + vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; + vol->v_strip_size = strip; + vol->v_disks_count = numdisks; + if (level == G_RAID_VOLUME_RL_RAID0) + vol->v_mediasize = size * numdisks; + else if (level == G_RAID_VOLUME_RL_RAID1) + vol->v_mediasize = size; + else if (level == G_RAID_VOLUME_RL_RAID5) + vol->v_mediasize = size * (numdisks - 1); + else { /* RAID1E */ + vol->v_mediasize = ((size * numdisks) / strip / 2) * + strip; + } + vol->v_sectorsize = sectorsize; + g_raid_start_volume(vol); + + /* , and subdisks. */ + for (i = 0; i < numdisks; i++) { + disk = vol1->v_subdisks[i].sd_disk; + sd = &vol->v_subdisks[i]; + sd->sd_disk = disk; + sd->sd_offset = off; + sd->sd_size = size; + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + if (disk->d_state == G_RAID_DISK_S_ACTIVE) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_ACTIVE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, + G_RAID_EVENT_SUBDISK); + } + } + + /* Write metadata based on created entities. */ + g_raid_md_write_intel(md, NULL, NULL, NULL); + + g_raid_event_send(vol, G_RAID_VOLUME_E_START, + G_RAID_EVENT_VOLUME); + return (0); + } + if (strcmp(verb, "delete") == 0) { + + /* Full node destruction. */ + if (*nargs == 1) { + /* Check if some volume is still open. */ + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (force != NULL && *force == 0 && + g_raid_nopens(sc) != 0) { + gctl_error(req, "Some volume is still open."); + return (-4); + } + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_consumer) + intel_meta_erase(disk->d_consumer); + } + g_raid_destroy_node(sc, 0); + return (0); + } + + /* Destroy specified volume. If it was last - all node. */ + if (*nargs != 2) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + volname = gctl_get_asciiparam(req, "arg1"); + if (volname == NULL) { + gctl_error(req, "No volume name."); + return (-2); + } + + /* Search for volume. */ + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + if (strcmp(vol->v_name, volname) == 0) + break; + } + if (vol == NULL) { + i = strtol(volname, &tmp, 10); + if (verb != volname && tmp[0] == 0) { + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + if (vol->v_global_id == i) + break; + } + } + } + if (vol == NULL) { + gctl_error(req, "Volume '%s' not found.", volname); + return (-3); + } + + /* Check if volume is still open. */ + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (force != NULL && *force == 0 && + vol->v_provider_open != 0) { + gctl_error(req, "Volume is still open."); + return (-4); + } + + /* Destroy volume and potentially node. */ + i = 0; + TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) + i++; + if (i >= 2) { + g_raid_destroy_volume(vol); + g_raid_md_write_intel(md, NULL, NULL, NULL); + } else { + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_consumer) + intel_meta_erase(disk->d_consumer); + } + g_raid_destroy_node(sc, 0); + } + return (0); + } + if (strcmp(verb, "remove") == 0 || + strcmp(verb, "fail") == 0) { + if (*nargs < 2) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + for (i = 1; i < *nargs; i++) { + snprintf(arg, sizeof(arg), "arg%d", i); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -2; + break; + } + if (strncmp(diskname, "/dev/", 5) == 0) + diskname += 5; + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_consumer != NULL && + disk->d_consumer->provider != NULL && + strcmp(disk->d_consumer->provider->name, + diskname) == 0) + break; + } + if (disk == NULL) { + gctl_error(req, "Disk '%s' not found.", + diskname); + error = -3; + break; + } + + if (strcmp(verb, "fail") == 0) { + g_raid_md_fail_disk_intel(md, NULL, disk); + continue; + } + + pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; + + /* Erase metadata on deleting disk. */ + intel_meta_erase(disk->d_consumer); + + /* If disk was assigned, just update statuses. */ + if (pd->pd_disk_pos >= 0) { + g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); + if (disk->d_consumer) { + g_raid_kill_consumer(sc, disk->d_consumer); + disk->d_consumer = NULL; + } + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NONE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, + G_RAID_EVENT_SUBDISK); + } + } else { + /* Otherwise -- delete. */ + g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); + g_raid_destroy_disk(disk); + } + } + + /* Write updated metadata to remaining disks. */ + g_raid_md_write_intel(md, NULL, NULL, NULL); + + /* Check if anything left except placeholders. */ + if (g_raid_ndisks(sc, -1) == + g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) + g_raid_destroy_node(sc, 0); + else + g_raid_md_intel_refill(sc); + return (error); + } + if (strcmp(verb, "insert") == 0) { + if (*nargs < 2) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + update = 0; + for (i = 1; i < *nargs; i++) { + /* Get disk name. */ + snprintf(arg, sizeof(arg), "arg%d", i); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -3; + break; + } + + /* Try to find provider with specified name. */ + g_topology_lock(); + cp = g_raid_open_consumer(sc, diskname); + if (cp == NULL) { + gctl_error(req, "Can't open disk '%s'.", + diskname); + g_topology_unlock(); + error = -4; + break; + } + pp = cp->provider; + g_topology_unlock(); + + /* Read disk serial. */ + error = g_raid_md_get_label(cp, + &serial[0], INTEL_SERIAL_LEN); + if (error != 0) { + gctl_error(req, + "Can't get serial for provider '%s'.", + diskname); + g_raid_kill_consumer(sc, cp); + error = -7; + break; + } + + pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); + pd->pd_disk_pos = -1; + + disk = g_raid_create_disk(sc); + disk->d_consumer = cp; + disk->d_consumer->private = disk; + disk->d_md_data = (void *)pd; + cp->private = disk; + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, + "Dumping not supported by %s.", + cp->provider->name); + + memcpy(&pd->pd_disk_meta.serial[0], &serial[0], + INTEL_SERIAL_LEN); + pd->pd_disk_meta.sectors = pp->mediasize / pp->sectorsize; + pd->pd_disk_meta.id = 0; + pd->pd_disk_meta.flags = INTEL_F_SPARE; + + /* Welcome the "new" disk. */ + update += g_raid_md_intel_start_disk(disk); + if (disk->d_state == G_RAID_DISK_S_SPARE) { + intel_meta_write_spare(cp, &pd->pd_disk_meta); + g_raid_destroy_disk(disk); + } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { + gctl_error(req, "Disk '%s' doesn't fit.", + diskname); + g_raid_destroy_disk(disk); + error = -8; + break; + } + } + + /* Write new metadata if we changed something. */ + if (update) + g_raid_md_write_intel(md, NULL, NULL, NULL); + return (error); + } + return (-100); +} + +static int +g_raid_md_write_intel(struct g_raid_md_object *md, struct g_raid_volume *tvol, + struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + struct g_raid_md_intel_object *mdi; + struct g_raid_md_intel_perdisk *pd; + struct intel_raid_conf *meta; + struct intel_raid_vol *mvol; + struct intel_raid_map *mmap0, *mmap1; + off_t sectorsize = 512, pos; + const char *version, *cv; + int vi, sdi, numdisks, len, state, stale; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_intel_object *)md; + + if (sc->sc_stopping == G_RAID_DESTROY_HARD) + return (0); + + /* Bump generation. Newly written metadata may differ from previous. */ + mdi->mdio_generation++; + + /* Count number of disks. */ + numdisks = 0; + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; + if (pd->pd_disk_pos < 0) + continue; + numdisks++; + if (disk->d_state == G_RAID_DISK_S_ACTIVE) { + pd->pd_disk_meta.flags = + INTEL_F_ONLINE | INTEL_F_ASSIGNED; + } else if (disk->d_state == G_RAID_DISK_S_FAILED) { + pd->pd_disk_meta.flags = INTEL_F_FAILED | INTEL_F_ASSIGNED; + } else { + pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; + if (pd->pd_disk_meta.id != 0xffffffff) { + pd->pd_disk_meta.id = 0xffffffff; + len = strlen(pd->pd_disk_meta.serial); + len = min(len, INTEL_SERIAL_LEN - 3); + strcpy(pd->pd_disk_meta.serial + len, ":0"); + } + } + } + + /* Fill anchor and disks. */ + meta = malloc(INTEL_MAX_MD_SIZE(numdisks), + M_MD_INTEL, M_WAITOK | M_ZERO); + memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC)); + meta->config_size = INTEL_MAX_MD_SIZE(numdisks); + meta->config_id = mdi->mdio_config_id; + meta->generation = mdi->mdio_generation; + meta->attributes = INTEL_ATTR_CHECKSUM; + meta->total_disks = numdisks; + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; + if (pd->pd_disk_pos < 0) + continue; + meta->disk[pd->pd_disk_pos] = pd->pd_disk_meta; + } + + /* Fill volumes and maps. */ + vi = 0; + version = INTEL_VERSION_1000; + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + if (vol->v_stopping) + continue; + mvol = intel_get_volume(meta, vi); + + /* New metadata may have different volumes order. */ + vol->v_md_data = (void *)(intptr_t)vi; + + for (sdi = 0; sdi < vol->v_disks_count; sdi++) { + sd = &vol->v_subdisks[sdi]; + if (sd->sd_disk != NULL) + break; + } + if (sdi >= vol->v_disks_count) + panic("No any filled subdisk in volume"); + if (vol->v_mediasize >= 0x20000000000llu) + meta->attributes |= INTEL_ATTR_2TB; + if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) + meta->attributes |= INTEL_ATTR_RAID0; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) + meta->attributes |= INTEL_ATTR_RAID1; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) + meta->attributes |= INTEL_ATTR_RAID5; + else + meta->attributes |= INTEL_ATTR_RAID10; + + if (meta->attributes & INTEL_ATTR_2TB) + cv = INTEL_VERSION_1300; +// else if (dev->status == DEV_CLONE_N_GO) +// cv = INTEL_VERSION_1206; + else if (vol->v_disks_count > 4) + cv = INTEL_VERSION_1204; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) + cv = INTEL_VERSION_1202; + else if (vol->v_disks_count > 2) + cv = INTEL_VERSION_1201; + else if (vi > 0) + cv = INTEL_VERSION_1200; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) + cv = INTEL_VERSION_1100; + else + cv = INTEL_VERSION_1000; + if (strcmp(cv, version) > 0) + version = cv; + + strlcpy(&mvol->name[0], vol->v_name, sizeof(mvol->name)); + mvol->total_sectors = vol->v_mediasize / sectorsize; + + /* Check for any recovery in progress. */ + state = G_RAID_SUBDISK_S_ACTIVE; + pos = 0x7fffffffffffffffllu; + stale = 0; + for (sdi = 0; sdi < vol->v_disks_count; sdi++) { + sd = &vol->v_subdisks[sdi]; + if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) + state = G_RAID_SUBDISK_S_REBUILD; + else if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC && + state != G_RAID_SUBDISK_S_REBUILD) + state = G_RAID_SUBDISK_S_RESYNC; + else if (sd->sd_state == G_RAID_SUBDISK_S_STALE) + stale = 1; + if ((sd->sd_state == G_RAID_SUBDISK_S_REBUILD || + sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && + sd->sd_rebuild_pos < pos) + pos = sd->sd_rebuild_pos; + } + if (state == G_RAID_SUBDISK_S_REBUILD) { + mvol->migr_state = 1; + mvol->migr_type = INTEL_MT_REBUILD; + } else if (state == G_RAID_SUBDISK_S_RESYNC) { + mvol->migr_state = 1; + /* mvol->migr_type = INTEL_MT_REPAIR; */ + mvol->migr_type = INTEL_MT_VERIFY; + mvol->state |= INTEL_ST_VERIFY_AND_FIX; + } else + mvol->migr_state = 0; + mvol->dirty = (vol->v_dirty || stale); + + mmap0 = intel_get_map(mvol, 0); + + /* Write map / common part of two maps. */ + mmap0->offset = sd->sd_offset / sectorsize; + mmap0->disk_sectors = sd->sd_size / sectorsize; + mmap0->strip_sectors = vol->v_strip_size / sectorsize; + if (vol->v_state == G_RAID_VOLUME_S_BROKEN) + mmap0->status = INTEL_S_FAILURE; + else if (vol->v_state == G_RAID_VOLUME_S_DEGRADED) + mmap0->status = INTEL_S_DEGRADED; + else + mmap0->status = INTEL_S_READY; + if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) + mmap0->type = INTEL_T_RAID0; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || + vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) + mmap0->type = INTEL_T_RAID1; + else + mmap0->type = INTEL_T_RAID5; + mmap0->total_disks = vol->v_disks_count; + if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) + mmap0->total_domains = vol->v_disks_count; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) + mmap0->total_domains = 2; + else + mmap0->total_domains = 1; + mmap0->stripe_count = sd->sd_size / vol->v_strip_size / + mmap0->total_domains; + mmap0->failed_disk_num = 0xff; + mmap0->ddf = 1; + + /* If there are two maps - copy common and update. */ + if (mvol->migr_state) { + mvol->curr_migr_unit = pos / + vol->v_strip_size / mmap0->total_domains; + mmap1 = intel_get_map(mvol, 1); + memcpy(mmap1, mmap0, sizeof(struct intel_raid_map)); + mmap0->status = INTEL_S_READY; + } else + mmap1 = NULL; + + /* Write disk indexes and put rebuild flags. */ + for (sdi = 0; sdi < vol->v_disks_count; sdi++) { + sd = &vol->v_subdisks[sdi]; + pd = (struct g_raid_md_intel_perdisk *) + sd->sd_disk->d_md_data; + mmap0->disk_idx[sdi] = pd->pd_disk_pos; + if (mvol->migr_state) + mmap1->disk_idx[sdi] = pd->pd_disk_pos; + if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || + sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { + mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; + } else if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && + sd->sd_state != G_RAID_SUBDISK_S_STALE) { + mmap0->disk_idx[sdi] |= INTEL_DI_RBLD; + if (mvol->migr_state) + mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; + } + if ((sd->sd_state == G_RAID_SUBDISK_S_NONE || + sd->sd_state == G_RAID_SUBDISK_S_FAILED) && + mmap0->failed_disk_num == 0xff) { + mmap0->failed_disk_num = sdi; + if (mvol->migr_state) + mmap1->failed_disk_num = sdi; + } + } + vi++; + } + meta->total_volumes = vi; + if (strcmp(version, INTEL_VERSION_1300) != 0) + meta->attributes &= INTEL_ATTR_CHECKSUM; + memcpy(&meta->version[0], version, sizeof(INTEL_VERSION_1000)); + + /* We are done. Print meta data and store them to disks. */ + g_raid_md_intel_print(meta); + if (mdi->mdio_meta != NULL) + free(mdi->mdio_meta, M_MD_INTEL); + mdi->mdio_meta = meta; + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; + if (disk->d_state != G_RAID_DISK_S_ACTIVE) + continue; + if (pd->pd_meta != NULL) { + free(pd->pd_meta, M_MD_INTEL); + pd->pd_meta = NULL; + } + pd->pd_meta = intel_meta_copy(meta); + intel_meta_write(disk->d_consumer, meta); + } + return (0); +} + +static int +g_raid_md_fail_disk_intel(struct g_raid_md_object *md, + struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) +{ + struct g_raid_softc *sc; + struct g_raid_md_intel_object *mdi; + struct g_raid_md_intel_perdisk *pd; + struct g_raid_subdisk *sd; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_intel_object *)md; + pd = (struct g_raid_md_intel_perdisk *)tdisk->d_md_data; + + /* We can't fail disk that is not a part of array now. */ + if (pd->pd_disk_pos < 0) + return (-1); + + /* + * Mark disk as failed in metadata and try to write that metadata + * to the disk itself to prevent it's later resurrection as STALE. + */ + mdi->mdio_meta->disk[pd->pd_disk_pos].flags = INTEL_F_FAILED; + pd->pd_disk_meta.flags = INTEL_F_FAILED; + g_raid_md_intel_print(mdi->mdio_meta); + if (tdisk->d_consumer) + intel_meta_write(tdisk->d_consumer, mdi->mdio_meta); + + /* Change states. */ + g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); + TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_FAILED); + g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, + G_RAID_EVENT_SUBDISK); + } + + /* Write updated metadata to remaining disks. */ + g_raid_md_write_intel(md, NULL, NULL, tdisk); + + /* Check if anything left except placeholders. */ + if (g_raid_ndisks(sc, -1) == + g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) + g_raid_destroy_node(sc, 0); + else + g_raid_md_intel_refill(sc); + return (0); +} + +static int +g_raid_md_free_disk_intel(struct g_raid_md_object *md, + struct g_raid_disk *disk) +{ + struct g_raid_md_intel_perdisk *pd; + + pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; + if (pd->pd_meta != NULL) { + free(pd->pd_meta, M_MD_INTEL); + pd->pd_meta = NULL; + } + free(pd, M_MD_INTEL); + disk->d_md_data = NULL; + return (0); +} + +static int +g_raid_md_free_intel(struct g_raid_md_object *md) +{ + struct g_raid_md_intel_object *mdi; + + mdi = (struct g_raid_md_intel_object *)md; + if (!mdi->mdio_started) { + mdi->mdio_started = 0; + callout_stop(&mdi->mdio_start_co); + G_RAID_DEBUG1(1, md->mdo_softc, + "root_mount_rel %p", mdi->mdio_rootmount); + root_mount_rel(mdi->mdio_rootmount); + mdi->mdio_rootmount = NULL; + } + if (mdi->mdio_meta != NULL) { + free(mdi->mdio_meta, M_MD_INTEL); + mdi->mdio_meta = NULL; + } + return (0); +} + +G_RAID_MD_DECLARE(g_raid_md_intel); diff --git a/sys/geom/raid/md_jmicron.c b/sys/geom/raid/md_jmicron.c new file mode 100644 index 0000000..a56c543 --- /dev/null +++ b/sys/geom/raid/md_jmicron.c @@ -0,0 +1,1582 @@ +/*- + * Copyright (c) 2010 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "geom/raid/g_raid.h" +#include "g_raid_md_if.h" + +static MALLOC_DEFINE(M_MD_JMICRON, "md_jmicron_data", "GEOM_RAID JMicron metadata"); + +#define JMICRON_MAX_DISKS 8 +#define JMICRON_MAX_SPARE 2 + +struct jmicron_raid_conf { + u_int8_t signature[2]; +#define JMICRON_MAGIC "JM" + + u_int16_t version; +#define JMICRON_VERSION 0x0001 + + u_int16_t checksum; + u_int8_t filler_1[10]; + u_int32_t disk_id; + u_int32_t offset; + u_int32_t disk_sectors_high; + u_int16_t disk_sectors_low; + u_int8_t filler_2[2]; + u_int8_t name[16]; + u_int8_t type; +#define JMICRON_T_RAID0 0 +#define JMICRON_T_RAID1 1 +#define JMICRON_T_RAID01 2 +#define JMICRON_T_CONCAT 3 +#define JMICRON_T_RAID5 5 + + u_int8_t stripe_shift; + u_int16_t flags; +#define JMICRON_F_READY 0x0001 +#define JMICRON_F_BOOTABLE 0x0002 +#define JMICRON_F_BADSEC 0x0004 +#define JMICRON_F_ACTIVE 0x0010 +#define JMICRON_F_UNSYNC 0x0020 +#define JMICRON_F_NEWEST 0x0040 + + u_int8_t filler_3[4]; + u_int32_t spare[JMICRON_MAX_SPARE]; + u_int32_t disks[JMICRON_MAX_DISKS]; +#define JMICRON_DISK_MASK 0xFFFFFFF0 +#define JMICRON_SEG_MASK 0x0000000F + u_int8_t filler_4[32]; + u_int8_t filler_5[384]; +}; + +struct g_raid_md_jmicron_perdisk { + struct jmicron_raid_conf *pd_meta; + int pd_disk_pos; + int pd_disk_id; + off_t pd_disk_size; +}; + +struct g_raid_md_jmicron_object { + struct g_raid_md_object mdio_base; + uint32_t mdio_config_id; + struct jmicron_raid_conf *mdio_meta; + struct callout mdio_start_co; /* STARTING state timer. */ + int mdio_total_disks; + int mdio_disks_present; + int mdio_started; + int mdio_incomplete; + struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ +}; + +static g_raid_md_create_t g_raid_md_create_jmicron; +static g_raid_md_taste_t g_raid_md_taste_jmicron; +static g_raid_md_event_t g_raid_md_event_jmicron; +static g_raid_md_ctl_t g_raid_md_ctl_jmicron; +static g_raid_md_write_t g_raid_md_write_jmicron; +static g_raid_md_fail_disk_t g_raid_md_fail_disk_jmicron; +static g_raid_md_free_disk_t g_raid_md_free_disk_jmicron; +static g_raid_md_free_t g_raid_md_free_jmicron; + +static kobj_method_t g_raid_md_jmicron_methods[] = { + KOBJMETHOD(g_raid_md_create, g_raid_md_create_jmicron), + KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_jmicron), + KOBJMETHOD(g_raid_md_event, g_raid_md_event_jmicron), + KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_jmicron), + KOBJMETHOD(g_raid_md_write, g_raid_md_write_jmicron), + KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_jmicron), + KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_jmicron), + KOBJMETHOD(g_raid_md_free, g_raid_md_free_jmicron), + { 0, 0 } +}; + +static struct g_raid_md_class g_raid_md_jmicron_class = { + "JMicron", + g_raid_md_jmicron_methods, + sizeof(struct g_raid_md_jmicron_object), + .mdc_priority = 100 +}; + +static void +g_raid_md_jmicron_print(struct jmicron_raid_conf *meta) +{ + int k; + + if (g_raid_debug < 1) + return; + + printf("********* ATA JMicron RAID Metadata *********\n"); + printf("signature <%c%c>\n", meta->signature[0], meta->signature[1]); + printf("version %04x\n", meta->version); + printf("checksum 0x%04x\n", meta->checksum); + printf("disk_id 0x%08x\n", meta->disk_id); + printf("offset 0x%08x\n", meta->offset); + printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high); + printf("disk_sectors_low 0x%04x\n", meta->disk_sectors_low); + printf("name <%.16s>\n", meta->name); + printf("type %d\n", meta->type); + printf("stripe_shift %d\n", meta->stripe_shift); + printf("flags %04x\n", meta->flags); + printf("spare "); + for (k = 0; k < JMICRON_MAX_SPARE; k++) + printf(" 0x%08x", meta->spare[k]); + printf("\n"); + printf("disks "); + for (k = 0; k < JMICRON_MAX_DISKS; k++) + printf(" 0x%08x", meta->disks[k]); + printf("\n"); + printf("=================================================\n"); +} + +static struct jmicron_raid_conf * +jmicron_meta_copy(struct jmicron_raid_conf *meta) +{ + struct jmicron_raid_conf *nmeta; + + nmeta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK); + memcpy(nmeta, meta, sizeof(*meta)); + return (nmeta); +} + +static int +jmicron_meta_total_disks(struct jmicron_raid_conf *meta) +{ + int pos; + + for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) { + if (meta->disks[pos] == 0) + break; + } + return (pos); +} + +static int +jmicron_meta_total_spare(struct jmicron_raid_conf *meta) +{ + int pos, n; + + n = 0; + for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) { + if (meta->spare[pos] != 0) + n++; + } + return (n); +} + +/* + * Generate fake Configuration ID based on disk IDs. + * Note: it will change after each disk set change. + */ +static uint32_t +jmicron_meta_config_id(struct jmicron_raid_conf *meta) +{ + int pos; + uint32_t config_id; + + config_id = 0; + for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) + config_id += meta->disks[pos] << pos; + return (config_id); +} + +static void +jmicron_meta_get_name(struct jmicron_raid_conf *meta, char *buf) +{ + int i; + + strncpy(buf, meta->name, 16); + buf[16] = 0; + for (i = 15; i >= 0; i--) { + if (buf[i] > 0x20) + break; + buf[i] = 0; + } +} + +static void +jmicron_meta_put_name(struct jmicron_raid_conf *meta, char *buf) +{ + + memset(meta->name, 0x20, 16); + memcpy(meta->name, buf, MIN(strlen(buf), 16)); +} + +static int +jmicron_meta_find_disk(struct jmicron_raid_conf *meta, uint32_t id) +{ + int pos; + + id &= JMICRON_DISK_MASK; + for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) { + if ((meta->disks[pos] & JMICRON_DISK_MASK) == id) + return (pos); + } + for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) { + if ((meta->spare[pos] & JMICRON_DISK_MASK) == id) + return (-3); + } + return (-1); +} + +static struct jmicron_raid_conf * +jmicron_meta_read(struct g_consumer *cp) +{ + struct g_provider *pp; + struct jmicron_raid_conf *meta; + char *buf; + int error, i; + uint16_t checksum, *ptr; + + pp = cp->provider; + + /* Read the anchor sector. */ + buf = g_read_data(cp, + pp->mediasize - pp->sectorsize, pp->sectorsize, &error); + if (buf == NULL) { + G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", + pp->name, error); + return (NULL); + } + meta = (struct jmicron_raid_conf *)buf; + + /* Check if this is an JMicron RAID struct */ + if (strncmp(meta->signature, JMICRON_MAGIC, strlen(JMICRON_MAGIC))) { + G_RAID_DEBUG(1, "JMicron signature check failed on %s", pp->name); + g_free(buf); + return (NULL); + } + meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK); + memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); + g_free(buf); + + /* Check metadata checksum. */ + for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++) + checksum += *ptr++; + if (checksum != 0) { + G_RAID_DEBUG(1, "JMicron checksum check failed on %s", pp->name); + free(meta, M_MD_JMICRON); + return (NULL); + } + + return (meta); +} + +static int +jmicron_meta_write(struct g_consumer *cp, struct jmicron_raid_conf *meta) +{ + struct g_provider *pp; + char *buf; + int error, i; + uint16_t checksum, *ptr; + + pp = cp->provider; + + /* Recalculate checksum for case if metadata were changed. */ + meta->checksum = 0; + for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++) + checksum += *ptr++; + meta->checksum -= checksum; + + /* Create and fill buffer. */ + buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO); + memcpy(buf, meta, sizeof(*meta)); + + error = g_write_data(cp, + pp->mediasize - pp->sectorsize, buf, pp->sectorsize); + if (error != 0) { + G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", + pp->name, error); + } + + free(buf, M_MD_JMICRON); + return (error); +} + +static int +jmicron_meta_erase(struct g_consumer *cp) +{ + struct g_provider *pp; + char *buf; + int error; + + pp = cp->provider; + buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO); + error = g_write_data(cp, + pp->mediasize - pp->sectorsize, buf, pp->sectorsize); + if (error != 0) { + G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", + pp->name, error); + } + free(buf, M_MD_JMICRON); + return (error); +} + +static struct g_raid_disk * +g_raid_md_jmicron_get_disk(struct g_raid_softc *sc, int id) +{ + struct g_raid_disk *disk; + struct g_raid_md_jmicron_perdisk *pd; + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; + if (pd->pd_disk_pos == id) + break; + } + return (disk); +} + +static int +g_raid_md_jmicron_supported(int level, int qual, int disks, int force) +{ + + if (disks > 8) + return (0); + switch (level) { + case G_RAID_VOLUME_RL_RAID0: + if (disks < 1) + return (0); + if (!force && (disks < 2 || disks > 6)) + return (0); + break; + case G_RAID_VOLUME_RL_RAID1: + if (disks < 1) + return (0); + if (!force && (disks != 2)) + return (0); + break; + case G_RAID_VOLUME_RL_RAID1E: + if (disks < 2) + return (0); + if (!force && (disks != 4)) + return (0); + break; + case G_RAID_VOLUME_RL_SINGLE: + if (disks != 1) + return (0); + if (!force) + return (0); + break; + case G_RAID_VOLUME_RL_CONCAT: + if (disks < 2) + return (0); + break; + case G_RAID_VOLUME_RL_RAID5: + if (disks < 3) + return (0); + if (!force) + return (0); + break; + default: + return (0); + } + if (qual != G_RAID_VOLUME_RLQ_NONE) + return (0); + return (1); +} + +static int +g_raid_md_jmicron_start_disk(struct g_raid_disk *disk) +{ + struct g_raid_softc *sc; + struct g_raid_subdisk *sd, *tmpsd; + struct g_raid_disk *olddisk, *tmpdisk; + struct g_raid_md_object *md; + struct g_raid_md_jmicron_object *mdi; + struct g_raid_md_jmicron_perdisk *pd, *oldpd; + struct jmicron_raid_conf *meta; + int disk_pos, resurrection = 0; + + sc = disk->d_softc; + md = sc->sc_md; + mdi = (struct g_raid_md_jmicron_object *)md; + meta = mdi->mdio_meta; + pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; + olddisk = NULL; + + /* Find disk position in metadata by it's serial. */ + disk_pos = jmicron_meta_find_disk(meta, pd->pd_disk_id); + if (disk_pos < 0) { + G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); + /* If we are in the start process, that's all for now. */ + if (!mdi->mdio_started) + goto nofit; + /* + * If we have already started - try to get use of the disk. + * Try to replace OFFLINE disks first, then FAILED. + */ + TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { + if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && + tmpdisk->d_state != G_RAID_DISK_S_FAILED) + continue; + /* Make sure this disk is big enough. */ + TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { + if (sd->sd_offset + sd->sd_size + 512 > + pd->pd_disk_size) { + G_RAID_DEBUG1(1, sc, + "Disk too small (%ju < %ju)", + pd->pd_disk_size, + sd->sd_offset + sd->sd_size + 512); + break; + } + } + if (sd != NULL) + continue; + if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { + olddisk = tmpdisk; + break; + } else if (olddisk == NULL) + olddisk = tmpdisk; + } + if (olddisk == NULL) { +nofit: + if (disk_pos == -3 || pd->pd_disk_pos == -3) { + g_raid_change_disk_state(disk, + G_RAID_DISK_S_SPARE); + return (1); + } else { + g_raid_change_disk_state(disk, + G_RAID_DISK_S_STALE); + return (0); + } + } + oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data; + disk_pos = oldpd->pd_disk_pos; + resurrection = 1; + } + + if (olddisk == NULL) { + /* Find placeholder by position. */ + olddisk = g_raid_md_jmicron_get_disk(sc, disk_pos); + if (olddisk == NULL) + panic("No disk at position %d!", disk_pos); + if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { + G_RAID_DEBUG1(1, sc, "More then one disk for pos %d", + disk_pos); + g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); + return (0); + } + oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data; + } + + /* Replace failed disk or placeholder with new disk. */ + TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { + TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + sd->sd_disk = disk; + } + oldpd->pd_disk_pos = -2; + pd->pd_disk_pos = disk_pos; + /* Update global metadata just in case. */ + meta->disks[disk_pos] = pd->pd_disk_id; + + /* If it was placeholder -- destroy it. */ + if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { + g_raid_destroy_disk(olddisk); + } else { + /* Otherwise, make it STALE_FAILED. */ + g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); + } + + /* Welcome the new disk. */ + g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + + /* + * Different disks may have different sizes/offsets, + * especially in concat mode. Update. + */ + if (pd->pd_meta != NULL && !resurrection) { + sd->sd_offset = + (off_t)pd->pd_meta->offset * 16 * 512; //ZZZ + sd->sd_size = + (((off_t)pd->pd_meta->disk_sectors_high << 16) + + pd->pd_meta->disk_sectors_low) * 512; + } + + if (resurrection) { + /* Stale disk, almost same as new. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NEW); + } else if ((meta->flags & JMICRON_F_BADSEC) != 0 && + (pd->pd_meta->flags & JMICRON_F_BADSEC) == 0) { + /* Cold-inserted or rebuilding disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NEW); + } else if (pd->pd_meta->flags & JMICRON_F_UNSYNC) { + /* Dirty or resyncing disk.. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_STALE); + } else { + /* Up to date disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_ACTIVE); + } + g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, + G_RAID_EVENT_SUBDISK); + } + + /* Update status of our need for spare. */ + if (mdi->mdio_started) { + mdi->mdio_incomplete = + (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < + mdi->mdio_total_disks); + } + + return (resurrection); +} + +static void +g_disk_md_jmicron_retaste(void *arg, int pending) +{ + + G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); + g_retaste(&g_raid_class); + free(arg, M_MD_JMICRON); +} + +static void +g_raid_md_jmicron_refill(struct g_raid_softc *sc) +{ + struct g_raid_md_object *md; + struct g_raid_md_jmicron_object *mdi; + struct jmicron_raid_conf *meta; + struct g_raid_disk *disk; + struct task *task; + int update, na; + + md = sc->sc_md; + mdi = (struct g_raid_md_jmicron_object *)md; + meta = mdi->mdio_meta; + update = 0; + do { + /* Make sure we miss anything. */ + na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); + if (na == mdi->mdio_total_disks) + break; + + G_RAID_DEBUG1(1, md->mdo_softc, + "Array is not complete (%d of %d), " + "trying to refill.", na, mdi->mdio_total_disks); + + /* Try to get use some of STALE disks. */ + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_state == G_RAID_DISK_S_STALE) { + update += g_raid_md_jmicron_start_disk(disk); + if (disk->d_state == G_RAID_DISK_S_ACTIVE) + break; + } + } + if (disk != NULL) + continue; + + /* Try to get use some of SPARE disks. */ + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_state == G_RAID_DISK_S_SPARE) { + update += g_raid_md_jmicron_start_disk(disk); + if (disk->d_state == G_RAID_DISK_S_ACTIVE) + break; + } + } + } while (disk != NULL); + + /* Write new metadata if we changed something. */ + if (update) { + g_raid_md_write_jmicron(md, NULL, NULL, NULL); + meta = mdi->mdio_meta; + } + + /* Update status of our need for spare. */ + mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < + mdi->mdio_total_disks); + + /* Request retaste hoping to find spare. */ + if (mdi->mdio_incomplete) { + task = malloc(sizeof(struct task), + M_MD_JMICRON, M_WAITOK | M_ZERO); + TASK_INIT(task, 0, g_disk_md_jmicron_retaste, task); + taskqueue_enqueue(taskqueue_swi, task); + } +} + +static void +g_raid_md_jmicron_start(struct g_raid_softc *sc) +{ + struct g_raid_md_object *md; + struct g_raid_md_jmicron_object *mdi; + struct g_raid_md_jmicron_perdisk *pd; + struct jmicron_raid_conf *meta; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + off_t size; + int j, disk_pos; + char buf[17]; + + md = sc->sc_md; + mdi = (struct g_raid_md_jmicron_object *)md; + meta = mdi->mdio_meta; + + /* Create volumes and subdisks. */ + jmicron_meta_get_name(meta, buf); + vol = g_raid_create_volume(sc, buf, -1); + size = ((off_t)meta->disk_sectors_high << 16) + meta->disk_sectors_low; + size *= 512; //ZZZ + if (meta->type == JMICRON_T_RAID0) { + vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; + vol->v_mediasize = size * mdi->mdio_total_disks; + } else if (meta->type == JMICRON_T_RAID1) { + vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; + vol->v_mediasize = size; + } else if (meta->type == JMICRON_T_RAID01) { + vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; + vol->v_mediasize = size * mdi->mdio_total_disks / 2; + } else if (meta->type == JMICRON_T_CONCAT) { + if (mdi->mdio_total_disks == 1) + vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; + else + vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; + vol->v_mediasize = 0; + } else if (meta->type == JMICRON_T_RAID5) { + vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; + vol->v_mediasize = size * (mdi->mdio_total_disks - 1); + } else { + vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; + vol->v_mediasize = 0; + } + vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; + vol->v_strip_size = 1024 << meta->stripe_shift; //ZZZ + vol->v_disks_count = mdi->mdio_total_disks; + vol->v_sectorsize = 512; //ZZZ + for (j = 0; j < vol->v_disks_count; j++) { + sd = &vol->v_subdisks[j]; + sd->sd_offset = (off_t)meta->offset * 16 * 512; //ZZZ + sd->sd_size = size; + } + g_raid_start_volume(vol); + + /* Create disk placeholders to store data for later writing. */ + for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { + pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); + pd->pd_disk_pos = disk_pos; + pd->pd_disk_id = meta->disks[disk_pos]; + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_state = G_RAID_DISK_S_OFFLINE; + sd = &vol->v_subdisks[disk_pos]; + sd->sd_disk = disk; + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + } + + /* Make all disks found till the moment take their places. */ + do { + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_state == G_RAID_DISK_S_NONE) { + g_raid_md_jmicron_start_disk(disk); + break; + } + } + } while (disk != NULL); + + mdi->mdio_started = 1; + G_RAID_DEBUG1(0, sc, "Array started."); + g_raid_md_write_jmicron(md, NULL, NULL, NULL); + + /* Pickup any STALE/SPARE disks to refill array if needed. */ + g_raid_md_jmicron_refill(sc); + + g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); + + callout_stop(&mdi->mdio_start_co); + G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); + root_mount_rel(mdi->mdio_rootmount); + mdi->mdio_rootmount = NULL; +} + +static void +g_raid_md_jmicron_new_disk(struct g_raid_disk *disk) +{ + struct g_raid_softc *sc; + struct g_raid_md_object *md; + struct g_raid_md_jmicron_object *mdi; + struct jmicron_raid_conf *pdmeta; + struct g_raid_md_jmicron_perdisk *pd; + + sc = disk->d_softc; + md = sc->sc_md; + mdi = (struct g_raid_md_jmicron_object *)md; + pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; + pdmeta = pd->pd_meta; + + if (mdi->mdio_started) { + if (g_raid_md_jmicron_start_disk(disk)) + g_raid_md_write_jmicron(md, NULL, NULL, NULL); + } else { + /* + * If we haven't started yet - update common metadata + * to get subdisks details, avoiding data from spare disks. + */ + if (mdi->mdio_meta == NULL || + jmicron_meta_find_disk(mdi->mdio_meta, + mdi->mdio_meta->disk_id) == -3) { + if (mdi->mdio_meta != NULL) + free(mdi->mdio_meta, M_MD_JMICRON); + mdi->mdio_meta = jmicron_meta_copy(pdmeta); + mdi->mdio_total_disks = jmicron_meta_total_disks(pdmeta); + } + mdi->mdio_meta->flags |= pdmeta->flags & JMICRON_F_BADSEC; + + mdi->mdio_disks_present++; + G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d+%d up)", + mdi->mdio_disks_present, + mdi->mdio_total_disks, + jmicron_meta_total_spare(mdi->mdio_meta)); + + /* If we collected all needed disks - start array. */ + if (mdi->mdio_disks_present == mdi->mdio_total_disks + + jmicron_meta_total_spare(mdi->mdio_meta)) + g_raid_md_jmicron_start(sc); + } +} + +static void +g_raid_jmicron_go(void *arg) +{ + struct g_raid_softc *sc; + struct g_raid_md_object *md; + struct g_raid_md_jmicron_object *mdi; + + sc = arg; + md = sc->sc_md; + mdi = (struct g_raid_md_jmicron_object *)md; + if (!mdi->mdio_started) { + G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); + g_raid_event_send(sc, G_RAID_NODE_E_START, 0); + } +} + +static int +g_raid_md_create_jmicron(struct g_raid_md_object *md, struct g_class *mp, + struct g_geom **gp) +{ + struct g_raid_softc *sc; + struct g_raid_md_jmicron_object *mdi; + char name[16]; + + mdi = (struct g_raid_md_jmicron_object *)md; + mdi->mdio_config_id = arc4random(); + snprintf(name, sizeof(name), "JMicron-%08x", mdi->mdio_config_id); + sc = g_raid_create_node(mp, name, md); + if (sc == NULL) + return (G_RAID_MD_TASTE_FAIL); + md->mdo_softc = sc; + *gp = sc->sc_geom; + return (G_RAID_MD_TASTE_NEW); +} + +static int +g_raid_md_taste_jmicron(struct g_raid_md_object *md, struct g_class *mp, + struct g_consumer *cp, struct g_geom **gp) +{ + struct g_consumer *rcp; + struct g_provider *pp; + struct g_raid_md_jmicron_object *mdi, *mdi1; + struct g_raid_softc *sc; + struct g_raid_disk *disk; + struct jmicron_raid_conf *meta; + struct g_raid_md_jmicron_perdisk *pd; + struct g_geom *geom; + int error, disk_pos, result, spare, len; + char name[16]; + uint16_t vendor; + + G_RAID_DEBUG(1, "Tasting JMicron on %s", cp->provider->name); + mdi = (struct g_raid_md_jmicron_object *)md; + pp = cp->provider; + + /* Read metadata from device. */ + meta = NULL; + spare = 0; + vendor = 0xffff; + disk_pos = 0; + if (g_access(cp, 1, 0, 0) != 0) + return (G_RAID_MD_TASTE_FAIL); + g_topology_unlock(); + len = 2; + if (pp->geom->rank == 1) + g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); + meta = jmicron_meta_read(cp); + g_topology_lock(); + g_access(cp, -1, 0, 0); + if (meta == NULL) { + if (g_raid_aggressive_spare) { + if (vendor == 0x197b) { + G_RAID_DEBUG(1, + "No JMicron metadata, forcing spare."); + spare = 2; + goto search; + } else { + G_RAID_DEBUG(1, + "JMicron vendor mismatch 0x%04x != 0x197b", + vendor); + } + } + return (G_RAID_MD_TASTE_FAIL); + } + + /* Check this disk position in obtained metadata. */ + disk_pos = jmicron_meta_find_disk(meta, meta->disk_id); + if (disk_pos == -1) { + G_RAID_DEBUG(1, "JMicron disk_id %08x not found", + meta->disk_id); + goto fail1; + } + + /* Metadata valid. Print it. */ + g_raid_md_jmicron_print(meta); + G_RAID_DEBUG(1, "JMicron disk position %d", disk_pos); + spare = (disk_pos == -2) ? 1 : 0; + +search: + /* Search for matching node. */ + sc = NULL; + mdi1 = NULL; + LIST_FOREACH(geom, &mp->geom, geom) { + sc = geom->softc; + if (sc == NULL) + continue; + if (sc->sc_stopping != 0) + continue; + if (sc->sc_md->mdo_class != md->mdo_class) + continue; + mdi1 = (struct g_raid_md_jmicron_object *)sc->sc_md; + if (spare == 2) { + if (mdi1->mdio_incomplete) + break; + } else { + if (mdi1->mdio_config_id == + jmicron_meta_config_id(meta)) + break; + } + } + + /* Found matching node. */ + if (geom != NULL) { + G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); + result = G_RAID_MD_TASTE_EXISTING; + + } else if (spare) { /* Not found needy node -- left for later. */ + G_RAID_DEBUG(1, "Spare is not needed at this time"); + goto fail1; + + } else { /* Not found matching node -- create one. */ + result = G_RAID_MD_TASTE_NEW; + mdi->mdio_config_id = jmicron_meta_config_id(meta); + snprintf(name, sizeof(name), "JMicron-%08x", + mdi->mdio_config_id); + sc = g_raid_create_node(mp, name, md); + md->mdo_softc = sc; + geom = sc->sc_geom; + callout_init(&mdi->mdio_start_co, 1); + callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, + g_raid_jmicron_go, sc); + mdi->mdio_rootmount = root_mount_hold("GRAID-JMicron"); + G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); + } + + rcp = g_new_consumer(geom); + g_attach(rcp, pp); + if (g_access(rcp, 1, 1, 1) != 0) + ; //goto fail1; + + g_topology_unlock(); + sx_xlock(&sc->sc_lock); + + pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); + pd->pd_meta = meta; + if (spare == 2) { + pd->pd_disk_pos = -3; + pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; + } else { + pd->pd_disk_pos = -1; + pd->pd_disk_id = meta->disk_id; + } + pd->pd_disk_size = pp->mediasize; + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_consumer = rcp; + rcp->private = disk; + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", + rcp->provider->name, error); + + g_raid_md_jmicron_new_disk(disk); + + sx_xunlock(&sc->sc_lock); + g_topology_lock(); + *gp = geom; + return (result); +fail1: + free(meta, M_MD_JMICRON); + return (G_RAID_MD_TASTE_FAIL); +} + +static int +g_raid_md_event_jmicron(struct g_raid_md_object *md, + struct g_raid_disk *disk, u_int event) +{ + struct g_raid_softc *sc; + struct g_raid_subdisk *sd; + struct g_raid_md_jmicron_object *mdi; + struct g_raid_md_jmicron_perdisk *pd; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_jmicron_object *)md; + if (disk == NULL) { + switch (event) { + case G_RAID_NODE_E_START: + if (!mdi->mdio_started) + g_raid_md_jmicron_start(sc); + return (0); + } + return (-1); + } + pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; + switch (event) { + case G_RAID_DISK_E_DISCONNECTED: + /* If disk was assigned, just update statuses. */ + if (pd->pd_disk_pos >= 0) { + g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); + if (disk->d_consumer) { + g_raid_kill_consumer(sc, disk->d_consumer); + disk->d_consumer = NULL; + } + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NONE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, + G_RAID_EVENT_SUBDISK); + } + } else { + /* Otherwise -- delete. */ + g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); + g_raid_destroy_disk(disk); + } + + /* Write updated metadata to all disks. */ + g_raid_md_write_jmicron(md, NULL, NULL, NULL); + + /* Check if anything left except placeholders. */ + if (g_raid_ndisks(sc, -1) == + g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) + g_raid_destroy_node(sc, 0); + else + g_raid_md_jmicron_refill(sc); + return (0); + } + return (-2); +} + +static int +g_raid_md_ctl_jmicron(struct g_raid_md_object *md, + struct gctl_req *req) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + struct g_raid_md_jmicron_object *mdi; + struct g_raid_md_jmicron_perdisk *pd; + struct g_consumer *cp; + struct g_provider *pp; + char arg[16]; + const char *verb, *volname, *levelname, *diskname; + int *nargs, *force; + off_t size, sectorsize, strip; + intmax_t *sizearg, *striparg; + int numdisks, i, len, level, qual, update; + int error; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_jmicron_object *)md; + verb = gctl_get_param(req, "verb", NULL); + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + error = 0; + if (strcmp(verb, "label") == 0) { + + if (*nargs < 4) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + volname = gctl_get_asciiparam(req, "arg1"); + if (volname == NULL) { + gctl_error(req, "No volume name."); + return (-2); + } + levelname = gctl_get_asciiparam(req, "arg2"); + if (levelname == NULL) { + gctl_error(req, "No RAID level."); + return (-3); + } + if (g_raid_volume_str2level(levelname, &level, &qual)) { + gctl_error(req, "Unknown RAID level '%s'.", levelname); + return (-4); + } + numdisks = *nargs - 3; + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (!g_raid_md_jmicron_supported(level, qual, numdisks, + force ? *force : 0)) { + gctl_error(req, "Unsupported RAID level " + "(0x%02x/0x%02x), or number of disks (%d).", + level, qual, numdisks); + return (-5); + } + + /* Search for disks, connect them and probe. */ + size = 0x7fffffffffffffffllu; + sectorsize = 0; + for (i = 0; i < numdisks; i++) { + snprintf(arg, sizeof(arg), "arg%d", i + 3); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -6; + break; + } + if (strcmp(diskname, "NONE") == 0) { + cp = NULL; + pp = NULL; + } else { + g_topology_lock(); + cp = g_raid_open_consumer(sc, diskname); + if (cp == NULL) { + gctl_error(req, "Can't open '%s'.", + diskname); + g_topology_unlock(); + error = -7; + break; + } + pp = cp->provider; + } + pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); + pd->pd_disk_pos = i; + pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_consumer = cp; + if (cp == NULL) + continue; + cp->private = disk; + g_topology_unlock(); + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, + "Dumping not supported by %s.", + cp->provider->name); + + pd->pd_disk_size = pp->mediasize; + if (size > pp->mediasize) + size = pp->mediasize; + if (sectorsize < pp->sectorsize) + sectorsize = pp->sectorsize; + } + if (error != 0) + return (error); + + /* Reserve space for metadata. */ + size -= sectorsize; + + /* Handle size argument. */ + len = sizeof(*sizearg); + sizearg = gctl_get_param(req, "size", &len); + if (sizearg != NULL && len == sizeof(*sizearg) && + *sizearg > 0) { + if (*sizearg > size) { + gctl_error(req, "Size too big %lld > %lld.", + (long long)*sizearg, (long long)size); + return (-9); + } + size = *sizearg; + } + + /* Handle strip argument. */ + strip = 131072; + len = sizeof(*striparg); + striparg = gctl_get_param(req, "strip", &len); + if (striparg != NULL && len == sizeof(*striparg) && + *striparg > 0) { + if (*striparg < sectorsize) { + gctl_error(req, "Strip size too small."); + return (-10); + } + if (*striparg % sectorsize != 0) { + gctl_error(req, "Incorrect strip size."); + return (-11); + } + if (strip > 65535 * sectorsize) { + gctl_error(req, "Strip size too big."); + return (-12); + } + strip = *striparg; + } + + /* Round size down to strip or sector. */ + if (level == G_RAID_VOLUME_RL_RAID1) + size -= (size % sectorsize); + else if (level == G_RAID_VOLUME_RL_RAID1E && + (numdisks & 1) != 0) + size -= (size % (2 * strip)); + else + size -= (size % strip); + if (size <= 0) { + gctl_error(req, "Size too small."); + return (-13); + } + if (size > 0xffffffffffffllu * sectorsize) { + gctl_error(req, "Size too big."); + return (-14); + } + + /* We have all we need, create things: volume, ... */ + mdi->mdio_total_disks = numdisks; + mdi->mdio_started = 1; + vol = g_raid_create_volume(sc, volname, -1); + vol->v_md_data = (void *)(intptr_t)0; + vol->v_raid_level = level; + vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; + vol->v_strip_size = strip; + vol->v_disks_count = numdisks; + if (level == G_RAID_VOLUME_RL_RAID0 || + level == G_RAID_VOLUME_RL_CONCAT || + level == G_RAID_VOLUME_RL_SINGLE) + vol->v_mediasize = size * numdisks; + else if (level == G_RAID_VOLUME_RL_RAID1) + vol->v_mediasize = size; + else if (level == G_RAID_VOLUME_RL_RAID5) + vol->v_mediasize = size * (numdisks - 1); + else { /* RAID1E */ + vol->v_mediasize = ((size * numdisks) / strip / 2) * + strip; + } + vol->v_sectorsize = sectorsize; + g_raid_start_volume(vol); + + /* , and subdisks. */ + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; + sd = &vol->v_subdisks[pd->pd_disk_pos]; + sd->sd_disk = disk; + sd->sd_offset = 0; + sd->sd_size = size; + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + if (sd->sd_disk->d_consumer != NULL) { + g_raid_change_disk_state(disk, + G_RAID_DISK_S_ACTIVE); + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_ACTIVE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, + G_RAID_EVENT_SUBDISK); + } else { + g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); + } + } + + /* Write metadata based on created entities. */ + G_RAID_DEBUG1(0, sc, "Array started."); + g_raid_md_write_jmicron(md, NULL, NULL, NULL); + + /* Pickup any STALE/SPARE disks to refill array if needed. */ + g_raid_md_jmicron_refill(sc); + + g_raid_event_send(vol, G_RAID_VOLUME_E_START, + G_RAID_EVENT_VOLUME); + return (0); + } + if (strcmp(verb, "delete") == 0) { + + /* Check if some volume is still open. */ + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (force != NULL && *force == 0 && + g_raid_nopens(sc) != 0) { + gctl_error(req, "Some volume is still open."); + return (-4); + } + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_consumer) + jmicron_meta_erase(disk->d_consumer); + } + g_raid_destroy_node(sc, 0); + return (0); + } + if (strcmp(verb, "remove") == 0 || + strcmp(verb, "fail") == 0) { + if (*nargs < 2) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + for (i = 1; i < *nargs; i++) { + snprintf(arg, sizeof(arg), "arg%d", i); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -2; + break; + } + if (strncmp(diskname, "/dev/", 5) == 0) + diskname += 5; + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_consumer != NULL && + disk->d_consumer->provider != NULL && + strcmp(disk->d_consumer->provider->name, + diskname) == 0) + break; + } + if (disk == NULL) { + gctl_error(req, "Disk '%s' not found.", + diskname); + error = -3; + break; + } + + if (strcmp(verb, "fail") == 0) { + g_raid_md_fail_disk_jmicron(md, NULL, disk); + continue; + } + + pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; + + /* Erase metadata on deleting disk. */ + jmicron_meta_erase(disk->d_consumer); + + /* If disk was assigned, just update statuses. */ + if (pd->pd_disk_pos >= 0) { + g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); + if (disk->d_consumer) { + g_raid_kill_consumer(sc, disk->d_consumer); + disk->d_consumer = NULL; + } + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NONE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, + G_RAID_EVENT_SUBDISK); + } + } else { + /* Otherwise -- delete. */ + g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); + g_raid_destroy_disk(disk); + } + } + + /* Write updated metadata to remaining disks. */ + g_raid_md_write_jmicron(md, NULL, NULL, NULL); + + /* Check if anything left except placeholders. */ + if (g_raid_ndisks(sc, -1) == + g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) + g_raid_destroy_node(sc, 0); + else + g_raid_md_jmicron_refill(sc); + return (error); + } + if (strcmp(verb, "insert") == 0) { + if (*nargs < 2) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + update = 0; + for (i = 1; i < *nargs; i++) { + /* Get disk name. */ + snprintf(arg, sizeof(arg), "arg%d", i); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -3; + break; + } + + /* Try to find provider with specified name. */ + g_topology_lock(); + cp = g_raid_open_consumer(sc, diskname); + if (cp == NULL) { + gctl_error(req, "Can't open disk '%s'.", + diskname); + g_topology_unlock(); + error = -4; + break; + } + pp = cp->provider; + + pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); + pd->pd_disk_pos = -3; + pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; + pd->pd_disk_size = pp->mediasize; + + disk = g_raid_create_disk(sc); + disk->d_consumer = cp; + disk->d_consumer->private = disk; + disk->d_md_data = (void *)pd; + cp->private = disk; + g_topology_unlock(); + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, + "Dumping not supported by %s.", + cp->provider->name); + + /* Welcome the "new" disk. */ + update += g_raid_md_jmicron_start_disk(disk); + if (disk->d_state != G_RAID_DISK_S_ACTIVE && + disk->d_state != G_RAID_DISK_S_SPARE) { + gctl_error(req, "Disk '%s' doesn't fit.", + diskname); + g_raid_destroy_disk(disk); + error = -8; + break; + } + } + + /* Write new metadata if we changed something. */ + if (update) + g_raid_md_write_jmicron(md, NULL, NULL, NULL); + return (error); + } + gctl_error(req, "Command '%s' is not supported.", verb); + return (-100); +} + +static int +g_raid_md_write_jmicron(struct g_raid_md_object *md, struct g_raid_volume *tvol, + struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + struct g_raid_md_jmicron_object *mdi; + struct g_raid_md_jmicron_perdisk *pd; + struct jmicron_raid_conf *meta; + int i, spares; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_jmicron_object *)md; + + if (sc->sc_stopping == G_RAID_DESTROY_HARD) + return (0); + + /* There is only one volume. */ + vol = TAILQ_FIRST(&sc->sc_volumes); + + /* Fill global fields. */ + meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK | M_ZERO); + strncpy(meta->signature, JMICRON_MAGIC, 2); + meta->version = JMICRON_VERSION; + jmicron_meta_put_name(meta, vol->v_name); + if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) + meta->type = JMICRON_T_RAID0; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) + meta->type = JMICRON_T_RAID1; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) + meta->type = JMICRON_T_RAID01; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || + vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) + meta->type = JMICRON_T_CONCAT; + else + meta->type = JMICRON_T_RAID5; + meta->stripe_shift = fls(vol->v_strip_size / 2048); + meta->flags = JMICRON_F_READY | JMICRON_F_BOOTABLE; + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + if (sd->sd_disk == NULL || sd->sd_disk->d_md_data == NULL) + meta->disks[i] = 0xffffffff; + else { + pd = (struct g_raid_md_jmicron_perdisk *) + sd->sd_disk->d_md_data; + meta->disks[i] = pd->pd_disk_id; + } + if (sd->sd_state < G_RAID_SUBDISK_S_STALE) + meta->flags |= JMICRON_F_BADSEC; + if (vol->v_dirty) + meta->flags |= JMICRON_F_UNSYNC; + } + + /* Put spares to their slots. */ + spares = 0; + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; + if (disk->d_state != G_RAID_DISK_S_SPARE) + continue; + meta->spare[spares] = pd->pd_disk_id; + if (++spares >= 2) + break; + } + + /* We are done. Print meta data and store them to disks. */ + if (mdi->mdio_meta != NULL) + free(mdi->mdio_meta, M_MD_JMICRON); + mdi->mdio_meta = meta; + i = 0; + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; + if (disk->d_state != G_RAID_DISK_S_ACTIVE && + disk->d_state != G_RAID_DISK_S_SPARE) + continue; + if (pd->pd_meta != NULL) { + free(pd->pd_meta, M_MD_JMICRON); + pd->pd_meta = NULL; + } + pd->pd_meta = jmicron_meta_copy(meta); + pd->pd_meta->disk_id = pd->pd_disk_id; + if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { + pd->pd_meta->offset = + (sd->sd_offset / 512) / 16; + pd->pd_meta->disk_sectors_high = + (sd->sd_size / 512) >> 16; + pd->pd_meta->disk_sectors_low = + (sd->sd_size / 512) & 0xffff; + if (sd->sd_state < G_RAID_SUBDISK_S_STALE) + pd->pd_meta->flags &= ~JMICRON_F_BADSEC; + else if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) + pd->pd_meta->flags |= JMICRON_F_UNSYNC; + } + G_RAID_DEBUG(1, "Writing JMicron metadata to %s", + g_raid_get_diskname(disk)); + g_raid_md_jmicron_print(pd->pd_meta); + jmicron_meta_write(disk->d_consumer, pd->pd_meta); + } + return (0); +} + +static int +g_raid_md_fail_disk_jmicron(struct g_raid_md_object *md, + struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) +{ + struct g_raid_softc *sc; + struct g_raid_md_jmicron_object *mdi; + struct g_raid_md_jmicron_perdisk *pd; + struct g_raid_subdisk *sd; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_jmicron_object *)md; + pd = (struct g_raid_md_jmicron_perdisk *)tdisk->d_md_data; + + /* We can't fail disk that is not a part of array now. */ + if (pd->pd_disk_pos < 0) + return (-1); + + if (tdisk->d_consumer) + jmicron_meta_erase(tdisk->d_consumer); + + /* Change states. */ + g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); + TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_FAILED); + g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, + G_RAID_EVENT_SUBDISK); + } + + /* Write updated metadata to remaining disks. */ + g_raid_md_write_jmicron(md, NULL, NULL, tdisk); + + /* Check if anything left except placeholders. */ + if (g_raid_ndisks(sc, -1) == + g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) + g_raid_destroy_node(sc, 0); + else + g_raid_md_jmicron_refill(sc); + return (0); +} + +static int +g_raid_md_free_disk_jmicron(struct g_raid_md_object *md, + struct g_raid_disk *disk) +{ + struct g_raid_md_jmicron_perdisk *pd; + + pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; + if (pd->pd_meta != NULL) { + free(pd->pd_meta, M_MD_JMICRON); + pd->pd_meta = NULL; + } + free(pd, M_MD_JMICRON); + disk->d_md_data = NULL; + return (0); +} + +static int +g_raid_md_free_jmicron(struct g_raid_md_object *md) +{ + struct g_raid_md_jmicron_object *mdi; + + mdi = (struct g_raid_md_jmicron_object *)md; + if (!mdi->mdio_started) { + mdi->mdio_started = 0; + callout_stop(&mdi->mdio_start_co); + G_RAID_DEBUG1(1, md->mdo_softc, + "root_mount_rel %p", mdi->mdio_rootmount); + root_mount_rel(mdi->mdio_rootmount); + mdi->mdio_rootmount = NULL; + } + if (mdi->mdio_meta != NULL) { + free(mdi->mdio_meta, M_MD_JMICRON); + mdi->mdio_meta = NULL; + } + return (0); +} + +G_RAID_MD_DECLARE(g_raid_md_jmicron); diff --git a/sys/geom/raid/md_nvidia.c b/sys/geom/raid/md_nvidia.c new file mode 100644 index 0000000..dbaee0a --- /dev/null +++ b/sys/geom/raid/md_nvidia.c @@ -0,0 +1,1607 @@ +/*- + * Copyright (c) 2011 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "geom/raid/g_raid.h" +#include "g_raid_md_if.h" + +static MALLOC_DEFINE(M_MD_NVIDIA, "md_nvidia_data", "GEOM_RAID NVIDIA metadata"); + +struct nvidia_raid_conf { + uint8_t nvidia_id[8]; +#define NVIDIA_MAGIC "NVIDIA " + + uint32_t config_size; + uint32_t checksum; + uint16_t version; + uint8_t disk_number; + uint8_t dummy_0; + uint32_t total_sectors; + uint32_t sector_size; + uint8_t name[16]; + uint8_t revision[4]; + uint32_t disk_status; + + uint32_t magic_0; +#define NVIDIA_MAGIC0 0x00640044 + + uint64_t volume_id[2]; + uint8_t state; +#define NVIDIA_S_IDLE 0 +#define NVIDIA_S_INIT 2 +#define NVIDIA_S_REBUILD 3 +#define NVIDIA_S_UPGRADE 4 +#define NVIDIA_S_SYNC 5 + uint8_t array_width; + uint8_t total_disks; + uint8_t orig_array_width; + uint16_t type; +#define NVIDIA_T_RAID0 0x0080 +#define NVIDIA_T_RAID1 0x0081 +#define NVIDIA_T_RAID3 0x0083 +#define NVIDIA_T_RAID5 0x0085 /* RLQ = 00/02? */ +#define NVIDIA_T_RAID5_SYM 0x0095 /* RLQ = 03 */ +#define NVIDIA_T_RAID10 0x008a +#define NVIDIA_T_RAID01 0x8180 +#define NVIDIA_T_CONCAT 0x00ff + + uint16_t dummy_3; + uint32_t strip_sectors; + uint32_t strip_bytes; + uint32_t strip_shift; + uint32_t strip_mask; + uint32_t stripe_sectors; + uint32_t stripe_bytes; + uint32_t rebuild_lba; + uint32_t orig_type; + uint32_t orig_total_sectors; + uint32_t status; +#define NVIDIA_S_BOOTABLE 0x00000001 +#define NVIDIA_S_DEGRADED 0x00000002 + + uint32_t filler[98]; +} __packed; + +struct g_raid_md_nvidia_perdisk { + struct nvidia_raid_conf *pd_meta; + int pd_disk_pos; + off_t pd_disk_size; +}; + +struct g_raid_md_nvidia_object { + struct g_raid_md_object mdio_base; + uint64_t mdio_volume_id[2]; + struct nvidia_raid_conf *mdio_meta; + struct callout mdio_start_co; /* STARTING state timer. */ + int mdio_total_disks; + int mdio_disks_present; + int mdio_started; + int mdio_incomplete; + struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ +}; + +static g_raid_md_create_t g_raid_md_create_nvidia; +static g_raid_md_taste_t g_raid_md_taste_nvidia; +static g_raid_md_event_t g_raid_md_event_nvidia; +static g_raid_md_ctl_t g_raid_md_ctl_nvidia; +static g_raid_md_write_t g_raid_md_write_nvidia; +static g_raid_md_fail_disk_t g_raid_md_fail_disk_nvidia; +static g_raid_md_free_disk_t g_raid_md_free_disk_nvidia; +static g_raid_md_free_t g_raid_md_free_nvidia; + +static kobj_method_t g_raid_md_nvidia_methods[] = { + KOBJMETHOD(g_raid_md_create, g_raid_md_create_nvidia), + KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_nvidia), + KOBJMETHOD(g_raid_md_event, g_raid_md_event_nvidia), + KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_nvidia), + KOBJMETHOD(g_raid_md_write, g_raid_md_write_nvidia), + KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_nvidia), + KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_nvidia), + KOBJMETHOD(g_raid_md_free, g_raid_md_free_nvidia), + { 0, 0 } +}; + +static struct g_raid_md_class g_raid_md_nvidia_class = { + "NVIDIA", + g_raid_md_nvidia_methods, + sizeof(struct g_raid_md_nvidia_object), + .mdc_priority = 100 +}; + +static int NVIDIANodeID = 1; + +static void +g_raid_md_nvidia_print(struct nvidia_raid_conf *meta) +{ + + if (g_raid_debug < 1) + return; + + printf("********* ATA NVIDIA RAID Metadata *********\n"); + printf("nvidia_id <%.8s>\n", meta->nvidia_id); + printf("config_size %u\n", meta->config_size); + printf("checksum 0x%08x\n", meta->checksum); + printf("version 0x%04x\n", meta->version); + printf("disk_number %d\n", meta->disk_number); + printf("dummy_0 0x%02x\n", meta->dummy_0); + printf("total_sectors %u\n", meta->total_sectors); + printf("sector_size %u\n", meta->sector_size); + printf("name <%.16s>\n", meta->name); + printf("revision 0x%02x%02x%02x%02x\n", + meta->revision[0], meta->revision[1], + meta->revision[2], meta->revision[3]); + printf("disk_status 0x%08x\n", meta->disk_status); + printf("magic_0 0x%08x\n", meta->magic_0); + printf("volume_id 0x%016jx%016jx\n", + meta->volume_id[1], meta->volume_id[0]); + printf("state 0x%02x\n", meta->state); + printf("array_width %u\n", meta->array_width); + printf("total_disks %u\n", meta->total_disks); + printf("orig_array_width %u\n", meta->orig_array_width); + printf("type 0x%04x\n", meta->type); + printf("dummy_3 0x%04x\n", meta->dummy_3); + printf("strip_sectors %u\n", meta->strip_sectors); + printf("strip_bytes %u\n", meta->strip_bytes); + printf("strip_shift %u\n", meta->strip_shift); + printf("strip_mask 0x%08x\n", meta->strip_mask); + printf("stripe_sectors %u\n", meta->stripe_sectors); + printf("stripe_bytes %u\n", meta->stripe_bytes); + printf("rebuild_lba %u\n", meta->rebuild_lba); + printf("orig_type 0x%04x\n", meta->orig_type); + printf("orig_total_sectors %u\n", meta->orig_total_sectors); + printf("status 0x%08x\n", meta->status); + printf("=================================================\n"); +} + +static struct nvidia_raid_conf * +nvidia_meta_copy(struct nvidia_raid_conf *meta) +{ + struct nvidia_raid_conf *nmeta; + + nmeta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK); + memcpy(nmeta, meta, sizeof(*meta)); + return (nmeta); +} + +static int +nvidia_meta_translate_disk(struct nvidia_raid_conf *meta, int md_disk_pos) +{ + int disk_pos; + + if (md_disk_pos >= 0 && meta->type == NVIDIA_T_RAID01) { + disk_pos = (md_disk_pos / meta->array_width) + + (md_disk_pos % meta->array_width) * meta->array_width; + } else + disk_pos = md_disk_pos; + return (disk_pos); +} + +static void +nvidia_meta_get_name(struct nvidia_raid_conf *meta, char *buf) +{ + int i; + + strncpy(buf, meta->name, 16); + buf[16] = 0; + for (i = 15; i >= 0; i--) { + if (buf[i] > 0x20) + break; + buf[i] = 0; + } +} + +static void +nvidia_meta_put_name(struct nvidia_raid_conf *meta, char *buf) +{ + + memset(meta->name, 0x20, 16); + memcpy(meta->name, buf, MIN(strlen(buf), 16)); +} + +static struct nvidia_raid_conf * +nvidia_meta_read(struct g_consumer *cp) +{ + struct g_provider *pp; + struct nvidia_raid_conf *meta; + char *buf; + int error, i; + uint32_t checksum, *ptr; + + pp = cp->provider; + + /* Read the anchor sector. */ + buf = g_read_data(cp, + pp->mediasize - 2 * pp->sectorsize, pp->sectorsize, &error); + if (buf == NULL) { + G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", + pp->name, error); + return (NULL); + } + meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK); + memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); + g_free(buf); + + /* Check if this is an NVIDIA RAID struct */ + if (strncmp(meta->nvidia_id, NVIDIA_MAGIC, strlen(NVIDIA_MAGIC))) { + G_RAID_DEBUG(1, "NVIDIA signature check failed on %s", pp->name); + free(meta, M_MD_NVIDIA); + return (NULL); + } + if (meta->config_size > 128 || + meta->config_size < 30) { + G_RAID_DEBUG(1, "NVIDIA metadata size looks wrong: %d", + meta->config_size); + free(meta, M_MD_NVIDIA); + return (NULL); + } + + /* Check metadata checksum. */ + for (checksum = 0, ptr = (uint32_t *)meta, + i = 0; i < meta->config_size; i++) + checksum += *ptr++; + if (checksum != 0) { + G_RAID_DEBUG(1, "NVIDIA checksum check failed on %s", pp->name); + free(meta, M_MD_NVIDIA); + return (NULL); + } + + /* Check volume state. */ + if (meta->state != NVIDIA_S_IDLE && meta->state != NVIDIA_S_INIT && + meta->state != NVIDIA_S_REBUILD && meta->state != NVIDIA_S_SYNC) { + G_RAID_DEBUG(1, "NVIDIA unknown state on %s (0x%02x)", + pp->name, meta->state); + free(meta, M_MD_NVIDIA); + return (NULL); + } + + /* Check raid type. */ + if (meta->type != NVIDIA_T_RAID0 && meta->type != NVIDIA_T_RAID1 && + meta->type != NVIDIA_T_RAID3 && meta->type != NVIDIA_T_RAID5 && + meta->type != NVIDIA_T_RAID5_SYM && + meta->type != NVIDIA_T_RAID01 && meta->type != NVIDIA_T_CONCAT) { + G_RAID_DEBUG(1, "NVIDIA unknown RAID level on %s (0x%02x)", + pp->name, meta->type); + free(meta, M_MD_NVIDIA); + return (NULL); + } + + return (meta); +} + +static int +nvidia_meta_write(struct g_consumer *cp, struct nvidia_raid_conf *meta) +{ + struct g_provider *pp; + char *buf; + int error, i; + uint32_t checksum, *ptr; + + pp = cp->provider; + + /* Recalculate checksum for case if metadata were changed. */ + meta->checksum = 0; + for (checksum = 0, ptr = (uint32_t *)meta, + i = 0; i < meta->config_size; i++) + checksum += *ptr++; + meta->checksum -= checksum; + + /* Create and fill buffer. */ + buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO); + memcpy(buf, meta, sizeof(*meta)); + + /* Write metadata. */ + error = g_write_data(cp, + pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); + if (error != 0) { + G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", + pp->name, error); + } + + free(buf, M_MD_NVIDIA); + return (error); +} + +static int +nvidia_meta_erase(struct g_consumer *cp) +{ + struct g_provider *pp; + char *buf; + int error; + + pp = cp->provider; + buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO); + error = g_write_data(cp, + pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); + if (error != 0) { + G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", + pp->name, error); + } + free(buf, M_MD_NVIDIA); + return (error); +} + +static struct g_raid_disk * +g_raid_md_nvidia_get_disk(struct g_raid_softc *sc, int id) +{ + struct g_raid_disk *disk; + struct g_raid_md_nvidia_perdisk *pd; + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; + if (pd->pd_disk_pos == id) + break; + } + return (disk); +} + +static int +g_raid_md_nvidia_supported(int level, int qual, int disks, int force) +{ + + switch (level) { + case G_RAID_VOLUME_RL_RAID0: + if (disks < 1) + return (0); + if (!force && (disks < 2 || disks > 6)) + return (0); + break; + case G_RAID_VOLUME_RL_RAID1: + if (disks < 1) + return (0); + if (!force && (disks != 2)) + return (0); + break; + case G_RAID_VOLUME_RL_RAID1E: + if (disks < 2) + return (0); + if (disks % 2 != 0) + return (0); + if (!force && (disks < 4)) + return (0); + break; + case G_RAID_VOLUME_RL_SINGLE: + if (disks != 1) + return (0); + break; + case G_RAID_VOLUME_RL_CONCAT: + if (disks < 2) + return (0); + break; + case G_RAID_VOLUME_RL_RAID5: + if (disks < 3) + return (0); + break; + default: + return (0); + } + if (qual != G_RAID_VOLUME_RLQ_NONE) + return (0); + return (1); +} + +static int +g_raid_md_nvidia_start_disk(struct g_raid_disk *disk) +{ + struct g_raid_softc *sc; + struct g_raid_subdisk *sd, *tmpsd; + struct g_raid_disk *olddisk, *tmpdisk; + struct g_raid_md_object *md; + struct g_raid_md_nvidia_object *mdi; + struct g_raid_md_nvidia_perdisk *pd, *oldpd; + struct nvidia_raid_conf *meta; + int disk_pos, resurrection = 0; + + sc = disk->d_softc; + md = sc->sc_md; + mdi = (struct g_raid_md_nvidia_object *)md; + meta = mdi->mdio_meta; + pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; + olddisk = NULL; + + /* Find disk position in metadata by it's serial. */ + if (pd->pd_meta != NULL) { + disk_pos = pd->pd_meta->disk_number; + if (disk_pos >= meta->total_disks || mdi->mdio_started) + disk_pos = -3; + } else + disk_pos = -3; + /* For RAID0+1 we need to translate order. */ + disk_pos = nvidia_meta_translate_disk(meta, disk_pos); + if (disk_pos < 0) { + G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); + /* If we are in the start process, that's all for now. */ + if (!mdi->mdio_started) + goto nofit; + /* + * If we have already started - try to get use of the disk. + * Try to replace OFFLINE disks first, then FAILED. + */ + TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { + if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && + tmpdisk->d_state != G_RAID_DISK_S_FAILED) + continue; + /* Make sure this disk is big enough. */ + TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { + if (sd->sd_offset + sd->sd_size + 2 * 512 > + pd->pd_disk_size) { + G_RAID_DEBUG1(1, sc, + "Disk too small (%ju < %ju)", + pd->pd_disk_size, + sd->sd_offset + sd->sd_size + 512); + break; + } + } + if (sd != NULL) + continue; + if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { + olddisk = tmpdisk; + break; + } else if (olddisk == NULL) + olddisk = tmpdisk; + } + if (olddisk == NULL) { +nofit: + g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); + return (1); + } + oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data; + disk_pos = oldpd->pd_disk_pos; + resurrection = 1; + } + + if (olddisk == NULL) { + /* Find placeholder by position. */ + olddisk = g_raid_md_nvidia_get_disk(sc, disk_pos); + if (olddisk == NULL) + panic("No disk at position %d!", disk_pos); + if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { + G_RAID_DEBUG1(1, sc, "More then one disk for pos %d", + disk_pos); + g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); + return (0); + } + oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data; + } + + /* Replace failed disk or placeholder with new disk. */ + TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { + TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + sd->sd_disk = disk; + } + oldpd->pd_disk_pos = -2; + pd->pd_disk_pos = disk_pos; + + /* If it was placeholder -- destroy it. */ + if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { + g_raid_destroy_disk(olddisk); + } else { + /* Otherwise, make it STALE_FAILED. */ + g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); + } + + /* Welcome the new disk. */ + if (resurrection) + g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); + else// if (pd->pd_meta->disk_status == NVIDIA_S_CURRENT || + //pd->pd_meta->disk_status == NVIDIA_S_REBUILD) + g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); +// else +// g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + + /* + * Different disks may have different sizes, + * in concat mode. Update from real disk size. + */ + if (meta->type == NVIDIA_T_CONCAT) + sd->sd_size = pd->pd_disk_size - 0x800 * 512; + + if (resurrection) { + /* New or ex-spare disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NEW); + } else if (meta->state == NVIDIA_S_REBUILD && + (pd->pd_meta->disk_status & 0x100)) { + /* Rebuilding disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_REBUILD); + sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba / + meta->array_width * pd->pd_meta->sector_size; + } else if (meta->state == NVIDIA_S_SYNC) { + /* Resyncing/dirty disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_RESYNC); + sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba / + meta->array_width * pd->pd_meta->sector_size; + } else { + /* Up to date disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_ACTIVE); + } + g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, + G_RAID_EVENT_SUBDISK); + } + + /* Update status of our need for spare. */ + if (mdi->mdio_started) { + mdi->mdio_incomplete = + (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < + mdi->mdio_total_disks); + } + + return (resurrection); +} + +static void +g_disk_md_nvidia_retaste(void *arg, int pending) +{ + + G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); + g_retaste(&g_raid_class); + free(arg, M_MD_NVIDIA); +} + +static void +g_raid_md_nvidia_refill(struct g_raid_softc *sc) +{ + struct g_raid_md_object *md; + struct g_raid_md_nvidia_object *mdi; + struct nvidia_raid_conf *meta; + struct g_raid_disk *disk; + struct task *task; + int update, na; + + md = sc->sc_md; + mdi = (struct g_raid_md_nvidia_object *)md; + meta = mdi->mdio_meta; + update = 0; + do { + /* Make sure we miss anything. */ + na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); + if (na == mdi->mdio_total_disks) + break; + + G_RAID_DEBUG1(1, md->mdo_softc, + "Array is not complete (%d of %d), " + "trying to refill.", na, mdi->mdio_total_disks); + + /* Try to get use some of STALE disks. */ + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_state == G_RAID_DISK_S_STALE) { + update += g_raid_md_nvidia_start_disk(disk); + if (disk->d_state == G_RAID_DISK_S_ACTIVE) + break; + } + } + if (disk != NULL) + continue; + + /* Try to get use some of SPARE disks. */ + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_state == G_RAID_DISK_S_SPARE) { + update += g_raid_md_nvidia_start_disk(disk); + if (disk->d_state == G_RAID_DISK_S_ACTIVE) + break; + } + } + } while (disk != NULL); + + /* Write new metadata if we changed something. */ + if (update) { + g_raid_md_write_nvidia(md, NULL, NULL, NULL); + meta = mdi->mdio_meta; + } + + /* Update status of our need for spare. */ + mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < + mdi->mdio_total_disks); + + /* Request retaste hoping to find spare. */ + if (mdi->mdio_incomplete) { + task = malloc(sizeof(struct task), + M_MD_NVIDIA, M_WAITOK | M_ZERO); + TASK_INIT(task, 0, g_disk_md_nvidia_retaste, task); + taskqueue_enqueue(taskqueue_swi, task); + } +} + +static void +g_raid_md_nvidia_start(struct g_raid_softc *sc) +{ + struct g_raid_md_object *md; + struct g_raid_md_nvidia_object *mdi; + struct g_raid_md_nvidia_perdisk *pd; + struct nvidia_raid_conf *meta; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + off_t size; + int j, disk_pos; + char buf[17]; + + md = sc->sc_md; + mdi = (struct g_raid_md_nvidia_object *)md; + meta = mdi->mdio_meta; + + /* Create volumes and subdisks. */ + nvidia_meta_get_name(meta, buf); + vol = g_raid_create_volume(sc, buf, -1); + vol->v_mediasize = (off_t)meta->total_sectors * 512; + vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; + if (meta->type == NVIDIA_T_RAID0) { + vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; + size = vol->v_mediasize / mdi->mdio_total_disks; + } else if (meta->type == NVIDIA_T_RAID1) { + vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; + size = vol->v_mediasize; + } else if (meta->type == NVIDIA_T_RAID01) { + vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; + size = vol->v_mediasize / (mdi->mdio_total_disks / 2); + } else if (meta->type == NVIDIA_T_CONCAT) { + if (mdi->mdio_total_disks == 1) + vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; + else + vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; + size = 0; + } else if (meta->type == NVIDIA_T_RAID5) { + vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; + size = vol->v_mediasize / (mdi->mdio_total_disks - 1); + } else if (meta->type == NVIDIA_T_RAID5_SYM) { + vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; +// vol->v_raid_level_qualifier = 0x03; + size = vol->v_mediasize / (mdi->mdio_total_disks - 1); + } else { + vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; + size = 0; + } + vol->v_strip_size = meta->strip_sectors * 512; //ZZZ + vol->v_disks_count = mdi->mdio_total_disks; + vol->v_sectorsize = 512; //ZZZ + for (j = 0; j < vol->v_disks_count; j++) { + sd = &vol->v_subdisks[j]; + sd->sd_offset = 0; + sd->sd_size = size; + } + g_raid_start_volume(vol); + + /* Create disk placeholders to store data for later writing. */ + for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { + pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); + pd->pd_disk_pos = disk_pos; + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_state = G_RAID_DISK_S_OFFLINE; + sd = &vol->v_subdisks[disk_pos]; + sd->sd_disk = disk; + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + } + + /* Make all disks found till the moment take their places. */ + do { + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_state == G_RAID_DISK_S_NONE) { + g_raid_md_nvidia_start_disk(disk); + break; + } + } + } while (disk != NULL); + + mdi->mdio_started = 1; + G_RAID_DEBUG1(0, sc, "Array started."); + g_raid_md_write_nvidia(md, NULL, NULL, NULL); + + /* Pickup any STALE/SPARE disks to refill array if needed. */ + g_raid_md_nvidia_refill(sc); + + g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); + + callout_stop(&mdi->mdio_start_co); + G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); + root_mount_rel(mdi->mdio_rootmount); + mdi->mdio_rootmount = NULL; +} + +static void +g_raid_md_nvidia_new_disk(struct g_raid_disk *disk) +{ + struct g_raid_softc *sc; + struct g_raid_md_object *md; + struct g_raid_md_nvidia_object *mdi; + struct nvidia_raid_conf *pdmeta; + struct g_raid_md_nvidia_perdisk *pd; + + sc = disk->d_softc; + md = sc->sc_md; + mdi = (struct g_raid_md_nvidia_object *)md; + pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; + pdmeta = pd->pd_meta; + + if (mdi->mdio_started) { + if (g_raid_md_nvidia_start_disk(disk)) + g_raid_md_write_nvidia(md, NULL, NULL, NULL); + } else { + if (mdi->mdio_meta == NULL || + mdi->mdio_meta->disk_number >= mdi->mdio_meta->total_disks) { + G_RAID_DEBUG1(1, sc, "Newer disk"); + if (mdi->mdio_meta != NULL) + free(mdi->mdio_meta, M_MD_NVIDIA); + mdi->mdio_meta = nvidia_meta_copy(pdmeta); + mdi->mdio_total_disks = pdmeta->total_disks; + mdi->mdio_disks_present = 1; + } else if (pdmeta->disk_number < mdi->mdio_meta->total_disks) { + mdi->mdio_disks_present++; + G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", + mdi->mdio_disks_present, + mdi->mdio_total_disks); + } else + G_RAID_DEBUG1(1, sc, "Spare disk"); + + /* If we collected all needed disks - start array. */ + if (mdi->mdio_disks_present == mdi->mdio_total_disks) + g_raid_md_nvidia_start(sc); + } +} + +static void +g_raid_nvidia_go(void *arg) +{ + struct g_raid_softc *sc; + struct g_raid_md_object *md; + struct g_raid_md_nvidia_object *mdi; + + sc = arg; + md = sc->sc_md; + mdi = (struct g_raid_md_nvidia_object *)md; + if (!mdi->mdio_started) { + G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); + g_raid_event_send(sc, G_RAID_NODE_E_START, 0); + } +} + +static int +g_raid_md_create_nvidia(struct g_raid_md_object *md, struct g_class *mp, + struct g_geom **gp) +{ + struct g_raid_softc *sc; + struct g_raid_md_nvidia_object *mdi; + char name[32]; + + mdi = (struct g_raid_md_nvidia_object *)md; + arc4rand(&mdi->mdio_volume_id, 16, 0); + snprintf(name, sizeof(name), "NVIDIA-%d", + atomic_fetchadd_int(&NVIDIANodeID, 1)); + sc = g_raid_create_node(mp, name, md); + if (sc == NULL) + return (G_RAID_MD_TASTE_FAIL); + md->mdo_softc = sc; + *gp = sc->sc_geom; + return (G_RAID_MD_TASTE_NEW); +} + +static int +g_raid_md_taste_nvidia(struct g_raid_md_object *md, struct g_class *mp, + struct g_consumer *cp, struct g_geom **gp) +{ + struct g_consumer *rcp; + struct g_provider *pp; + struct g_raid_md_nvidia_object *mdi, *mdi1; + struct g_raid_softc *sc; + struct g_raid_disk *disk; + struct nvidia_raid_conf *meta; + struct g_raid_md_nvidia_perdisk *pd; + struct g_geom *geom; + int error, disk_pos, result, spare, len; + char name[32]; + uint16_t vendor; + + G_RAID_DEBUG(1, "Tasting NVIDIA on %s", cp->provider->name); + mdi = (struct g_raid_md_nvidia_object *)md; + pp = cp->provider; + + /* Read metadata from device. */ + meta = NULL; + spare = 0; + vendor = 0xffff; + disk_pos = 0; + if (g_access(cp, 1, 0, 0) != 0) + return (G_RAID_MD_TASTE_FAIL); + g_topology_unlock(); + len = 2; + if (pp->geom->rank == 1) + g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); + meta = nvidia_meta_read(cp); + g_topology_lock(); + g_access(cp, -1, 0, 0); + if (meta == NULL) { + if (g_raid_aggressive_spare) { + if (vendor == 0x10de) { + G_RAID_DEBUG(1, + "No NVIDIA metadata, forcing spare."); + spare = 2; + goto search; + } else { + G_RAID_DEBUG(1, + "NVIDIA vendor mismatch 0x%04x != 0x10de", + vendor); + } + } + return (G_RAID_MD_TASTE_FAIL); + } + + /* Check this disk position in obtained metadata. */ + disk_pos = meta->disk_number; + if (disk_pos == -1) { + G_RAID_DEBUG(1, "NVIDIA disk position not found"); + goto fail1; + } + + /* Metadata valid. Print it. */ + g_raid_md_nvidia_print(meta); + G_RAID_DEBUG(1, "NVIDIA disk position %d", disk_pos); + spare = 0;//(meta->type == NVIDIA_T_SPARE) ? 1 : 0; + +search: + /* Search for matching node. */ + sc = NULL; + mdi1 = NULL; + LIST_FOREACH(geom, &mp->geom, geom) { + sc = geom->softc; + if (sc == NULL) + continue; + if (sc->sc_stopping != 0) + continue; + if (sc->sc_md->mdo_class != md->mdo_class) + continue; + mdi1 = (struct g_raid_md_nvidia_object *)sc->sc_md; + if (spare) { + if (mdi1->mdio_incomplete) + break; + } else { + if (memcmp(&mdi1->mdio_volume_id, + &meta->volume_id, 16) == 0) + break; + } + } + + /* Found matching node. */ + if (geom != NULL) { + G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); + result = G_RAID_MD_TASTE_EXISTING; + + } else if (spare) { /* Not found needy node -- left for later. */ + G_RAID_DEBUG(1, "Spare is not needed at this time"); + goto fail1; + + } else { /* Not found matching node -- create one. */ + result = G_RAID_MD_TASTE_NEW; + memcpy(&mdi->mdio_volume_id, &meta->volume_id, 16); + snprintf(name, sizeof(name), "NVIDIA-%d", + atomic_fetchadd_int(&NVIDIANodeID, 1)); + sc = g_raid_create_node(mp, name, md); + md->mdo_softc = sc; + geom = sc->sc_geom; + callout_init(&mdi->mdio_start_co, 1); + callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, + g_raid_nvidia_go, sc); + mdi->mdio_rootmount = root_mount_hold("GRAID-NVIDIA"); + G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); + } + + rcp = g_new_consumer(geom); + g_attach(rcp, pp); + if (g_access(rcp, 1, 1, 1) != 0) + ; //goto fail1; + + g_topology_unlock(); + sx_xlock(&sc->sc_lock); + + pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); + pd->pd_meta = meta; + if (spare == 2) { + pd->pd_disk_pos = -3; + } else { + pd->pd_disk_pos = -1; + } + pd->pd_disk_size = pp->mediasize; + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_consumer = rcp; + rcp->private = disk; + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", + rcp->provider->name, error); + + g_raid_md_nvidia_new_disk(disk); + + sx_xunlock(&sc->sc_lock); + g_topology_lock(); + *gp = geom; + return (result); +fail1: + free(meta, M_MD_NVIDIA); + return (G_RAID_MD_TASTE_FAIL); +} + +static int +g_raid_md_event_nvidia(struct g_raid_md_object *md, + struct g_raid_disk *disk, u_int event) +{ + struct g_raid_softc *sc; + struct g_raid_subdisk *sd; + struct g_raid_md_nvidia_object *mdi; + struct g_raid_md_nvidia_perdisk *pd; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_nvidia_object *)md; + if (disk == NULL) { + switch (event) { + case G_RAID_NODE_E_START: + if (!mdi->mdio_started) { + /* Bump volume ID to drop missing disks. */ + arc4rand(&mdi->mdio_volume_id, 16, 0); + g_raid_md_nvidia_start(sc); + } + return (0); + } + return (-1); + } + pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; + switch (event) { + case G_RAID_DISK_E_DISCONNECTED: + /* If disk was assigned, just update statuses. */ + if (pd->pd_disk_pos >= 0) { + g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); + if (disk->d_consumer) { + g_raid_kill_consumer(sc, disk->d_consumer); + disk->d_consumer = NULL; + } + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NONE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, + G_RAID_EVENT_SUBDISK); + } + } else { + /* Otherwise -- delete. */ + g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); + g_raid_destroy_disk(disk); + } + + if (mdi->mdio_started) { + /* Bump volume ID to prevent disk resurrection. */ + if (pd->pd_disk_pos >= 0) + arc4rand(&mdi->mdio_volume_id, 16, 0); + + /* Write updated metadata to all disks. */ + g_raid_md_write_nvidia(md, NULL, NULL, NULL); + } + + /* Check if anything left except placeholders. */ + if (g_raid_ndisks(sc, -1) == + g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) + g_raid_destroy_node(sc, 0); + else + g_raid_md_nvidia_refill(sc); + return (0); + } + return (-2); +} + +static int +g_raid_md_ctl_nvidia(struct g_raid_md_object *md, + struct gctl_req *req) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + struct g_raid_md_nvidia_object *mdi; + struct g_raid_md_nvidia_perdisk *pd; + struct g_consumer *cp; + struct g_provider *pp; + char arg[16]; + const char *verb, *volname, *levelname, *diskname; + int *nargs, *force; + off_t size, sectorsize, strip; + intmax_t *sizearg, *striparg; + int numdisks, i, len, level, qual, update; + int error; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_nvidia_object *)md; + verb = gctl_get_param(req, "verb", NULL); + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + error = 0; + if (strcmp(verb, "label") == 0) { + + if (*nargs < 4) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + volname = gctl_get_asciiparam(req, "arg1"); + if (volname == NULL) { + gctl_error(req, "No volume name."); + return (-2); + } + levelname = gctl_get_asciiparam(req, "arg2"); + if (levelname == NULL) { + gctl_error(req, "No RAID level."); + return (-3); + } + if (g_raid_volume_str2level(levelname, &level, &qual)) { + gctl_error(req, "Unknown RAID level '%s'.", levelname); + return (-4); + } + numdisks = *nargs - 3; + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (!g_raid_md_nvidia_supported(level, qual, numdisks, + force ? *force : 0)) { + gctl_error(req, "Unsupported RAID level " + "(0x%02x/0x%02x), or number of disks (%d).", + level, qual, numdisks); + return (-5); + } + + /* Search for disks, connect them and probe. */ + size = 0x7fffffffffffffffllu; + sectorsize = 0; + for (i = 0; i < numdisks; i++) { + snprintf(arg, sizeof(arg), "arg%d", i + 3); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -6; + break; + } + if (strcmp(diskname, "NONE") == 0) { + cp = NULL; + pp = NULL; + } else { + g_topology_lock(); + cp = g_raid_open_consumer(sc, diskname); + if (cp == NULL) { + gctl_error(req, "Can't open '%s'.", + diskname); + g_topology_unlock(); + error = -7; + break; + } + pp = cp->provider; + } + pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); + pd->pd_disk_pos = i; + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_consumer = cp; + if (cp == NULL) + continue; + cp->private = disk; + g_topology_unlock(); + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, + "Dumping not supported by %s.", + cp->provider->name); + + pd->pd_disk_size = pp->mediasize; + if (size > pp->mediasize) + size = pp->mediasize; + if (sectorsize < pp->sectorsize) + sectorsize = pp->sectorsize; + } + if (error != 0) + return (error); + + /* Reserve space for metadata. */ + size -= 2 * sectorsize; + + /* Handle size argument. */ + len = sizeof(*sizearg); + sizearg = gctl_get_param(req, "size", &len); + if (sizearg != NULL && len == sizeof(*sizearg) && + *sizearg > 0) { + if (*sizearg > size) { + gctl_error(req, "Size too big %lld > %lld.", + (long long)*sizearg, (long long)size); + return (-9); + } + size = *sizearg; + } + + /* Handle strip argument. */ + strip = 131072; + len = sizeof(*striparg); + striparg = gctl_get_param(req, "strip", &len); + if (striparg != NULL && len == sizeof(*striparg) && + *striparg > 0) { + if (*striparg < sectorsize) { + gctl_error(req, "Strip size too small."); + return (-10); + } + if (*striparg % sectorsize != 0) { + gctl_error(req, "Incorrect strip size."); + return (-11); + } + if (strip > 65535 * sectorsize) { + gctl_error(req, "Strip size too big."); + return (-12); + } + strip = *striparg; + } + + /* Round size down to strip or sector. */ + if (level == G_RAID_VOLUME_RL_RAID1) + size -= (size % sectorsize); + else if (level == G_RAID_VOLUME_RL_RAID1E && + (numdisks & 1) != 0) + size -= (size % (2 * strip)); + else + size -= (size % strip); + if (size <= 0) { + gctl_error(req, "Size too small."); + return (-13); + } + if (size > 0xffffffffffffllu * sectorsize) { + gctl_error(req, "Size too big."); + return (-14); + } + + /* We have all we need, create things: volume, ... */ + mdi->mdio_total_disks = numdisks; + mdi->mdio_started = 1; + vol = g_raid_create_volume(sc, volname, -1); + vol->v_md_data = (void *)(intptr_t)0; + vol->v_raid_level = level; + vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; + vol->v_strip_size = strip; + vol->v_disks_count = numdisks; + if (level == G_RAID_VOLUME_RL_RAID0 || + level == G_RAID_VOLUME_RL_CONCAT || + level == G_RAID_VOLUME_RL_SINGLE) + vol->v_mediasize = size * numdisks; + else if (level == G_RAID_VOLUME_RL_RAID1) + vol->v_mediasize = size; + else if (level == G_RAID_VOLUME_RL_RAID5) + vol->v_mediasize = size * (numdisks - 1); + else { /* RAID1E */ + vol->v_mediasize = ((size * numdisks) / strip / 2) * + strip; + } + vol->v_sectorsize = sectorsize; + g_raid_start_volume(vol); + + /* , and subdisks. */ + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; + sd = &vol->v_subdisks[pd->pd_disk_pos]; + sd->sd_disk = disk; + sd->sd_offset = 0; + sd->sd_size = size; + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + if (sd->sd_disk->d_consumer != NULL) { + g_raid_change_disk_state(disk, + G_RAID_DISK_S_ACTIVE); + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_ACTIVE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, + G_RAID_EVENT_SUBDISK); + } else { + g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); + } + } + + /* Write metadata based on created entities. */ + G_RAID_DEBUG1(0, sc, "Array started."); + g_raid_md_write_nvidia(md, NULL, NULL, NULL); + + /* Pickup any STALE/SPARE disks to refill array if needed. */ + g_raid_md_nvidia_refill(sc); + + g_raid_event_send(vol, G_RAID_VOLUME_E_START, + G_RAID_EVENT_VOLUME); + return (0); + } + if (strcmp(verb, "delete") == 0) { + + /* Check if some volume is still open. */ + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (force != NULL && *force == 0 && + g_raid_nopens(sc) != 0) { + gctl_error(req, "Some volume is still open."); + return (-4); + } + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_consumer) + nvidia_meta_erase(disk->d_consumer); + } + g_raid_destroy_node(sc, 0); + return (0); + } + if (strcmp(verb, "remove") == 0 || + strcmp(verb, "fail") == 0) { + if (*nargs < 2) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + for (i = 1; i < *nargs; i++) { + snprintf(arg, sizeof(arg), "arg%d", i); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -2; + break; + } + if (strncmp(diskname, "/dev/", 5) == 0) + diskname += 5; + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_consumer != NULL && + disk->d_consumer->provider != NULL && + strcmp(disk->d_consumer->provider->name, + diskname) == 0) + break; + } + if (disk == NULL) { + gctl_error(req, "Disk '%s' not found.", + diskname); + error = -3; + break; + } + + if (strcmp(verb, "fail") == 0) { + g_raid_md_fail_disk_nvidia(md, NULL, disk); + continue; + } + + pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; + + /* Erase metadata on deleting disk. */ + nvidia_meta_erase(disk->d_consumer); + + /* If disk was assigned, just update statuses. */ + if (pd->pd_disk_pos >= 0) { + g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); + if (disk->d_consumer) { + g_raid_kill_consumer(sc, disk->d_consumer); + disk->d_consumer = NULL; + } + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NONE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, + G_RAID_EVENT_SUBDISK); + } + } else { + /* Otherwise -- delete. */ + g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); + g_raid_destroy_disk(disk); + } + } + + /* Write updated metadata to remaining disks. */ + g_raid_md_write_nvidia(md, NULL, NULL, NULL); + + /* Check if anything left except placeholders. */ + if (g_raid_ndisks(sc, -1) == + g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) + g_raid_destroy_node(sc, 0); + else + g_raid_md_nvidia_refill(sc); + return (error); + } + if (strcmp(verb, "insert") == 0) { + if (*nargs < 2) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + update = 0; + for (i = 1; i < *nargs; i++) { + /* Get disk name. */ + snprintf(arg, sizeof(arg), "arg%d", i); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -3; + break; + } + + /* Try to find provider with specified name. */ + g_topology_lock(); + cp = g_raid_open_consumer(sc, diskname); + if (cp == NULL) { + gctl_error(req, "Can't open disk '%s'.", + diskname); + g_topology_unlock(); + error = -4; + break; + } + pp = cp->provider; + + pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); + pd->pd_disk_pos = -3; + pd->pd_disk_size = pp->mediasize; + + disk = g_raid_create_disk(sc); + disk->d_consumer = cp; + disk->d_consumer->private = disk; + disk->d_md_data = (void *)pd; + cp->private = disk; + g_topology_unlock(); + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, + "Dumping not supported by %s.", + cp->provider->name); + + /* Welcome the "new" disk. */ + update += g_raid_md_nvidia_start_disk(disk); + if (disk->d_state != G_RAID_DISK_S_SPARE && + disk->d_state != G_RAID_DISK_S_ACTIVE) { + gctl_error(req, "Disk '%s' doesn't fit.", + diskname); + g_raid_destroy_disk(disk); + error = -8; + break; + } + } + + /* Write new metadata if we changed something. */ + if (update) + g_raid_md_write_nvidia(md, NULL, NULL, NULL); + return (error); + } + gctl_error(req, "Command '%s' is not supported.", verb); + return (-100); +} + +static int +g_raid_md_write_nvidia(struct g_raid_md_object *md, struct g_raid_volume *tvol, + struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + struct g_raid_md_nvidia_object *mdi; + struct g_raid_md_nvidia_perdisk *pd; + struct nvidia_raid_conf *meta; + int i, spares; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_nvidia_object *)md; + + if (sc->sc_stopping == G_RAID_DESTROY_HARD) + return (0); + + /* There is only one volume. */ + vol = TAILQ_FIRST(&sc->sc_volumes); + + /* Fill global fields. */ + meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK | M_ZERO); + if (mdi->mdio_meta) + memcpy(meta, mdi->mdio_meta, sizeof(*meta)); + memcpy(meta->nvidia_id, NVIDIA_MAGIC, sizeof(NVIDIA_MAGIC)); + meta->config_size = 30; + meta->version = 0x0064; + meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; + meta->sector_size = vol->v_sectorsize; + nvidia_meta_put_name(meta, vol->v_name); + meta->magic_0 = NVIDIA_MAGIC0; + memcpy(&meta->volume_id, &mdi->mdio_volume_id, 16); + meta->state = NVIDIA_S_IDLE; + if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) + meta->array_width = 1; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) + meta->array_width = vol->v_disks_count / 2; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) + meta->array_width = vol->v_disks_count - 1; + else + meta->array_width = vol->v_disks_count; + meta->total_disks = vol->v_disks_count; + meta->orig_array_width = meta->array_width; + if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) + meta->type = NVIDIA_T_RAID0; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) + meta->type = NVIDIA_T_RAID1; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) + meta->type = NVIDIA_T_RAID01; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || + vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) + meta->type = NVIDIA_T_CONCAT; +// else if (vol->v_raid_level_qualifier == 0) +// meta->type = NVIDIA_T_RAID5; + else + meta->type = NVIDIA_T_RAID5_SYM; + meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize; + meta->strip_bytes = vol->v_strip_size; + meta->strip_shift = ffs(meta->strip_sectors) - 1; + meta->strip_mask = meta->strip_sectors - 1; + meta->stripe_sectors = meta->strip_sectors * meta->orig_array_width; + meta->stripe_bytes = meta->stripe_sectors * vol->v_sectorsize; + meta->rebuild_lba = 0; + meta->orig_type = meta->type; + meta->orig_total_sectors = meta->total_sectors; + meta->status = 0; + + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + if ((sd->sd_state == G_RAID_SUBDISK_S_STALE || + sd->sd_state == G_RAID_SUBDISK_S_RESYNC || + vol->v_dirty) && + meta->state != NVIDIA_S_REBUILD) + meta->state = NVIDIA_S_SYNC; + else if (sd->sd_state == G_RAID_SUBDISK_S_NEW || + sd->sd_state == G_RAID_SUBDISK_S_REBUILD) + meta->state = NVIDIA_S_REBUILD; + } + + /* We are done. Print meta data and store them to disks. */ + if (mdi->mdio_meta != NULL) + free(mdi->mdio_meta, M_MD_NVIDIA); + mdi->mdio_meta = meta; + spares = 0; + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; + if (disk->d_state != G_RAID_DISK_S_ACTIVE && + disk->d_state != G_RAID_DISK_S_SPARE) + continue; + if (pd->pd_meta != NULL) { + free(pd->pd_meta, M_MD_NVIDIA); + pd->pd_meta = NULL; + } + pd->pd_meta = nvidia_meta_copy(meta); + if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { + /* For RAID0+1 we need to translate order. */ + pd->pd_meta->disk_number = + nvidia_meta_translate_disk(meta, sd->sd_pos); + if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { + pd->pd_meta->disk_status = 0x100; + pd->pd_meta->rebuild_lba = + sd->sd_rebuild_pos / vol->v_sectorsize * + meta->array_width; + } + } else + pd->pd_meta->disk_number = meta->total_disks + spares++; + G_RAID_DEBUG(1, "Writing NVIDIA metadata to %s", + g_raid_get_diskname(disk)); + g_raid_md_nvidia_print(pd->pd_meta); + nvidia_meta_write(disk->d_consumer, pd->pd_meta); + } + return (0); +} + +static int +g_raid_md_fail_disk_nvidia(struct g_raid_md_object *md, + struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) +{ + struct g_raid_softc *sc; + struct g_raid_md_nvidia_object *mdi; + struct g_raid_md_nvidia_perdisk *pd; + struct g_raid_subdisk *sd; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_nvidia_object *)md; + pd = (struct g_raid_md_nvidia_perdisk *)tdisk->d_md_data; + + /* We can't fail disk that is not a part of array now. */ + if (pd->pd_disk_pos < 0) + return (-1); + + /* Erase metadata to prevent disks's later resurrection. */ + if (tdisk->d_consumer) + nvidia_meta_erase(tdisk->d_consumer); + + /* Change states. */ + g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); + TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_FAILED); + g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, + G_RAID_EVENT_SUBDISK); + } + + /* Write updated metadata to remaining disks. */ + g_raid_md_write_nvidia(md, NULL, NULL, tdisk); + + /* Check if anything left except placeholders. */ + if (g_raid_ndisks(sc, -1) == + g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) + g_raid_destroy_node(sc, 0); + else + g_raid_md_nvidia_refill(sc); + return (0); +} + +static int +g_raid_md_free_disk_nvidia(struct g_raid_md_object *md, + struct g_raid_disk *disk) +{ + struct g_raid_md_nvidia_perdisk *pd; + + pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; + if (pd->pd_meta != NULL) { + free(pd->pd_meta, M_MD_NVIDIA); + pd->pd_meta = NULL; + } + free(pd, M_MD_NVIDIA); + disk->d_md_data = NULL; + return (0); +} + +static int +g_raid_md_free_nvidia(struct g_raid_md_object *md) +{ + struct g_raid_md_nvidia_object *mdi; + + mdi = (struct g_raid_md_nvidia_object *)md; + if (!mdi->mdio_started) { + mdi->mdio_started = 0; + callout_stop(&mdi->mdio_start_co); + G_RAID_DEBUG1(1, md->mdo_softc, + "root_mount_rel %p", mdi->mdio_rootmount); + root_mount_rel(mdi->mdio_rootmount); + mdi->mdio_rootmount = NULL; + } + if (mdi->mdio_meta != NULL) { + free(mdi->mdio_meta, M_MD_NVIDIA); + mdi->mdio_meta = NULL; + } + return (0); +} + +G_RAID_MD_DECLARE(g_raid_md_nvidia); diff --git a/sys/geom/raid/md_promise.c b/sys/geom/raid/md_promise.c new file mode 100644 index 0000000..b7bf070 --- /dev/null +++ b/sys/geom/raid/md_promise.c @@ -0,0 +1,1940 @@ +/*- + * Copyright (c) 2011 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "geom/raid/g_raid.h" +#include "g_raid_md_if.h" + +static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata"); + +#define PROMISE_MAX_DISKS 8 +#define PROMISE_MAX_SUBDISKS 2 +#define PROMISE_META_OFFSET 14 + +struct promise_raid_disk { + uint8_t flags; /* Subdisk status. */ +#define PROMISE_F_VALID 0x01 +#define PROMISE_F_ONLINE 0x02 +#define PROMISE_F_ASSIGNED 0x04 +#define PROMISE_F_SPARE 0x08 +#define PROMISE_F_DUPLICATE 0x10 +#define PROMISE_F_REDIR 0x20 +#define PROMISE_F_DOWN 0x40 +#define PROMISE_F_READY 0x80 + + uint8_t number; /* Position in a volume. */ + uint8_t channel; /* ATA channel number. */ + uint8_t device; /* ATA device number. */ + uint64_t id __packed; /* Subdisk ID. */ +} __packed; + +struct promise_raid_conf { + char promise_id[24]; +#define PROMISE_MAGIC "Promise Technology, Inc." +#define FREEBSD_MAGIC "FreeBSD ATA driver RAID " + + uint32_t dummy_0; + uint64_t magic_0; +#define PROMISE_MAGIC0(x) (((uint64_t)(x.channel) << 48) | \ + ((uint64_t)(x.device != 0) << 56)) + uint16_t magic_1; + uint32_t magic_2; + uint8_t filler1[470]; + + uint32_t integrity; +#define PROMISE_I_VALID 0x00000080 + + struct promise_raid_disk disk; /* This subdisk info. */ + uint32_t disk_offset; /* Subdisk offset. */ + uint32_t disk_sectors; /* Subdisk size */ + uint32_t rebuild_lba; /* Rebuild position. */ + uint16_t generation; /* Generation number. */ + uint8_t status; /* Volume status. */ +#define PROMISE_S_VALID 0x01 +#define PROMISE_S_ONLINE 0x02 +#define PROMISE_S_INITED 0x04 +#define PROMISE_S_READY 0x08 +#define PROMISE_S_DEGRADED 0x10 +#define PROMISE_S_MARKED 0x20 +#define PROMISE_S_MIGRATING 0x40 +#define PROMISE_S_FUNCTIONAL 0x80 + + uint8_t type; /* Voluem type. */ +#define PROMISE_T_RAID0 0x00 +#define PROMISE_T_RAID1 0x01 +#define PROMISE_T_RAID3 0x02 +#define PROMISE_T_RAID5 0x04 +#define PROMISE_T_SPAN 0x08 +#define PROMISE_T_JBOD 0x10 + + uint8_t total_disks; /* Disks in this volume. */ + uint8_t stripe_shift; /* Strip size. */ + uint8_t array_width; /* Number of RAID0 stripes. */ + uint8_t array_number; /* Global volume number. */ + uint32_t total_sectors; /* Volume size. */ + uint16_t cylinders; /* Volume geometry: C. */ + uint8_t heads; /* Volume geometry: H. */ + uint8_t sectors; /* Volume geometry: S. */ + uint64_t volume_id __packed; /* Volume ID, */ + struct promise_raid_disk disks[PROMISE_MAX_DISKS]; + /* Subdisks in this volume. */ + char name[32]; /* Volume label. */ + + uint32_t filler2[8]; + uint32_t magic_3; /* Something related to rebuild. */ + uint64_t rebuild_lba64; /* Per-volume rebuild position. */ + uint32_t magic_4; + uint32_t magic_5; + uint32_t filler3[325]; + uint32_t checksum; +} __packed; + +struct g_raid_md_promise_perdisk { + int pd_updated; + int pd_subdisks; + struct promise_raid_conf *pd_meta[PROMISE_MAX_SUBDISKS]; +}; + +struct g_raid_md_promise_pervolume { + struct promise_raid_conf *pv_meta; + uint64_t pv_id; + uint16_t pv_generation; + int pv_disks_present; + int pv_started; + struct callout pv_start_co; /* STARTING state timer. */ +}; + +static g_raid_md_create_t g_raid_md_create_promise; +static g_raid_md_taste_t g_raid_md_taste_promise; +static g_raid_md_event_t g_raid_md_event_promise; +static g_raid_md_volume_event_t g_raid_md_volume_event_promise; +static g_raid_md_ctl_t g_raid_md_ctl_promise; +static g_raid_md_write_t g_raid_md_write_promise; +static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise; +static g_raid_md_free_disk_t g_raid_md_free_disk_promise; +static g_raid_md_free_volume_t g_raid_md_free_volume_promise; +static g_raid_md_free_t g_raid_md_free_promise; + +static kobj_method_t g_raid_md_promise_methods[] = { + KOBJMETHOD(g_raid_md_create, g_raid_md_create_promise), + KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_promise), + KOBJMETHOD(g_raid_md_event, g_raid_md_event_promise), + KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_promise), + KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_promise), + KOBJMETHOD(g_raid_md_write, g_raid_md_write_promise), + KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_promise), + KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_promise), + KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_promise), + KOBJMETHOD(g_raid_md_free, g_raid_md_free_promise), + { 0, 0 } +}; + +static struct g_raid_md_class g_raid_md_promise_class = { + "Promise", + g_raid_md_promise_methods, + sizeof(struct g_raid_md_object), + .mdc_priority = 100 +}; + + +static void +g_raid_md_promise_print(struct promise_raid_conf *meta) +{ + int i; + + if (g_raid_debug < 1) + return; + + printf("********* ATA Promise Metadata *********\n"); + printf("promise_id <%.24s>\n", meta->promise_id); + printf("disk %02x %02x %02x %02x %016jx\n", + meta->disk.flags, meta->disk.number, meta->disk.channel, + meta->disk.device, meta->disk.id); + printf("disk_offset %u\n", meta->disk_offset); + printf("disk_sectors %u\n", meta->disk_sectors); + printf("rebuild_lba %u\n", meta->rebuild_lba); + printf("generation %u\n", meta->generation); + printf("status 0x%02x\n", meta->status); + printf("type %u\n", meta->type); + printf("total_disks %u\n", meta->total_disks); + printf("stripe_shift %u\n", meta->stripe_shift); + printf("array_width %u\n", meta->array_width); + printf("array_number %u\n", meta->array_number); + printf("total_sectors %u\n", meta->total_sectors); + printf("cylinders %u\n", meta->cylinders); + printf("heads %u\n", meta->heads); + printf("sectors %u\n", meta->sectors); + printf("volume_id 0x%016jx\n", meta->volume_id); + printf("disks:\n"); + for (i = 0; i < PROMISE_MAX_DISKS; i++ ) { + printf(" %02x %02x %02x %02x %016jx\n", + meta->disks[i].flags, meta->disks[i].number, + meta->disks[i].channel, meta->disks[i].device, + meta->disks[i].id); + } + printf("name <%.32s>\n", meta->name); + printf("magic_3 0x%08x\n", meta->magic_3); + printf("rebuild_lba64 %ju\n", meta->rebuild_lba64); + printf("magic_4 0x%08x\n", meta->magic_4); + printf("magic_5 0x%08x\n", meta->magic_5); + printf("=================================================\n"); +} + +static struct promise_raid_conf * +promise_meta_copy(struct promise_raid_conf *meta) +{ + struct promise_raid_conf *nmeta; + + nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK); + memcpy(nmeta, meta, sizeof(*nmeta)); + return (nmeta); +} + +static int +promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id) +{ + int pos; + + for (pos = 0; pos < meta->total_disks; pos++) { + if (meta->disks[pos].id == id) + return (pos); + } + return (-1); +} + +static int +promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd, + uint32_t sectors, uint32_t *off, uint32_t *size) +{ + uint32_t coff, csize; + int i, j; + + sectors -= 131072; + *off = 0; + *size = 0; + coff = 0; + csize = sectors; + i = 0; + while (1) { + for (j = 0; j < nsd; j++) { + if (metaarr[j]->disk_offset >= coff) { + csize = MIN(csize, + metaarr[j]->disk_offset - coff); + } + } + if (csize > *size) { + *off = coff; + *size = csize; + } + if (i >= nsd) + break; + coff = metaarr[i]->disk_offset + metaarr[i]->disk_sectors; + csize = sectors - coff; + i++; + }; + return ((*size > 0) ? 1 : 0); +} + +static int +promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos) +{ + int disk_pos, width; + + if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { + width = vol->v_disks_count / 2; + disk_pos = (md_disk_pos / width) + + (md_disk_pos % width) * width; + } else + disk_pos = md_disk_pos; + return (disk_pos); +} + +static void +promise_meta_get_name(struct promise_raid_conf *meta, char *buf) +{ + int i; + + strncpy(buf, meta->name, 32); + buf[32] = 0; + for (i = 31; i >= 0; i--) { + if (buf[i] > 0x20) + break; + buf[i] = 0; + } +} + +static void +promise_meta_put_name(struct promise_raid_conf *meta, char *buf) +{ + + memset(meta->name, 0x20, 32); + memcpy(meta->name, buf, MIN(strlen(buf), 32)); +} + +static int +promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr) +{ + struct g_provider *pp; + struct promise_raid_conf *meta; + char *buf; + int error, i, subdisks; + uint32_t checksum, *ptr; + + pp = cp->provider; + subdisks = 0; +next: + /* Read metadata block. */ + buf = g_read_data(cp, pp->mediasize - pp->sectorsize * + (63 - subdisks * PROMISE_META_OFFSET), + pp->sectorsize * 4, &error); + if (buf == NULL) { + G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", + pp->name, error); + return (subdisks); + } + meta = (struct promise_raid_conf *)buf; + + /* Check if this is an Promise RAID struct */ + if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) && + strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) { + if (subdisks == 0) + G_RAID_DEBUG(1, + "Promise signature check failed on %s", pp->name); + g_free(buf); + return (subdisks); + } + meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK); + memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4)); + g_free(buf); + + /* Check metadata checksum. */ + for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) + checksum += *ptr++; + if (checksum != meta->checksum) { + G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name); + free(meta, M_MD_PROMISE); + return (subdisks); + } + + if ((meta->integrity & PROMISE_I_VALID) == 0) { + G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name); + free(meta, M_MD_PROMISE); + return (subdisks); + } + + if (meta->total_disks > PROMISE_MAX_DISKS) { + G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)", + pp->name, meta->total_disks); + free(meta, M_MD_PROMISE); + return (subdisks); + } + + /* Save this part and look for next. */ + *metaarr = meta; + metaarr++; + subdisks++; + if (subdisks < PROMISE_MAX_SUBDISKS) + goto next; + + return (subdisks); +} + +static int +promise_meta_write(struct g_consumer *cp, + struct promise_raid_conf **metaarr, int nsd) +{ + struct g_provider *pp; + struct promise_raid_conf *meta; + char *buf; + int error, i, subdisk, fake; + uint32_t checksum, *ptr, off, size; + + pp = cp->provider; + subdisk = 0; + fake = 0; +next: + buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO); + meta = NULL; + if (subdisk < nsd) { + meta = metaarr[subdisk]; + } else if (!fake && promise_meta_unused_range(metaarr, nsd, + cp->provider->mediasize / cp->provider->sectorsize, + &off, &size)) { + /* Optionally add record for unused space. */ + meta = (struct promise_raid_conf *)buf; + memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC)); + meta->dummy_0 = 0x00020000; + meta->integrity = PROMISE_I_VALID; + meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID; + meta->disk.number = 0xff; + arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); + meta->disk_offset = off; + meta->disk_sectors = size; + meta->rebuild_lba = UINT32_MAX; + fake = 1; + } + if (meta != NULL) { + /* Recalculate checksum for case if metadata were changed. */ + meta->checksum = 0; + for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) + checksum += *ptr++; + meta->checksum = checksum; + memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta))); + } + error = g_write_data(cp, pp->mediasize - pp->sectorsize * + (63 - subdisk * PROMISE_META_OFFSET), + buf, pp->sectorsize * 4); + if (error != 0) { + G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", + pp->name, error); + } + free(buf, M_MD_PROMISE); + + subdisk++; + if (subdisk < PROMISE_MAX_SUBDISKS) + goto next; + + return (error); +} + +static int +promise_meta_erase(struct g_consumer *cp) +{ + struct g_provider *pp; + char *buf; + int error, subdisk; + + pp = cp->provider; + buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO); + for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) { + error = g_write_data(cp, pp->mediasize - pp->sectorsize * + (63 - subdisk * PROMISE_META_OFFSET), + buf, 4 * pp->sectorsize); + if (error != 0) { + G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", + pp->name, error); + } + } + free(buf, M_MD_PROMISE); + return (error); +} + +static int +promise_meta_write_spare(struct g_consumer *cp) +{ + struct promise_raid_conf *meta; + int error; + + meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); + memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC)); + meta->dummy_0 = 0x00020000; + meta->integrity = PROMISE_I_VALID; + meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID; + meta->disk.number = 0xff; + arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); + meta->disk_sectors = cp->provider->mediasize / cp->provider->sectorsize; + meta->disk_sectors -= 131072; + meta->rebuild_lba = UINT32_MAX; + error = promise_meta_write(cp, &meta, 1); + free(meta, M_MD_PROMISE); + return (error); +} + +static struct g_raid_volume * +g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id) +{ + struct g_raid_volume *vol; + struct g_raid_md_promise_pervolume *pv; + + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + pv = vol->v_md_data; + if (pv->pv_id == id) + break; + } + return (vol); +} + +static int +g_raid_md_promise_purge_volumes(struct g_raid_softc *sc) +{ + struct g_raid_volume *vol, *tvol; + struct g_raid_md_promise_pervolume *pv; + int i, res; + + res = 0; + TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) { + pv = vol->v_md_data; + if (!pv->pv_started || vol->v_stopping) + continue; + for (i = 0; i < vol->v_disks_count; i++) { + if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) + break; + } + if (i >= vol->v_disks_count) { + g_raid_destroy_volume(vol); + res = 1; + } + } + return (res); +} + +static int +g_raid_md_promise_purge_disks(struct g_raid_softc *sc) +{ + struct g_raid_disk *disk, *tdisk; + struct g_raid_volume *vol; + struct g_raid_md_promise_perdisk *pd; + int i, j, res; + + res = 0; + TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { + if (disk->d_state == G_RAID_DISK_S_SPARE) + continue; + pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; + + /* Scan for deleted volumes. */ + for (i = 0; i < pd->pd_subdisks; ) { + vol = g_raid_md_promise_get_volume(sc, + pd->pd_meta[i]->volume_id); + if (vol != NULL && !vol->v_stopping) { + i++; + continue; + } + free(pd->pd_meta[i], M_MD_PROMISE); + for (j = i; j < pd->pd_subdisks - 1; j++) + pd->pd_meta[j] = pd->pd_meta[j + 1]; + pd->pd_meta[PROMISE_MAX_SUBDISKS - 1] = NULL; + pd->pd_subdisks--; + pd->pd_updated = 1; + } + + /* If there is no metadata left - erase and delete disk. */ + if (pd->pd_subdisks == 0) { + promise_meta_erase(disk->d_consumer); + g_raid_destroy_disk(disk); + res = 1; + } + } + return (res); +} + +static int +g_raid_md_promise_supported(int level, int qual, int disks, int force) +{ + + if (disks > PROMISE_MAX_DISKS) + return (0); + switch (level) { + case G_RAID_VOLUME_RL_RAID0: + if (disks < 1) + return (0); + if (!force && disks < 2) + return (0); + break; + case G_RAID_VOLUME_RL_RAID1: + if (disks < 1) + return (0); + if (!force && (disks != 2)) + return (0); + break; + case G_RAID_VOLUME_RL_RAID1E: + if (disks < 2) + return (0); + if (disks % 2 != 0) + return (0); + if (!force && (disks != 4)) + return (0); + break; + case G_RAID_VOLUME_RL_SINGLE: + if (disks != 1) + return (0); + break; + case G_RAID_VOLUME_RL_CONCAT: + if (disks < 2) + return (0); + break; + case G_RAID_VOLUME_RL_RAID5: + if (disks < 3) + return (0); + break; + default: + return (0); + } + if (qual != G_RAID_VOLUME_RLQ_NONE) + return (0); + return (1); +} + +static int +g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn, + struct g_raid_volume *vol) +{ + struct g_raid_softc *sc; + struct g_raid_subdisk *sd; + struct g_raid_md_promise_perdisk *pd; + struct g_raid_md_promise_pervolume *pv; + struct promise_raid_conf *meta; + off_t size; + int disk_pos, md_disk_pos, i, resurrection = 0; + uint32_t eoff, esize; + + sc = disk->d_softc; + pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; + + pv = vol->v_md_data; + meta = pv->pv_meta; + + if (sdn >= 0) { + /* Find disk position in metadata by it's serial. */ + md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id); + /* For RAID0+1 we need to translate order. */ + disk_pos = promise_meta_translate_disk(vol, md_disk_pos); + } else { + md_disk_pos = -1; + disk_pos = -1; + } + if (disk_pos < 0) { + G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s", + g_raid_get_diskname(disk), vol->v_name); + /* Failed stale disk is useless for us. */ + if (sdn >= 0 && + pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) { + g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); + return (0); + } + /* If we were given specific metadata subdisk - erase it. */ + if (sdn >= 0) { + free(pd->pd_meta[sdn], M_MD_PROMISE); + for (i = sdn; i < pd->pd_subdisks - 1; i++) + pd->pd_meta[i] = pd->pd_meta[i + 1]; + pd->pd_meta[PROMISE_MAX_SUBDISKS - 1] = NULL; + pd->pd_subdisks--; + } + /* If we are in the start process, that's all for now. */ + if (!pv->pv_started) + goto nofit; + /* + * If we have already started - try to get use of the disk. + * Try to replace OFFLINE disks first, then FAILED. + */ + promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks, + disk->d_consumer->provider->mediasize / + disk->d_consumer->provider->sectorsize, + &eoff, &esize); + if (esize == 0) { + G_RAID_DEBUG1(1, sc, "No free space on disk %s", + g_raid_get_diskname(disk)); + goto nofit; + } + size = INT64_MAX; + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + if (sd->sd_state != G_RAID_SUBDISK_S_NONE) + size = sd->sd_size; + if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED && + (disk_pos < 0 || + vol->v_subdisks[i].sd_state < sd->sd_state)) + disk_pos = i; + } + if (disk_pos >= 0 && + vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT && + (off_t)esize * 512 < size) { + G_RAID_DEBUG1(1, sc, "Disk %s free space " + "is too small (%ju < %ju)", + g_raid_get_diskname(disk), + (off_t)esize * 512, size); + disk_pos = -1; + } + if (disk_pos >= 0) { + if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT) + esize = size / 512; + /* For RAID0+1 we need to translate order. */ + md_disk_pos = promise_meta_translate_disk(vol, disk_pos); + } else { +nofit: + if (pd->pd_subdisks == 0) { + g_raid_change_disk_state(disk, + G_RAID_DISK_S_SPARE); + } + return (0); + } + G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s", + g_raid_get_diskname(disk), disk_pos, vol->v_name); + resurrection = 1; + } + + sd = &vol->v_subdisks[disk_pos]; + + if (resurrection && sd->sd_disk != NULL) { + g_raid_change_disk_state(sd->sd_disk, + G_RAID_DISK_S_STALE_FAILED); + TAILQ_REMOVE(&sd->sd_disk->d_subdisks, + sd, sd_next); + } + vol->v_subdisks[disk_pos].sd_disk = disk; + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + + /* Welcome the new disk. */ + if (resurrection) + g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); + else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) + g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); + else + g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); + + if (resurrection) { + sd->sd_offset = (off_t)eoff * 512; + sd->sd_size = (off_t)esize * 512; + } else { + sd->sd_offset = (off_t)pd->pd_meta[sdn]->disk_offset * 512; + sd->sd_size = (off_t)pd->pd_meta[sdn]->disk_sectors * 512; + } + + if (resurrection) { + /* Stale disk, almost same as new. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NEW); + } else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) { + /* Failed disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_FAILED); + } else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) { + /* Rebuilding disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_REBUILD); + if (pd->pd_meta[sdn]->generation != meta->generation) + sd->sd_rebuild_pos = 0; + else { + sd->sd_rebuild_pos = + (off_t)pd->pd_meta[sdn]->rebuild_lba * 512; + } + } else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) { + /* Rebuilding disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NEW); + } else if (pd->pd_meta[sdn]->generation != meta->generation || + (meta->status & PROMISE_S_MARKED)) { + /* Stale disk or dirty volume (unclean shutdown). */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_STALE); + } else { + /* Up to date disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_ACTIVE); + } + g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, + G_RAID_EVENT_SUBDISK); + + return (resurrection); +} + +static void +g_raid_md_promise_refill(struct g_raid_softc *sc) +{ + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + struct g_raid_md_object *md; + struct g_raid_md_promise_perdisk *pd; + struct g_raid_md_promise_pervolume *pv; + int update, updated, i, bad; + + md = sc->sc_md; +restart: + updated = 0; + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + pv = vol->v_md_data; + if (!pv->pv_started || vol->v_stopping) + continue; + + /* Search for subdisk that needs replacement. */ + bad = 0; + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + if (sd->sd_state == G_RAID_SUBDISK_S_NONE || + sd->sd_state == G_RAID_SUBDISK_S_FAILED) + bad = 1; + } + if (!bad) + continue; + + G_RAID_DEBUG1(1, sc, "Volume %s is not complete, " + "trying to refill.", vol->v_name); + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + /* Skip failed. */ + if (disk->d_state < G_RAID_DISK_S_SPARE) + continue; + /* Skip already used by this volume. */ + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + if (sd->sd_disk == disk) + break; + } + if (i < vol->v_disks_count) + continue; + + /* Try to use disk if it has empty extents. */ + pd = disk->d_md_data; + if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) { + update = + g_raid_md_promise_start_disk(disk, -1, vol); + } else + update = 0; + if (update) { + g_raid_md_write_promise(md, vol, NULL, disk); + break; + } + updated += update; + } + } + if (updated) + goto restart; +} + +static void +g_raid_md_promise_start(struct g_raid_volume *vol) +{ + struct g_raid_softc *sc; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + struct g_raid_md_object *md; + struct g_raid_md_promise_perdisk *pd; + struct g_raid_md_promise_pervolume *pv; + struct promise_raid_conf *meta; + int i; + + sc = vol->v_softc; + md = sc->sc_md; + pv = vol->v_md_data; + meta = pv->pv_meta; + + if (meta->type == PROMISE_T_RAID0) + vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; + else if (meta->type == PROMISE_T_RAID1) { + if (meta->array_width == 1) + vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; + else + vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; + } else if (meta->type == PROMISE_T_RAID3) + vol->v_raid_level = G_RAID_VOLUME_RL_RAID3; + else if (meta->type == PROMISE_T_RAID5) + vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; + else if (meta->type == PROMISE_T_SPAN) + vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; + else if (meta->type == PROMISE_T_JBOD) + vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; + else + vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; + vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; + vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ + vol->v_disks_count = meta->total_disks; + vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ + vol->v_sectorsize = 512; //ZZZ + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + sd->sd_offset = (off_t)meta->disk_offset * 512; //ZZZ + sd->sd_size = (off_t)meta->disk_sectors * 512; //ZZZ + } + g_raid_start_volume(vol); + + /* Make all disks found till the moment take their places. */ + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = disk->d_md_data; + for (i = 0; i < pd->pd_subdisks; i++) { + if (pd->pd_meta[i]->volume_id == meta->volume_id) + g_raid_md_promise_start_disk(disk, i, vol); + } + } + + pv->pv_started = 1; + callout_stop(&pv->pv_start_co); + G_RAID_DEBUG1(0, sc, "Volume started."); + g_raid_md_write_promise(md, vol, NULL, NULL); + + /* Pickup any STALE/SPARE disks to refill array if needed. */ + g_raid_md_promise_refill(sc); + + g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); +} + +static void +g_raid_promise_go(void *arg) +{ + struct g_raid_volume *vol; + struct g_raid_softc *sc; + struct g_raid_md_promise_pervolume *pv; + + vol = arg; + pv = vol->v_md_data; + sc = vol->v_softc; + if (!pv->pv_started) { + G_RAID_DEBUG1(0, sc, "Force volume start due to timeout."); + g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD, + G_RAID_EVENT_VOLUME); + } +} + +static void +g_raid_md_promise_new_disk(struct g_raid_disk *disk) +{ + struct g_raid_softc *sc; + struct g_raid_md_object *md; + struct promise_raid_conf *pdmeta; + struct g_raid_md_promise_perdisk *pd; + struct g_raid_md_promise_pervolume *pv; + struct g_raid_volume *vol; + int i; + char buf[33]; + + sc = disk->d_softc; + md = sc->sc_md; + pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; + + if (pd->pd_subdisks == 0) { + g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); + g_raid_md_promise_refill(sc); + return; + } + + for (i = 0; i < pd->pd_subdisks; i++) { + pdmeta = pd->pd_meta[i]; + + /* Look for volume with matching ID. */ + vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); + if (vol == NULL) { + promise_meta_get_name(pdmeta, buf); + vol = g_raid_create_volume(sc, buf, pdmeta->array_number); + pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); + pv->pv_id = pdmeta->volume_id; + vol->v_md_data = pv; + callout_init(&pv->pv_start_co, 1); + callout_reset(&pv->pv_start_co, + g_raid_start_timeout * hz, + g_raid_promise_go, vol); + } else + pv = vol->v_md_data; + + /* If we haven't started yet - check metadata freshness. */ + if (pv->pv_meta == NULL || !pv->pv_started) { + if (pv->pv_meta == NULL || + ((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) { + G_RAID_DEBUG1(1, sc, "Newer disk"); + if (pv->pv_meta != NULL) + free(pv->pv_meta, M_MD_PROMISE); + pv->pv_meta = promise_meta_copy(pdmeta); + pv->pv_generation = pv->pv_meta->generation; + pv->pv_disks_present = 1; + } else if (pdmeta->generation == pv->pv_generation) { + pv->pv_disks_present++; + G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", + pv->pv_disks_present, + pv->pv_meta->total_disks); + } else { + G_RAID_DEBUG1(1, sc, "Older disk"); + } + } + } + + for (i = 0; i < pd->pd_subdisks; i++) { + pdmeta = pd->pd_meta[i]; + + /* Look for volume with matching ID. */ + vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); + if (vol == NULL) + continue; + pv = vol->v_md_data; + + if (pv->pv_started) { + if (g_raid_md_promise_start_disk(disk, i, vol)) + g_raid_md_write_promise(md, vol, NULL, NULL); + } else { + /* If we collected all needed disks - start array. */ + if (pv->pv_disks_present == pv->pv_meta->total_disks) + g_raid_md_promise_start(vol); + } + } +} + +static int +g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp, + struct g_geom **gp) +{ + struct g_geom *geom; + struct g_raid_softc *sc; + + /* Search for existing node. */ + LIST_FOREACH(geom, &mp->geom, geom) { + sc = geom->softc; + if (sc == NULL) + continue; + if (sc->sc_stopping != 0) + continue; + if (sc->sc_md->mdo_class != md->mdo_class) + continue; + break; + } + if (geom != NULL) { + *gp = geom; + return (G_RAID_MD_TASTE_EXISTING); + } + + /* Create new one if not found. */ + sc = g_raid_create_node(mp, "Promise", md); + if (sc == NULL) + return (G_RAID_MD_TASTE_FAIL); + md->mdo_softc = sc; + *gp = sc->sc_geom; + return (G_RAID_MD_TASTE_NEW); +} + +static int +g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp, + struct g_consumer *cp, struct g_geom **gp) +{ + struct g_consumer *rcp; + struct g_provider *pp; + struct g_raid_softc *sc; + struct g_raid_disk *disk; + struct promise_raid_conf *meta, *metaarr[4]; + struct g_raid_md_promise_perdisk *pd; + struct g_geom *geom; + int error, i, j, result, len, subdisks; + char name[16]; + uint16_t vendor; + + G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name); + pp = cp->provider; + + /* Read metadata from device. */ + meta = NULL; + vendor = 0xffff; + if (g_access(cp, 1, 0, 0) != 0) + return (G_RAID_MD_TASTE_FAIL); + g_topology_unlock(); + len = 2; + if (pp->geom->rank == 1) + g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); + subdisks = promise_meta_read(cp, metaarr); + g_topology_lock(); + g_access(cp, -1, 0, 0); + if (subdisks == 0) { + if (g_raid_aggressive_spare) { + if (vendor == 0x105a || vendor == 0x1002) { + G_RAID_DEBUG(1, + "No Promise metadata, forcing spare."); + goto search; + } else { + G_RAID_DEBUG(1, + "Promise/ATI vendor mismatch " + "0x%04x != 0x105a/0x1002", + vendor); + } + } + return (G_RAID_MD_TASTE_FAIL); + } + + /* Metadata valid. Print it. */ + for (i = 0; i < subdisks; i++) + g_raid_md_promise_print(metaarr[i]); + + /* Purge meaningless (empty/spare) records. */ + for (i = 0; i < subdisks; ) { + if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) { + i++; + continue; + } + free(metaarr[i], M_MD_PROMISE); + for (j = i; j < subdisks - 1; j++) + metaarr[i] = metaarr[j + 1]; + metaarr[PROMISE_MAX_SUBDISKS - 1] = NULL; + subdisks--; + } + +search: + /* Search for matching node. */ + sc = NULL; + LIST_FOREACH(geom, &mp->geom, geom) { + sc = geom->softc; + if (sc == NULL) + continue; + if (sc->sc_stopping != 0) + continue; + if (sc->sc_md->mdo_class != md->mdo_class) + continue; + break; + } + + /* Found matching node. */ + if (geom != NULL) { + G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); + result = G_RAID_MD_TASTE_EXISTING; + + } else { /* Not found matching node -- create one. */ + result = G_RAID_MD_TASTE_NEW; + snprintf(name, sizeof(name), "Promise"); + sc = g_raid_create_node(mp, name, md); + md->mdo_softc = sc; + geom = sc->sc_geom; + } + + rcp = g_new_consumer(geom); + g_attach(rcp, pp); + if (g_access(rcp, 1, 1, 1) != 0) + ; //goto fail1; + + g_topology_unlock(); + sx_xlock(&sc->sc_lock); + + pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); + pd->pd_subdisks = subdisks; + for (i = 0; i < subdisks; i++) + pd->pd_meta[i] = metaarr[i]; + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_consumer = rcp; + rcp->private = disk; + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", + rcp->provider->name, error); + + g_raid_md_promise_new_disk(disk); + + sx_xunlock(&sc->sc_lock); + g_topology_lock(); + *gp = geom; + return (result); +} + +static int +g_raid_md_event_promise(struct g_raid_md_object *md, + struct g_raid_disk *disk, u_int event) +{ + struct g_raid_softc *sc; + struct g_raid_md_promise_perdisk *pd; + + sc = md->mdo_softc; + if (disk == NULL) + return (-1); + pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; + switch (event) { + case G_RAID_DISK_E_DISCONNECTED: + /* Delete disk. */ + g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); + g_raid_destroy_disk(disk); + g_raid_md_promise_purge_volumes(sc); + + /* Write updated metadata to all disks. */ + g_raid_md_write_promise(md, NULL, NULL, NULL); + + /* Check if anything left. */ + if (g_raid_ndisks(sc, -1) == 0) + g_raid_destroy_node(sc, 0); + else + g_raid_md_promise_refill(sc); + return (0); + } + return (-2); +} + +static int +g_raid_md_volume_event_promise(struct g_raid_md_object *md, + struct g_raid_volume *vol, u_int event) +{ + struct g_raid_softc *sc; + struct g_raid_md_promise_pervolume *pv; + + sc = md->mdo_softc; + pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; + switch (event) { + case G_RAID_VOLUME_E_STARTMD: + if (!pv->pv_started) + g_raid_md_promise_start(vol); + return (0); + } + return (-2); +} + +static int +g_raid_md_ctl_promise(struct g_raid_md_object *md, + struct gctl_req *req) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol, *vol1; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS]; + struct g_raid_md_promise_perdisk *pd; + struct g_raid_md_promise_pervolume *pv; + struct g_consumer *cp; + struct g_provider *pp; + char arg[16]; + const char *verb, *volname, *levelname, *diskname; + char *tmp; + int *nargs, *force; + off_t size, sectorsize, strip; + intmax_t *sizearg, *striparg; + uint32_t offs[PROMISE_MAX_DISKS], esize; + int numdisks, i, len, level, qual; + int error; + + sc = md->mdo_softc; + verb = gctl_get_param(req, "verb", NULL); + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + error = 0; + if (strcmp(verb, "label") == 0) { + + if (*nargs < 4) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + volname = gctl_get_asciiparam(req, "arg1"); + if (volname == NULL) { + gctl_error(req, "No volume name."); + return (-2); + } + levelname = gctl_get_asciiparam(req, "arg2"); + if (levelname == NULL) { + gctl_error(req, "No RAID level."); + return (-3); + } + if (g_raid_volume_str2level(levelname, &level, &qual)) { + gctl_error(req, "Unknown RAID level '%s'.", levelname); + return (-4); + } + numdisks = *nargs - 3; + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (!g_raid_md_promise_supported(level, qual, numdisks, + force ? *force : 0)) { + gctl_error(req, "Unsupported RAID level " + "(0x%02x/0x%02x), or number of disks (%d).", + level, qual, numdisks); + return (-5); + } + + /* Search for disks, connect them and probe. */ + size = INT64_MAX; + sectorsize = 0; + bzero(disks, sizeof(disks)); + bzero(offs, sizeof(offs)); + for (i = 0; i < numdisks; i++) { + snprintf(arg, sizeof(arg), "arg%d", i + 3); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -6; + break; + } + if (strcmp(diskname, "NONE") == 0) + continue; + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_consumer != NULL && + disk->d_consumer->provider != NULL && + strcmp(disk->d_consumer->provider->name, + diskname) == 0) + break; + } + if (disk != NULL) { + if (disk->d_state != G_RAID_DISK_S_ACTIVE) { + gctl_error(req, "Disk '%s' is in a " + "wrong state (%s).", diskname, + g_raid_disk_state2str(disk->d_state)); + error = -7; + break; + } + pd = disk->d_md_data; + if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) { + gctl_error(req, "Disk '%s' already " + "used by %d volumes.", + diskname, pd->pd_subdisks); + error = -7; + break; + } + pp = disk->d_consumer->provider; + disks[i] = disk; + promise_meta_unused_range(pd->pd_meta, + pd->pd_subdisks, + pp->mediasize / pp->sectorsize, + &offs[i], &esize); + size = MIN(size, (off_t)esize * pp->sectorsize); + sectorsize = MAX(sectorsize, pp->sectorsize); + continue; + } + + g_topology_lock(); + cp = g_raid_open_consumer(sc, diskname); + if (cp == NULL) { + gctl_error(req, "Can't open disk '%s'.", + diskname); + g_topology_unlock(); + error = -8; + break; + } + pp = cp->provider; + pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_consumer = cp; + disks[i] = disk; + cp->private = disk; + g_topology_unlock(); + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, + "Dumping not supported by %s.", + cp->provider->name); + + /* Reserve some space for metadata. */ + size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize); + sectorsize = MAX(sectorsize, pp->sectorsize); + } + if (error != 0) { + for (i = 0; i < numdisks; i++) { + if (disks[i] != NULL && + disks[i]->d_state == G_RAID_DISK_S_NONE) + g_raid_destroy_disk(disks[i]); + } + return (error); + } + + /* Handle size argument. */ + len = sizeof(*sizearg); + sizearg = gctl_get_param(req, "size", &len); + if (sizearg != NULL && len == sizeof(*sizearg) && + *sizearg > 0) { + if (*sizearg > size) { + gctl_error(req, "Size too big %lld > %lld.", + (long long)*sizearg, (long long)size); + return (-9); + } + size = *sizearg; + } + + /* Handle strip argument. */ + strip = 131072; + len = sizeof(*striparg); + striparg = gctl_get_param(req, "strip", &len); + if (striparg != NULL && len == sizeof(*striparg) && + *striparg > 0) { + if (*striparg < sectorsize) { + gctl_error(req, "Strip size too small."); + return (-10); + } + if (*striparg % sectorsize != 0) { + gctl_error(req, "Incorrect strip size."); + return (-11); + } + strip = *striparg; + } + + /* Round size down to strip or sector. */ + if (level == G_RAID_VOLUME_RL_RAID1 || + level == G_RAID_VOLUME_RL_SINGLE || + level == G_RAID_VOLUME_RL_CONCAT) + size -= (size % sectorsize); + else if (level == G_RAID_VOLUME_RL_RAID1E && + (numdisks & 1) != 0) + size -= (size % (2 * strip)); + else + size -= (size % strip); + if (size <= 0) { + gctl_error(req, "Size too small."); + return (-13); + } + if (size > 0xffffffffllu * sectorsize) { + gctl_error(req, "Size too big."); + return (-14); + } + + /* We have all we need, create things: volume, ... */ + pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); + arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0); + pv->pv_generation = 0; + pv->pv_started = 1; + vol = g_raid_create_volume(sc, volname, -1); + vol->v_md_data = pv; + vol->v_raid_level = level; + vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; + vol->v_strip_size = strip; + vol->v_disks_count = numdisks; + if (level == G_RAID_VOLUME_RL_RAID0 || + level == G_RAID_VOLUME_RL_CONCAT || + level == G_RAID_VOLUME_RL_SINGLE) + vol->v_mediasize = size * numdisks; + else if (level == G_RAID_VOLUME_RL_RAID1) + vol->v_mediasize = size; + else if (level == G_RAID_VOLUME_RL_RAID3 || + level == G_RAID_VOLUME_RL_RAID5) + vol->v_mediasize = size * (numdisks - 1); + else { /* RAID1E */ + vol->v_mediasize = ((size * numdisks) / strip / 2) * + strip; + } + vol->v_sectorsize = sectorsize; + g_raid_start_volume(vol); + + /* , and subdisks. */ + for (i = 0; i < numdisks; i++) { + disk = disks[i]; + sd = &vol->v_subdisks[i]; + sd->sd_disk = disk; + sd->sd_offset = (off_t)offs[i] * 512; + sd->sd_size = size; + if (disk == NULL) + continue; + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + g_raid_change_disk_state(disk, + G_RAID_DISK_S_ACTIVE); + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_ACTIVE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, + G_RAID_EVENT_SUBDISK); + } + + /* Write metadata based on created entities. */ + G_RAID_DEBUG1(0, sc, "Array started."); + g_raid_md_write_promise(md, vol, NULL, NULL); + + /* Pickup any STALE/SPARE disks to refill array if needed. */ + g_raid_md_promise_refill(sc); + + g_raid_event_send(vol, G_RAID_VOLUME_E_START, + G_RAID_EVENT_VOLUME); + return (0); + } + if (strcmp(verb, "add") == 0) { + + gctl_error(req, "`add` command is not applicable, " + "use `label` instead."); + return (-99); + } + if (strcmp(verb, "delete") == 0) { + + /* Full node destruction. */ + if (*nargs == 1) { + /* Check if some volume is still open. */ + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (force != NULL && *force == 0 && + g_raid_nopens(sc) != 0) { + gctl_error(req, "Some volume is still open."); + return (-4); + } + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_consumer) + promise_meta_erase(disk->d_consumer); + } + g_raid_destroy_node(sc, 0); + return (0); + } + + /* Destroy specified volume. If it was last - all node. */ + if (*nargs != 2) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + volname = gctl_get_asciiparam(req, "arg1"); + if (volname == NULL) { + gctl_error(req, "No volume name."); + return (-2); + } + + /* Search for volume. */ + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + if (strcmp(vol->v_name, volname) == 0) + break; + } + if (vol == NULL) { + i = strtol(volname, &tmp, 10); + if (verb != volname && tmp[0] == 0) { + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + if (vol->v_global_id == i) + break; + } + } + } + if (vol == NULL) { + gctl_error(req, "Volume '%s' not found.", volname); + return (-3); + } + + /* Check if volume is still open. */ + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (force != NULL && *force == 0 && + vol->v_provider_open != 0) { + gctl_error(req, "Volume is still open."); + return (-4); + } + + /* Destroy volume and potentially node. */ + i = 0; + TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) + i++; + if (i >= 2) { + g_raid_destroy_volume(vol); + g_raid_md_promise_purge_disks(sc); + g_raid_md_write_promise(md, NULL, NULL, NULL); + } else { + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_consumer) + promise_meta_erase(disk->d_consumer); + } + g_raid_destroy_node(sc, 0); + } + return (0); + } + if (strcmp(verb, "remove") == 0 || + strcmp(verb, "fail") == 0) { + if (*nargs < 2) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + for (i = 1; i < *nargs; i++) { + snprintf(arg, sizeof(arg), "arg%d", i); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -2; + break; + } + if (strncmp(diskname, "/dev/", 5) == 0) + diskname += 5; + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_consumer != NULL && + disk->d_consumer->provider != NULL && + strcmp(disk->d_consumer->provider->name, + diskname) == 0) + break; + } + if (disk == NULL) { + gctl_error(req, "Disk '%s' not found.", + diskname); + error = -3; + break; + } + + if (strcmp(verb, "fail") == 0) { + g_raid_md_fail_disk_promise(md, NULL, disk); + continue; + } + + pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; + + /* Erase metadata on deleting disk and destroy it. */ + promise_meta_erase(disk->d_consumer); + g_raid_destroy_disk(disk); + } + g_raid_md_promise_purge_volumes(sc); + + /* Write updated metadata to remaining disks. */ + g_raid_md_write_promise(md, NULL, NULL, NULL); + + /* Check if anything left. */ + if (g_raid_ndisks(sc, -1) == 0) + g_raid_destroy_node(sc, 0); + else + g_raid_md_promise_refill(sc); + return (error); + } + if (strcmp(verb, "insert") == 0) { + if (*nargs < 2) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + for (i = 1; i < *nargs; i++) { + /* Get disk name. */ + snprintf(arg, sizeof(arg), "arg%d", i); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -3; + break; + } + + /* Try to find provider with specified name. */ + g_topology_lock(); + cp = g_raid_open_consumer(sc, diskname); + if (cp == NULL) { + gctl_error(req, "Can't open disk '%s'.", + diskname); + g_topology_unlock(); + error = -4; + break; + } + pp = cp->provider; + g_topology_unlock(); + + pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); + + disk = g_raid_create_disk(sc); + disk->d_consumer = cp; + disk->d_consumer->private = disk; + disk->d_md_data = (void *)pd; + cp->private = disk; + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, + "Dumping not supported by %s.", + cp->provider->name); + + /* Welcome the "new" disk. */ + g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); + promise_meta_write_spare(cp); + g_raid_md_promise_refill(sc); + } + return (error); + } + return (-100); +} + +static int +g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol, + struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + struct g_raid_md_promise_perdisk *pd; + struct g_raid_md_promise_pervolume *pv; + struct promise_raid_conf *meta; + off_t rebuild_lba64; + int i, j, pos, rebuild; + + sc = md->mdo_softc; + + if (sc->sc_stopping == G_RAID_DESTROY_HARD) + return (0); + + /* Generate new per-volume metadata for affected volumes. */ + TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { + if (vol->v_stopping) + continue; + + /* Skip volumes not related to specified targets. */ + if (tvol != NULL && vol != tvol) + continue; + if (tsd != NULL && vol != tsd->sd_volume) + continue; + if (tdisk != NULL) { + for (i = 0; i < vol->v_disks_count; i++) { + if (vol->v_subdisks[i].sd_disk == tdisk) + break; + } + if (i >= vol->v_disks_count) + continue; + } + + pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; + pv->pv_generation++; + + meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); + if (pv->pv_meta != NULL) + memcpy(meta, pv->pv_meta, sizeof(*meta)); + memcpy(meta->promise_id, PROMISE_MAGIC, sizeof(PROMISE_MAGIC)); + meta->dummy_0 = 0x00020000; + meta->integrity = PROMISE_I_VALID; + + meta->generation = pv->pv_generation; + meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE | + PROMISE_S_INITED | PROMISE_S_READY; + if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED) + meta->status |= PROMISE_S_DEGRADED; + if (vol->v_dirty) + meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */ + if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 || + vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) + meta->type = PROMISE_T_RAID0; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || + vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) + meta->type = PROMISE_T_RAID1; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) + meta->type = PROMISE_T_RAID3; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) + meta->type = PROMISE_T_RAID5; + else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) + meta->type = PROMISE_T_SPAN; + else + meta->type = PROMISE_T_JBOD; + meta->total_disks = vol->v_disks_count; + meta->stripe_shift = ffs(vol->v_strip_size / 1024); + meta->array_width = vol->v_disks_count; + if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || + vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) + meta->array_width /= 2; + meta->array_number = vol->v_global_id; + meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; + meta->cylinders = meta->total_sectors / (255 * 63) - 1; + meta->heads = 254; + meta->sectors = 63; + meta->volume_id = pv->pv_id; + rebuild_lba64 = UINT64_MAX; + rebuild = 0; + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + /* For RAID0+1 we need to translate order. */ + pos = promise_meta_translate_disk(vol, i); + meta->disks[pos].flags = PROMISE_F_VALID | + PROMISE_F_ASSIGNED; + if (sd->sd_state == G_RAID_SUBDISK_S_NONE) { + meta->disks[pos].flags |= 0; + } else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) { + meta->disks[pos].flags |= + PROMISE_F_DOWN | PROMISE_F_REDIR; + } else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) { + meta->disks[pos].flags |= + PROMISE_F_ONLINE | PROMISE_F_REDIR; + if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { + rebuild_lba64 = MIN(rebuild_lba64, + sd->sd_rebuild_pos / 512); + } else + rebuild_lba64 = 0; + rebuild = 1; + } else { + meta->disks[pos].flags |= PROMISE_F_ONLINE; + if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) { + meta->status |= PROMISE_S_MARKED; + if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { + rebuild_lba64 = MIN(rebuild_lba64, + sd->sd_rebuild_pos / 512); + } else + rebuild_lba64 = 0; + } + } + if (pv->pv_meta != NULL) { + meta->disks[pos].id = pv->pv_meta->disks[pos].id; + } else { + meta->disks[pos].number = i * 2; + arc4rand(&meta->disks[pos].id, + sizeof(meta->disks[pos].id), 0); + } + } + promise_meta_put_name(meta, vol->v_name); + + /* Try to mimic AMD BIOS rebuild/resync behavior. */ + if (rebuild_lba64 != UINT64_MAX) { + if (rebuild) + meta->magic_3 = 0x03040010UL; /* Rebuild? */ + else + meta->magic_3 = 0x03040008UL; /* Resync? */ + /* Translate from per-disk to per-volume LBA. */ + if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || + vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { + rebuild_lba64 *= meta->array_width; + } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || + vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) { + rebuild_lba64 *= meta->array_width - 1; + } else + rebuild_lba64 = 0; + } else + meta->magic_3 = 0x03000000UL; + meta->rebuild_lba64 = rebuild_lba64; + meta->magic_4 = 0x04010101UL; + + /* Replace per-volume metadata with new. */ + if (pv->pv_meta != NULL) + free(pv->pv_meta, M_MD_PROMISE); + pv->pv_meta = meta; + + /* Copy new metadata to the disks, adding or replacing old. */ + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + disk = sd->sd_disk; + if (disk == NULL) + continue; + /* For RAID0+1 we need to translate order. */ + pos = promise_meta_translate_disk(vol, i); + pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; + for (j = 0; j < pd->pd_subdisks; j++) { + if (pd->pd_meta[j]->volume_id == meta->volume_id) + break; + } + if (j == pd->pd_subdisks) + pd->pd_subdisks++; + if (pd->pd_meta[j] != NULL) + free(pd->pd_meta[j], M_MD_PROMISE); + pd->pd_meta[j] = promise_meta_copy(meta); + pd->pd_meta[j]->disk = meta->disks[pos]; + pd->pd_meta[j]->disk.number = pos; + pd->pd_meta[j]->disk_offset = sd->sd_offset / 512; + pd->pd_meta[j]->disk_sectors = sd->sd_size / 512; + if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { + pd->pd_meta[j]->rebuild_lba = + sd->sd_rebuild_pos / 512; + } else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD) + pd->pd_meta[j]->rebuild_lba = 0; + else + pd->pd_meta[j]->rebuild_lba = UINT32_MAX; + pd->pd_updated = 1; + } + } + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; + if (disk->d_state != G_RAID_DISK_S_ACTIVE) + continue; + if (!pd->pd_updated) + continue; + G_RAID_DEBUG(1, "Writing Promise metadata to %s", + g_raid_get_diskname(disk)); + for (i = 0; i < pd->pd_subdisks; i++) + g_raid_md_promise_print(pd->pd_meta[i]); + promise_meta_write(disk->d_consumer, + pd->pd_meta, pd->pd_subdisks); + pd->pd_updated = 0; + } + + return (0); +} + +static int +g_raid_md_fail_disk_promise(struct g_raid_md_object *md, + struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) +{ + struct g_raid_softc *sc; + struct g_raid_md_promise_perdisk *pd; + struct g_raid_subdisk *sd; + int i, pos; + + sc = md->mdo_softc; + pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data; + + /* We can't fail disk that is not a part of array now. */ + if (tdisk->d_state != G_RAID_DISK_S_ACTIVE) + return (-1); + + /* + * Mark disk as failed in metadata and try to write that metadata + * to the disk itself to prevent it's later resurrection as STALE. + */ + if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL) + G_RAID_DEBUG(1, "Writing Promise metadata to %s", + g_raid_get_diskname(tdisk)); + for (i = 0; i < pd->pd_subdisks; i++) { + pd->pd_meta[i]->disk.flags |= + PROMISE_F_DOWN | PROMISE_F_REDIR; + pos = pd->pd_meta[i]->disk.number; + if (pos >= 0 && pos < PROMISE_MAX_DISKS) { + pd->pd_meta[i]->disks[pos].flags |= + PROMISE_F_DOWN | PROMISE_F_REDIR; + } + g_raid_md_promise_print(pd->pd_meta[i]); + } + if (tdisk->d_consumer != NULL) + promise_meta_write(tdisk->d_consumer, + pd->pd_meta, pd->pd_subdisks); + + /* Change states. */ + g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); + TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_FAILED); + g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, + G_RAID_EVENT_SUBDISK); + } + + /* Write updated metadata to remaining disks. */ + g_raid_md_write_promise(md, NULL, NULL, tdisk); + + g_raid_md_promise_refill(sc); + return (0); +} + +static int +g_raid_md_free_disk_promise(struct g_raid_md_object *md, + struct g_raid_disk *disk) +{ + struct g_raid_md_promise_perdisk *pd; + int i; + + pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; + for (i = 0; i < pd->pd_subdisks; i++) { + if (pd->pd_meta[i] != NULL) { + free(pd->pd_meta[i], M_MD_PROMISE); + pd->pd_meta[i] = NULL; + } + } + free(pd, M_MD_PROMISE); + disk->d_md_data = NULL; + return (0); +} + +static int +g_raid_md_free_volume_promise(struct g_raid_md_object *md, + struct g_raid_volume *vol) +{ + struct g_raid_md_promise_pervolume *pv; + + pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; + if (pv && pv->pv_meta != NULL) { + free(pv->pv_meta, M_MD_PROMISE); + pv->pv_meta = NULL; + } + if (pv && !pv->pv_started) { + pv->pv_started = 1; + callout_stop(&pv->pv_start_co); + } + return (0); +} + +static int +g_raid_md_free_promise(struct g_raid_md_object *md) +{ + + return (0); +} + +G_RAID_MD_DECLARE(g_raid_md_promise); diff --git a/sys/geom/raid/md_sii.c b/sys/geom/raid/md_sii.c new file mode 100644 index 0000000..305accd --- /dev/null +++ b/sys/geom/raid/md_sii.c @@ -0,0 +1,1692 @@ +/*- + * Copyright (c) 2011 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "geom/raid/g_raid.h" +#include "g_raid_md_if.h" + +static MALLOC_DEFINE(M_MD_SII, "md_sii_data", "GEOM_RAID SiI metadata"); + +struct sii_raid_conf { + uint16_t ata_params_00_53[54]; + uint64_t total_sectors; /* 54 - 57 */ + uint16_t ata_params_58_81[72]; + uint16_t product_id; /* 130 */ + uint16_t vendor_id; /* 131 */ + uint16_t version_minor; /* 132 */ + uint16_t version_major; /* 133 */ + uint8_t timestamp[6]; /* 134 - 136 */ + uint16_t strip_sectors; /* 137 */ + uint16_t dummy_2; + uint8_t disk_number; /* 139 */ + uint8_t type; +#define SII_T_RAID0 0x00 +#define SII_T_RAID1 0x01 +#define SII_T_RAID01 0x02 +#define SII_T_SPARE 0x03 +#define SII_T_CONCAT 0x04 +#define SII_T_RAID5 0x10 +#define SII_T_RESERVED 0xfd +#define SII_T_JBOD 0xff + + uint8_t raid0_disks; /* 140 */ + uint8_t raid0_ident; + uint8_t raid1_disks; /* 141 */ + uint8_t raid1_ident; + uint64_t rebuild_lba; /* 142 - 145 */ + uint32_t generation; /* 146 - 147 */ + uint8_t disk_status; /* 148 */ +#define SII_S_CURRENT 0x01 +#define SII_S_REBUILD 0x02 +#define SII_S_DROPPED 0x03 +#define SII_S_REMOVED 0x04 + + uint8_t raid_status; +#define SII_S_ONLINE 0x01 +#define SII_S_AVAILABLE 0x02 + + uint8_t raid_location; /* 149 */ + uint8_t disk_location; + uint8_t auto_rebuild; /* 150 */ +#define SII_R_REBUILD 0x00 +#define SII_R_NOREBUILD 0xff + + uint8_t dummy_3; + uint8_t name[16]; /* 151 - 158 */ + uint16_t checksum; /* 159 */ + uint16_t ata_params_160_255[96]; +} __packed; + +struct g_raid_md_sii_perdisk { + struct sii_raid_conf *pd_meta; + int pd_disk_pos; + off_t pd_disk_size; +}; + +struct g_raid_md_sii_object { + struct g_raid_md_object mdio_base; + uint8_t mdio_timestamp[6]; + uint8_t mdio_location; + uint32_t mdio_generation; + struct sii_raid_conf *mdio_meta; + struct callout mdio_start_co; /* STARTING state timer. */ + int mdio_total_disks; + int mdio_disks_present; + int mdio_started; + int mdio_incomplete; + struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ +}; + +static g_raid_md_create_t g_raid_md_create_sii; +static g_raid_md_taste_t g_raid_md_taste_sii; +static g_raid_md_event_t g_raid_md_event_sii; +static g_raid_md_ctl_t g_raid_md_ctl_sii; +static g_raid_md_write_t g_raid_md_write_sii; +static g_raid_md_fail_disk_t g_raid_md_fail_disk_sii; +static g_raid_md_free_disk_t g_raid_md_free_disk_sii; +static g_raid_md_free_t g_raid_md_free_sii; + +static kobj_method_t g_raid_md_sii_methods[] = { + KOBJMETHOD(g_raid_md_create, g_raid_md_create_sii), + KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_sii), + KOBJMETHOD(g_raid_md_event, g_raid_md_event_sii), + KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_sii), + KOBJMETHOD(g_raid_md_write, g_raid_md_write_sii), + KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_sii), + KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_sii), + KOBJMETHOD(g_raid_md_free, g_raid_md_free_sii), + { 0, 0 } +}; + +static struct g_raid_md_class g_raid_md_sii_class = { + "SiI", + g_raid_md_sii_methods, + sizeof(struct g_raid_md_sii_object), + .mdc_priority = 100 +}; + +static void +g_raid_md_sii_print(struct sii_raid_conf *meta) +{ + + if (g_raid_debug < 1) + return; + + printf("********* ATA SiI RAID Metadata *********\n"); + printf("total_sectors %llu\n", + (long long unsigned)meta->total_sectors); + printf("product_id 0x%04x\n", meta->product_id); + printf("vendor_id 0x%04x\n", meta->vendor_id); + printf("version_minor 0x%04x\n", meta->version_minor); + printf("version_major 0x%04x\n", meta->version_major); + printf("timestamp 0x%02x%02x%02x%02x%02x%02x\n", + meta->timestamp[5], meta->timestamp[4], meta->timestamp[3], + meta->timestamp[2], meta->timestamp[1], meta->timestamp[0]); + printf("strip_sectors %d\n", meta->strip_sectors); + printf("disk_number %d\n", meta->disk_number); + printf("type 0x%02x\n", meta->type); + printf("raid0_disks %d\n", meta->raid0_disks); + printf("raid0_ident %d\n", meta->raid0_ident); + printf("raid1_disks %d\n", meta->raid1_disks); + printf("raid1_ident %d\n", meta->raid1_ident); + printf("rebuild_lba %llu\n", + (long long unsigned)meta->rebuild_lba); + printf("generation %d\n", meta->generation); + printf("disk_status %d\n", meta->disk_status); + printf("raid_status %d\n", meta->raid_status); + printf("raid_location %d\n", meta->raid_location); + printf("disk_location %d\n", meta->disk_location); + printf("auto_rebuild %d\n", meta->auto_rebuild); + printf("name <%.16s>\n", meta->name); + printf("checksum 0x%04x\n", meta->checksum); + printf("=================================================\n"); +} + +static struct sii_raid_conf * +sii_meta_copy(struct sii_raid_conf *meta) +{ + struct sii_raid_conf *nmeta; + + nmeta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); + memcpy(nmeta, meta, sizeof(*meta)); + return (nmeta); +} + +static int +sii_meta_total_disks(struct sii_raid_conf *meta) +{ + + switch (meta->type) { + case SII_T_RAID0: + case SII_T_RAID5: + case SII_T_CONCAT: + return (meta->raid0_disks); + case SII_T_RAID1: + return (meta->raid1_disks); + case SII_T_RAID01: + return (meta->raid0_disks * meta->raid1_disks); + case SII_T_SPARE: + case SII_T_JBOD: + return (1); + } + return (0); +} + +static int +sii_meta_disk_pos(struct sii_raid_conf *meta, struct sii_raid_conf *pdmeta) +{ + + if (pdmeta->type == SII_T_SPARE) + return (-3); + + if (memcmp(&meta->timestamp, &pdmeta->timestamp, 6) != 0) + return (-1); + + switch (pdmeta->type) { + case SII_T_RAID0: + case SII_T_RAID1: + case SII_T_RAID5: + case SII_T_CONCAT: + return (pdmeta->disk_number); + case SII_T_RAID01: + return (pdmeta->raid1_ident * pdmeta->raid1_disks + + pdmeta->raid0_ident); + case SII_T_JBOD: + return (0); + } + return (-1); +} + +static void +sii_meta_get_name(struct sii_raid_conf *meta, char *buf) +{ + int i; + + strncpy(buf, meta->name, 16); + buf[16] = 0; + for (i = 15; i >= 0; i--) { + if (buf[i] > 0x20) + break; + buf[i] = 0; + } +} + +static void +sii_meta_put_name(struct sii_raid_conf *meta, char *buf) +{ + + memset(meta->name, 0x20, 16); + memcpy(meta->name, buf, MIN(strlen(buf), 16)); +} + +static struct sii_raid_conf * +sii_meta_read(struct g_consumer *cp) +{ + struct g_provider *pp; + struct sii_raid_conf *meta; + char *buf; + int error, i; + uint16_t checksum, *ptr; + + pp = cp->provider; + + /* Read the anchor sector. */ + buf = g_read_data(cp, + pp->mediasize - pp->sectorsize, pp->sectorsize, &error); + if (buf == NULL) { + G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", + pp->name, error); + return (NULL); + } + meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); + memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); + g_free(buf); + + /* Check vendor ID. */ + if (meta->vendor_id != 0x1095) { + G_RAID_DEBUG(1, "SiI vendor ID check failed on %s (0x%04x)", + pp->name, meta->vendor_id); + free(meta, M_MD_SII); + return (NULL); + } + + /* Check metadata major version. */ + if (meta->version_major != 2) { + G_RAID_DEBUG(1, "SiI version check failed on %s (%d.%d)", + pp->name, meta->version_major, meta->version_minor); + free(meta, M_MD_SII); + return (NULL); + } + + /* Check metadata checksum. */ + for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i <= 159; i++) + checksum += *ptr++; + if (checksum != 0) { + G_RAID_DEBUG(1, "SiI checksum check failed on %s", pp->name); + free(meta, M_MD_SII); + return (NULL); + } + + /* Check raid type. */ + if (meta->type != SII_T_RAID0 && meta->type != SII_T_RAID1 && + meta->type != SII_T_RAID01 && meta->type != SII_T_SPARE && + meta->type != SII_T_RAID5 && meta->type != SII_T_CONCAT && + meta->type != SII_T_JBOD) { + G_RAID_DEBUG(1, "SiI unknown RAID level on %s (0x%02x)", + pp->name, meta->type); + free(meta, M_MD_SII); + return (NULL); + } + + return (meta); +} + +static int +sii_meta_write(struct g_consumer *cp, struct sii_raid_conf *meta) +{ + struct g_provider *pp; + char *buf; + int error, i; + uint16_t checksum, *ptr; + + pp = cp->provider; + + /* Recalculate checksum for case if metadata were changed. */ + meta->checksum = 0; + for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 159; i++) + checksum += *ptr++; + meta->checksum -= checksum; + + /* Create and fill buffer. */ + buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); + memcpy(buf, meta, sizeof(*meta)); + + /* Write 4 copies of metadata. */ + for (i = 0; i < 4; i++) { + error = g_write_data(cp, + pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), + buf, pp->sectorsize); + if (error != 0) { + G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", + pp->name, error); + break; + } + } + + free(buf, M_MD_SII); + return (error); +} + +static int +sii_meta_erase(struct g_consumer *cp) +{ + struct g_provider *pp; + char *buf; + int error, i; + + pp = cp->provider; + buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); + /* Write 4 copies of metadata. */ + for (i = 0; i < 4; i++) { + error = g_write_data(cp, + pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), + buf, pp->sectorsize); + if (error != 0) { + G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", + pp->name, error); + } + } + free(buf, M_MD_SII); + return (error); +} + +static int +sii_meta_write_spare(struct g_consumer *cp) +{ + struct sii_raid_conf *meta; + int error; + + meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); + meta->total_sectors = cp->provider->mediasize / + cp->provider->sectorsize - 0x800; + meta->vendor_id = 0x1095; + meta->version_minor = 0; + meta->version_major = 2; + meta->timestamp[0] = arc4random(); + meta->timestamp[1] = arc4random(); + meta->timestamp[2] = arc4random(); + meta->timestamp[3] = arc4random(); + meta->timestamp[4] = arc4random(); + meta->timestamp[5] = arc4random(); + meta->type = SII_T_SPARE; + meta->generation = 1; + meta->raid1_ident = 0xff; + meta->raid_location = arc4random(); + error = sii_meta_write(cp, meta); + free(meta, M_MD_SII); + return (error); +} + +static struct g_raid_disk * +g_raid_md_sii_get_disk(struct g_raid_softc *sc, int id) +{ + struct g_raid_disk *disk; + struct g_raid_md_sii_perdisk *pd; + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; + if (pd->pd_disk_pos == id) + break; + } + return (disk); +} + +static int +g_raid_md_sii_supported(int level, int qual, int disks, int force) +{ + + if (disks > 8) + return (0); + switch (level) { + case G_RAID_VOLUME_RL_RAID0: + if (disks < 1) + return (0); + if (!force && (disks < 2 || disks > 6)) + return (0); + break; + case G_RAID_VOLUME_RL_RAID1: + if (disks < 1) + return (0); + if (!force && (disks != 2)) + return (0); + break; + case G_RAID_VOLUME_RL_RAID1E: + if (disks < 2) + return (0); + if (disks % 2 != 0) + return (0); + if (!force && (disks < 4)) + return (0); + break; + case G_RAID_VOLUME_RL_SINGLE: + if (disks != 1) + return (0); + break; + case G_RAID_VOLUME_RL_CONCAT: + if (disks < 2) + return (0); + break; + case G_RAID_VOLUME_RL_RAID5: + if (disks < 3) + return (0); + break; + default: + return (0); + } + if (qual != G_RAID_VOLUME_RLQ_NONE) + return (0); + return (1); +} + +static int +g_raid_md_sii_start_disk(struct g_raid_disk *disk) +{ + struct g_raid_softc *sc; + struct g_raid_subdisk *sd, *tmpsd; + struct g_raid_disk *olddisk, *tmpdisk; + struct g_raid_md_object *md; + struct g_raid_md_sii_object *mdi; + struct g_raid_md_sii_perdisk *pd, *oldpd; + struct sii_raid_conf *meta; + int disk_pos, resurrection = 0; + + sc = disk->d_softc; + md = sc->sc_md; + mdi = (struct g_raid_md_sii_object *)md; + meta = mdi->mdio_meta; + pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; + olddisk = NULL; + + /* Find disk position in metadata by it's serial. */ + if (pd->pd_meta != NULL) + disk_pos = sii_meta_disk_pos(meta, pd->pd_meta); + else + disk_pos = -3; + if (disk_pos < 0) { + G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); + /* If we are in the start process, that's all for now. */ + if (!mdi->mdio_started) + goto nofit; + /* + * If we have already started - try to get use of the disk. + * Try to replace OFFLINE disks first, then FAILED. + */ + TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { + if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && + tmpdisk->d_state != G_RAID_DISK_S_FAILED) + continue; + /* Make sure this disk is big enough. */ + TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { + if (sd->sd_offset + sd->sd_size + 512 > + pd->pd_disk_size) { + G_RAID_DEBUG1(1, sc, + "Disk too small (%ju < %ju)", + pd->pd_disk_size, + sd->sd_offset + sd->sd_size + 512); + break; + } + } + if (sd != NULL) + continue; + if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { + olddisk = tmpdisk; + break; + } else if (olddisk == NULL) + olddisk = tmpdisk; + } + if (olddisk == NULL) { +nofit: + if (disk_pos == -3 || pd->pd_disk_pos == -3) { + g_raid_change_disk_state(disk, + G_RAID_DISK_S_SPARE); + return (1); + } else { + g_raid_change_disk_state(disk, + G_RAID_DISK_S_STALE); + return (0); + } + } + oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; + disk_pos = oldpd->pd_disk_pos; + resurrection = 1; + } + + if (olddisk == NULL) { + /* Find placeholder by position. */ + olddisk = g_raid_md_sii_get_disk(sc, disk_pos); + if (olddisk == NULL) + panic("No disk at position %d!", disk_pos); + if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { + G_RAID_DEBUG1(1, sc, "More then one disk for pos %d", + disk_pos); + g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); + return (0); + } + oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; + } + + /* Replace failed disk or placeholder with new disk. */ + TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { + TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + sd->sd_disk = disk; + } + oldpd->pd_disk_pos = -2; + pd->pd_disk_pos = disk_pos; + + /* If it was placeholder -- destroy it. */ + if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { + g_raid_destroy_disk(olddisk); + } else { + /* Otherwise, make it STALE_FAILED. */ + g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); + } + + /* Welcome the new disk. */ + if (resurrection) + g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); + else if (pd->pd_meta->disk_status == SII_S_CURRENT || + pd->pd_meta->disk_status == SII_S_REBUILD) + g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); + else + g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + + /* + * Different disks may have different sizes, + * in concat mode. Update from real disk size. + */ + if (meta->type == SII_T_CONCAT || meta->type == SII_T_JBOD) + sd->sd_size = pd->pd_disk_size - 0x800 * 512; + + if (resurrection) { + /* New or ex-spare disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NEW); + } else if (pd->pd_meta->disk_status == SII_S_REBUILD) { + /* Rebuilding disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_REBUILD); + if (pd->pd_meta->generation == meta->generation) + sd->sd_rebuild_pos = pd->pd_meta->rebuild_lba * 512; + else + sd->sd_rebuild_pos = 0; + } else if (pd->pd_meta->disk_status == SII_S_CURRENT) { + if (pd->pd_meta->raid_status == SII_S_ONLINE || + pd->pd_meta->generation != meta->generation) { + /* Dirty or resyncing disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_STALE); + } else { + /* Up to date disk. */ + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_ACTIVE); + } + } else { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_FAILED); + } + g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, + G_RAID_EVENT_SUBDISK); + } + + /* Update status of our need for spare. */ + if (mdi->mdio_started) { + mdi->mdio_incomplete = + (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < + mdi->mdio_total_disks); + } + + return (resurrection); +} + +static void +g_disk_md_sii_retaste(void *arg, int pending) +{ + + G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); + g_retaste(&g_raid_class); + free(arg, M_MD_SII); +} + +static void +g_raid_md_sii_refill(struct g_raid_softc *sc) +{ + struct g_raid_md_object *md; + struct g_raid_md_sii_object *mdi; + struct sii_raid_conf *meta; + struct g_raid_disk *disk; + struct task *task; + int update, na; + + md = sc->sc_md; + mdi = (struct g_raid_md_sii_object *)md; + meta = mdi->mdio_meta; + update = 0; + do { + /* Make sure we miss anything. */ + na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); + if (na == mdi->mdio_total_disks) + break; + + G_RAID_DEBUG1(1, md->mdo_softc, + "Array is not complete (%d of %d), " + "trying to refill.", na, mdi->mdio_total_disks); + + /* Try to get use some of STALE disks. */ + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_state == G_RAID_DISK_S_STALE) { + update += g_raid_md_sii_start_disk(disk); + if (disk->d_state == G_RAID_DISK_S_ACTIVE) + break; + } + } + if (disk != NULL) + continue; + + /* Try to get use some of SPARE disks. */ + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_state == G_RAID_DISK_S_SPARE) { + update += g_raid_md_sii_start_disk(disk); + if (disk->d_state == G_RAID_DISK_S_ACTIVE) + break; + } + } + } while (disk != NULL); + + /* Write new metadata if we changed something. */ + if (update) { + g_raid_md_write_sii(md, NULL, NULL, NULL); + meta = mdi->mdio_meta; + } + + /* Update status of our need for spare. */ + mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < + mdi->mdio_total_disks); + + /* Request retaste hoping to find spare. */ + if (mdi->mdio_incomplete) { + task = malloc(sizeof(struct task), + M_MD_SII, M_WAITOK | M_ZERO); + TASK_INIT(task, 0, g_disk_md_sii_retaste, task); + taskqueue_enqueue(taskqueue_swi, task); + } +} + +static void +g_raid_md_sii_start(struct g_raid_softc *sc) +{ + struct g_raid_md_object *md; + struct g_raid_md_sii_object *mdi; + struct g_raid_md_sii_perdisk *pd; + struct sii_raid_conf *meta; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk, *best; + off_t size; + int j, disk_pos; + uint32_t gendiff, bestgendiff; + char buf[17]; + + md = sc->sc_md; + mdi = (struct g_raid_md_sii_object *)md; + meta = mdi->mdio_meta; + + /* Create volumes and subdisks. */ + sii_meta_get_name(meta, buf); + vol = g_raid_create_volume(sc, buf, -1); + vol->v_mediasize = (off_t)meta->total_sectors * 512; + if (meta->type == SII_T_RAID0) { + vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; + size = vol->v_mediasize / mdi->mdio_total_disks; + } else if (meta->type == SII_T_RAID1) { + vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; + size = vol->v_mediasize; + } else if (meta->type == SII_T_RAID01) { + vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; + size = vol->v_mediasize / (mdi->mdio_total_disks / 2); + } else if (meta->type == SII_T_CONCAT) { + if (mdi->mdio_total_disks == 1) + vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; + else + vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; + size = 0; + } else if (meta->type == SII_T_RAID5) { + vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; + size = vol->v_mediasize / (mdi->mdio_total_disks - 1); + } else if (meta->type == SII_T_JBOD) { + vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; + size = 0; + } else { + vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; + size = 0; + } + vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; + vol->v_strip_size = meta->strip_sectors * 512; //ZZZ + vol->v_disks_count = mdi->mdio_total_disks; + vol->v_sectorsize = 512; //ZZZ + for (j = 0; j < vol->v_disks_count; j++) { + sd = &vol->v_subdisks[j]; + sd->sd_offset = 0; + sd->sd_size = size; + } + g_raid_start_volume(vol); + + /* Create disk placeholders to store data for later writing. */ + for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { + pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); + pd->pd_disk_pos = disk_pos; + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_state = G_RAID_DISK_S_OFFLINE; + sd = &vol->v_subdisks[disk_pos]; + sd->sd_disk = disk; + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + } + + /* + * Make all disks found till the moment take their places + * in order of their generation numbers. + */ + do { + best = NULL; + bestgendiff = 0xffffffff; + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_state != G_RAID_DISK_S_NONE) + continue; + pd = disk->d_md_data; + if (pd->pd_meta == NULL) + gendiff = 0xfffffffe; + else + gendiff = meta->generation - + pd->pd_meta->generation; + if (gendiff < bestgendiff) { + best = disk; + bestgendiff = gendiff; + } + } + if (best != NULL) + g_raid_md_sii_start_disk(best); + } while (best != NULL); + + mdi->mdio_started = 1; + G_RAID_DEBUG1(0, sc, "Array started."); + g_raid_md_write_sii(md, NULL, NULL, NULL); + + /* Pickup any STALE/SPARE disks to refill array if needed. */ + g_raid_md_sii_refill(sc); + + g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); + + callout_stop(&mdi->mdio_start_co); + G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); + root_mount_rel(mdi->mdio_rootmount); + mdi->mdio_rootmount = NULL; +} + +static void +g_raid_md_sii_new_disk(struct g_raid_disk *disk) +{ + struct g_raid_softc *sc; + struct g_raid_md_object *md; + struct g_raid_md_sii_object *mdi; + struct sii_raid_conf *pdmeta; + struct g_raid_md_sii_perdisk *pd; + + sc = disk->d_softc; + md = sc->sc_md; + mdi = (struct g_raid_md_sii_object *)md; + pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; + pdmeta = pd->pd_meta; + + if (mdi->mdio_started) { + if (g_raid_md_sii_start_disk(disk)) + g_raid_md_write_sii(md, NULL, NULL, NULL); + } else { + if (mdi->mdio_meta == NULL || + ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) { + G_RAID_DEBUG1(1, sc, "Newer disk"); + if (mdi->mdio_meta != NULL) + free(mdi->mdio_meta, M_MD_SII); + mdi->mdio_meta = sii_meta_copy(pdmeta); + mdi->mdio_generation = mdi->mdio_meta->generation; + mdi->mdio_total_disks = sii_meta_total_disks(pdmeta); + mdi->mdio_disks_present = 1; + } else if (pdmeta->generation == mdi->mdio_generation) { + mdi->mdio_disks_present++; + G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", + mdi->mdio_disks_present, + mdi->mdio_total_disks); + } else { + G_RAID_DEBUG1(1, sc, "Older disk"); + } + + /* If we collected all needed disks - start array. */ + if (mdi->mdio_disks_present == mdi->mdio_total_disks) + g_raid_md_sii_start(sc); + } +} + +static void +g_raid_sii_go(void *arg) +{ + struct g_raid_softc *sc; + struct g_raid_md_object *md; + struct g_raid_md_sii_object *mdi; + + sc = arg; + md = sc->sc_md; + mdi = (struct g_raid_md_sii_object *)md; + if (!mdi->mdio_started) { + G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); + g_raid_event_send(sc, G_RAID_NODE_E_START, 0); + } +} + +static int +g_raid_md_create_sii(struct g_raid_md_object *md, struct g_class *mp, + struct g_geom **gp) +{ + struct g_raid_softc *sc; + struct g_raid_md_sii_object *mdi; + char name[32]; + + mdi = (struct g_raid_md_sii_object *)md; + mdi->mdio_timestamp[5] = arc4random(); + mdi->mdio_timestamp[4] = arc4random(); + mdi->mdio_timestamp[3] = arc4random(); + mdi->mdio_timestamp[2] = arc4random(); + mdi->mdio_timestamp[1] = arc4random(); + mdi->mdio_timestamp[0] = arc4random(); + mdi->mdio_location = arc4random(); + mdi->mdio_generation = 0; + snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", + mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], + mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], + mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); + sc = g_raid_create_node(mp, name, md); + if (sc == NULL) + return (G_RAID_MD_TASTE_FAIL); + md->mdo_softc = sc; + *gp = sc->sc_geom; + return (G_RAID_MD_TASTE_NEW); +} + +static int +g_raid_md_taste_sii(struct g_raid_md_object *md, struct g_class *mp, + struct g_consumer *cp, struct g_geom **gp) +{ + struct g_consumer *rcp; + struct g_provider *pp; + struct g_raid_md_sii_object *mdi, *mdi1; + struct g_raid_softc *sc; + struct g_raid_disk *disk; + struct sii_raid_conf *meta; + struct g_raid_md_sii_perdisk *pd; + struct g_geom *geom; + int error, disk_pos, result, spare, len; + char name[32]; + uint16_t vendor; + + G_RAID_DEBUG(1, "Tasting SiI on %s", cp->provider->name); + mdi = (struct g_raid_md_sii_object *)md; + pp = cp->provider; + + /* Read metadata from device. */ + meta = NULL; + spare = 0; + vendor = 0xffff; + disk_pos = 0; + if (g_access(cp, 1, 0, 0) != 0) + return (G_RAID_MD_TASTE_FAIL); + g_topology_unlock(); + len = 2; + if (pp->geom->rank == 1) + g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); + meta = sii_meta_read(cp); + g_topology_lock(); + g_access(cp, -1, 0, 0); + if (meta == NULL) { + if (g_raid_aggressive_spare) { + if (vendor == 0x1095) { + G_RAID_DEBUG(1, + "No SiI metadata, forcing spare."); + spare = 2; + goto search; + } else { + G_RAID_DEBUG(1, + "SiI vendor mismatch 0x%04x != 0x1095", + vendor); + } + } + return (G_RAID_MD_TASTE_FAIL); + } + + /* Check this disk position in obtained metadata. */ + disk_pos = sii_meta_disk_pos(meta, meta); + if (disk_pos == -1) { + G_RAID_DEBUG(1, "SiI disk position not found"); + goto fail1; + } + + /* Metadata valid. Print it. */ + g_raid_md_sii_print(meta); + G_RAID_DEBUG(1, "SiI disk position %d", disk_pos); + spare = (meta->type == SII_T_SPARE) ? 1 : 0; + +search: + /* Search for matching node. */ + sc = NULL; + mdi1 = NULL; + LIST_FOREACH(geom, &mp->geom, geom) { + sc = geom->softc; + if (sc == NULL) + continue; + if (sc->sc_stopping != 0) + continue; + if (sc->sc_md->mdo_class != md->mdo_class) + continue; + mdi1 = (struct g_raid_md_sii_object *)sc->sc_md; + if (spare) { + if (mdi1->mdio_incomplete) + break; + } else { + if (mdi1->mdio_location == meta->raid_location && + memcmp(&mdi1->mdio_timestamp, + &meta->timestamp, 6) == 0) + break; + } + } + + /* Found matching node. */ + if (geom != NULL) { + G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); + result = G_RAID_MD_TASTE_EXISTING; + + } else if (spare) { /* Not found needy node -- left for later. */ + G_RAID_DEBUG(1, "Spare is not needed at this time"); + goto fail1; + + } else { /* Not found matching node -- create one. */ + result = G_RAID_MD_TASTE_NEW; + memcpy(&mdi->mdio_timestamp, &meta->timestamp, 6); + mdi->mdio_location = meta->raid_location; + snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", + mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], + mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], + mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); + sc = g_raid_create_node(mp, name, md); + md->mdo_softc = sc; + geom = sc->sc_geom; + callout_init(&mdi->mdio_start_co, 1); + callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, + g_raid_sii_go, sc); + mdi->mdio_rootmount = root_mount_hold("GRAID-SiI"); + G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); + } + + rcp = g_new_consumer(geom); + g_attach(rcp, pp); + if (g_access(rcp, 1, 1, 1) != 0) + ; //goto fail1; + + g_topology_unlock(); + sx_xlock(&sc->sc_lock); + + pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); + pd->pd_meta = meta; + if (spare == 2) { + pd->pd_disk_pos = -3; + } else { + pd->pd_disk_pos = -1; + } + pd->pd_disk_size = pp->mediasize; + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_consumer = rcp; + rcp->private = disk; + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + error = g_io_getattr("GEOM::kerneldump", rcp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, "Dumping not supported by %s: %d.", + rcp->provider->name, error); + + g_raid_md_sii_new_disk(disk); + + sx_xunlock(&sc->sc_lock); + g_topology_lock(); + *gp = geom; + return (result); +fail1: + free(meta, M_MD_SII); + return (G_RAID_MD_TASTE_FAIL); +} + +static int +g_raid_md_event_sii(struct g_raid_md_object *md, + struct g_raid_disk *disk, u_int event) +{ + struct g_raid_softc *sc; + struct g_raid_subdisk *sd; + struct g_raid_md_sii_object *mdi; + struct g_raid_md_sii_perdisk *pd; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_sii_object *)md; + if (disk == NULL) { + switch (event) { + case G_RAID_NODE_E_START: + if (!mdi->mdio_started) + g_raid_md_sii_start(sc); + return (0); + } + return (-1); + } + pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; + switch (event) { + case G_RAID_DISK_E_DISCONNECTED: + /* If disk was assigned, just update statuses. */ + if (pd->pd_disk_pos >= 0) { + g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); + if (disk->d_consumer) { + g_raid_kill_consumer(sc, disk->d_consumer); + disk->d_consumer = NULL; + } + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NONE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, + G_RAID_EVENT_SUBDISK); + } + } else { + /* Otherwise -- delete. */ + g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); + g_raid_destroy_disk(disk); + } + + /* Write updated metadata to all disks. */ + g_raid_md_write_sii(md, NULL, NULL, NULL); + + /* Check if anything left except placeholders. */ + if (g_raid_ndisks(sc, -1) == + g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) + g_raid_destroy_node(sc, 0); + else + g_raid_md_sii_refill(sc); + return (0); + } + return (-2); +} + +static int +g_raid_md_ctl_sii(struct g_raid_md_object *md, + struct gctl_req *req) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + struct g_raid_md_sii_object *mdi; + struct g_raid_md_sii_perdisk *pd; + struct g_consumer *cp; + struct g_provider *pp; + char arg[16]; + const char *verb, *volname, *levelname, *diskname; + int *nargs, *force; + off_t size, sectorsize, strip; + intmax_t *sizearg, *striparg; + int numdisks, i, len, level, qual, update; + int error; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_sii_object *)md; + verb = gctl_get_param(req, "verb", NULL); + nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); + error = 0; + if (strcmp(verb, "label") == 0) { + + if (*nargs < 4) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + volname = gctl_get_asciiparam(req, "arg1"); + if (volname == NULL) { + gctl_error(req, "No volume name."); + return (-2); + } + levelname = gctl_get_asciiparam(req, "arg2"); + if (levelname == NULL) { + gctl_error(req, "No RAID level."); + return (-3); + } + if (g_raid_volume_str2level(levelname, &level, &qual)) { + gctl_error(req, "Unknown RAID level '%s'.", levelname); + return (-4); + } + numdisks = *nargs - 3; + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (!g_raid_md_sii_supported(level, qual, numdisks, + force ? *force : 0)) { + gctl_error(req, "Unsupported RAID level " + "(0x%02x/0x%02x), or number of disks (%d).", + level, qual, numdisks); + return (-5); + } + + /* Search for disks, connect them and probe. */ + size = 0x7fffffffffffffffllu; + sectorsize = 0; + for (i = 0; i < numdisks; i++) { + snprintf(arg, sizeof(arg), "arg%d", i + 3); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -6; + break; + } + if (strcmp(diskname, "NONE") == 0) { + cp = NULL; + pp = NULL; + } else { + g_topology_lock(); + cp = g_raid_open_consumer(sc, diskname); + if (cp == NULL) { + gctl_error(req, "Can't open '%s'.", + diskname); + g_topology_unlock(); + error = -7; + break; + } + pp = cp->provider; + } + pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); + pd->pd_disk_pos = i; + disk = g_raid_create_disk(sc); + disk->d_md_data = (void *)pd; + disk->d_consumer = cp; + if (cp == NULL) + continue; + cp->private = disk; + g_topology_unlock(); + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, + "Dumping not supported by %s.", + cp->provider->name); + + pd->pd_disk_size = pp->mediasize; + if (size > pp->mediasize) + size = pp->mediasize; + if (sectorsize < pp->sectorsize) + sectorsize = pp->sectorsize; + } + if (error != 0) + return (error); + + /* Reserve space for metadata. */ + size -= 0x800 * sectorsize; + + /* Handle size argument. */ + len = sizeof(*sizearg); + sizearg = gctl_get_param(req, "size", &len); + if (sizearg != NULL && len == sizeof(*sizearg) && + *sizearg > 0) { + if (*sizearg > size) { + gctl_error(req, "Size too big %lld > %lld.", + (long long)*sizearg, (long long)size); + return (-9); + } + size = *sizearg; + } + + /* Handle strip argument. */ + strip = 131072; + len = sizeof(*striparg); + striparg = gctl_get_param(req, "strip", &len); + if (striparg != NULL && len == sizeof(*striparg) && + *striparg > 0) { + if (*striparg < sectorsize) { + gctl_error(req, "Strip size too small."); + return (-10); + } + if (*striparg % sectorsize != 0) { + gctl_error(req, "Incorrect strip size."); + return (-11); + } + if (strip > 65535 * sectorsize) { + gctl_error(req, "Strip size too big."); + return (-12); + } + strip = *striparg; + } + + /* Round size down to strip or sector. */ + if (level == G_RAID_VOLUME_RL_RAID1) + size -= (size % sectorsize); + else if (level == G_RAID_VOLUME_RL_RAID1E && + (numdisks & 1) != 0) + size -= (size % (2 * strip)); + else + size -= (size % strip); + if (size <= 0) { + gctl_error(req, "Size too small."); + return (-13); + } + if (size > 0xffffffffffffllu * sectorsize) { + gctl_error(req, "Size too big."); + return (-14); + } + + /* We have all we need, create things: volume, ... */ + mdi->mdio_total_disks = numdisks; + mdi->mdio_started = 1; + vol = g_raid_create_volume(sc, volname, -1); + vol->v_md_data = (void *)(intptr_t)0; + vol->v_raid_level = level; + vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; + vol->v_strip_size = strip; + vol->v_disks_count = numdisks; + if (level == G_RAID_VOLUME_RL_RAID0 || + level == G_RAID_VOLUME_RL_CONCAT || + level == G_RAID_VOLUME_RL_SINGLE) + vol->v_mediasize = size * numdisks; + else if (level == G_RAID_VOLUME_RL_RAID1) + vol->v_mediasize = size; + else if (level == G_RAID_VOLUME_RL_RAID5) + vol->v_mediasize = size * (numdisks - 1); + else { /* RAID1E */ + vol->v_mediasize = ((size * numdisks) / strip / 2) * + strip; + } + vol->v_sectorsize = sectorsize; + g_raid_start_volume(vol); + + /* , and subdisks. */ + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; + sd = &vol->v_subdisks[pd->pd_disk_pos]; + sd->sd_disk = disk; + sd->sd_offset = 0; + sd->sd_size = size; + TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); + if (sd->sd_disk->d_consumer != NULL) { + g_raid_change_disk_state(disk, + G_RAID_DISK_S_ACTIVE); + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_ACTIVE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, + G_RAID_EVENT_SUBDISK); + } else { + g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); + } + } + + /* Write metadata based on created entities. */ + G_RAID_DEBUG1(0, sc, "Array started."); + g_raid_md_write_sii(md, NULL, NULL, NULL); + + /* Pickup any STALE/SPARE disks to refill array if needed. */ + g_raid_md_sii_refill(sc); + + g_raid_event_send(vol, G_RAID_VOLUME_E_START, + G_RAID_EVENT_VOLUME); + return (0); + } + if (strcmp(verb, "delete") == 0) { + + /* Check if some volume is still open. */ + force = gctl_get_paraml(req, "force", sizeof(*force)); + if (force != NULL && *force == 0 && + g_raid_nopens(sc) != 0) { + gctl_error(req, "Some volume is still open."); + return (-4); + } + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_consumer) + sii_meta_erase(disk->d_consumer); + } + g_raid_destroy_node(sc, 0); + return (0); + } + if (strcmp(verb, "remove") == 0 || + strcmp(verb, "fail") == 0) { + if (*nargs < 2) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + for (i = 1; i < *nargs; i++) { + snprintf(arg, sizeof(arg), "arg%d", i); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -2; + break; + } + if (strncmp(diskname, "/dev/", 5) == 0) + diskname += 5; + + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + if (disk->d_consumer != NULL && + disk->d_consumer->provider != NULL && + strcmp(disk->d_consumer->provider->name, + diskname) == 0) + break; + } + if (disk == NULL) { + gctl_error(req, "Disk '%s' not found.", + diskname); + error = -3; + break; + } + + if (strcmp(verb, "fail") == 0) { + g_raid_md_fail_disk_sii(md, NULL, disk); + continue; + } + + pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; + + /* Erase metadata on deleting disk. */ + sii_meta_erase(disk->d_consumer); + + /* If disk was assigned, just update statuses. */ + if (pd->pd_disk_pos >= 0) { + g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); + if (disk->d_consumer) { + g_raid_kill_consumer(sc, disk->d_consumer); + disk->d_consumer = NULL; + } + TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_NONE); + g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, + G_RAID_EVENT_SUBDISK); + } + } else { + /* Otherwise -- delete. */ + g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); + g_raid_destroy_disk(disk); + } + } + + /* Write updated metadata to remaining disks. */ + g_raid_md_write_sii(md, NULL, NULL, NULL); + + /* Check if anything left except placeholders. */ + if (g_raid_ndisks(sc, -1) == + g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) + g_raid_destroy_node(sc, 0); + else + g_raid_md_sii_refill(sc); + return (error); + } + if (strcmp(verb, "insert") == 0) { + if (*nargs < 2) { + gctl_error(req, "Invalid number of arguments."); + return (-1); + } + update = 0; + for (i = 1; i < *nargs; i++) { + /* Get disk name. */ + snprintf(arg, sizeof(arg), "arg%d", i); + diskname = gctl_get_asciiparam(req, arg); + if (diskname == NULL) { + gctl_error(req, "No disk name (%s).", arg); + error = -3; + break; + } + + /* Try to find provider with specified name. */ + g_topology_lock(); + cp = g_raid_open_consumer(sc, diskname); + if (cp == NULL) { + gctl_error(req, "Can't open disk '%s'.", + diskname); + g_topology_unlock(); + error = -4; + break; + } + pp = cp->provider; + + pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); + pd->pd_disk_pos = -3; + pd->pd_disk_size = pp->mediasize; + + disk = g_raid_create_disk(sc); + disk->d_consumer = cp; + disk->d_consumer->private = disk; + disk->d_md_data = (void *)pd; + cp->private = disk; + g_topology_unlock(); + + /* Read kernel dumping information. */ + disk->d_kd.offset = 0; + disk->d_kd.length = OFF_MAX; + len = sizeof(disk->d_kd); + g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); + if (disk->d_kd.di.dumper == NULL) + G_RAID_DEBUG1(2, sc, + "Dumping not supported by %s.", + cp->provider->name); + + /* Welcome the "new" disk. */ + update += g_raid_md_sii_start_disk(disk); + if (disk->d_state == G_RAID_DISK_S_SPARE) { + sii_meta_write_spare(cp); + g_raid_destroy_disk(disk); + } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { + gctl_error(req, "Disk '%s' doesn't fit.", + diskname); + g_raid_destroy_disk(disk); + error = -8; + break; + } + } + + /* Write new metadata if we changed something. */ + if (update) + g_raid_md_write_sii(md, NULL, NULL, NULL); + return (error); + } + gctl_error(req, "Command '%s' is not supported.", verb); + return (-100); +} + +static int +g_raid_md_write_sii(struct g_raid_md_object *md, struct g_raid_volume *tvol, + struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct g_raid_disk *disk; + struct g_raid_md_sii_object *mdi; + struct g_raid_md_sii_perdisk *pd; + struct sii_raid_conf *meta; + int i; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_sii_object *)md; + + if (sc->sc_stopping == G_RAID_DESTROY_HARD) + return (0); + + /* Bump generation. Newly written metadata may differ from previous. */ + mdi->mdio_generation++; + + /* There is only one volume. */ + vol = TAILQ_FIRST(&sc->sc_volumes); + + /* Fill global fields. */ + meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); + if (mdi->mdio_meta) + memcpy(meta, mdi->mdio_meta, sizeof(*meta)); + meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; + meta->vendor_id = 0x1095; + meta->version_minor = 0; + meta->version_major = 2; + memcpy(&meta->timestamp, &mdi->mdio_timestamp, 6); + meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize; + if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) { + meta->type = SII_T_RAID0; + meta->raid0_disks = vol->v_disks_count; + meta->raid1_disks = 0xff; + } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { + meta->type = SII_T_RAID1; + meta->raid0_disks = 0xff; + meta->raid1_disks = vol->v_disks_count; + } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { + meta->type = SII_T_RAID01; + meta->raid0_disks = vol->v_disks_count / 2; + meta->raid1_disks = 2; + } else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || + vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) { + meta->type = SII_T_JBOD; + meta->raid0_disks = vol->v_disks_count; + meta->raid1_disks = 0xff; + } else { + meta->type = SII_T_RAID5; + meta->raid0_disks = vol->v_disks_count; + meta->raid1_disks = 0xff; + } + meta->generation = mdi->mdio_generation; + meta->raid_status = vol->v_dirty ? SII_S_ONLINE : SII_S_AVAILABLE; + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + if (sd->sd_state == G_RAID_SUBDISK_S_STALE || + sd->sd_state == G_RAID_SUBDISK_S_RESYNC) + meta->raid_status = SII_S_ONLINE; + } + meta->raid_location = mdi->mdio_location; + sii_meta_put_name(meta, vol->v_name); + + /* We are done. Print meta data and store them to disks. */ + if (mdi->mdio_meta != NULL) + free(mdi->mdio_meta, M_MD_SII); + mdi->mdio_meta = meta; + i = 0; + TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { + pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; + if (disk->d_state != G_RAID_DISK_S_ACTIVE) + continue; + if (pd->pd_meta != NULL) { + free(pd->pd_meta, M_MD_SII); + pd->pd_meta = NULL; + } + pd->pd_meta = sii_meta_copy(meta); + if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { + if (sd->sd_state < G_RAID_SUBDISK_S_NEW) + pd->pd_meta->disk_status = SII_S_DROPPED; + else if (sd->sd_state < G_RAID_SUBDISK_S_STALE) { + pd->pd_meta->disk_status = SII_S_REBUILD; + pd->pd_meta->rebuild_lba = + sd->sd_rebuild_pos / vol->v_sectorsize; + } else + pd->pd_meta->disk_status = SII_S_CURRENT; + if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { + pd->pd_meta->disk_number = sd->sd_pos; + pd->pd_meta->raid0_ident = 0xff; + pd->pd_meta->raid1_ident = 0; + } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { + pd->pd_meta->disk_number = sd->sd_pos / meta->raid1_disks; + pd->pd_meta->raid0_ident = sd->sd_pos % meta->raid1_disks; + pd->pd_meta->raid1_ident = sd->sd_pos / meta->raid1_disks; + } else { + pd->pd_meta->disk_number = sd->sd_pos; + pd->pd_meta->raid0_ident = 0; + pd->pd_meta->raid1_ident = 0xff; + } + } + G_RAID_DEBUG(1, "Writing SiI metadata to %s", + g_raid_get_diskname(disk)); + g_raid_md_sii_print(pd->pd_meta); + sii_meta_write(disk->d_consumer, pd->pd_meta); + } + return (0); +} + +static int +g_raid_md_fail_disk_sii(struct g_raid_md_object *md, + struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) +{ + struct g_raid_softc *sc; + struct g_raid_md_sii_object *mdi; + struct g_raid_md_sii_perdisk *pd; + struct g_raid_subdisk *sd; + + sc = md->mdo_softc; + mdi = (struct g_raid_md_sii_object *)md; + pd = (struct g_raid_md_sii_perdisk *)tdisk->d_md_data; + + /* We can't fail disk that is not a part of array now. */ + if (pd->pd_disk_pos < 0) + return (-1); + + /* + * Mark disk as failed in metadata and try to write that metadata + * to the disk itself to prevent it's later resurrection as STALE. + */ + if (tdisk->d_consumer) { + if (pd->pd_meta) { + pd->pd_meta->disk_status = SII_S_REMOVED; + sii_meta_write(tdisk->d_consumer, pd->pd_meta); + } else + sii_meta_erase(tdisk->d_consumer); + } + + /* Change states. */ + g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); + TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_FAILED); + g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, + G_RAID_EVENT_SUBDISK); + } + + /* Write updated metadata to remaining disks. */ + g_raid_md_write_sii(md, NULL, NULL, tdisk); + + /* Check if anything left except placeholders. */ + if (g_raid_ndisks(sc, -1) == + g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) + g_raid_destroy_node(sc, 0); + else + g_raid_md_sii_refill(sc); + return (0); +} + +static int +g_raid_md_free_disk_sii(struct g_raid_md_object *md, + struct g_raid_disk *disk) +{ + struct g_raid_md_sii_perdisk *pd; + + pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; + if (pd->pd_meta != NULL) { + free(pd->pd_meta, M_MD_SII); + pd->pd_meta = NULL; + } + free(pd, M_MD_SII); + disk->d_md_data = NULL; + return (0); +} + +static int +g_raid_md_free_sii(struct g_raid_md_object *md) +{ + struct g_raid_md_sii_object *mdi; + + mdi = (struct g_raid_md_sii_object *)md; + if (!mdi->mdio_started) { + mdi->mdio_started = 0; + callout_stop(&mdi->mdio_start_co); + G_RAID_DEBUG1(1, md->mdo_softc, + "root_mount_rel %p", mdi->mdio_rootmount); + root_mount_rel(mdi->mdio_rootmount); + mdi->mdio_rootmount = NULL; + } + if (mdi->mdio_meta != NULL) { + free(mdi->mdio_meta, M_MD_SII); + mdi->mdio_meta = NULL; + } + return (0); +} + +G_RAID_MD_DECLARE(g_raid_md_sii); diff --git a/sys/geom/raid/tr_concat.c b/sys/geom/raid/tr_concat.c new file mode 100644 index 0000000..c5f2913 --- /dev/null +++ b/sys/geom/raid/tr_concat.c @@ -0,0 +1,343 @@ +/*- + * Copyright (c) 2010 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "geom/raid/g_raid.h" +#include "g_raid_tr_if.h" + +static MALLOC_DEFINE(M_TR_CONCAT, "tr_concat_data", "GEOM_RAID CONCAT data"); + +struct g_raid_tr_concat_object { + struct g_raid_tr_object trso_base; + int trso_starting; + int trso_stopped; +}; + +static g_raid_tr_taste_t g_raid_tr_taste_concat; +static g_raid_tr_event_t g_raid_tr_event_concat; +static g_raid_tr_start_t g_raid_tr_start_concat; +static g_raid_tr_stop_t g_raid_tr_stop_concat; +static g_raid_tr_iostart_t g_raid_tr_iostart_concat; +static g_raid_tr_iodone_t g_raid_tr_iodone_concat; +static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_concat; +static g_raid_tr_free_t g_raid_tr_free_concat; + +static kobj_method_t g_raid_tr_concat_methods[] = { + KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_concat), + KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_concat), + KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_concat), + KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_concat), + KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_concat), + KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_concat), + KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_concat), + KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_concat), + { 0, 0 } +}; + +static struct g_raid_tr_class g_raid_tr_concat_class = { + "CONCAT", + g_raid_tr_concat_methods, + sizeof(struct g_raid_tr_concat_object), + .trc_priority = 50 +}; + +static int +g_raid_tr_taste_concat(struct g_raid_tr_object *tr, struct g_raid_volume *volume) +{ + struct g_raid_tr_concat_object *trs; + + trs = (struct g_raid_tr_concat_object *)tr; + if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_SINGLE && + tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_CONCAT && + !(tr->tro_volume->v_disks_count == 1 && + tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_UNKNOWN)) + return (G_RAID_TR_TASTE_FAIL); + trs->trso_starting = 1; + return (G_RAID_TR_TASTE_SUCCEED); +} + +static int +g_raid_tr_update_state_concat(struct g_raid_volume *vol) +{ + struct g_raid_tr_concat_object *trs; + struct g_raid_softc *sc; + off_t size; + u_int s; + int i, n, f; + + sc = vol->v_softc; + trs = (struct g_raid_tr_concat_object *)vol->v_tr; + if (trs->trso_stopped) + s = G_RAID_VOLUME_S_STOPPED; + else if (trs->trso_starting) + s = G_RAID_VOLUME_S_STARTING; + else { + n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); + f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED); + if (n + f == vol->v_disks_count) { + if (f == 0) + s = G_RAID_VOLUME_S_OPTIMAL; + else + s = G_RAID_VOLUME_S_SUBOPTIMAL; + } else + s = G_RAID_VOLUME_S_BROKEN; + } + if (s != vol->v_state) { + + /* + * Some metadata modules may not know CONCAT volume + * mediasize until all disks connected. Recalculate. + */ + if (G_RAID_VOLUME_S_ALIVE(s) && + !G_RAID_VOLUME_S_ALIVE(vol->v_state)) { + size = 0; + for (i = 0; i < vol->v_disks_count; i++) { + if (vol->v_subdisks[i].sd_state != + G_RAID_SUBDISK_S_NONE) + size += vol->v_subdisks[i].sd_size; + } + vol->v_mediasize = size; + } + + g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? + G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, + G_RAID_EVENT_VOLUME); + g_raid_change_volume_state(vol, s); + if (!trs->trso_starting && !trs->trso_stopped) + g_raid_write_metadata(sc, vol, NULL, NULL); + } + return (0); +} + +static int +g_raid_tr_event_concat(struct g_raid_tr_object *tr, + struct g_raid_subdisk *sd, u_int event) +{ + struct g_raid_tr_concat_object *trs; + struct g_raid_softc *sc; + struct g_raid_volume *vol; + int state; + + trs = (struct g_raid_tr_concat_object *)tr; + vol = tr->tro_volume; + sc = vol->v_softc; + + state = sd->sd_state; + if (state != G_RAID_SUBDISK_S_NONE && + state != G_RAID_SUBDISK_S_FAILED && + state != G_RAID_SUBDISK_S_ACTIVE) { + G_RAID_DEBUG1(1, sc, + "Promote subdisk %s:%d from %s to ACTIVE.", + vol->v_name, sd->sd_pos, + g_raid_subdisk_state2str(sd->sd_state)); + g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); + } + if (state != sd->sd_state && + !trs->trso_starting && !trs->trso_stopped) + g_raid_write_metadata(sc, vol, sd, NULL); + g_raid_tr_update_state_concat(vol); + return (0); +} + +static int +g_raid_tr_start_concat(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_concat_object *trs; + struct g_raid_volume *vol; + + trs = (struct g_raid_tr_concat_object *)tr; + vol = tr->tro_volume; + trs->trso_starting = 0; + g_raid_tr_update_state_concat(vol); + return (0); +} + +static int +g_raid_tr_stop_concat(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_concat_object *trs; + struct g_raid_volume *vol; + + trs = (struct g_raid_tr_concat_object *)tr; + vol = tr->tro_volume; + trs->trso_starting = 0; + trs->trso_stopped = 1; + g_raid_tr_update_state_concat(vol); + return (0); +} + +static void +g_raid_tr_iostart_concat(struct g_raid_tr_object *tr, struct bio *bp) +{ + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct bio_queue_head queue; + struct bio *cbp; + char *addr; + off_t offset, length, remain; + u_int no; + + vol = tr->tro_volume; + if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && + vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) { + g_raid_iodone(bp, EIO); + return; + } + if (bp->bio_cmd == BIO_FLUSH) { + g_raid_tr_flush_common(tr, bp); + return; + } + + offset = bp->bio_offset; + remain = bp->bio_length; + addr = bp->bio_data; + no = 0; + while (no < vol->v_disks_count && + offset >= vol->v_subdisks[no].sd_size) { + offset -= vol->v_subdisks[no].sd_size; + no++; + } + KASSERT(no < vol->v_disks_count, + ("Request starts after volume end (%ju)", bp->bio_offset)); + bioq_init(&queue); + do { + sd = &vol->v_subdisks[no]; + length = MIN(sd->sd_size - offset, remain); + cbp = g_clone_bio(bp); + if (cbp == NULL) + goto failure; + cbp->bio_offset = offset; + cbp->bio_data = addr; + cbp->bio_length = length; + cbp->bio_caller1 = sd; + bioq_insert_tail(&queue, cbp); + remain -= length; + addr += length; + offset = 0; + no++; + KASSERT(no < vol->v_disks_count || remain == 0, + ("Request ends after volume end (%ju, %ju)", + bp->bio_offset, bp->bio_length)); + } while (remain > 0); + for (cbp = bioq_first(&queue); cbp != NULL; + cbp = bioq_first(&queue)) { + bioq_remove(&queue, cbp); + sd = cbp->bio_caller1; + cbp->bio_caller1 = NULL; + g_raid_subdisk_iostart(sd, cbp); + } + return; +failure: + for (cbp = bioq_first(&queue); cbp != NULL; + cbp = bioq_first(&queue)) { + bioq_remove(&queue, cbp); + g_destroy_bio(cbp); + } + if (bp->bio_error == 0) + bp->bio_error = ENOMEM; + g_raid_iodone(bp, bp->bio_error); +} + +static int +g_raid_tr_kerneldump_concat(struct g_raid_tr_object *tr, + void *virtual, vm_offset_t physical, off_t boffset, size_t blength) +{ + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + char *addr; + off_t offset, length, remain; + int error, no; + + vol = tr->tro_volume; + if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL) + return (ENXIO); + + offset = boffset; + remain = blength; + addr = virtual; + no = 0; + while (no < vol->v_disks_count && + offset >= vol->v_subdisks[no].sd_size) { + offset -= vol->v_subdisks[no].sd_size; + no++; + } + KASSERT(no < vol->v_disks_count, + ("Request starts after volume end (%ju)", boffset)); + do { + sd = &vol->v_subdisks[no]; + length = MIN(sd->sd_size - offset, remain); + error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no], + addr, 0, offset, length); + if (error != 0) + return (error); + remain -= length; + addr += length; + offset = 0; + no++; + KASSERT(no < vol->v_disks_count || remain == 0, + ("Request ends after volume end (%ju, %zu)", + boffset, blength)); + } while (remain > 0); + return (0); +} + +static void +g_raid_tr_iodone_concat(struct g_raid_tr_object *tr, + struct g_raid_subdisk *sd,struct bio *bp) +{ + struct bio *pbp; + + pbp = bp->bio_parent; + if (pbp->bio_error == 0) + pbp->bio_error = bp->bio_error; + g_destroy_bio(bp); + pbp->bio_inbed++; + if (pbp->bio_children == pbp->bio_inbed) { + pbp->bio_completed = pbp->bio_length; + g_raid_iodone(pbp, bp->bio_error); + } +} + +static int +g_raid_tr_free_concat(struct g_raid_tr_object *tr) +{ + + return (0); +} + +G_RAID_TR_DECLARE(g_raid_tr_concat); diff --git a/sys/geom/raid/tr_raid0.c b/sys/geom/raid/tr_raid0.c new file mode 100644 index 0000000..0fb45a6 --- /dev/null +++ b/sys/geom/raid/tr_raid0.c @@ -0,0 +1,326 @@ +/*- + * Copyright (c) 2010 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "geom/raid/g_raid.h" +#include "g_raid_tr_if.h" + +static MALLOC_DEFINE(M_TR_RAID0, "tr_raid0_data", "GEOM_RAID RAID0 data"); + +struct g_raid_tr_raid0_object { + struct g_raid_tr_object trso_base; + int trso_starting; + int trso_stopped; +}; + +static g_raid_tr_taste_t g_raid_tr_taste_raid0; +static g_raid_tr_event_t g_raid_tr_event_raid0; +static g_raid_tr_start_t g_raid_tr_start_raid0; +static g_raid_tr_stop_t g_raid_tr_stop_raid0; +static g_raid_tr_iostart_t g_raid_tr_iostart_raid0; +static g_raid_tr_iodone_t g_raid_tr_iodone_raid0; +static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid0; +static g_raid_tr_free_t g_raid_tr_free_raid0; + +static kobj_method_t g_raid_tr_raid0_methods[] = { + KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid0), + KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid0), + KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid0), + KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid0), + KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid0), + KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid0), + KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid0), + KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid0), + { 0, 0 } +}; + +static struct g_raid_tr_class g_raid_tr_raid0_class = { + "RAID0", + g_raid_tr_raid0_methods, + sizeof(struct g_raid_tr_raid0_object), + .trc_priority = 100 +}; + +static int +g_raid_tr_taste_raid0(struct g_raid_tr_object *tr, struct g_raid_volume *volume) +{ + struct g_raid_tr_raid0_object *trs; + + trs = (struct g_raid_tr_raid0_object *)tr; + if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID0 || + tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE) + return (G_RAID_TR_TASTE_FAIL); + trs->trso_starting = 1; + return (G_RAID_TR_TASTE_SUCCEED); +} + +static int +g_raid_tr_update_state_raid0(struct g_raid_volume *vol) +{ + struct g_raid_tr_raid0_object *trs; + struct g_raid_softc *sc; + u_int s; + int n, f; + + sc = vol->v_softc; + trs = (struct g_raid_tr_raid0_object *)vol->v_tr; + if (trs->trso_stopped) + s = G_RAID_VOLUME_S_STOPPED; + else if (trs->trso_starting) + s = G_RAID_VOLUME_S_STARTING; + else { + n = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); + f = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_FAILED); + if (n + f == vol->v_disks_count) { + if (f == 0) + s = G_RAID_VOLUME_S_OPTIMAL; + else + s = G_RAID_VOLUME_S_SUBOPTIMAL; + } else + s = G_RAID_VOLUME_S_BROKEN; + } + if (s != vol->v_state) { + g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? + G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, + G_RAID_EVENT_VOLUME); + g_raid_change_volume_state(vol, s); + if (!trs->trso_starting && !trs->trso_stopped) + g_raid_write_metadata(sc, vol, NULL, NULL); + } + return (0); +} + +static int +g_raid_tr_event_raid0(struct g_raid_tr_object *tr, + struct g_raid_subdisk *sd, u_int event) +{ + struct g_raid_tr_raid0_object *trs; + struct g_raid_softc *sc; + struct g_raid_volume *vol; + int state; + + trs = (struct g_raid_tr_raid0_object *)tr; + vol = tr->tro_volume; + sc = vol->v_softc; + + state = sd->sd_state; + if (state != G_RAID_SUBDISK_S_NONE && + state != G_RAID_SUBDISK_S_FAILED && + state != G_RAID_SUBDISK_S_ACTIVE) { + G_RAID_DEBUG1(1, sc, + "Promote subdisk %s:%d from %s to ACTIVE.", + vol->v_name, sd->sd_pos, + g_raid_subdisk_state2str(sd->sd_state)); + g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); + } + if (state != sd->sd_state && + !trs->trso_starting && !trs->trso_stopped) + g_raid_write_metadata(sc, vol, sd, NULL); + g_raid_tr_update_state_raid0(vol); + return (0); +} + +static int +g_raid_tr_start_raid0(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid0_object *trs; + struct g_raid_volume *vol; + + trs = (struct g_raid_tr_raid0_object *)tr; + vol = tr->tro_volume; + trs->trso_starting = 0; + g_raid_tr_update_state_raid0(vol); + return (0); +} + +static int +g_raid_tr_stop_raid0(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid0_object *trs; + struct g_raid_volume *vol; + + trs = (struct g_raid_tr_raid0_object *)tr; + vol = tr->tro_volume; + trs->trso_starting = 0; + trs->trso_stopped = 1; + g_raid_tr_update_state_raid0(vol); + return (0); +} + +static void +g_raid_tr_iostart_raid0(struct g_raid_tr_object *tr, struct bio *bp) +{ + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct bio_queue_head queue; + struct bio *cbp; + char *addr; + off_t offset, start, length, nstripe, remain; + u_int no, strip_size; + + vol = tr->tro_volume; + if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && + vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL) { + g_raid_iodone(bp, EIO); + return; + } + if (bp->bio_cmd == BIO_FLUSH) { + g_raid_tr_flush_common(tr, bp); + return; + } + addr = bp->bio_data; + strip_size = vol->v_strip_size; + + /* Stripe number. */ + nstripe = bp->bio_offset / strip_size; + /* Start position in stripe. */ + start = bp->bio_offset % strip_size; + /* Disk number. */ + no = nstripe % vol->v_disks_count; + /* Stripe start position in disk. */ + offset = (nstripe / vol->v_disks_count) * strip_size; + /* Length of data to operate. */ + remain = bp->bio_length; + + bioq_init(&queue); + do { + length = MIN(strip_size - start, remain); + cbp = g_clone_bio(bp); + if (cbp == NULL) + goto failure; + cbp->bio_offset = offset + start; + cbp->bio_data = addr; + cbp->bio_length = length; + cbp->bio_caller1 = &vol->v_subdisks[no]; + bioq_insert_tail(&queue, cbp); + if (++no >= vol->v_disks_count) { + no = 0; + offset += strip_size; + } + remain -= length; + addr += length; + start = 0; + } while (remain > 0); + for (cbp = bioq_first(&queue); cbp != NULL; + cbp = bioq_first(&queue)) { + bioq_remove(&queue, cbp); + sd = cbp->bio_caller1; + cbp->bio_caller1 = NULL; + g_raid_subdisk_iostart(sd, cbp); + } + return; +failure: + for (cbp = bioq_first(&queue); cbp != NULL; + cbp = bioq_first(&queue)) { + bioq_remove(&queue, cbp); + g_destroy_bio(cbp); + } + if (bp->bio_error == 0) + bp->bio_error = ENOMEM; + g_raid_iodone(bp, bp->bio_error); +} + +static int +g_raid_tr_kerneldump_raid0(struct g_raid_tr_object *tr, + void *virtual, vm_offset_t physical, off_t boffset, size_t blength) +{ + struct g_raid_volume *vol; + char *addr; + off_t offset, start, length, nstripe, remain; + u_int no, strip_size; + int error; + + vol = tr->tro_volume; + if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL) + return (ENXIO); + addr = virtual; + strip_size = vol->v_strip_size; + + /* Stripe number. */ + nstripe = boffset / strip_size; + /* Start position in stripe. */ + start = boffset % strip_size; + /* Disk number. */ + no = nstripe % vol->v_disks_count; + /* Stripe tart position in disk. */ + offset = (nstripe / vol->v_disks_count) * strip_size; + /* Length of data to operate. */ + remain = blength; + + do { + length = MIN(strip_size - start, remain); + error = g_raid_subdisk_kerneldump(&vol->v_subdisks[no], + addr, 0, offset + start, length); + if (error != 0) + return (error); + if (++no >= vol->v_disks_count) { + no = 0; + offset += strip_size; + } + remain -= length; + addr += length; + start = 0; + } while (remain > 0); + return (0); +} + +static void +g_raid_tr_iodone_raid0(struct g_raid_tr_object *tr, + struct g_raid_subdisk *sd,struct bio *bp) +{ + struct bio *pbp; + + pbp = bp->bio_parent; + if (pbp->bio_error == 0) + pbp->bio_error = bp->bio_error; + g_destroy_bio(bp); + pbp->bio_inbed++; + if (pbp->bio_children == pbp->bio_inbed) { + pbp->bio_completed = pbp->bio_length; + g_raid_iodone(pbp, bp->bio_error); + } +} + +static int +g_raid_tr_free_raid0(struct g_raid_tr_object *tr) +{ + + return (0); +} + +G_RAID_TR_DECLARE(g_raid_tr_raid0); diff --git a/sys/geom/raid/tr_raid1.c b/sys/geom/raid/tr_raid1.c new file mode 100644 index 0000000..b5e4953 --- /dev/null +++ b/sys/geom/raid/tr_raid1.c @@ -0,0 +1,993 @@ +/*- + * Copyright (c) 2010 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "geom/raid/g_raid.h" +#include "g_raid_tr_if.h" + +SYSCTL_DECL(_kern_geom_raid); +SYSCTL_NODE(_kern_geom_raid, OID_AUTO, raid1, CTLFLAG_RW, 0, + "RAID1 parameters"); + +#define RAID1_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ +static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB; +TUNABLE_INT("kern.geom.raid.raid1.rebuild_slab_size", + &g_raid1_rebuild_slab); +SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RW, + &g_raid1_rebuild_slab, 0, + "Amount of the disk to rebuild each read/write cycle of the rebuild."); + +#define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ +static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO; +TUNABLE_INT("kern.geom.raid.raid1.rebuild_fair_io", + &g_raid1_rebuild_fair_io); +SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RW, + &g_raid1_rebuild_fair_io, 0, + "Fraction of the I/O bandwidth to use when disk busy for rebuild."); + +#define RAID1_REBUILD_CLUSTER_IDLE 100 +static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE; +TUNABLE_INT("kern.geom.raid.raid1.rebuild_cluster_idle", + &g_raid1_rebuild_cluster_idle); +SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW, + &g_raid1_rebuild_cluster_idle, 0, + "Number of slabs to do each time we trigger a rebuild cycle"); + +#define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ +static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE; +TUNABLE_INT("kern.geom.raid.raid1.rebuild_meta_update", + &g_raid1_rebuild_meta_update); +SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RW, + &g_raid1_rebuild_meta_update, 0, + "When to update the meta data."); + +static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data"); + +#define TR_RAID1_NONE 0 +#define TR_RAID1_REBUILD 1 +#define TR_RAID1_RESYNC 2 + +#define TR_RAID1_F_DOING_SOME 0x1 +#define TR_RAID1_F_LOCKED 0x2 +#define TR_RAID1_F_ABORT 0x4 + +struct g_raid_tr_raid1_object { + struct g_raid_tr_object trso_base; + int trso_starting; + int trso_stopping; + int trso_type; + int trso_recover_slabs; /* slabs before rest */ + int trso_fair_io; + int trso_meta_update; + int trso_flags; + struct g_raid_subdisk *trso_failed_sd; /* like per volume */ + void *trso_buffer; /* Buffer space */ + struct bio trso_bio; +}; + +static g_raid_tr_taste_t g_raid_tr_taste_raid1; +static g_raid_tr_event_t g_raid_tr_event_raid1; +static g_raid_tr_start_t g_raid_tr_start_raid1; +static g_raid_tr_stop_t g_raid_tr_stop_raid1; +static g_raid_tr_iostart_t g_raid_tr_iostart_raid1; +static g_raid_tr_iodone_t g_raid_tr_iodone_raid1; +static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1; +static g_raid_tr_locked_t g_raid_tr_locked_raid1; +static g_raid_tr_idle_t g_raid_tr_idle_raid1; +static g_raid_tr_free_t g_raid_tr_free_raid1; + +static kobj_method_t g_raid_tr_raid1_methods[] = { + KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1), + KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1), + KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1), + KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1), + KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1), + KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1), + KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1), + KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1), + KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1), + KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1), + { 0, 0 } +}; + +static struct g_raid_tr_class g_raid_tr_raid1_class = { + "RAID1", + g_raid_tr_raid1_methods, + sizeof(struct g_raid_tr_raid1_object), + .trc_priority = 100 +}; + +static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr); +static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, + struct g_raid_subdisk *sd); + +static int +g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol) +{ + struct g_raid_tr_raid1_object *trs; + + trs = (struct g_raid_tr_raid1_object *)tr; + if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 || + tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE) + return (G_RAID_TR_TASTE_FAIL); + trs->trso_starting = 1; + return (G_RAID_TR_TASTE_SUCCEED); +} + +static int +g_raid_tr_update_state_raid1(struct g_raid_volume *vol, + struct g_raid_subdisk *sd) +{ + struct g_raid_tr_raid1_object *trs; + struct g_raid_softc *sc; + struct g_raid_subdisk *tsd, *bestsd; + u_int s; + int i, na, ns; + + sc = vol->v_softc; + trs = (struct g_raid_tr_raid1_object *)vol->v_tr; + if (trs->trso_stopping && + (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0) + s = G_RAID_VOLUME_S_STOPPED; + else if (trs->trso_starting) + s = G_RAID_VOLUME_S_STARTING; + else { + /* Make sure we have at least one ACTIVE disk. */ + na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); + if (na == 0) { + /* + * Critical situation! We have no any active disk! + * Choose the best disk we have to make it active. + */ + bestsd = &vol->v_subdisks[0]; + for (i = 1; i < vol->v_disks_count; i++) { + tsd = &vol->v_subdisks[i]; + if (tsd->sd_state > bestsd->sd_state) + bestsd = tsd; + else if (tsd->sd_state == bestsd->sd_state && + (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD || + tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) && + tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos) + bestsd = tsd; + } + if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) { + /* We found reasonable candidate. */ + G_RAID_DEBUG1(1, sc, + "Promote subdisk %s:%d from %s to ACTIVE.", + vol->v_name, bestsd->sd_pos, + g_raid_subdisk_state2str(bestsd->sd_state)); + g_raid_change_subdisk_state(bestsd, + G_RAID_SUBDISK_S_ACTIVE); + g_raid_write_metadata(sc, + vol, bestsd, bestsd->sd_disk); + } + } + na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); + ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); + if (na == vol->v_disks_count) + s = G_RAID_VOLUME_S_OPTIMAL; + else if (na + ns == vol->v_disks_count) + s = G_RAID_VOLUME_S_SUBOPTIMAL; + else if (na > 0) + s = G_RAID_VOLUME_S_DEGRADED; + else + s = G_RAID_VOLUME_S_BROKEN; + g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd); + } + if (s != vol->v_state) { + g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? + G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, + G_RAID_EVENT_VOLUME); + g_raid_change_volume_state(vol, s); + if (!trs->trso_starting && !trs->trso_stopping) + g_raid_write_metadata(sc, vol, NULL, NULL); + } + return (0); +} + +static void +g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, + struct g_raid_disk *disk) +{ + /* + * We don't fail the last disk in the pack, since it still has decent + * data on it and that's better than failing the disk if it is the root + * file system. + * + * XXX should this be controlled via a tunable? It makes sense for + * the volume that has / on it. I can't think of a case where we'd + * want the volume to go away on this kind of event. + */ + if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 && + g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd) + return; + g_raid_fail_disk(sc, sd, disk); +} + +static void +g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1_object *trs; + struct g_raid_subdisk *sd, *good_sd; + struct bio *bp; + + trs = (struct g_raid_tr_raid1_object *)tr; + if (trs->trso_flags & TR_RAID1_F_DOING_SOME) + return; + sd = trs->trso_failed_sd; + good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE); + if (good_sd == NULL) { + g_raid_tr_raid1_rebuild_abort(tr); + return; + } + bp = &trs->trso_bio; + memset(bp, 0, sizeof(*bp)); + bp->bio_offset = sd->sd_rebuild_pos; + bp->bio_length = MIN(g_raid1_rebuild_slab, + sd->sd_size - sd->sd_rebuild_pos); + bp->bio_data = trs->trso_buffer; + bp->bio_cmd = BIO_READ; + bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; + bp->bio_caller1 = good_sd; + trs->trso_flags |= TR_RAID1_F_DOING_SOME; + trs->trso_flags |= TR_RAID1_F_LOCKED; + g_raid_lock_range(sd->sd_volume, /* Lock callback starts I/O */ + bp->bio_offset, bp->bio_length, NULL, bp); +} + +static void +g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs) +{ + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + + vol = trs->trso_base.tro_volume; + sd = trs->trso_failed_sd; + g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); + free(trs->trso_buffer, M_TR_RAID1); + trs->trso_buffer = NULL; + trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; + trs->trso_type = TR_RAID1_NONE; + trs->trso_recover_slabs = 0; + trs->trso_failed_sd = NULL; + g_raid_tr_update_state_raid1(vol, NULL); +} + +static void +g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1_object *trs; + struct g_raid_subdisk *sd; + + trs = (struct g_raid_tr_raid1_object *)tr; + sd = trs->trso_failed_sd; + G_RAID_DEBUG1(0, tr->tro_volume->v_softc, + "Subdisk %s:%d-%s rebuild completed.", + sd->sd_volume->v_name, sd->sd_pos, + sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); + g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); + sd->sd_rebuild_pos = 0; + g_raid_tr_raid1_rebuild_done(trs); +} + +static void +g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1_object *trs; + struct g_raid_subdisk *sd; + struct g_raid_volume *vol; + off_t len; + + vol = tr->tro_volume; + trs = (struct g_raid_tr_raid1_object *)tr; + sd = trs->trso_failed_sd; + if (trs->trso_flags & TR_RAID1_F_DOING_SOME) { + G_RAID_DEBUG1(1, vol->v_softc, + "Subdisk %s:%d-%s rebuild is aborting.", + sd->sd_volume->v_name, sd->sd_pos, + sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); + trs->trso_flags |= TR_RAID1_F_ABORT; + } else { + G_RAID_DEBUG1(0, vol->v_softc, + "Subdisk %s:%d-%s rebuild aborted.", + sd->sd_volume->v_name, sd->sd_pos, + sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); + trs->trso_flags &= ~TR_RAID1_F_ABORT; + if (trs->trso_flags & TR_RAID1_F_LOCKED) { + trs->trso_flags &= ~TR_RAID1_F_LOCKED; + len = MIN(g_raid1_rebuild_slab, + sd->sd_size - sd->sd_rebuild_pos); + g_raid_unlock_range(tr->tro_volume, + sd->sd_rebuild_pos, len); + } + g_raid_tr_raid1_rebuild_done(trs); + } +} + +static void +g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr) +{ + struct g_raid_volume *vol; + struct g_raid_tr_raid1_object *trs; + struct g_raid_subdisk *sd, *fsd; + + vol = tr->tro_volume; + trs = (struct g_raid_tr_raid1_object *)tr; + if (trs->trso_failed_sd) { + G_RAID_DEBUG1(1, vol->v_softc, + "Already rebuild in start rebuild. pos %jd\n", + (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); + return; + } + sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE); + if (sd == NULL) { + G_RAID_DEBUG1(1, vol->v_softc, + "No active disk to rebuild. night night."); + return; + } + fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); + if (fsd == NULL) + fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); + if (fsd == NULL) { + fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); + if (fsd != NULL) { + fsd->sd_rebuild_pos = 0; + g_raid_change_subdisk_state(fsd, + G_RAID_SUBDISK_S_RESYNC); + g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); + } else { + fsd = g_raid_get_subdisk(vol, + G_RAID_SUBDISK_S_UNINITIALIZED); + if (fsd == NULL) + fsd = g_raid_get_subdisk(vol, + G_RAID_SUBDISK_S_NEW); + if (fsd != NULL) { + fsd->sd_rebuild_pos = 0; + g_raid_change_subdisk_state(fsd, + G_RAID_SUBDISK_S_REBUILD); + g_raid_write_metadata(vol->v_softc, + vol, fsd, NULL); + } + } + } + if (fsd == NULL) { + G_RAID_DEBUG1(1, vol->v_softc, + "No failed disk to rebuild. night night."); + return; + } + trs->trso_failed_sd = fsd; + G_RAID_DEBUG1(0, vol->v_softc, + "Subdisk %s:%d-%s rebuild start at %jd.", + fsd->sd_volume->v_name, fsd->sd_pos, + fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]", + trs->trso_failed_sd->sd_rebuild_pos); + trs->trso_type = TR_RAID1_REBUILD; + trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK); + trs->trso_meta_update = g_raid1_rebuild_meta_update; + g_raid_tr_raid1_rebuild_some(tr); +} + + +static void +g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, + struct g_raid_subdisk *sd) +{ + struct g_raid_volume *vol; + struct g_raid_tr_raid1_object *trs; + int na, nr; + + /* + * If we're stopping, don't do anything. If we don't have at least one + * good disk and one bad disk, we don't do anything. And if there's a + * 'good disk' stored in the trs, then we're in progress and we punt. + * If we make it past all these checks, we need to rebuild. + */ + vol = tr->tro_volume; + trs = (struct g_raid_tr_raid1_object *)tr; + if (trs->trso_stopping) + return; + na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); + nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); + switch(trs->trso_type) { + case TR_RAID1_NONE: + if (na == 0) + return; + if (nr == 0) { + nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); + if (nr == 0) + return; + } + g_raid_tr_raid1_rebuild_start(tr); + break; + case TR_RAID1_REBUILD: + if (na == 0 || nr == 0 || trs->trso_failed_sd == sd) + g_raid_tr_raid1_rebuild_abort(tr); + break; + case TR_RAID1_RESYNC: + break; + } +} + +static int +g_raid_tr_event_raid1(struct g_raid_tr_object *tr, + struct g_raid_subdisk *sd, u_int event) +{ + + g_raid_tr_update_state_raid1(tr->tro_volume, sd); + return (0); +} + +static int +g_raid_tr_start_raid1(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1_object *trs; + struct g_raid_volume *vol; + + trs = (struct g_raid_tr_raid1_object *)tr; + vol = tr->tro_volume; + trs->trso_starting = 0; + g_raid_tr_update_state_raid1(vol, NULL); + return (0); +} + +static int +g_raid_tr_stop_raid1(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1_object *trs; + struct g_raid_volume *vol; + + trs = (struct g_raid_tr_raid1_object *)tr; + vol = tr->tro_volume; + trs->trso_starting = 0; + trs->trso_stopping = 1; + g_raid_tr_update_state_raid1(vol, NULL); + return (0); +} + +/* + * Select the disk to read from. Take into account: subdisk state, running + * error recovery, average disk load, head position and possible cache hits. + */ +#define ABS(x) (((x) >= 0) ? (x) : (-(x))) +static struct g_raid_subdisk * +g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp, + u_int mask) +{ + struct g_raid_subdisk *sd, *best; + int i, prio, bestprio; + + best = NULL; + bestprio = INT_MAX; + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && + ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD && + sd->sd_state != G_RAID_SUBDISK_S_RESYNC) || + bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos)) + continue; + if ((mask & (1 << i)) != 0) + continue; + prio = G_RAID_SUBDISK_LOAD(sd); + prio += min(sd->sd_recovery, 255) << 22; + prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16; + /* If disk head is precisely in position - highly prefer it. */ + if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset) + prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; + else + /* If disk head is close to position - prefer it. */ + if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) < + G_RAID_SUBDISK_TRACK_SIZE) + prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; + if (prio < bestprio) { + best = sd; + bestprio = prio; + } + } + return (best); +} + +static void +g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp) +{ + struct g_raid_subdisk *sd; + struct bio *cbp; + + sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0); + KASSERT(sd != NULL, ("No active disks in volume %s.", + tr->tro_volume->v_name)); + + cbp = g_clone_bio(bp); + if (cbp == NULL) { + g_raid_iodone(bp, ENOMEM); + return; + } + + g_raid_subdisk_iostart(sd, cbp); +} + +static void +g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp) +{ + struct g_raid_softc *sc; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct bio_queue_head queue; + struct bio *cbp; + int i; + + vol = tr->tro_volume; + sc = vol->v_softc; + + /* + * Allocate all bios before sending any request, so we can return + * ENOMEM in nice and clean way. + */ + bioq_init(&queue); + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + switch (sd->sd_state) { + case G_RAID_SUBDISK_S_ACTIVE: + break; + case G_RAID_SUBDISK_S_REBUILD: + /* + * When rebuilding, only part of this subdisk is + * writable, the rest will be written as part of the + * that process. + */ + if (bp->bio_offset >= sd->sd_rebuild_pos) + continue; + break; + case G_RAID_SUBDISK_S_STALE: + case G_RAID_SUBDISK_S_RESYNC: + /* + * Resyncing still writes on the theory that the + * resync'd disk is very close and writing it will + * keep it that way better if we keep up while + * resyncing. + */ + break; + default: + continue; + } + cbp = g_clone_bio(bp); + if (cbp == NULL) + goto failure; + cbp->bio_caller1 = sd; + bioq_insert_tail(&queue, cbp); + } + for (cbp = bioq_first(&queue); cbp != NULL; + cbp = bioq_first(&queue)) { + bioq_remove(&queue, cbp); + sd = cbp->bio_caller1; + cbp->bio_caller1 = NULL; + g_raid_subdisk_iostart(sd, cbp); + } + return; +failure: + for (cbp = bioq_first(&queue); cbp != NULL; + cbp = bioq_first(&queue)) { + bioq_remove(&queue, cbp); + g_destroy_bio(cbp); + } + if (bp->bio_error == 0) + bp->bio_error = ENOMEM; + g_raid_iodone(bp, bp->bio_error); +} + +static void +g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp) +{ + struct g_raid_volume *vol; + struct g_raid_tr_raid1_object *trs; + + vol = tr->tro_volume; + trs = (struct g_raid_tr_raid1_object *)tr; + if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && + vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && + vol->v_state != G_RAID_VOLUME_S_DEGRADED) { + g_raid_iodone(bp, EIO); + return; + } + /* + * If we're rebuilding, squeeze in rebuild activity every so often, + * even when the disk is busy. Be sure to only count real I/O + * to the disk. All 'SPECIAL' I/O is traffic generated to the disk + * by this module. + */ + if (trs->trso_failed_sd != NULL && + !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { + /* Make this new or running now round short. */ + trs->trso_recover_slabs = 0; + if (--trs->trso_fair_io <= 0) { + trs->trso_fair_io = g_raid1_rebuild_fair_io; + g_raid_tr_raid1_rebuild_some(tr); + } + } + switch (bp->bio_cmd) { + case BIO_READ: + g_raid_tr_iostart_raid1_read(tr, bp); + break; + case BIO_WRITE: + g_raid_tr_iostart_raid1_write(tr, bp); + break; + case BIO_DELETE: + g_raid_iodone(bp, EIO); + break; + case BIO_FLUSH: + g_raid_tr_flush_common(tr, bp); + break; + default: + KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", + bp->bio_cmd, vol->v_name)); + break; + } +} + +static void +g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr, + struct g_raid_subdisk *sd, struct bio *bp) +{ + struct bio *cbp; + struct g_raid_subdisk *nsd; + struct g_raid_volume *vol; + struct bio *pbp; + struct g_raid_tr_raid1_object *trs; + uintptr_t *mask; + int error, do_write; + + trs = (struct g_raid_tr_raid1_object *)tr; + vol = tr->tro_volume; + if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { + /* + * This operation is part of a rebuild or resync operation. + * See what work just got done, then schedule the next bit of + * work, if any. Rebuild/resync is done a little bit at a + * time. Either when a timeout happens, or after we get a + * bunch of I/Os to the disk (to make sure an active system + * will complete in a sane amount of time). + * + * We are setup to do differing amounts of work for each of + * these cases. so long as the slabs is smallish (less than + * 50 or so, I'd guess, but that's just a WAG), we shouldn't + * have any bio starvation issues. For active disks, we do + * 5MB of data, for inactive ones, we do 50MB. + */ + if (trs->trso_type == TR_RAID1_REBUILD) { + if (bp->bio_cmd == BIO_READ) { + + /* Immediately abort rebuild, if requested. */ + if (trs->trso_flags & TR_RAID1_F_ABORT) { + trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; + g_raid_tr_raid1_rebuild_abort(tr); + return; + } + + /* On read error, skip and cross fingers. */ + if (bp->bio_error != 0) { + G_RAID_LOGREQ(0, bp, + "Read error during rebuild (%d), " + "possible data loss!", + bp->bio_error); + goto rebuild_round_done; + } + + /* + * The read operation finished, queue the + * write and get out. + */ + G_RAID_LOGREQ(4, bp, "rebuild read done. %d", + bp->bio_error); + bp->bio_cmd = BIO_WRITE; + bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; + bp->bio_offset = bp->bio_offset; + bp->bio_length = bp->bio_length; + G_RAID_LOGREQ(4, bp, "Queueing rebuild write."); + g_raid_subdisk_iostart(trs->trso_failed_sd, bp); + } else { + /* + * The write operation just finished. Do + * another. We keep cloning the master bio + * since it has the right buffers allocated to + * it. + */ + G_RAID_LOGREQ(4, bp, + "rebuild write done. Error %d", + bp->bio_error); + nsd = trs->trso_failed_sd; + if (bp->bio_error != 0 || + trs->trso_flags & TR_RAID1_F_ABORT) { + if ((trs->trso_flags & + TR_RAID1_F_ABORT) == 0) { + g_raid_tr_raid1_fail_disk(sd->sd_softc, + nsd, nsd->sd_disk); + } + trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; + g_raid_tr_raid1_rebuild_abort(tr); + return; + } +rebuild_round_done: + nsd = trs->trso_failed_sd; + trs->trso_flags &= ~TR_RAID1_F_LOCKED; + g_raid_unlock_range(sd->sd_volume, + bp->bio_offset, bp->bio_length); + nsd->sd_rebuild_pos += bp->bio_length; + if (nsd->sd_rebuild_pos >= nsd->sd_size) { + g_raid_tr_raid1_rebuild_finish(tr); + return; + } + + /* Abort rebuild if we are stopping */ + if (trs->trso_stopping) { + trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; + g_raid_tr_raid1_rebuild_abort(tr); + return; + } + + if (--trs->trso_meta_update <= 0) { + g_raid_write_metadata(vol->v_softc, + vol, nsd, nsd->sd_disk); + trs->trso_meta_update = + g_raid1_rebuild_meta_update; + } + trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; + if (--trs->trso_recover_slabs <= 0) + return; + g_raid_tr_raid1_rebuild_some(tr); + } + } else if (trs->trso_type == TR_RAID1_RESYNC) { + /* + * read good sd, read bad sd in parallel. when both + * done, compare the buffers. write good to the bad + * if different. do the next bit of work. + */ + panic("Somehow, we think we're doing a resync"); + } + return; + } + pbp = bp->bio_parent; + pbp->bio_inbed++; + if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { + /* + * Read failed on first drive. Retry the read error on + * another disk drive, if available, before erroring out the + * read. + */ + sd->sd_disk->d_read_errs++; + G_RAID_LOGREQ(0, bp, + "Read error (%d), %d read errors total", + bp->bio_error, sd->sd_disk->d_read_errs); + + /* + * If there are too many read errors, we move to degraded. + * XXX Do we want to FAIL the drive (eg, make the user redo + * everything to get it back in sync), or just degrade the + * drive, which kicks off a resync? + */ + do_write = 1; + if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) { + g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); + if (pbp->bio_children == 1) + do_write = 0; + } + + /* + * Find the other disk, and try to do the I/O to it. + */ + mask = (uintptr_t *)(&pbp->bio_driver2); + if (pbp->bio_children == 1) { + /* Save original subdisk. */ + pbp->bio_driver1 = do_write ? sd : NULL; + *mask = 0; + } + *mask |= 1 << sd->sd_pos; + nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask); + if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) { + g_destroy_bio(bp); + G_RAID_LOGREQ(2, cbp, "Retrying read from %d", + nsd->sd_pos); + if (pbp->bio_children == 2 && do_write) { + sd->sd_recovery++; + cbp->bio_caller1 = nsd; + pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED; + /* Lock callback starts I/O */ + g_raid_lock_range(sd->sd_volume, + cbp->bio_offset, cbp->bio_length, pbp, cbp); + } else { + g_raid_subdisk_iostart(nsd, cbp); + } + return; + } + /* + * We can't retry. Return the original error by falling + * through. This will happen when there's only one good disk. + * We don't need to fail the raid, since its actual state is + * based on the state of the subdisks. + */ + G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); + } + if (bp->bio_cmd == BIO_READ && + bp->bio_error == 0 && + pbp->bio_children > 1 && + pbp->bio_driver1 != NULL) { + /* + * If it was a read, and bio_children is >1, then we just + * recovered the data from the second drive. We should try to + * write that data to the first drive if sector remapping is + * enabled. A write should put the data in a new place on the + * disk, remapping the bad sector. Do we need to do that by + * queueing a request to the main worker thread? It doesn't + * affect the return code of this current read, and can be + * done at our liesure. However, to make the code simpler, it + * is done syncrhonously. + */ + G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); + cbp = g_clone_bio(pbp); + if (cbp != NULL) { + g_destroy_bio(bp); + cbp->bio_cmd = BIO_WRITE; + cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; + G_RAID_LOGREQ(2, cbp, + "Attempting bad sector remap on failing drive."); + g_raid_subdisk_iostart(pbp->bio_driver1, cbp); + return; + } + } + if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) { + /* + * We're done with a recovery, mark the range as unlocked. + * For any write errors, we agressively fail the disk since + * there was both a READ and a WRITE error at this location. + * Both types of errors generally indicates the drive is on + * the verge of total failure anyway. Better to stop trusting + * it now. However, we need to reset error to 0 in that case + * because we're not failing the original I/O which succeeded. + */ + if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { + G_RAID_LOGREQ(0, bp, "Remap write failed: " + "failing subdisk."); + g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); + bp->bio_error = 0; + } + if (pbp->bio_driver1 != NULL) { + ((struct g_raid_subdisk *)pbp->bio_driver1) + ->sd_recovery--; + } + G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); + g_raid_unlock_range(sd->sd_volume, bp->bio_offset, + bp->bio_length); + } + error = bp->bio_error; + g_destroy_bio(bp); + if (pbp->bio_children == pbp->bio_inbed) { + pbp->bio_completed = pbp->bio_length; + g_raid_iodone(pbp, error); + } +} + +static int +g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr, + void *virtual, vm_offset_t physical, off_t offset, size_t length) +{ + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + int error, i, ok; + + vol = tr->tro_volume; + error = 0; + ok = 0; + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + switch (sd->sd_state) { + case G_RAID_SUBDISK_S_ACTIVE: + break; + case G_RAID_SUBDISK_S_REBUILD: + /* + * When rebuilding, only part of this subdisk is + * writable, the rest will be written as part of the + * that process. + */ + if (offset >= sd->sd_rebuild_pos) + continue; + break; + case G_RAID_SUBDISK_S_STALE: + case G_RAID_SUBDISK_S_RESYNC: + /* + * Resyncing still writes on the theory that the + * resync'd disk is very close and writing it will + * keep it that way better if we keep up while + * resyncing. + */ + break; + default: + continue; + } + error = g_raid_subdisk_kerneldump(sd, + virtual, physical, offset, length); + if (error == 0) + ok++; + } + return (ok > 0 ? 0 : error); +} + +static int +g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp) +{ + struct bio *bp; + struct g_raid_subdisk *sd; + + bp = (struct bio *)argp; + sd = (struct g_raid_subdisk *)bp->bio_caller1; + g_raid_subdisk_iostart(sd, bp); + + return (0); +} + +static int +g_raid_tr_idle_raid1(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1_object *trs; + + trs = (struct g_raid_tr_raid1_object *)tr; + trs->trso_fair_io = g_raid1_rebuild_fair_io; + trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle; + if (trs->trso_type == TR_RAID1_REBUILD) + g_raid_tr_raid1_rebuild_some(tr); + return (0); +} + +static int +g_raid_tr_free_raid1(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1_object *trs; + + trs = (struct g_raid_tr_raid1_object *)tr; + + if (trs->trso_buffer != NULL) { + free(trs->trso_buffer, M_TR_RAID1); + trs->trso_buffer = NULL; + } + return (0); +} + +G_RAID_TR_DECLARE(g_raid_tr_raid1); diff --git a/sys/geom/raid/tr_raid1e.c b/sys/geom/raid/tr_raid1e.c new file mode 100644 index 0000000..9ebe218 --- /dev/null +++ b/sys/geom/raid/tr_raid1e.c @@ -0,0 +1,1227 @@ +/*- + * Copyright (c) 2010 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "geom/raid/g_raid.h" +#include "g_raid_tr_if.h" + +#define N 2 + +SYSCTL_DECL(_kern_geom_raid); +SYSCTL_NODE(_kern_geom_raid, OID_AUTO, raid1e, CTLFLAG_RW, 0, + "RAID1E parameters"); + +#define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ +static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB; +TUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size", + &g_raid1e_rebuild_slab); +SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW, + &g_raid1e_rebuild_slab, 0, + "Amount of the disk to rebuild each read/write cycle of the rebuild."); + +#define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ +static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO; +TUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io", + &g_raid1e_rebuild_fair_io); +SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW, + &g_raid1e_rebuild_fair_io, 0, + "Fraction of the I/O bandwidth to use when disk busy for rebuild."); + +#define RAID1E_REBUILD_CLUSTER_IDLE 100 +static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE; +TUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle", + &g_raid1e_rebuild_cluster_idle); +SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW, + &g_raid1e_rebuild_cluster_idle, 0, + "Number of slabs to do each time we trigger a rebuild cycle"); + +#define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ +static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE; +TUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update", + &g_raid1e_rebuild_meta_update); +SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW, + &g_raid1e_rebuild_meta_update, 0, + "When to update the meta data."); + +static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data"); + +#define TR_RAID1E_NONE 0 +#define TR_RAID1E_REBUILD 1 +#define TR_RAID1E_RESYNC 2 + +#define TR_RAID1E_F_DOING_SOME 0x1 +#define TR_RAID1E_F_LOCKED 0x2 +#define TR_RAID1E_F_ABORT 0x4 + +struct g_raid_tr_raid1e_object { + struct g_raid_tr_object trso_base; + int trso_starting; + int trso_stopping; + int trso_type; + int trso_recover_slabs; /* slabs before rest */ + int trso_fair_io; + int trso_meta_update; + int trso_flags; + struct g_raid_subdisk *trso_failed_sd; /* like per volume */ + void *trso_buffer; /* Buffer space */ + off_t trso_lock_pos; /* Locked range start. */ + off_t trso_lock_len; /* Locked range length. */ + struct bio trso_bio; +}; + +static g_raid_tr_taste_t g_raid_tr_taste_raid1e; +static g_raid_tr_event_t g_raid_tr_event_raid1e; +static g_raid_tr_start_t g_raid_tr_start_raid1e; +static g_raid_tr_stop_t g_raid_tr_stop_raid1e; +static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e; +static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e; +static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e; +static g_raid_tr_locked_t g_raid_tr_locked_raid1e; +static g_raid_tr_idle_t g_raid_tr_idle_raid1e; +static g_raid_tr_free_t g_raid_tr_free_raid1e; + +static kobj_method_t g_raid_tr_raid1e_methods[] = { + KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e), + KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e), + KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e), + KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e), + KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e), + KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e), + KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e), + KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e), + KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e), + KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e), + { 0, 0 } +}; + +static struct g_raid_tr_class g_raid_tr_raid1e_class = { + "RAID1E", + g_raid_tr_raid1e_methods, + sizeof(struct g_raid_tr_raid1e_object), + .trc_priority = 200 +}; + +static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr); +static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, + struct g_raid_subdisk *sd); +static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, + int no, off_t off, off_t len, u_int mask); + +static inline void +V2P(struct g_raid_volume *vol, off_t virt, + int *disk, off_t *offset, off_t *start) +{ + off_t nstrip; + u_int strip_size; + + strip_size = vol->v_strip_size; + /* Strip number. */ + nstrip = virt / strip_size; + /* Start position in strip. */ + *start = virt % strip_size; + /* Disk number. */ + *disk = (nstrip * N) % vol->v_disks_count; + /* Strip start position in disk. */ + *offset = ((nstrip * N) / vol->v_disks_count) * strip_size; +} + +static inline void +P2V(struct g_raid_volume *vol, int disk, off_t offset, + off_t *virt, int *copy) +{ + off_t nstrip, start; + u_int strip_size; + + strip_size = vol->v_strip_size; + /* Start position in strip. */ + start = offset % strip_size; + /* Physical strip number. */ + nstrip = (offset / strip_size) * vol->v_disks_count + disk; + /* Number of physical strip (copy) inside virtual strip. */ + *copy = nstrip % N; + /* Offset in virtual space. */ + *virt = (nstrip / N) * strip_size + start; +} + +static int +g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol) +{ + struct g_raid_tr_raid1e_object *trs; + + trs = (struct g_raid_tr_raid1e_object *)tr; + if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E || + tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE) + return (G_RAID_TR_TASTE_FAIL); + trs->trso_starting = 1; + return (G_RAID_TR_TASTE_SUCCEED); +} + +static int +g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol) +{ + struct g_raid_tr_raid1e_object *trs; + struct g_raid_softc *sc; + struct g_raid_subdisk *sd, *bestsd, *worstsd; + int i, j, state, sstate; + + sc = vol->v_softc; + trs = (struct g_raid_tr_raid1e_object *)vol->v_tr; + state = G_RAID_VOLUME_S_OPTIMAL; + for (i = 0; i < vol->v_disks_count / N; i++) { + bestsd = &vol->v_subdisks[i * N]; + for (j = 1; j < N; j++) { + sd = &vol->v_subdisks[i * N + j]; + if (sd->sd_state > bestsd->sd_state) + bestsd = sd; + else if (sd->sd_state == bestsd->sd_state && + (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || + sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && + sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) + bestsd = sd; + } + if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED && + bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { + /* We found reasonable candidate. */ + G_RAID_DEBUG1(1, sc, + "Promote subdisk %s:%d from %s to ACTIVE.", + vol->v_name, bestsd->sd_pos, + g_raid_subdisk_state2str(bestsd->sd_state)); + g_raid_change_subdisk_state(bestsd, + G_RAID_SUBDISK_S_ACTIVE); + g_raid_write_metadata(sc, + vol, bestsd, bestsd->sd_disk); + } + worstsd = &vol->v_subdisks[i * N]; + for (j = 1; j < N; j++) { + sd = &vol->v_subdisks[i * N + j]; + if (sd->sd_state < worstsd->sd_state) + worstsd = sd; + } + if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) + sstate = G_RAID_VOLUME_S_OPTIMAL; + else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) + sstate = G_RAID_VOLUME_S_SUBOPTIMAL; + else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) + sstate = G_RAID_VOLUME_S_DEGRADED; + else + sstate = G_RAID_VOLUME_S_BROKEN; + if (sstate < state) + state = sstate; + } + return (state); +} + +static int +g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol) +{ + struct g_raid_tr_raid1e_object *trs; + struct g_raid_softc *sc; + struct g_raid_subdisk *sd, *bestsd, *worstsd; + int i, j, state, sstate; + + sc = vol->v_softc; + trs = (struct g_raid_tr_raid1e_object *)vol->v_tr; + if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) == + vol->v_disks_count) + return (G_RAID_VOLUME_S_OPTIMAL); + for (i = 0; i < vol->v_disks_count; i++) { + sd = &vol->v_subdisks[i]; + if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) { + /* We found reasonable candidate. */ + G_RAID_DEBUG1(1, sc, + "Promote subdisk %s:%d from %s to STALE.", + vol->v_name, sd->sd_pos, + g_raid_subdisk_state2str(sd->sd_state)); + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_STALE); + g_raid_write_metadata(sc, vol, sd, sd->sd_disk); + } + } + state = G_RAID_VOLUME_S_OPTIMAL; + for (i = 0; i < vol->v_disks_count; i++) { + bestsd = &vol->v_subdisks[i]; + worstsd = &vol->v_subdisks[i]; + for (j = 1; j < N; j++) { + sd = &vol->v_subdisks[(i + j) % vol->v_disks_count]; + if (sd->sd_state > bestsd->sd_state) + bestsd = sd; + else if (sd->sd_state == bestsd->sd_state && + (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || + sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && + sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) + bestsd = sd; + if (sd->sd_state < worstsd->sd_state) + worstsd = sd; + } + if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) + sstate = G_RAID_VOLUME_S_OPTIMAL; + else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) + sstate = G_RAID_VOLUME_S_SUBOPTIMAL; + else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE) + sstate = G_RAID_VOLUME_S_DEGRADED; + else + sstate = G_RAID_VOLUME_S_BROKEN; + if (sstate < state) + state = sstate; + } + return (state); +} + +static int +g_raid_tr_update_state_raid1e(struct g_raid_volume *vol, + struct g_raid_subdisk *sd) +{ + struct g_raid_tr_raid1e_object *trs; + struct g_raid_softc *sc; + u_int s; + + sc = vol->v_softc; + trs = (struct g_raid_tr_raid1e_object *)vol->v_tr; + if (trs->trso_stopping && + (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0) + s = G_RAID_VOLUME_S_STOPPED; + else if (trs->trso_starting) + s = G_RAID_VOLUME_S_STARTING; + else { + if ((vol->v_disks_count % N) == 0) + s = g_raid_tr_update_state_raid1e_even(vol); + else + s = g_raid_tr_update_state_raid1e_odd(vol); + } + if (s != vol->v_state) { + g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? + G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, + G_RAID_EVENT_VOLUME); + g_raid_change_volume_state(vol, s); + if (!trs->trso_starting && !trs->trso_stopping) + g_raid_write_metadata(sc, vol, NULL, NULL); + } + if (!trs->trso_starting && !trs->trso_stopping) + g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd); + return (0); +} + +static void +g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, + struct g_raid_disk *disk) +{ + /* + * We don't fail the last disk in the pack, since it still has decent + * data on it and that's better than failing the disk if it is the root + * file system. + * + * XXX should this be controlled via a tunable? It makes sense for + * the volume that has / on it. I can't think of a case where we'd + * want the volume to go away on this kind of event. + */ + if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 && + g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd) + return; + g_raid_fail_disk(sc, sd, disk); +} + +static void +g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs) +{ + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + + vol = trs->trso_base.tro_volume; + sd = trs->trso_failed_sd; + g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); + free(trs->trso_buffer, M_TR_RAID1E); + trs->trso_buffer = NULL; + trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; + trs->trso_type = TR_RAID1E_NONE; + trs->trso_recover_slabs = 0; + trs->trso_failed_sd = NULL; + g_raid_tr_update_state_raid1e(vol, NULL); +} + +static void +g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1e_object *trs; + struct g_raid_subdisk *sd; + + trs = (struct g_raid_tr_raid1e_object *)tr; + sd = trs->trso_failed_sd; + G_RAID_DEBUG1(0, tr->tro_volume->v_softc, + "Subdisk %s:%d-%s rebuild completed.", + sd->sd_volume->v_name, sd->sd_pos, + sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); + g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); + sd->sd_rebuild_pos = 0; + g_raid_tr_raid1e_rebuild_done(trs); +} + +static void +g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1e_object *trs; + struct g_raid_subdisk *sd; + struct g_raid_volume *vol; + + vol = tr->tro_volume; + trs = (struct g_raid_tr_raid1e_object *)tr; + sd = trs->trso_failed_sd; + if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) { + G_RAID_DEBUG1(1, vol->v_softc, + "Subdisk %s:%d-%s rebuild is aborting.", + sd->sd_volume->v_name, sd->sd_pos, + sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); + trs->trso_flags |= TR_RAID1E_F_ABORT; + } else { + G_RAID_DEBUG1(0, vol->v_softc, + "Subdisk %s:%d-%s rebuild aborted.", + sd->sd_volume->v_name, sd->sd_pos, + sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); + trs->trso_flags &= ~TR_RAID1E_F_ABORT; + if (trs->trso_flags & TR_RAID1E_F_LOCKED) { + trs->trso_flags &= ~TR_RAID1E_F_LOCKED; + g_raid_unlock_range(tr->tro_volume, + trs->trso_lock_pos, trs->trso_lock_len); + } + g_raid_tr_raid1e_rebuild_done(trs); + } +} + +static void +g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1e_object *trs; + struct g_raid_softc *sc; + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct bio *bp; + off_t len, virtual, vend, offset, start; + int disk, copy, best; + + trs = (struct g_raid_tr_raid1e_object *)tr; + if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) + return; + vol = tr->tro_volume; + sc = vol->v_softc; + sd = trs->trso_failed_sd; + + while (1) { + if (sd->sd_rebuild_pos >= sd->sd_size) { + g_raid_tr_raid1e_rebuild_finish(tr); + return; + } + /* Get virtual offset from physical rebuild position. */ + P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©); + /* Get physical offset back to get first stripe position. */ + V2P(vol, virtual, &disk, &offset, &start); + /* Calculate contignous data length. */ + len = MIN(g_raid1e_rebuild_slab, + sd->sd_size - sd->sd_rebuild_pos); + if ((vol->v_disks_count % N) != 0) + len = MIN(len, vol->v_strip_size - start); + /* Find disk with most accurate data. */ + best = g_raid_tr_raid1e_select_read_disk(vol, disk, + offset + start, len, 0); + if (best < 0) { + /* There is no any valid disk. */ + g_raid_tr_raid1e_rebuild_abort(tr); + return; + } else if (best != copy) { + /* Some other disk has better data. */ + break; + } + /* We have the most accurate data. Skip the range. */ + G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju", + sd->sd_rebuild_pos, sd->sd_rebuild_pos + len); + sd->sd_rebuild_pos += len; + } + + bp = &trs->trso_bio; + memset(bp, 0, sizeof(*bp)); + bp->bio_offset = offset + start + + ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0); + bp->bio_length = len; + bp->bio_data = trs->trso_buffer; + bp->bio_cmd = BIO_READ; + bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; + bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count]; + G_RAID_LOGREQ(3, bp, "Queueing rebuild read"); + /* + * If we are crossing stripe boundary, correct affected virtual + * range we should lock. + */ + if (start + len > vol->v_strip_size) { + P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©); + len = vend - virtual; + } + trs->trso_flags |= TR_RAID1E_F_DOING_SOME; + trs->trso_flags |= TR_RAID1E_F_LOCKED; + trs->trso_lock_pos = virtual; + trs->trso_lock_len = len; + /* Lock callback starts I/O */ + g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp); +} + +static void +g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr) +{ + struct g_raid_volume *vol; + struct g_raid_tr_raid1e_object *trs; + struct g_raid_subdisk *sd; + + vol = tr->tro_volume; + trs = (struct g_raid_tr_raid1e_object *)tr; + if (trs->trso_failed_sd) { + G_RAID_DEBUG1(1, vol->v_softc, + "Already rebuild in start rebuild. pos %jd\n", + (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); + return; + } + sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); + if (sd == NULL) + sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); + if (sd == NULL) { + sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); + if (sd != NULL) { + sd->sd_rebuild_pos = 0; + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_RESYNC); + g_raid_write_metadata(vol->v_softc, vol, sd, NULL); + } else { + sd = g_raid_get_subdisk(vol, + G_RAID_SUBDISK_S_UNINITIALIZED); + if (sd == NULL) + sd = g_raid_get_subdisk(vol, + G_RAID_SUBDISK_S_NEW); + if (sd != NULL) { + sd->sd_rebuild_pos = 0; + g_raid_change_subdisk_state(sd, + G_RAID_SUBDISK_S_REBUILD); + g_raid_write_metadata(vol->v_softc, + vol, sd, NULL); + } + } + } + if (sd == NULL) { + G_RAID_DEBUG1(1, vol->v_softc, + "No failed disk to rebuild. night night."); + return; + } + trs->trso_failed_sd = sd; + G_RAID_DEBUG1(0, vol->v_softc, + "Subdisk %s:%d-%s rebuild start at %jd.", + sd->sd_volume->v_name, sd->sd_pos, + sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", + trs->trso_failed_sd->sd_rebuild_pos); + trs->trso_type = TR_RAID1E_REBUILD; + trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK); + trs->trso_meta_update = g_raid1e_rebuild_meta_update; + g_raid_tr_raid1e_rebuild_some(tr); +} + +static void +g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, + struct g_raid_subdisk *sd) +{ + struct g_raid_volume *vol; + struct g_raid_tr_raid1e_object *trs; + int nr; + + vol = tr->tro_volume; + trs = (struct g_raid_tr_raid1e_object *)tr; + if (trs->trso_stopping) + return; + nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); + switch(trs->trso_type) { + case TR_RAID1E_NONE: + if (vol->v_state < G_RAID_VOLUME_S_DEGRADED) + return; + if (nr == 0) { + nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + + g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); + if (nr == 0) + return; + } + g_raid_tr_raid1e_rebuild_start(tr); + break; + case TR_RAID1E_REBUILD: + if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 || + trs->trso_failed_sd == sd) + g_raid_tr_raid1e_rebuild_abort(tr); + break; + case TR_RAID1E_RESYNC: + break; + } +} + +static int +g_raid_tr_event_raid1e(struct g_raid_tr_object *tr, + struct g_raid_subdisk *sd, u_int event) +{ + + g_raid_tr_update_state_raid1e(tr->tro_volume, sd); + return (0); +} + +static int +g_raid_tr_start_raid1e(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1e_object *trs; + struct g_raid_volume *vol; + + trs = (struct g_raid_tr_raid1e_object *)tr; + vol = tr->tro_volume; + trs->trso_starting = 0; + g_raid_tr_update_state_raid1e(vol, NULL); + return (0); +} + +static int +g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1e_object *trs; + struct g_raid_volume *vol; + + trs = (struct g_raid_tr_raid1e_object *)tr; + vol = tr->tro_volume; + trs->trso_starting = 0; + trs->trso_stopping = 1; + g_raid_tr_update_state_raid1e(vol, NULL); + return (0); +} + +/* + * Select the disk to read from. Take into account: subdisk state, running + * error recovery, average disk load, head position and possible cache hits. + */ +#define ABS(x) (((x) >= 0) ? (x) : (-(x))) +static int +g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, + int no, off_t off, off_t len, u_int mask) +{ + struct g_raid_subdisk *sd; + off_t offset; + int i, best, prio, bestprio; + + best = -1; + bestprio = INT_MAX; + for (i = 0; i < N; i++) { + sd = &vol->v_subdisks[(no + i) % vol->v_disks_count]; + offset = off; + if (no + i >= vol->v_disks_count) + offset += vol->v_strip_size; + + prio = G_RAID_SUBDISK_LOAD(sd); + if ((mask & (1 << sd->sd_pos)) != 0) + continue; + switch (sd->sd_state) { + case G_RAID_SUBDISK_S_ACTIVE: + break; + case G_RAID_SUBDISK_S_RESYNC: + if (offset + off < sd->sd_rebuild_pos) + break; + /* FALLTHROUGH */ + case G_RAID_SUBDISK_S_STALE: + prio += i << 24; + break; + case G_RAID_SUBDISK_S_REBUILD: + if (offset + off < sd->sd_rebuild_pos) + break; + /* FALLTHROUGH */ + default: + continue; + } + prio += min(sd->sd_recovery, 255) << 16; + /* If disk head is precisely in position - highly prefer it. */ + if (G_RAID_SUBDISK_POS(sd) == offset) + prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; + else + /* If disk head is close to position - prefer it. */ + if (ABS(G_RAID_SUBDISK_POS(sd) - offset) < + G_RAID_SUBDISK_TRACK_SIZE) + prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; + if (prio < bestprio) { + bestprio = prio; + best = i; + } + } + return (best); +} + +static void +g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp) +{ + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct bio_queue_head queue; + struct bio *cbp; + char *addr; + off_t offset, start, length, remain; + u_int no, strip_size; + int best; + + vol = tr->tro_volume; + addr = bp->bio_data; + strip_size = vol->v_strip_size; + V2P(vol, bp->bio_offset, &no, &offset, &start); + remain = bp->bio_length; + bioq_init(&queue); + while (remain > 0) { + length = MIN(strip_size - start, remain); + best = g_raid_tr_raid1e_select_read_disk(vol, + no, offset, length, 0); + KASSERT(best >= 0, ("No readable disk in volume %s!", + vol->v_name)); + no += best; + if (no >= vol->v_disks_count) { + no -= vol->v_disks_count; + offset += strip_size; + } + cbp = g_clone_bio(bp); + if (cbp == NULL) + goto failure; + cbp->bio_offset = offset + start; + cbp->bio_data = addr; + cbp->bio_length = length; + cbp->bio_caller1 = &vol->v_subdisks[no]; + bioq_insert_tail(&queue, cbp); + no += N - best; + if (no >= vol->v_disks_count) { + no -= vol->v_disks_count; + offset += strip_size; + } + remain -= length; + addr += length; + start = 0; + } + for (cbp = bioq_first(&queue); cbp != NULL; + cbp = bioq_first(&queue)) { + bioq_remove(&queue, cbp); + sd = cbp->bio_caller1; + cbp->bio_caller1 = NULL; + g_raid_subdisk_iostart(sd, cbp); + } + return; +failure: + for (cbp = bioq_first(&queue); cbp != NULL; + cbp = bioq_first(&queue)) { + bioq_remove(&queue, cbp); + g_destroy_bio(cbp); + } + if (bp->bio_error == 0) + bp->bio_error = ENOMEM; + g_raid_iodone(bp, bp->bio_error); +} + +static void +g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp) +{ + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct bio_queue_head queue; + struct bio *cbp; + char *addr; + off_t offset, start, length, remain; + u_int no, strip_size; + int i; + + vol = tr->tro_volume; + addr = bp->bio_data; + strip_size = vol->v_strip_size; + V2P(vol, bp->bio_offset, &no, &offset, &start); + remain = bp->bio_length; + bioq_init(&queue); + while (remain > 0) { + length = MIN(strip_size - start, remain); + for (i = 0; i < N; i++) { + sd = &vol->v_subdisks[no]; + switch (sd->sd_state) { + case G_RAID_SUBDISK_S_ACTIVE: + case G_RAID_SUBDISK_S_STALE: + case G_RAID_SUBDISK_S_RESYNC: + break; + case G_RAID_SUBDISK_S_REBUILD: + if (offset + start >= sd->sd_rebuild_pos) + goto nextdisk; + break; + default: + goto nextdisk; + } + cbp = g_clone_bio(bp); + if (cbp == NULL) + goto failure; + cbp->bio_offset = offset + start; + cbp->bio_data = addr; + cbp->bio_length = length; + cbp->bio_caller1 = sd; + bioq_insert_tail(&queue, cbp); +nextdisk: + if (++no >= vol->v_disks_count) { + no = 0; + offset += strip_size; + } + } + remain -= length; + addr += length; + start = 0; + } + for (cbp = bioq_first(&queue); cbp != NULL; + cbp = bioq_first(&queue)) { + bioq_remove(&queue, cbp); + sd = cbp->bio_caller1; + cbp->bio_caller1 = NULL; + g_raid_subdisk_iostart(sd, cbp); + } + return; +failure: + for (cbp = bioq_first(&queue); cbp != NULL; + cbp = bioq_first(&queue)) { + bioq_remove(&queue, cbp); + g_destroy_bio(cbp); + } + if (bp->bio_error == 0) + bp->bio_error = ENOMEM; + g_raid_iodone(bp, bp->bio_error); +} + +static void +g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp) +{ + struct g_raid_volume *vol; + struct g_raid_tr_raid1e_object *trs; + + vol = tr->tro_volume; + trs = (struct g_raid_tr_raid1e_object *)tr; + if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && + vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && + vol->v_state != G_RAID_VOLUME_S_DEGRADED) { + g_raid_iodone(bp, EIO); + return; + } + /* + * If we're rebuilding, squeeze in rebuild activity every so often, + * even when the disk is busy. Be sure to only count real I/O + * to the disk. All 'SPECIAL' I/O is traffic generated to the disk + * by this module. + */ + if (trs->trso_failed_sd != NULL && + !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { + /* Make this new or running now round short. */ + trs->trso_recover_slabs = 0; + if (--trs->trso_fair_io <= 0) { + trs->trso_fair_io = g_raid1e_rebuild_fair_io; + g_raid_tr_raid1e_rebuild_some(tr); + } + } + switch (bp->bio_cmd) { + case BIO_READ: + g_raid_tr_iostart_raid1e_read(tr, bp); + break; + case BIO_WRITE: + g_raid_tr_iostart_raid1e_write(tr, bp); + break; + case BIO_DELETE: + g_raid_iodone(bp, EIO); + break; + case BIO_FLUSH: + g_raid_tr_flush_common(tr, bp); + break; + default: + KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", + bp->bio_cmd, vol->v_name)); + break; + } +} + +static void +g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr, + struct g_raid_subdisk *sd, struct bio *bp) +{ + struct bio *cbp; + struct g_raid_subdisk *nsd; + struct g_raid_volume *vol; + struct bio *pbp; + struct g_raid_tr_raid1e_object *trs; + off_t virtual, offset, start; + uintptr_t mask; + int error, do_write, copy, disk, best; + + trs = (struct g_raid_tr_raid1e_object *)tr; + vol = tr->tro_volume; + if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { + if (trs->trso_type == TR_RAID1E_REBUILD) { + nsd = trs->trso_failed_sd; + if (bp->bio_cmd == BIO_READ) { + + /* Immediately abort rebuild, if requested. */ + if (trs->trso_flags & TR_RAID1E_F_ABORT) { + trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; + g_raid_tr_raid1e_rebuild_abort(tr); + return; + } + + /* On read error, skip and cross fingers. */ + if (bp->bio_error != 0) { + G_RAID_LOGREQ(0, bp, + "Read error during rebuild (%d), " + "possible data loss!", + bp->bio_error); + goto rebuild_round_done; + } + + /* + * The read operation finished, queue the + * write and get out. + */ + G_RAID_LOGREQ(3, bp, "Rebuild read done: %d", + bp->bio_error); + bp->bio_cmd = BIO_WRITE; + bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; + bp->bio_offset = nsd->sd_rebuild_pos; + G_RAID_LOGREQ(3, bp, "Queueing rebuild write."); + g_raid_subdisk_iostart(nsd, bp); + } else { + /* + * The write operation just finished. Do + * another. We keep cloning the master bio + * since it has the right buffers allocated to + * it. + */ + G_RAID_LOGREQ(3, bp, "Rebuild write done: %d", + bp->bio_error); + if (bp->bio_error != 0 || + trs->trso_flags & TR_RAID1E_F_ABORT) { + if ((trs->trso_flags & + TR_RAID1E_F_ABORT) == 0) { + g_raid_tr_raid1e_fail_disk(sd->sd_softc, + nsd, nsd->sd_disk); + } + trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; + g_raid_tr_raid1e_rebuild_abort(tr); + return; + } +rebuild_round_done: + trs->trso_flags &= ~TR_RAID1E_F_LOCKED; + g_raid_unlock_range(tr->tro_volume, + trs->trso_lock_pos, trs->trso_lock_len); + nsd->sd_rebuild_pos += bp->bio_length; + if (nsd->sd_rebuild_pos >= nsd->sd_size) { + g_raid_tr_raid1e_rebuild_finish(tr); + return; + } + + /* Abort rebuild if we are stopping */ + if (trs->trso_stopping) { + trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; + g_raid_tr_raid1e_rebuild_abort(tr); + return; + } + + if (--trs->trso_meta_update <= 0) { + g_raid_write_metadata(vol->v_softc, + vol, nsd, nsd->sd_disk); + trs->trso_meta_update = + g_raid1e_rebuild_meta_update; + /* Compensate short rebuild I/Os. */ + if ((vol->v_disks_count % N) != 0 && + vol->v_strip_size < + g_raid1e_rebuild_slab) { + trs->trso_meta_update *= + g_raid1e_rebuild_slab; + trs->trso_meta_update /= + vol->v_strip_size; + } + } + trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; + if (--trs->trso_recover_slabs <= 0) + return; + /* Run next rebuild iteration. */ + g_raid_tr_raid1e_rebuild_some(tr); + } + } else if (trs->trso_type == TR_RAID1E_RESYNC) { + /* + * read good sd, read bad sd in parallel. when both + * done, compare the buffers. write good to the bad + * if different. do the next bit of work. + */ + panic("Somehow, we think we're doing a resync"); + } + return; + } + pbp = bp->bio_parent; + pbp->bio_inbed++; + mask = (intptr_t)bp->bio_caller2; + if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { + /* + * Read failed on first drive. Retry the read error on + * another disk drive, if available, before erroring out the + * read. + */ + sd->sd_disk->d_read_errs++; + G_RAID_LOGREQ(0, bp, + "Read error (%d), %d read errors total", + bp->bio_error, sd->sd_disk->d_read_errs); + + /* + * If there are too many read errors, we move to degraded. + * XXX Do we want to FAIL the drive (eg, make the user redo + * everything to get it back in sync), or just degrade the + * drive, which kicks off a resync? + */ + do_write = 0; + if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) + g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); + else if (mask == 0) + do_write = 1; + + /* Restore what we were doing. */ + P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); + V2P(vol, virtual, &disk, &offset, &start); + + /* Find the other disk, and try to do the I/O to it. */ + mask |= 1 << copy; + best = g_raid_tr_raid1e_select_read_disk(vol, + disk, offset, start, mask); + if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { + disk += best; + if (disk >= vol->v_disks_count) { + disk -= vol->v_disks_count; + offset += vol->v_strip_size; + } + cbp->bio_offset = offset + start; + cbp->bio_length = bp->bio_length; + cbp->bio_data = bp->bio_data; + g_destroy_bio(bp); + nsd = &vol->v_subdisks[disk]; + G_RAID_LOGREQ(2, cbp, "Retrying read from %d", + nsd->sd_pos); + if (do_write) + mask |= 1 << 31; + if ((mask & (1 << 31)) != 0) + sd->sd_recovery++; + cbp->bio_caller2 = (void *)mask; + if (do_write) { + cbp->bio_caller1 = nsd; + /* Lock callback starts I/O */ + g_raid_lock_range(sd->sd_volume, + virtual, cbp->bio_length, pbp, cbp); + } else { + g_raid_subdisk_iostart(nsd, cbp); + } + return; + } + /* + * We can't retry. Return the original error by falling + * through. This will happen when there's only one good disk. + * We don't need to fail the raid, since its actual state is + * based on the state of the subdisks. + */ + G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); + } + if (bp->bio_cmd == BIO_READ && + bp->bio_error == 0 && + (mask & (1 << 31)) != 0) { + G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); + + /* Restore what we were doing. */ + P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); + V2P(vol, virtual, &disk, &offset, &start); + + /* Find best disk to write. */ + best = g_raid_tr_raid1e_select_read_disk(vol, + disk, offset, start, ~mask); + if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { + disk += best; + if (disk >= vol->v_disks_count) { + disk -= vol->v_disks_count; + offset += vol->v_strip_size; + } + cbp->bio_offset = offset + start; + cbp->bio_length = bp->bio_length; + cbp->bio_data = bp->bio_data; + cbp->bio_cmd = BIO_WRITE; + cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; + cbp->bio_caller2 = (void *)mask; + g_destroy_bio(bp); + G_RAID_LOGREQ(2, cbp, + "Attempting bad sector remap on failing drive."); + g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp); + return; + } + } + if ((mask & (1 << 31)) != 0) { + /* + * We're done with a recovery, mark the range as unlocked. + * For any write errors, we agressively fail the disk since + * there was both a READ and a WRITE error at this location. + * Both types of errors generally indicates the drive is on + * the verge of total failure anyway. Better to stop trusting + * it now. However, we need to reset error to 0 in that case + * because we're not failing the original I/O which succeeded. + */ + + /* Restore what we were doing. */ + P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); + V2P(vol, virtual, &disk, &offset, &start); + + for (copy = 0; copy < N; copy++) { + if ((mask & (1 << copy) ) != 0) + vol->v_subdisks[(disk + copy) % + vol->v_disks_count].sd_recovery--; + } + + if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { + G_RAID_LOGREQ(0, bp, "Remap write failed: " + "failing subdisk."); + g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); + bp->bio_error = 0; + } + G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); + g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length); + } + error = bp->bio_error; + g_destroy_bio(bp); + if (pbp->bio_children == pbp->bio_inbed) { + pbp->bio_completed = pbp->bio_length; + g_raid_iodone(pbp, error); + } +} + +static int +g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, + void *virtual, vm_offset_t physical, off_t boffset, size_t blength) +{ + struct g_raid_volume *vol; + struct g_raid_subdisk *sd; + struct bio_queue_head queue; + char *addr; + off_t offset, start, length, remain; + u_int no, strip_size; + int i, error; + + vol = tr->tro_volume; + addr = virtual; + strip_size = vol->v_strip_size; + V2P(vol, boffset, &no, &offset, &start); + remain = blength; + bioq_init(&queue); + while (remain > 0) { + length = MIN(strip_size - start, remain); + for (i = 0; i < N; i++) { + sd = &vol->v_subdisks[no]; + switch (sd->sd_state) { + case G_RAID_SUBDISK_S_ACTIVE: + case G_RAID_SUBDISK_S_STALE: + case G_RAID_SUBDISK_S_RESYNC: + break; + case G_RAID_SUBDISK_S_REBUILD: + if (offset + start >= sd->sd_rebuild_pos) + goto nextdisk; + break; + default: + goto nextdisk; + } + error = g_raid_subdisk_kerneldump(sd, + addr, 0, offset + start, length); + if (error != 0) + return (error); +nextdisk: + if (++no >= vol->v_disks_count) { + no = 0; + offset += strip_size; + } + } + remain -= length; + addr += length; + start = 0; + } + return (0); +} + +static int +g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp) +{ + struct bio *bp; + struct g_raid_subdisk *sd; + + bp = (struct bio *)argp; + sd = (struct g_raid_subdisk *)bp->bio_caller1; + g_raid_subdisk_iostart(sd, bp); + + return (0); +} + +static int +g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1e_object *trs; + struct g_raid_volume *vol; + + vol = tr->tro_volume; + trs = (struct g_raid_tr_raid1e_object *)tr; + trs->trso_fair_io = g_raid1e_rebuild_fair_io; + trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle; + /* Compensate short rebuild I/Os. */ + if ((vol->v_disks_count % N) != 0 && + vol->v_strip_size < g_raid1e_rebuild_slab) { + trs->trso_recover_slabs *= g_raid1e_rebuild_slab; + trs->trso_recover_slabs /= vol->v_strip_size; + } + if (trs->trso_type == TR_RAID1E_REBUILD) + g_raid_tr_raid1e_rebuild_some(tr); + return (0); +} + +static int +g_raid_tr_free_raid1e(struct g_raid_tr_object *tr) +{ + struct g_raid_tr_raid1e_object *trs; + + trs = (struct g_raid_tr_raid1e_object *)tr; + + if (trs->trso_buffer != NULL) { + free(trs->trso_buffer, M_TR_RAID1E); + trs->trso_buffer = NULL; + } + return (0); +} + +G_RAID_TR_DECLARE(g_raid_tr_raid1e); diff --git a/sys/modules/geom/Makefile b/sys/modules/geom/Makefile index 0b2e3e8..ca7d7e6 100644 --- a/sys/modules/geom/Makefile +++ b/sys/modules/geom/Makefile @@ -18,6 +18,7 @@ SUBDIR= geom_bde \ geom_nop \ geom_part \ geom_pc98 \ + geom_raid \ geom_raid3 \ geom_sched \ geom_shsec \ diff --git a/sys/modules/geom/geom_raid/Makefile b/sys/modules/geom/geom_raid/Makefile new file mode 100644 index 0000000..4487807 --- /dev/null +++ b/sys/modules/geom/geom_raid/Makefile @@ -0,0 +1,19 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR}/../../../geom/raid + +KMOD= geom_raid +SRCS= g_raid.c +SRCS+= g_raid_ctl.c +SRCS+= bus_if.h device_if.h +SRCS+= g_raid_md_if.h g_raid_md_if.c +SRCS+= g_raid_tr_if.h g_raid_tr_if.c + +SRCS+= md_intel.c md_jmicron.c md_nvidia.c md_promise.c md_sii.c + +SRCS+= tr_concat.c tr_raid0.c tr_raid1.c tr_raid1e.c + +MFILES= kern/bus_if.m kern/device_if.m +MFILES+= geom/raid/g_raid_md_if.m geom/raid/g_raid_tr_if.m + +.include -- cgit v1.1