After much delay and anticipation, welcome RAIDFrame into the FreeBSD

world. This should be considered highly experimental. Approved-by: re
author: scottl <scottl@FreeBSD.org> 2002-10-20 08:17:39 +0000
committer: scottl <scottl@FreeBSD.org> 2002-10-20 08:17:39 +0000
commit: 710948de69ddeae56bda663219319f6d859aea1f (patch)
tree: 71c65823ba2e8591de708d5cb2e990a75135ee11
parent: 63bd46464d6d4587c20c1ca62fb6a6e3be132db9 (diff)
download: FreeBSD-src-710948de69ddeae56bda663219319f6d859aea1f.zip
FreeBSD-src-710948de69ddeae56bda663219319f6d859aea1f.tar.gz
148 files changed, 45834 insertions, 7 deletions
diff --git a/etc/MAKEDEV b/etc/MAKEDEV
index dbc2803..fa9dd8c 100644
--- a/etc/MAKEDEV
+++ b/etc/MAKEDEV
@@ -446,8 +446,12 @@ wt*)
 	umask 77
 	;;
 
+raidctl)
+	mknod raidctl c 201 0 root:operator
+	;;
+
 # Individual slices.
-aacd*s*|ad*s*|ar*s*|afd*s*|amrd*s*|da*s*|fla*s*|idad*s*|md*s*|mlxd*s*|twed*s*|wd*s*|wfd*s*)
+aacd*s*|ad*s*|ar*s*|afd*s*|amrd*s*|da*s*|fla*s*|idad*s*|md*s*|mlxd*s*|twed*s*|wd*s*|wfd*s*|raid*s*)
 	umask $disk_umask
 	case $i in
 	aacd*s*) name=aacd; chr=151;;
@@ -463,9 +467,10 @@ aacd*s*|ad*s*|ar*s*|afd*s*|amrd*s*|da*s*|fla*s*|idad*s*|md*s*|mlxd*s*|twed*s*|wd
 	twed*s*) name=twed; chr=147;;
 	wd*s*) name=wd;  chr=3;;
 	wfd*s*) name=wfd; chr=87;;
+	raid*s*) name=raid; chr=200;;
 	esac
 	case $i in
-	aacd*s*|amrd*s*|idad*s*|mlxd*s*|twed*s*)
+	aacd*s*|amrd*s*|idad*s*|mlxd*s*|twed*s*|raid*s*)
 		unit=`expr $i : '....\([0-9]*\)s'`
 		slice=`expr $i : '....[0-9]*s\([0-9]*\)'`
 		part=`expr $i : '....[0-9]*s[0-9]*\(.*\)'`
@@ -552,7 +557,7 @@ ata)
 	;;
 	
 
-aacd*|ad*|ar*|afd*|amrd*|da*|fla*|idad*|md*|mlxd*|twed*|wd*|wfd*)
+aacd*|ad*|ar*|afd*|amrd*|da*|fla*|idad*|md*|mlxd*|twed*|wd*|wfd*|raid*)
 	umask $disk_umask
 	case $i in
 	aacd*) name=aacd; chr=151;;
@@ -568,9 +573,10 @@ aacd*|ad*|ar*|afd*|amrd*|da*|fla*|idad*|md*|mlxd*|twed*|wd*|wfd*)
 	twed*) name=twed; chr=147;;
 	wd*) name=wd;  chr=3;;
 	wfd*) name=wfd; chr=87;;
+	raid*) name=raid; chr=200;;
 	esac
 	case $i in
-	aacd*|amrd*|idad*|mlxd*|twed*)
+	aacd*|amrd*|idad*|mlxd*|twed*|raid*)
 		unit=`expr $i : '....\(.*\)'`
 		;;
 	afd*|fla*|wfd*)
diff --git a/sbin/raidctl/Makefile b/sbin/raidctl/Makefile
new file mode 100644
index 0000000..0705eab
--- /dev/null
+++ b/sbin/raidctl/Makefile
@@ -0,0 +1,14 @@
+#	$FreeBSD$
+#	$NetBSD: Makefile,v 1.7 2000/05/23 00:46:53 thorpej Exp $
+PROG=	raidctl 
+SRCS=	rf_configure.c raidctl.c
+MAN8=	raidctl.8
+
+LOOKHERE = ${.CURDIR}/../../sys
+
+CFLAGS+= -DRF_UTILITY=1 -I${LOOKHERE}
+
+DPADD=  ${LIBUTIL}
+LDADD=  -lutil
+
+.include <bsd.prog.mk>
diff --git a/sbin/raidctl/raidctl.8 b/sbin/raidctl/raidctl.8
new file mode 100644
index 0000000..9aef14f
--- /dev/null
+++ b/sbin/raidctl/raidctl.8
@@ -0,0 +1,1325 @@
+.\"	$FreeBSD$
+.\"     $NetBSD: raidctl.8,v 1.21 2000/08/10 15:14:14 oster Exp $
+.\"
+.\" Copyright (c) 1998 The NetBSD Foundation, Inc.
+.\" All rights reserved.
+.\"
+.\" This code is derived from software contributed to The NetBSD Foundation
+.\" by Greg Oster
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\"    must display the following acknowledgement:
+.\"        This product includes software developed by the NetBSD
+.\"        Foundation, Inc. and its contributors.
+.\" 4. Neither the name of The NetBSD Foundation nor the names of its
+.\"    contributors may be used to endorse or promote products derived
+.\"    from this software without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.\"
+.\" Copyright (c) 1995 Carnegie-Mellon University.
+.\" All rights reserved.
+.\" 
+.\" Author: Mark Holland
+.\" 
+.\" Permission to use, copy, modify and distribute this software and
+.\" its documentation is hereby granted, provided that both the copyright
+.\" notice and this permission notice appear in all copies of the
+.\" software, derivative works or modified versions, and any portions
+.\" thereof, and that both notices appear in supporting documentation.
+.\" 
+.\" CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+.\" CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+.\" FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+.\" 
+.\" Carnegie Mellon requests users of this software to return to
+.\" 
+.\"  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+.\"  School of Computer Science
+.\"  Carnegie Mellon University
+.\"  Pittsburgh PA 15213-3890
+.\" 
+.\" any improvements or extensions that they make and grant Carnegie the
+.\" rights to redistribute these changes.
+.\" 
+.Dd November 6, 1998
+.Dt RAIDCTL 8
+.Os FreeBSD
+.Sh NAME
+.Nm raidctl
+.Nd configuration utility for the RAIDframe disk driver
+.Sh SYNOPSIS
+.Nm
+.Op Fl v 
+.Fl a Ar component Ar dev
+.Nm
+.Op Fl v
+.Fl A Op yes | no | root
+.Ar dev
+.Nm
+.Op Fl v 
+.Fl B Ar dev 
+.Nm
+.Op Fl v 
+.Fl c Ar config_file
+.Nm
+.Op Fl v 
+.Fl C Ar config_file
+.Nm
+.Op Fl v 
+.Fl f Ar component Ar dev
+.Nm
+.Op Fl v 
+.Fl F Ar component Ar dev
+.Nm
+.Op Fl v 
+.Fl g Ar component Ar dev
+.Nm
+.Op Fl v 
+.Fl i Ar dev
+.Nm
+.Op Fl v 
+.Fl I Ar serial_number Ar dev
+.Nm
+.Op Fl v 
+.Fl p Ar dev
+.Nm
+.Op Fl v 
+.Fl P Ar dev
+.Nm
+.Op Fl v 
+.Fl r Ar component Ar dev
+.Nm
+.Op Fl v 
+.Fl R Ar component Ar dev
+.Nm
+.Op Fl v 
+.Fl s Ar dev 
+.Nm
+.Op Fl v 
+.Fl S Ar dev
+.Nm
+.Op Fl v 
+.Fl u Ar dev
+.Sh DESCRIPTION
+.Nm
+is the user-land control program for
+.Xr raid 4 ,
+the RAIDframe disk device.  
+.Nm
+is primarily used to dynamically configure and unconfigure RAIDframe disk
+devices.  For more information about the RAIDframe disk device, see
+.Xr raid 4 .
+.Pp
+This document assumes the reader has at least rudimentary knowledge of
+RAID and RAID concepts.
+.Pp
+The command-line options for 
+.Nm
+are as follows:
+.Bl -tag -width indent
+.It Fl a Ar component Ar dev
+Add 
+.Ar component
+as a hot spare for the device 
+.Ar dev .
+.It Fl A Ic yes Ar dev
+Make the RAID set auto-configurable.  The RAID set will be
+automatically configured at boot 
+.Ar before
+the root filesystem is
+mounted.  Note that all components of the set must be of type RAID in the
+disklabel.
+.It Fl A Ic no Ar dev
+Turn off auto-configuration for the RAID set.
+.It Fl A Ic root Ar dev
+Make the RAID set auto-configurable, and also mark the set as being
+eligible to be the root partition.  A RAID set configured this way
+will 
+.Ar override
+the use of the boot disk as the root device.  All components of the
+set must be of type RAID in the disklabel.  Note that the kernel being
+booted must currently reside on a non-RAID set.
+.It Fl B Ar dev
+Initiate a copyback of reconstructed data from a spare disk to 
+its original disk.  This is performed after a component has failed, 
+and the failed drive has been reconstructed onto a spare drive.
+.It Fl c Ar config_file
+Configure a RAIDframe device 
+according to the configuration given in
+.Ar config_file .
+A description of the contents of 
+.Ar config_file
+is given later.
+.It Fl C Ar config_file
+As for
+.Ar -c ,
+but forces the configuration to take place.  This is required the
+first time a RAID set is configured.
+.It Fl f Ar component Ar dev
+This marks the specified 
+.Ar component
+as having failed, but does not initiate a reconstruction of that
+component.  
+.It Fl F Ar component Ar dev
+Fails the specified 
+.Ar component
+of the device, and immediately begin a reconstruction of the failed
+disk onto an available hot spare.  This is one of the mechanisms used to start
+the reconstruction process if a component does have a hardware failure.
+.It Fl g Ar component Ar dev
+Get the component label for the specified component.
+.It Fl i Ar dev
+Initialize the RAID device.  In particular, (re-write) the parity on
+the selected device.  This 
+.Ar MUST
+be done for 
+.Ar all 
+RAID sets before the RAID device is labeled and before
+filesystems are created on the RAID device.
+.It Fl I Ar serial_number Ar dev
+Initialize the component labels on each component of the device.  
+.Ar serial_number 
+is used as one of the keys in determining whether a
+particular set of components belong to the same RAID set.  While not
+strictly enforced, different serial numbers should be used for
+different RAID sets.  This step 
+.Ar MUST
+be performed when a new RAID set is created.
+.It Fl p Ar dev
+Check the status of the parity on the RAID set.  Displays a status
+message, and returns successfully if the parity is up-to-date.
+.It Fl P Ar dev
+Check the status of the parity on the RAID set, and initialize
+(re-write) the parity if the parity is not known to be up-to-date.
+This is normally used after a system crash (and before a
+.Xr fsck 8 )
+to ensure the integrity of the parity.
+.It Fl r Ar component Ar dev
+Remove the spare disk specified by 
+.Ar component 
+from the set of available spare components.
+.It Fl R Ar component Ar dev
+Fails the specified 
+.Ar component , 
+if necessary, and immediately begins a reconstruction back to 
+.Ar component .
+This is useful for reconstructing back onto a component after
+it has been replaced following a failure.
+.It Fl s Ar dev
+Display the status of the RAIDframe device for each of the components
+and spares.  
+.It Fl S Ar dev
+Check the status of parity re-writing, component reconstruction, and
+component copyback.  The output indicates the amount of progress
+achieved in each of these areas.
+.It Fl u Ar dev
+Unconfigure the RAIDframe device.
+.It Fl v 
+Be more verbose.  For operations such as reconstructions, parity
+re-writing, and copybacks, provide a progress indicator.
+.El
+.Pp
+The device used by 
+.Nm
+is specified by 
+.Ar dev .  
+.Ar dev
+may be either the full name of the device, e.g. /dev/rraid0d,
+for the i386 architecture, and /dev/rraid0c
+for all others, or just simply raid0 (for /dev/rraid0d).
+.Pp
+The format of the configuration file is complex, and
+only an abbreviated treatment is given here.  In the configuration
+files, a 
+.Sq #
+indicates the beginning of a comment.
+.Pp
+There are 4 required sections of a configuration file, and 2
+optional sections.  Each section begins with a 
+.Sq START , 
+followed by
+the section name, and the configuration parameters associated with that
+section.  The first section is the 
+.Sq array
+section, and it specifies
+the number of rows, columns, and spare disks in the RAID set.  For
+example: 
+.Bd -unfilled -offset indent
+START array
+1 3 0
+.Ed
+.Pp
+indicates an array with 1 row, 3 columns, and 0 spare disks.  Note
+that although multi-dimensional arrays may be specified, they are 
+.Ar NOT
+supported in the driver.
+.Pp
+The second section, the 
+.Sq disks
+section, specifies the actual
+components of the device.  For example:
+.Bd -unfilled -offset indent
+START disks
+/dev/da0s1e
+/dev/da1s1e
+/dev/da2s1e
+.Ed
+.Pp
+specifies the three component disks to be used in the RAID device.  If
+any of the specified drives cannot be found when the RAID device is
+configured, then they will be marked as 
+.Sq failed , 
+and the system will
+operate in degraded mode.  Note that it is 
+.Ar imperative
+that the order of the components in the configuration file does not
+change between configurations of a RAID device.  Changing the order
+of the components will result in data loss if the set is configured
+with the 
+.Fl C
+option.  In normal circumstances, the RAID set will not configure if
+only
+.Fl c
+is specified, and the components are out-of-order.  
+.Pp
+The next section, which is the 
+.Sq spare
+section, is optional, and, if
+present, specifies the devices to be used as 
+.Sq hot spares
+-- devices
+which are on-line, but are not actively used by the RAID driver unless
+one of the main components fail.  A simple 
+.Sq spare
+section might be:
+.Bd -unfilled -offset indent
+START spare 
+/dev/da3s1e
+.Ed
+.Pp
+for a configuration with a single spare component.  If no spare drives
+are to be used in the configuration, then the 
+.Sq spare
+section may be omitted.
+.Pp
+The next section is the 
+.Sq layout
+section.  This section describes the
+general layout parameters for the RAID device, and provides such
+information as sectors per stripe unit, stripe units per parity unit,
+stripe units per reconstruction unit, and the parity configuration to
+use.  This section might look like:
+.Bd -unfilled -offset indent
+START layout
+# sectPerSU SUsPerParityUnit SUsPerReconUnit RAID_level
+32 1 1 5
+.Ed
+.Pp
+The sectors per stripe unit specifies, in blocks, the interleave
+factor; i.e. the number of contiguous sectors to be written to each
+component for a single stripe.  Appropriate selection of this value
+(32 in this example) is the subject of much research in RAID
+architectures.  The stripe units per parity unit and
+stripe units per reconstruction unit are normally each set to 1.
+While certain values above 1 are permitted, a discussion of valid
+values and the consequences of using anything other than 1 are outside
+the scope of this document.  The last value in this section (5 in this
+example) indicates the parity configuration desired.  Valid entries
+include: 
+.Bl -tag -width inde
+.It 0 
+RAID level 0.  No parity, only simple striping.
+.It 1
+RAID level 1.  Mirroring.  The parity is the mirror.
+.It 4
+RAID level 4.  Striping across components, with parity stored on the
+last component.
+.It 5
+RAID level 5.  Striping across components, parity distributed across
+all components.
+.El
+.Pp
+There are other valid entries here, including those for Even-Odd
+parity, RAID level 5 with rotated sparing, Chained declustering, 
+and Interleaved declustering, but as of this writing the code for
+those parity operations has not been tested with 
+.Fx .
+.Pp
+The next required section is the 
+.Sq queue
+section.  This is most often
+specified as:
+.Bd -unfilled -offset indent
+START queue
+fifo 100
+.Ed
+.Pp
+where the queuing method is specified as fifo (first-in, first-out),
+and the size of the per-component queue is limited to 100 requests.  
+Other queuing methods may also be specified, but a discussion of them
+is beyond the scope of this document.
+.Pp
+The final section, the 
+.Sq debug
+section, is optional.  For more details
+on this the reader is referred to the RAIDframe documentation
+discussed in the 
+.Sx HISTORY
+section.
+
+See
+.Sx EXAMPLES
+for a more complete configuration file example.
+
+.Sh EXAMPLES
+
+It is highly recommended that before using the RAID driver for real
+filesystems that the system administrator(s) become quite familiar
+with the use of
+.Nm ,
+and that they understand how the component reconstruction process
+works.  The examples in this section will focus on configuring a
+number of different RAID sets of varying degrees of redundancy.
+By working through these examples, administrators should be able to 
+develop a good feel for how to configure a RAID set, and how to
+initiate reconstruction of failed components.
+.Pp
+In the following examples
+.Sq raid0
+will be used to denote the RAID device.  Depending on the
+architecture, 
+.Sq /dev/rraid0c 
+or 
+.Sq /dev/rraid0d 
+may be used in place of
+.Sq raid0 .
+.Pp
+.Ss Initialization and Configuration
+The initial step in configuring a RAID set is to identify the components
+that will be used in the RAID set.  All components should be the same
+size.  Each component should have a disklabel type of
+.Dv FS_RAID ,
+and a typical disklabel entry for a RAID component
+might look like:
+.Bd -unfilled -offset indent
+f:  1800000  200495     RAID              # (Cyl.  405*- 4041*)
+.Ed
+.Pp
+While
+.Dv FS_BSDFFS 
+will also work as the component type, the type
+.Dv FS_RAID 
+is preferred for RAIDframe use, as it is required for features such as
+auto-configuration.  As part of the initial configuration of each RAID
+set, each component will be given a
+.Sq component label .
+A
+.Sq component label
+contains important information about the component, including a
+user-specified serial number, the row and column of that component in
+the RAID set, the redundancy level of the RAID set, a 'modification
+counter', and whether the parity information (if any) on that
+component is known to be correct.  Component labels are an integral
+part of the RAID set, since they are used to ensure that components
+are configured in the correct order, and used to keep track of other
+vital information about the RAID set.  Component labels are also
+required for the auto-detection and auto-configuration of RAID sets at
+boot time.  For a component label to be considered valid, that
+particular component label must be in agreement with the other
+component labels in the set.  For example, the serial number,
+.Sq modification counter , 
+number of rows and number of columns must all
+be in agreement.  If any of these are different, then the component is
+not considered to be part of the set.  See
+.Xr raid 4
+for more information about component labels.
+.Pp
+Once the components have been identified, and the disks have
+appropriate labels, 
+.Nm
+is then used to configure the
+.Xr raid 4 
+device.  To configure the device, a configuration
+file which looks something like:
+.Bd -unfilled -offset indent
+START array
+# numRow numCol numSpare
+1 3 1
+
+START disks
+/dev/da1s1e
+/dev/da2s1e
+/dev/da3s1e
+
+START spare
+/dev/da4s1e
+
+START layout
+# sectPerSU SUsPerParityUnit SUsPerReconUnit RAID_level_5
+32 1 1 5
+
+START queue
+fifo 100
+.Ed
+.Pp
+is created in a file.  The above configuration file specifies a RAID 5
+set consisting of the components /dev/da1s1e, /dev/da2s1e, and /dev/da3s1e,
+with /dev/da4s1e available as a
+.Sq hot spare
+in case one of
+the three main drives should fail. A RAID 0 set would be specified in
+a similar way:
+.Bd -unfilled -offset indent
+START array
+# numRow numCol numSpare
+1 4 0
+
+START disks
+/dev/da1s10e
+/dev/da1s11e
+/dev/da1s12e
+/dev/da1s13e
+
+START layout
+# sectPerSU SUsPerParityUnit SUsPerReconUnit RAID_level_0
+64 1 1 0
+
+START queue
+fifo 100
+.Ed
+.Pp
+In this case, devices /dev/da1s10e, /dev/da1s11e, /dev/da1s12e, and /dev/da1s13e
+are the components that make up this RAID set.  Note that there are no
+hot spares for a RAID 0 set, since there is no way to recover data if
+any of the components fail.
+.Pp
+For a RAID 1 (mirror) set, the following configuration might be used:
+.Bd -unfilled -offset indent
+START array
+# numRow numCol numSpare
+1 2 0
+
+START disks
+/dev/da2s10e
+/dev/da2s11e
+
+START layout
+# sectPerSU SUsPerParityUnit SUsPerReconUnit RAID_level_1
+128 1 1 1
+
+START queue
+fifo 100
+.Ed
+.Pp
+In this case, /dev/da2s10e and /dev/da2s11e are the two components of the
+mirror set.  While no hot spares have been specified in this
+configuration, they easily could be, just as they were specified in
+the RAID 5 case above.  Note as well that RAID 1 sets are currently
+limited to only 2 components.  At present, n-way mirroring is not
+possible.
+.Pp
+The first time a RAID set is configured, the 
+.Fl C
+option must be used:
+.Bd -unfilled -offset indent
+raidctl -C raid0.conf
+.Ed
+.Pp
+where 
+.Sq raid0.conf
+is the name of the RAID configuration file.  The 
+.Fl C
+forces the configuration to succeed, even if any of the component
+labels are incorrect.  The
+.Fl C
+option should not be used lightly in
+situations other than initial configurations, as if
+the system is refusing to configure a RAID set, there is probably a
+very good reason for it.  After the initial configuration is done (and
+appropriate component labels are added with the 
+.Fl I
+option) then raid0 can be configured normally with:
+.Bd -unfilled -offset indent
+raidctl -c raid0.conf
+.Ed
+.Pp
+When the RAID set is configured for the first time, it is 
+necessary to initialize the component labels, and to initialize the
+parity on the RAID set.  Initializing the component labels is done with:
+.Bd -unfilled -offset indent
+raidctl -I 112341 raid0
+.Ed
+.Pp
+where 
+.Sq 112341
+is a user-specified serial number for the RAID set.  This
+initialization step is 
+.Ar required 
+for all RAID sets.  As well, using different
+serial numbers between RAID sets is 
+.Ar strongly encouraged , 
+as using the same serial number for all RAID sets will only serve to
+decrease the usefulness of the component label checking.
+.Pp
+Initializing the RAID set is done via the
+.Fl i
+option.  This initialization 
+.Ar MUST
+be done for 
+.Ar all
+RAID sets, since among other things it verifies that the parity (if
+any) on the RAID set is correct.  Since this initialization may be
+quite time-consuming, the
+.Fl v
+option may be also used in conjunction with
+.Fl i :
+.Bd -unfilled -offset indent
+raidctl -iv raid0
+.Ed
+.Pp
+This will give more verbose output on the
+status of the initialization:
+.Bd -unfilled -offset indent
+Initiating re-write of parity
+Parity Re-write status:
+ 10% |****                                   | ETA:    06:03 /
+.Ed
+.Pp
+The output provides a 
+.Sq Percent Complete
+in both a numeric and graphical format, as well as an estimated time
+to completion of the operation.
+.Pp
+Since it is the parity that provides the
+.Sq redundancy
+part of RAID, it is critical that the parity is correct
+as much as possible.  If the parity is not correct, then there is no
+guarantee that data will not be lost if a component fails.
+.Pp
+Once the parity is known to be correct, 
+it is then safe to perform
+.Xr disklabel 8 ,
+.Xr newfs 8 ,
+or
+.Xr fsck 8
+on the device or its filesystems, and then to mount the filesystems
+for use.
+.Pp
+Under certain circumstances (e.g. the additional component has not
+arrived, or data is being migrated off of a disk destined to become a
+component) it may be desirable to to configure a RAID 1 set with only
+a single component.  This can be achieved by configuring the set with
+a physically existing component (as either the first or second
+component) and with a
+.Sq fake
+component.  In the following:
+.Bd -unfilled -offset indent
+START array
+# numRow numCol numSpare
+1 2 0
+
+START disks
+/dev/da6s1e
+/dev/da0s1e
+
+START layout
+# sectPerSU SUsPerParityUnit SUsPerReconUnit RAID_level_1
+128 1 1 1
+
+START queue
+fifo 100
+.Ed
+.Pp
+/dev/da0s1e is the real component, and will be the second disk of a RAID 1
+set.  The component /dev/da6s1e, which must exist, but have no physical
+device associated with it, is simply used as a placeholder.
+Configuration (using 
+.Fl C
+and 
+.Fl I Ar 12345
+as above) proceeds normally, but initialization of the RAID set will
+have to wait until all physical components are present.  After
+configuration, this set can be used normally, but will be operating 
+in degraded mode.  Once a second physical component is obtained, it
+can be hot-added, the existing data mirrored, and normal operation
+resumed.
+.Pp
+.Ss Maintenance of the RAID set
+After the parity has been initialized for the first time, the command:
+.Bd -unfilled -offset indent
+raidctl -p raid0
+.Ed
+.Pp
+can be used to check the current status of the parity.  To check the
+parity and rebuild it necessary (for example, after an unclean
+shutdown) the command:
+.Bd -unfilled -offset indent
+raidctl -P raid0
+.Ed
+.Pp
+is used.  Note that re-writing the parity can be done while
+other operations on the RAID set are taking place (e.g. while doing a
+.Xr fsck 8
+on a filesystem on the RAID set).  However: for maximum effectiveness
+of the RAID set, the parity should be known to be correct before any
+data on the set is modified.
+.Pp
+To see how the RAID set is doing, the following command can be used to
+show the RAID set's status:
+.Bd -unfilled -offset indent
+raidctl -s raid0
+.Ed
+.Pp
+The output will look something like:
+.Bd -unfilled -offset indent
+Components:
+           /dev/da1s1e: optimal
+           /dev/da2s1e: optimal
+           /dev/da3s1e: optimal
+Spares:
+           /dev/da4s1e: spare
+Component label for /dev/da1s1e:
+   Row: 0 Column: 0 Num Rows: 1 Num Columns: 3
+   Version: 2 Serial Number: 13432 Mod Counter: 65
+   Clean: No Status: 0
+   sectPerSU: 32 SUsPerPU: 1 SUsPerRU: 1
+   RAID Level: 5  blocksize: 512 numBlocks: 1799936
+   Autoconfig: No
+   Last configured as: raid0
+Component label for /dev/da2s1e:
+   Row: 0 Column: 1 Num Rows: 1 Num Columns: 3
+   Version: 2 Serial Number: 13432 Mod Counter: 65
+   Clean: No Status: 0
+   sectPerSU: 32 SUsPerPU: 1 SUsPerRU: 1
+   RAID Level: 5  blocksize: 512 numBlocks: 1799936
+   Autoconfig: No
+   Last configured as: raid0
+Component label for /dev/da3s1e:
+   Row: 0 Column: 2 Num Rows: 1 Num Columns: 3
+   Version: 2 Serial Number: 13432 Mod Counter: 65
+   Clean: No Status: 0
+   sectPerSU: 32 SUsPerPU: 1 SUsPerRU: 1
+   RAID Level: 5  blocksize: 512 numBlocks: 1799936
+   Autoconfig: No
+   Last configured as: raid0
+Parity status: clean
+Reconstruction is 100% complete.
+Parity Re-write is 100% complete.
+Copyback is 100% complete.
+.Ed
+.Pp
+This indicates that all is well with the RAID set.  Of importance here
+are the component lines which read
+.Sq optimal ,
+and the 
+.Sq Parity status
+line which indicates that the parity is up-to-date.  Note that if
+there are filesystems open on the RAID set, the individual components
+will not be 
+.Sq clean
+but the set as a whole can still be clean.
+.Pp
+To check the component label of /dev/da1s1e, the following is used:
+.Bd -unfilled -offset indent
+raidctl -g /dev/da1s1e raid0
+.Ed
+.Pp
+The output of this command will look something like:
+.Bd -unfilled -offset indent
+Component label for /dev/da1s1e:
+   Row: 0 Column: 0 Num Rows: 1 Num Columns: 3
+   Version: 2 Serial Number: 13432 Mod Counter: 65
+   Clean: No Status: 0
+   sectPerSU: 32 SUsPerPU: 1 SUsPerRU: 1
+   RAID Level: 5  blocksize: 512 numBlocks: 1799936
+   Autoconfig: No
+   Last configured as: raid0
+.Ed
+.Pp
+.Ss Dealing with Component Failures
+If for some reason
+(perhaps to test reconstruction) it is necessary to pretend a drive
+has failed, the following will perform that function:
+.Bd -unfilled -offset indent
+raidctl -f /dev/da2s1e raid0
+.Ed
+.Pp
+The system will then be performing all operations in degraded mode,
+where missing data is re-computed from existing data and the parity.
+In this case, obtaining the status of raid0 will return (in part):
+.Bd -unfilled -offset indent
+Components:
+           /dev/da1s1e: optimal
+           /dev/da2s1e: failed
+           /dev/da3s1e: optimal
+Spares:
+           /dev/da4s1e: spare
+.Ed
+.Pp
+Note that with the use of 
+.Fl f
+a reconstruction has not been started.  To both fail the disk and
+start a reconstruction, the 
+.Fl F
+option must be used:
+.Bd -unfilled -offset indent
+raidctl -F /dev/da2s1e raid0
+.Ed
+.Pp
+The 
+.Fl f
+option may be used first, and then the
+.Fl F
+option used later, on the same disk, if desired.  
+Immediately after the reconstruction is started, the status will report:
+.Bd -unfilled -offset indent
+Components:
+           /dev/da1s1e: optimal
+           /dev/da2s1e: reconstructing
+           /dev/da3s1e: optimal
+Spares:
+           /dev/da4s1e: used_spare
+[...]
+Parity status: clean
+Reconstruction is 10% complete.
+Parity Re-write is 100% complete.
+Copyback is 100% complete.
+.Ed
+.Pp
+This indicates that a reconstruction is in progress.  To find out how
+the reconstruction is progressing the 
+.Fl S
+option may be used.  This will indicate the progress in terms of the
+percentage of the reconstruction that is completed.  When the
+reconstruction is finished the
+.Fl s
+option will show:
+.Bd -unfilled -offset indent
+Components:
+           /dev/da1s1e: optimal
+           /dev/da2s1e: spared
+           /dev/da3s1e: optimal
+Spares:
+           /dev/da4s1e: used_spare
+[...]
+Parity status: clean
+Reconstruction is 100% complete.
+Parity Re-write is 100% complete.
+Copyback is 100% complete.
+.Ed
+.Pp
+At this point there are at least two options.  First, if /dev/da2s1e is
+known to be good (i.e. the failure was either caused by 
+.Fl f
+or 
+.Fl F ,
+or the failed disk was replaced), then a copyback of the data can 
+be initiated with the 
+.Fl B
+option.  In this example, this would copy the entire contents of
+/dev/da4s1e to /dev/da2s1e.  Once the copyback procedure is complete, the
+status of the device would be (in part):
+.Bd -unfilled -offset indent
+Components:
+           /dev/da1s1e: optimal
+           /dev/da2s1e: optimal
+           /dev/da3s1e: optimal
+Spares:
+           /dev/da4s1e: spare
+.Ed
+.Pp
+and the system is back to normal operation.
+.Pp
+The second option after the reconstruction is to simply use /dev/da4s1e
+in place of /dev/da2s1e in the configuration file.  For example, the
+configuration file (in part) might now look like:
+.Bd -unfilled -offset indent
+START array
+1 3 0
+
+START drives
+/dev/da1s1e
+/dev/da4s1e
+/dev/da3s1e
+.Ed
+.Pp
+This can be done as /dev/da4s1e is completely interchangeable with
+/dev/da2s1e at this point.  Note that extreme care must be taken when 
+changing the order of the drives in a configuration.  This is one of
+the few instances where the devices and/or their orderings can be
+changed without loss of data!  In general, the ordering of components
+in a configuration file should 
+.Ar never 
+be changed.
+.Pp
+If a component fails and there are no hot spares
+available on-line, the status of the RAID set might (in part) look like:
+.Bd -unfilled -offset indent
+Components:
+           /dev/da1s1e: optimal
+           /dev/da2s1e: failed
+           /dev/da3s1e: optimal
+No spares.
+.Ed
+.Pp
+In this case there are a number of options.  The first option is to add a hot
+spare using:
+.Bd -unfilled -offset indent
+raidctl -a /dev/da4s1e raid0
+.Ed
+.Pp
+After the hot add, the status would then be:
+.Bd -unfilled -offset indent
+Components:
+           /dev/da1s1e: optimal
+           /dev/da2s1e: failed
+           /dev/da3s1e: optimal
+Spares:
+           /dev/da4s1e: spare
+.Ed
+.Pp
+Reconstruction could then take place using 
+.Fl F
+as describe above.
+.Pp
+A second option is to rebuild directly onto /dev/da2s1e.  Once the disk 
+containing /dev/da2s1e has been replaced, one can simply use:
+.Bd -unfilled -offset indent
+raidctl -R /dev/da2s1e raid0
+.Ed
+.Pp
+to rebuild the /dev/da2s1e component.  As the rebuilding is in progress,
+the status will be:
+.Bd -unfilled -offset indent
+Components:
+           /dev/da1s1e: optimal
+           /dev/da2s1e: reconstructing
+           /dev/da3s1e: optimal
+No spares.
+.Ed
+.Pp
+and when completed, will be:
+.Bd -unfilled -offset indent
+Components:
+           /dev/da1s1e: optimal
+           /dev/da2s1e: optimal
+           /dev/da3s1e: optimal
+No spares.
+.Ed
+.Pp
+In circumstances where a particular component is completely
+unavailable after a reboot, a special component name will be used to
+indicate the missing component.  For example:
+.Bd -unfilled -offset indent
+Components:
+           /dev/da2s1e: optimal
+          component1: failed
+No spares.
+.Ed
+.Pp
+indicates that the second component of this RAID set was not detected
+at all by the auto-configuration code.  The name
+.Sq component1
+can be used anywhere a normal component name would be used.  For
+example, to add a hot spare to the above set, and rebuild to that hot
+spare, the following could be done:
+.Bd -unfilled -offset indent
+raidctl -a /dev/da3s1e raid0
+raidctl -F component1 raid0
+.Ed
+.Pp
+at which point the data missing from 
+.Sq component1 
+would be reconstructed onto /dev/da3s1e.
+.Pp
+.Ss RAID on RAID
+RAID sets can be layered to create more complex and much larger RAID
+sets.  A RAID 0 set, for example, could be constructed from four RAID
+5 sets.  The following configuration file shows such a setup:
+.Bd -unfilled -offset indent
+START array
+# numRow numCol numSpare
+1 4 0
+
+START disks
+/dev/raid1e
+/dev/raid2e
+/dev/raid3e
+/dev/raid4e
+
+START layout
+# sectPerSU SUsPerParityUnit SUsPerReconUnit RAID_level_0
+128 1 1 0
+
+START queue
+fifo 100
+.Ed
+.Pp
+A similar configuration file might be used for a RAID 0 set
+constructed from components on RAID 1 sets.  In such a configuration,
+the mirroring provides a high degree of redundancy, while the striping
+provides additional speed benefits.
+.Pp
+.Ss Auto-configuration and Root on RAID
+RAID sets can also be auto-configured at boot.  To make a set
+auto-configurable, simply prepare the RAID set as above, and then do
+a:
+.Bd -unfilled -offset indent
+raidctl -A yes raid0
+.Ed
+.Pp
+to turn on auto-configuration for that set.  To turn off
+auto-configuration, use:
+.Bd -unfilled -offset indent
+raidctl -A no raid0
+.Ed
+.Pp
+RAID sets which are auto-configurable will be configured before the
+root filesystem is mounted.  These RAID sets are thus available for
+use as a root filesystem, or for any other filesystem.  A primary
+advantage of using the auto-configuration is that RAID components
+become more independent of the disks they reside on.  For example,
+SCSI ID's can change, but auto-configured sets will always be
+configured correctly, even if the SCSI ID's of the component disks
+have become scrambled.
+.Pp
+Having a system's root filesystem (/) on a RAID set is also allowed,
+with the 
+.Sq a
+partition of such a RAID set being used for /.
+To use raid0a as the root filesystem, simply use:
+.Bd -unfilled -offset indent
+raidctl -A root raid0
+.Ed
+.Pp
+To return raid0a to be just an auto-configuring set simply use the
+.Fl A Ar yes
+arguments.
+.Pp
+Note that kernels can only be directly read from RAID 1 components on
+alpha and pmax architectures.  On those architectures, the 
+.Dv FS_RAID
+filesystem is recognized by the bootblocks, and will properly load the
+kernel directly from a RAID 1 component.  For other architectures, or
+to support the root filesystem on other RAID sets, some other
+mechanism must be used to get a kernel booting.  For example, a small
+partition containing only the secondary boot-blocks and an alternate
+kernel (or two) could be used.  Once a kernel is booting however, and
+an auto-configuring RAID set is found that is eligible to be root,
+then that RAID set will be auto-configured and used as the root
+device.  If two or more RAID sets claim to be root devices, then the
+user will be prompted to select the root device.  At this time, RAID
+0, 1, 4, and 5 sets are all supported as root devices.
+.Pp
+A typical RAID 1 setup with root on RAID might be as follows:
+.Bl -enum
+.It 
+wd0a - a small partition, which contains a complete, bootable, basic
+NetBSD installation. 
+.It
+wd1a - also contains a complete, bootable, basic NetBSD installation.
+.It 
+wd0e and wd1e - a RAID 1 set, raid0, used for the root filesystem.
+.It
+wd0f and wd1f - a RAID 1 set, raid1, which will be used only for
+swap space. 
+.It
+wd0g and wd1g - a RAID 1 set, raid2, used for /usr, /home, or other
+data, if desired.
+.It 
+wd0h and wd0h - a RAID 1 set, raid3, if desired.
+.El
+.Pp
+RAID sets raid0, raid1, and raid2 are all marked as
+auto-configurable.  raid0 is marked as being a root filesystem.
+When new kernels are installed, the kernel is not only copied to /, 
+but also to wd0a and wd1a.  The kernel on wd0a is required, since that
+is the kernel the system boots from.  The kernel on wd1a is also
+required, since that will be the kernel used should wd0 fail.  The
+important point here is to have redundant copies of the kernel
+available, in the event that one of the drives fail.
+.Pp
+There is no requirement that the root filesystem be on the same disk
+as the kernel.  For example, obtaining the kernel from wd0a, and using
+da0s1e and da1s1e for raid0, and the root filesystem, is fine.  It 
+.Ar is
+critical, however, that there be multiple kernels available, in the
+event of media failure.
+.Pp
+Multi-layered RAID devices (such as a RAID 0 set made
+up of RAID 1 sets) are
+.Ar not
+supported as root devices or auto-configurable devices at this point.
+(Multi-layered RAID devices 
+.Ar are
+supported in general, however, as mentioned earlier.)  Note that in
+order to enable component auto-detection and auto-configuration of
+RAID devices, the line:
+.Bd -unfilled -offset indent
+options    RAID_AUTOCONFIG
+.Ed
+.Pp
+must be in the kernel configuration file.  See
+.Xr raid 4
+for more details.
+.Pp
+.Ss Unconfiguration
+The final operation performed by 
+.Nm
+is to unconfigure a 
+.Xr raid 4
+device.  This is accomplished via a simple:
+.Bd -unfilled -offset indent
+raidctl -u raid0
+.Ed
+.Pp
+at which point the device is ready to be reconfigured.
+.Pp
+.Ss Performance Tuning
+Selection of the various parameter values which result in the best
+performance can be quite tricky, and often requires a bit of
+trial-and-error to get those values most appropriate for a given system.
+A whole range of factors come into play, including:
+.Bl -enum
+.It
+Types of components (e.g. SCSI vs. IDE) and their bandwidth
+.It
+Types of controller cards and their bandwidth
+.It
+Distribution of components among controllers
+.It
+IO bandwidth
+.It
+Filesystem access patterns
+.It 
+CPU speed
+.El
+.Pp
+As with most performance tuning, benchmarking under real-life loads
+may be the only way to measure expected performance.  Understanding
+some of the underlying technology is also useful in tuning.  The goal
+of this section is to provide pointers to those parameters which may
+make significant differences in performance.
+.Pp
+For a RAID 1 set, a SectPerSU value of 64 or 128 is typically
+sufficient.  Since data in a RAID 1 set is arranged in a linear
+fashion on each component, selecting an appropriate stripe size is
+somewhat less critical than it is for a RAID 5 set.  However: a stripe
+size that is too small will cause large IO's to be broken up into a
+number of smaller ones, hurting performance.  At the same time, a
+large stripe size may cause problems with concurrent accesses to
+stripes, which may also affect performance.  Thus values in the range
+of 32 to 128 are often the most effective.
+.Pp
+Tuning RAID 5 sets is trickier.  In the best case, IO is presented to
+the RAID set one stripe at a time.  Since the entire stripe is
+available at the beginning of the IO, the parity of that stripe can
+be calculated before the stripe is written, and then the stripe data
+and parity can be written in parallel.  When the amount of data being
+written is less than a full stripe worth, the
+.Sq small write
+problem occurs.  Since a 
+.Sq small write
+means only a portion of the stripe on the components is going to
+change, the data (and parity) on the components must be updated
+slightly differently.  First, the 
+.Sq old parity
+and 
+.Sq old data
+must be read from the components.  Then the new parity is constructed,
+using the new data to be written, and the old data and old parity.
+Finally, the new data and new parity are written.  All this extra data
+shuffling results in a serious loss of performance, and is typically 2
+to 4 times slower than a full stripe write (or read).  To combat this
+problem in the real world, it may be useful to ensure that stripe
+sizes are small enough that a
+.Sq large IO
+from the system will use exactly one large stripe write. As is seen
+later, there are some filesystem dependencies which may come into play
+here as well.
+.Pp
+Since the size of a 
+.Sq large IO
+is often (currently) only 32K or 64K, on a 5-drive RAID 5 set it may
+be desirable to select a SectPerSU value of 16 blocks (8K) or 32
+blocks (16K).  Since there are 4 data sectors per stripe, the maximum
+data per stripe is 64 blocks (32K) or 128 blocks (64K).  Again,
+empirical measurement will provide the best indicators of which
+values will yeild better performance.
+.Pp
+The parameters used for the filesystem are also critical to good
+performance.  For 
+.Xr newfs 8 , 
+for example, increasing the block size to 32K or 64K may improve
+performance dramatically.  As well, changing the cylinders-per-group
+parameter from 16 to 32 or higher is often not only necessary for
+larger filesystems, but may also have positive performance
+implications.
+.Pp
+.Ss Summary
+Despite the length of this man-page, configuring a RAID set is a
+relatively straight-forward process.  All that needs to be done is the
+following steps:
+.Bl -enum
+.It 
+Use 
+.Xr disklabel 8 
+to create the components (of type RAID).
+.It 
+Construct a RAID configuration file: e.g. 
+.Sq raid0.conf 
+.It 
+Configure the RAID set with: 
+.Bd -unfilled -offset indent
+raidctl -C raid0.conf
+.Ed
+.Pp
+.It 
+Initialize the component labels with: 
+.Bd -unfilled -offset indent
+raidctl -I 123456 raid0
+.Ed
+.Pp
+.It 
+Initialize other important parts of the set with: 
+.Bd -unfilled -offset indent
+raidctl -i raid0
+.Ed
+.Pp
+.It
+Get the default label for the RAID set: 
+.Bd -unfilled -offset indent
+disklabel raid0 > /tmp/label
+.Ed
+.Pp
+.It 
+Edit the label: 
+.Bd -unfilled -offset indent
+vi /tmp/label
+.Ed
+.Pp
+.It 
+Put the new label on the RAID set: 
+.Bd -unfilled -offset indent
+disklabel -R -r raid0 /tmp/label
+.Ed
+.Pp
+.It 
+Create the filesystem: 
+.Bd -unfilled -offset indent
+newfs /dev/rraid0e 
+.Ed
+.Pp
+.It
+Mount the filesystem: 
+.Bd -unfilled -offset indent
+mount /dev/raid0e /mnt
+.Ed
+.Pp
+.It
+Use:
+.Bd -unfilled -offset indent
+raidctl -c raid0.conf
+.Ed
+.Pp
+To re-configure the RAID set the next time it is needed, or put
+raid0.conf into /etc where it will automatically be started by 
+the /etc/rc scripts.
+.El
+.Pp
+.Sh WARNINGS
+Certain RAID levels (1, 4, 5, 6, and others) can protect against some
+data loss due to component failure.  However the loss of two
+components of a RAID 4 or 5 system, or the loss of a single component
+of a RAID 0 system will result in the entire filesystem being lost.
+RAID is 
+.Ar NOT
+a substitute for good backup practices.
+.Pp
+Recomputation of parity 
+.Ar MUST
+be performed whenever there is a chance that it may have been
+compromised.  This includes after system crashes, or before a RAID
+device has been used for the first time.  Failure to keep parity
+correct will be catastrophic should a component ever fail -- it is
+better to use RAID 0 and get the additional space and speed, than it
+is to use parity, but not keep the parity correct.  At least with RAID
+0 there is no perception of increased data security.
+.Pp
+.Sh FILES
+.Bl -tag -width /dev/XXrXraidX -compact
+.It Pa /dev/{,r}raid*
+.Cm raid 
+device special files.
+.El
+.Pp
+.Sh SEE ALSO
+.Xr raid 4 ,
+.Xr ccd 4 ,
+.Xr rc 8
+.Sh BUGS
+Hot-spare removal is currently not available.
+.Sh HISTORY
+RAIDframe is a framework for rapid prototyping of RAID structures
+developed by the folks at the Parallel Data Laboratory at Carnegie
+Mellon University (CMU).  
+A more complete description of the internals and functionality of
+RAIDframe is found in the paper "RAIDframe: A Rapid Prototyping Tool
+for RAID Systems", by William V. Courtright II, Garth Gibson, Mark
+Holland, LeAnn Neal Reilly, and Jim Zelenka, and published by the
+Parallel Data Laboratory of Carnegie Mellon University.
+.Pp
+The
+.Nm
+command first appeared as a program in CMU's RAIDframe v1.1 distribution.  This
+version of
+.Nm
+is a complete re-write, and first appeared in
+.Fx 4.4 .
+.Sh COPYRIGHT
+.Bd -unfilled
+The RAIDframe Copyright is as follows:
+
+Copyright (c) 1994-1996 Carnegie-Mellon University.
+All rights reserved.
+
+Permission to use, copy, modify and distribute this software and
+its documentation is hereby granted, provided that both the copyright
+notice and this permission notice appear in all copies of the
+software, derivative works or modified versions, and any portions
+thereof, and that both notices appear in supporting documentation.
+
+CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+
+Carnegie Mellon requests users of this software to return to
+
+ Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ School of Computer Science
+ Carnegie Mellon University
+ Pittsburgh PA 15213-3890
+
+any improvements or extensions that they make and grant Carnegie the
+rights to redistribute these changes.
+.Ed
diff --git a/sbin/raidctl/raidctl.c b/sbin/raidctl/raidctl.c
new file mode 100644
index 0000000..4b7d27d
--- /dev/null
+++ b/sbin/raidctl/raidctl.c
@@ -0,0 +1,1110 @@
+/*-
+ * Copyright (c) 2002 Scott Long <scottl@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$FreeBSD$
+ */
+
+/*      $NetBSD: raidctl.c,v 1.25 2000/10/31 14:18:39 lukem Exp $   */
+/*-
+ * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Greg Oster
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* 
+ * This program is a re-write of the original rf_ctrl program 
+ * distributed by CMU with RAIDframe 1.1.
+ *
+ * This program is the user-land interface to the RAIDframe kernel
+ * driver in NetBSD.
+ */
+
+#include <sys/param.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/disklabel.h>
+#if defined(__FreeBSD__)
+#include <sys/linker.h>
+#include <sys/module.h>
+#endif
+
+#include <ctype.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#ifdef __FreeBSD__
+#include <paths.h>
+#endif
+#if defined(__NetBSD__)
+#include <util.h>
+#endif
+
+#include <dev/raidframe/rf_raidframe.h>
+
+int     main(int, char *[]);
+void	do_ioctl(int, u_long, void *, const char *);
+static  void rf_configure(int, char*, int);
+static  const char *device_status(RF_DiskStatus_t);
+static  void rf_get_device_status(int);
+static  void get_component_number(int, char *, int *, int *);
+static  void rf_fail_disk(int, char *, int);
+static  void usage(void);
+static  void get_component_label(int, char *);
+static  void set_component_label(int, char *);
+static  void init_component_labels(int, int);
+static  void set_autoconfig(int, char *, char *);
+static  void add_hot_spare(int, char *);
+static  void remove_hot_spare(int, char *);
+static  void rebuild_in_place(int, char *);
+static  void check_status(int,int);
+static  void check_parity(int,int, char *);
+static  void do_meter(int, u_long);
+static  void get_bar(char *, double, int);
+static  void get_time_string(char *, int);
+#if defined(__FreeBSD__)
+static	void check_driver(void);
+
+extern char *__progname;
+#define PROGNAME __progname
+
+#define RAIDCTLDEV "/dev/raidctl"
+#elif defined(__NetBSD__)
+#define PROGNAME getprogname()
+#endif
+
+int verbose;
+
+int
+main(argc,argv)
+	int argc;
+	char *argv[];
+{
+	int ch;
+	int num_options;
+	unsigned long action;
+	char config_filename[PATH_MAX];
+	char dev_name[PATH_MAX];
+	char name[PATH_MAX];
+	char component[PATH_MAX];
+	char autoconf[10];
+	int do_recon;
+	int do_rewrite;
+	int is_clean;
+	int serial_number;
+	struct stat st;
+	int fd;
+	int force;
+	int raidID;
+
+	num_options = 0;
+	action = 0;
+	do_recon = 0;
+	do_rewrite = 0;
+	is_clean = 0;
+	force = 0;
+
+	while ((ch = getopt(argc, argv, "a:A:Bc:C:f:F:g:iI:l:r:R:sSpPuv")) 
+	       != -1)
+		switch(ch) {
+		case 'a':
+			action = RAIDFRAME_ADD_HOT_SPARE;
+			strncpy(component, optarg, PATH_MAX);
+			num_options++;
+			break;
+		case 'A':
+			action = RAIDFRAME_SET_AUTOCONFIG;
+			strncpy(autoconf, optarg, 10);
+			num_options++;
+			break;
+		case 'B':
+			action = RAIDFRAME_COPYBACK;
+			num_options++;
+			break;
+		case 'c':
+		case 'C':
+			strncpy(config_filename,optarg,PATH_MAX);
+			action = RAIDFRAME_CONFIGURE;
+			force = (ch == 'c') ? 0 : 1;
+#if defined(__FreeBSD__)
+			check_driver();
+			fd = open(RAIDCTLDEV, O_RDWR);
+			if (fd < 0) {
+				fprintf(stderr, "%s: unable to open raid "
+				    "control device %s\n", PROGNAME,
+				    RAIDCTLDEV);
+				fprintf(stderr, "Error: %s\n", strerror(errno));
+				exit(1);
+			}
+			rf_configure(fd, config_filename, force);
+			close(fd);
+			exit(0);
+#elif defined(__NetBSD__)
+			num_options++;
+			break;
+#endif
+		case 'f':
+			action = RAIDFRAME_FAIL_DISK;
+			strncpy(component, optarg, PATH_MAX);
+			do_recon = 0;
+			num_options++;
+			break;
+		case 'F':
+			action = RAIDFRAME_FAIL_DISK;
+			strncpy(component, optarg, PATH_MAX);
+			do_recon = 1;
+			num_options++;
+			break;
+		case 'g':
+			action = RAIDFRAME_GET_COMPONENT_LABEL;
+			strncpy(component, optarg, PATH_MAX);
+			num_options++;
+			break;
+		case 'i':
+			action = RAIDFRAME_REWRITEPARITY;
+			num_options++;
+			break;
+		case 'I':
+			action = RAIDFRAME_INIT_LABELS;
+			serial_number = atoi(optarg);
+			num_options++;
+			break;
+		case 'l': 
+			action = RAIDFRAME_SET_COMPONENT_LABEL;
+			strncpy(component, optarg, PATH_MAX);
+			num_options++;
+			break;
+		case 'r':
+			action = RAIDFRAME_REMOVE_HOT_SPARE;
+			strncpy(component, optarg, PATH_MAX);
+			num_options++;
+			break;
+		case 'R':
+			strncpy(component,optarg,PATH_MAX);
+			action = RAIDFRAME_REBUILD_IN_PLACE;
+			num_options++;
+			break;
+		case 's':
+			action = RAIDFRAME_GET_INFO;
+			num_options++;
+			break;
+		case 'S':
+			action = RAIDFRAME_CHECK_RECON_STATUS_EXT;
+			num_options++;
+			break;
+		case 'p':
+			action = RAIDFRAME_CHECK_PARITY;
+			num_options++;
+			break;
+		case 'P':
+			action = RAIDFRAME_CHECK_PARITY;
+			do_rewrite = 1;
+			num_options++;
+			break;
+		case 'u':
+			action = RAIDFRAME_SHUTDOWN;
+			num_options++;
+			break;
+		case 'v':
+			verbose = 1;
+			/* Don't bump num_options, as '-v' is not 
+			   an option like the others */
+			/* num_options++; */
+			break;
+		default:
+			usage();
+		}
+	argc -= optind;
+	argv += optind;
+
+	if ((num_options > 1) || (argc == NULL)) 
+		usage();
+
+	strncpy(name,argv[0],PATH_MAX);
+#if defined(__NetBSD__)
+	fd = opendisk(name, O_RDWR, dev_name, sizeof(dev_name), 1);
+#elif defined(__FreeBSD__)
+	check_driver();
+
+	if (name[0] != '/') {
+		char name1[PATH_MAX];
+		snprintf(name1, PATH_MAX, "%s%s", _PATH_DEV, name);
+		strncpy(name, name1, PATH_MAX);
+	}
+	fd = open(name, O_RDWR);
+#endif
+	if (fd == -1) {
+		fprintf(stderr, "%s: unable to open device file: %s\n",
+			PROGNAME, name);
+		exit(1);
+	}
+	if (fstat(fd, &st) != 0) {
+		fprintf(stderr,"%s: stat failure on: %s\n",
+			PROGNAME, dev_name);
+		exit(1);
+	}
+	if (!S_ISBLK(st.st_mode) && !S_ISCHR(st.st_mode)) {
+		fprintf(stderr,"%s: invalid device: %s\n",
+			PROGNAME, dev_name);
+		exit(1);
+	}
+
+	switch(action) {
+	case RAIDFRAME_ADD_HOT_SPARE:
+		add_hot_spare(fd, component);
+		break;
+	case RAIDFRAME_REMOVE_HOT_SPARE:
+		remove_hot_spare(fd, component);
+		break;
+#if defined(__NetBSD__)
+	case RAIDFRAME_CONFIGURE:
+		rf_configure(fd, config_filename, force);
+		break;
+#endif
+	case RAIDFRAME_SET_AUTOCONFIG:
+		set_autoconfig(fd, name, autoconf);
+		break;
+	case RAIDFRAME_COPYBACK:
+		printf("Copyback.\n");
+		do_ioctl(fd, RAIDFRAME_COPYBACK, NULL, "RAIDFRAME_COPYBACK");
+		if (verbose) {
+			sleep(3); /* XXX give the copyback a chance to start */
+			printf("Copyback status:\n");
+			do_meter(fd,RAIDFRAME_CHECK_COPYBACK_STATUS_EXT);
+		}
+		break;
+	case RAIDFRAME_FAIL_DISK:
+		rf_fail_disk(fd, component, do_recon);
+		break;
+	case RAIDFRAME_SET_COMPONENT_LABEL:
+		set_component_label(fd, component);
+		break;
+	case RAIDFRAME_GET_COMPONENT_LABEL:
+		get_component_label(fd, component);
+		break;
+	case RAIDFRAME_INIT_LABELS:
+		init_component_labels(fd, serial_number);
+		break;
+	case RAIDFRAME_REWRITEPARITY:
+		printf("Initiating re-write of parity\n");
+		do_ioctl(fd, RAIDFRAME_REWRITEPARITY, NULL, 
+			 "RAIDFRAME_REWRITEPARITY");
+		if (verbose) {
+			sleep(3); /* XXX give it time to get started */
+			printf("Parity Re-write status:\n");
+			do_meter(fd, RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT);
+		}
+		break;
+	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
+		check_status(fd,1);
+		break;
+	case RAIDFRAME_GET_INFO:
+		rf_get_device_status(fd);
+		break;
+	case RAIDFRAME_REBUILD_IN_PLACE:
+		rebuild_in_place(fd, component);
+		break;
+	case RAIDFRAME_CHECK_PARITY:
+		check_parity(fd, do_rewrite, dev_name);
+		break;
+	case RAIDFRAME_SHUTDOWN:
+#if defined(__NetBSD__)
+                do_ioctl(fd, RAIDFRAME_SHUTDOWN, NULL, "RAIDFRAME_SHUTDOWN");
+#elif defined(__FreeBSD__)
+		/* Find out the unit number of the raid device */
+		do_ioctl(fd, RAIDFRAME_GET_UNIT, &raidID, "RAIDFRAME_GET_UNIT");
+		close (fd);
+
+		fd = open(RAIDCTLDEV, O_RDWR);
+		if (fd < 0) {
+			fprintf(stderr, "%s: unable to open raid control "
+				"device %s\n", PROGNAME, RAIDCTLDEV);
+			fprintf(stderr, "Error: %s\n", strerror(errno));
+			exit(1);
+		}
+		do_ioctl(fd, RAIDFRAME_SHUTDOWN, &raidID, "RAIDFRAME_SHUTDOWN");
+		close(fd);
+#endif
+		break;
+	default:
+		break;
+	}
+
+	close(fd);
+	exit(0);
+}
+
+void
+do_ioctl(fd, command, arg, ioctl_name)
+	int fd;
+	unsigned long command;
+	void *arg;
+	const char *ioctl_name;
+{
+	if (ioctl(fd, command, arg) < 0) {
+		warn("ioctl (%s) failed", ioctl_name);
+		exit(1);
+	}
+}
+
+
+static void
+rf_configure(fd,config_file,force)
+	int fd;
+	char *config_file;
+	int force;
+{
+	void *generic;
+	RF_Config_t cfg;
+
+	if (rf_MakeConfig( config_file, &cfg ) != 0) {
+		fprintf(stderr,"%s: unable to create RAIDframe %s\n",
+			PROGNAME, "configuration structure\n");
+		exit(1);
+	}
+	
+	cfg.force = force;
+
+	/* 
+	 * Note the extra level of redirection needed here, since
+	 * what we really want to pass in is a pointer to the pointer to 
+	 * the configuration structure. 
+	 */
+
+	generic = (void *) &cfg;
+	do_ioctl(fd, RAIDFRAME_CONFIGURE, &generic, "RAIDFRAME_CONFIGURE");
+}
+
+static const char *
+device_status(status)
+	RF_DiskStatus_t status;
+{
+
+	switch (status) {
+	case rf_ds_optimal:
+		return ("optimal");
+		break;
+	case rf_ds_failed:
+		return ("failed");
+		break;
+	case rf_ds_reconstructing:
+		return ("reconstructing");
+		break;
+	case rf_ds_dist_spared:
+		return ("dist_spared");
+		break;
+	case rf_ds_spared:
+		return ("spared");
+		break;
+	case rf_ds_spare:
+		return ("spare");
+		break;
+	case rf_ds_used_spare:
+		return ("used_spare");
+		break;
+	default:
+		return ("UNKNOWN");
+	}
+	/* NOTREACHED */
+}
+
+static void
+rf_get_device_status(fd)
+	int fd;
+{
+	RF_DeviceConfig_t device_config;
+	void *cfg_ptr;
+	int is_clean;
+	int i;
+
+	cfg_ptr = &device_config;
+	printf("Address= %p\n", &cfg_ptr);
+	do_ioctl(fd, RAIDFRAME_GET_INFO, &cfg_ptr, "RAIDFRAME_GET_INFO");
+
+	printf("Components:\n");
+	for(i=0; i < device_config.ndevs; i++) {
+		printf("%20s: %s\n", device_config.devs[i].devname, 
+		       device_status(device_config.devs[i].status));
+	}
+	if (device_config.nspares > 0) {
+		printf("Spares:\n");
+		for(i=0; i < device_config.nspares; i++) {
+			printf("%20s: %s\n",
+			       device_config.spares[i].devname, 
+			       device_status(device_config.spares[i].status));
+		}
+	} else {
+		printf("No spares.\n");
+	}
+	for(i=0; i < device_config.ndevs; i++) {
+		if (device_config.devs[i].status == rf_ds_optimal) {
+			get_component_label(fd, device_config.devs[i].devname);
+		} else {
+			printf("%s status is: %s.  Skipping label.\n",
+			       device_config.devs[i].devname,
+			       device_status(device_config.devs[i].status));
+		}
+	}
+
+	if (device_config.nspares > 0) {
+		for(i=0; i < device_config.nspares; i++) {
+			if ((device_config.spares[i].status == 
+			     rf_ds_optimal) ||
+			    (device_config.spares[i].status == 
+			     rf_ds_used_spare)) {
+				get_component_label(fd, 
+					    device_config.spares[i].devname);
+			} else {
+				printf("%s status is: %s.  Skipping label.\n",
+				       device_config.spares[i].devname,
+				       device_status(device_config.spares[i].status));
+			}		
+		}
+	}
+
+	do_ioctl(fd, RAIDFRAME_CHECK_PARITY, &is_clean,
+		 "RAIDFRAME_CHECK_PARITY");
+	if (is_clean) {
+		printf("Parity status: clean\n");
+	} else {
+		printf("Parity status: DIRTY\n");
+	}
+	check_status(fd,0);
+}
+
+static void
+get_component_number(fd, component_name, component_number, num_columns)
+	int fd;
+	char *component_name;
+	int *component_number;
+	int *num_columns;
+{
+	RF_DeviceConfig_t device_config;
+	void *cfg_ptr;
+	int i;
+	int found;
+
+	*component_number = -1;
+		
+	/* Assuming a full path spec... */
+	cfg_ptr = &device_config;
+	do_ioctl(fd, RAIDFRAME_GET_INFO, &cfg_ptr, "RAIDFRAME_GET_INFO");
+
+	*num_columns = device_config.cols;
+	
+	found = 0;
+	for(i=0; i < device_config.ndevs; i++) {
+		if (strncmp(component_name, device_config.devs[i].devname,
+			    PATH_MAX)==0) {
+			found = 1;
+			*component_number = i;
+		}
+	}
+	if (!found) { /* maybe it's a spare? */
+		for(i=0; i < device_config.nspares; i++) {
+			if (strncmp(component_name, 
+				    device_config.spares[i].devname,
+				    PATH_MAX)==0) {
+				found = 1;
+				*component_number = i + device_config.ndevs;
+				/* the way spares are done should
+				   really change... */
+				*num_columns = device_config.cols + 
+					device_config.nspares;
+			}
+		}
+	}
+
+	if (!found) {
+		fprintf(stderr,"%s: %s is not a component %s", PROGNAME, 
+			component_name, "of this device\n");
+		exit(1);
+	}
+}
+
+static void
+rf_fail_disk(fd, component_to_fail, do_recon)
+	int fd;
+	char *component_to_fail;
+	int do_recon;
+{
+	struct rf_recon_req recon_request;
+	int component_num;
+	int num_cols;
+
+	get_component_number(fd, component_to_fail, &component_num, &num_cols);
+
+	recon_request.row = component_num / num_cols;
+	recon_request.col = component_num % num_cols;
+	if (do_recon) {
+		recon_request.flags = RF_FDFLAGS_RECON;
+	} else {
+		recon_request.flags = RF_FDFLAGS_NONE;
+	}
+	do_ioctl(fd, RAIDFRAME_FAIL_DISK, &recon_request, 
+		 "RAIDFRAME_FAIL_DISK");
+	if (do_recon && verbose) {
+		printf("Reconstruction status:\n");
+		sleep(3); /* XXX give reconstruction a chance to start */
+		do_meter(fd,RAIDFRAME_CHECK_RECON_STATUS_EXT);
+	}
+}
+
+static void
+get_component_label(fd, component)
+	int fd;
+	char *component;
+{
+	RF_ComponentLabel_t component_label;
+	int component_num;
+	int num_cols;
+
+	get_component_number(fd, component, &component_num, &num_cols);
+
+	memset( &component_label, 0, sizeof(RF_ComponentLabel_t));
+	component_label.row = component_num / num_cols;
+	component_label.column = component_num % num_cols;
+
+	do_ioctl( fd, RAIDFRAME_GET_COMPONENT_LABEL, &component_label,
+		  "RAIDFRAME_GET_COMPONENT_LABEL");
+
+	printf("Component label for %s:\n",component);
+
+	printf("   Row: %d, Column: %d, Num Rows: %d, Num Columns: %d\n",
+	       component_label.row, component_label.column, 
+	       component_label.num_rows, component_label.num_columns);
+	printf("   Version: %d, Serial Number: %d, Mod Counter: %d\n",
+	       component_label.version, component_label.serial_number,
+	       component_label.mod_counter);
+	printf("   Clean: %s, Status: %d\n",
+	       component_label.clean ? "Yes" : "No", 
+	       component_label.status );
+	printf("   sectPerSU: %d, SUsPerPU: %d, SUsPerRU: %d\n",
+	       component_label.sectPerSU, component_label.SUsPerPU, 
+	       component_label.SUsPerRU);
+	printf("   Queue size: %d, blocksize: %d, numBlocks: %d\n",
+	       component_label.maxOutstanding, component_label.blockSize,
+	       component_label.numBlocks);
+	printf("   RAID Level: %c\n", (char) component_label.parityConfig);
+	printf("   Autoconfig: %s\n", 
+	       component_label.autoconfigure ? "Yes" : "No" );
+	printf("   Root partition: %s\n",
+	       component_label.root_partition ? "Yes" : "No" );
+	printf("   Last configured as: raid%d\n", component_label.last_unit );
+}
+
+static void
+set_component_label(fd, component)
+	int fd;
+	char *component;
+{
+	RF_ComponentLabel_t component_label;
+	int component_num;
+	int num_cols;
+
+	get_component_number(fd, component, &component_num, &num_cols);
+
+	/* XXX This is currently here for testing, and future expandability */
+
+	component_label.version = 1;
+	component_label.serial_number = 123456;
+	component_label.mod_counter = 0;
+	component_label.row = component_num / num_cols;
+	component_label.column = component_num % num_cols;
+	component_label.num_rows = 0;
+	component_label.num_columns = 5;
+	component_label.clean = 0;
+	component_label.status = 1;
+	
+	do_ioctl( fd, RAIDFRAME_SET_COMPONENT_LABEL, &component_label,
+		  "RAIDFRAME_SET_COMPONENT_LABEL");
+}
+
+
+static void
+init_component_labels(fd, serial_number)
+	int fd;
+	int serial_number;
+{
+	RF_ComponentLabel_t component_label;
+
+	component_label.version = 0;
+	component_label.serial_number = serial_number;
+	component_label.mod_counter = 0;
+	component_label.row = 0;
+	component_label.column = 0;
+	component_label.num_rows = 0;
+	component_label.num_columns = 0;
+	component_label.clean = 0;
+	component_label.status = 0;
+	
+	do_ioctl( fd, RAIDFRAME_INIT_LABELS, &component_label,
+		  "RAIDFRAME_SET_COMPONENT_LABEL");
+}
+
+static void
+set_autoconfig(fd, name, autoconf)
+	int fd;
+	char *name;
+	char *autoconf;
+{
+	int auto_config;
+	int root_config;
+
+	auto_config = 0;
+	root_config = 0;
+
+	if (strncasecmp(autoconf,"root", 4) == 0) {
+		root_config = 1;
+	}
+
+	if ((strncasecmp(autoconf,"yes", 3) == 0) ||
+	    root_config == 1) {
+		auto_config = 1;
+	}
+
+	do_ioctl(fd, RAIDFRAME_SET_AUTOCONFIG, &auto_config,
+		 "RAIDFRAME_SET_AUTOCONFIG");
+
+	do_ioctl(fd, RAIDFRAME_SET_ROOT, &root_config,
+		 "RAIDFRAME_SET_ROOT");
+
+	printf("%s: Autoconfigure: %s\n", name,
+	       auto_config ? "Yes" : "No");
+
+	if (root_config == 1) {
+		printf("%s: Root: %s\n", name,
+		       auto_config ? "Yes" : "No");
+	}
+}
+
+static void
+add_hot_spare(fd, component)
+	int fd;
+	char *component;
+{
+	RF_SingleComponent_t hot_spare;
+
+	hot_spare.row = 0;
+	hot_spare.column = 0;
+	strncpy(hot_spare.component_name, component, 
+		sizeof(hot_spare.component_name));
+	
+	do_ioctl( fd, RAIDFRAME_ADD_HOT_SPARE, &hot_spare,
+		  "RAIDFRAME_ADD_HOT_SPARE");
+}
+
+static void
+remove_hot_spare(fd, component)
+	int fd;
+	char *component;
+{
+	RF_SingleComponent_t hot_spare;
+	int component_num;
+	int num_cols;
+
+	get_component_number(fd, component, &component_num, &num_cols);
+
+	hot_spare.row = component_num / num_cols;
+	hot_spare.column = component_num % num_cols;
+
+	strncpy(hot_spare.component_name, component, 
+		sizeof(hot_spare.component_name));
+	
+	do_ioctl( fd, RAIDFRAME_REMOVE_HOT_SPARE, &hot_spare,
+		  "RAIDFRAME_REMOVE_HOT_SPARE");
+}
+
+static void
+rebuild_in_place( fd, component )
+	int fd;
+	char *component;
+{
+	RF_SingleComponent_t comp;
+	int component_num;
+	int num_cols;
+
+	get_component_number(fd, component, &component_num, &num_cols);
+
+	comp.row = 0;
+	comp.column = component_num;
+	strncpy(comp.component_name, component, sizeof(comp.component_name));
+	
+	do_ioctl( fd, RAIDFRAME_REBUILD_IN_PLACE, &comp,
+		  "RAIDFRAME_REBUILD_IN_PLACE");
+
+	if (verbose) {
+		printf("Reconstruction status:\n");
+		sleep(3); /* XXX give reconstruction a chance to start */
+		do_meter(fd,RAIDFRAME_CHECK_RECON_STATUS_EXT);
+	}
+
+}
+
+static void
+check_parity( fd, do_rewrite, dev_name )
+	int fd;
+	int do_rewrite;
+	char *dev_name;
+{
+	int is_clean;
+	int percent_done;
+
+	is_clean = 0;
+	percent_done = 0;
+	do_ioctl(fd, RAIDFRAME_CHECK_PARITY, &is_clean,
+		 "RAIDFRAME_CHECK_PARITY");
+	if (is_clean) {
+		printf("%s: Parity status: clean\n",dev_name);
+	} else {
+		printf("%s: Parity status: DIRTY\n",dev_name);
+		if (do_rewrite) {
+			printf("%s: Initiating re-write of parity\n",
+			       dev_name);
+			do_ioctl(fd, RAIDFRAME_REWRITEPARITY, NULL, 
+				 "RAIDFRAME_REWRITEPARITY");
+			sleep(3); /* XXX give it time to
+				     get started. */
+			if (verbose) {
+				printf("Parity Re-write status:\n");
+				do_meter(fd, RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT);
+			} else {
+				do_ioctl(fd, 
+					 RAIDFRAME_CHECK_PARITYREWRITE_STATUS, 
+					 &percent_done, 
+					 "RAIDFRAME_CHECK_PARITYREWRITE_STATUS"
+					 );
+				while( percent_done < 100 ) {
+					sleep(3); /* wait a bit... */
+					do_ioctl(fd, RAIDFRAME_CHECK_PARITYREWRITE_STATUS, 
+						 &percent_done, "RAIDFRAME_CHECK_PARITYREWRITE_STATUS");
+				}
+
+			}
+			       printf("%s: Parity Re-write complete\n",
+				      dev_name);
+		} else {
+			/* parity is wrong, and is not being fixed.
+			   Exit w/ an error. */
+			exit(1);
+		}
+	}
+}
+
+
+static void
+check_status( fd, meter )
+	int fd;
+	int meter;
+{
+	int recon_percent_done = 0;
+	int parity_percent_done = 0;
+	int copyback_percent_done = 0;
+
+	do_ioctl(fd, RAIDFRAME_CHECK_RECON_STATUS, &recon_percent_done, 
+		 "RAIDFRAME_CHECK_RECON_STATUS");
+	printf("Reconstruction is %d%% complete.\n", recon_percent_done);
+	do_ioctl(fd, RAIDFRAME_CHECK_PARITYREWRITE_STATUS, 
+		 &parity_percent_done, 
+		 "RAIDFRAME_CHECK_PARITYREWRITE_STATUS");
+	printf("Parity Re-write is %d%% complete.\n", parity_percent_done);
+	do_ioctl(fd, RAIDFRAME_CHECK_COPYBACK_STATUS, &copyback_percent_done, 
+		 "RAIDFRAME_CHECK_COPYBACK_STATUS");
+	printf("Copyback is %d%% complete.\n", copyback_percent_done);
+
+	if (meter) {
+		/* These 3 should be mutually exclusive at this point */
+		if (recon_percent_done < 100) {
+			printf("Reconstruction status:\n");
+			do_meter(fd,RAIDFRAME_CHECK_RECON_STATUS_EXT);
+		} else if (parity_percent_done < 100) {
+			printf("Parity Re-write status:\n");
+			do_meter(fd,RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT);
+		} else if (copyback_percent_done < 100) {
+			printf("Copyback status:\n");
+			do_meter(fd,RAIDFRAME_CHECK_COPYBACK_STATUS_EXT);
+		}
+	}
+}
+
+const char *tbits = "|/-\\";
+
+static void
+do_meter(fd, option)
+	int fd;
+	u_long option;
+{
+	int percent_done;
+	int last_value;
+	int start_value;
+	RF_ProgressInfo_t progressInfo;
+	struct timeval start_time;
+	struct timeval last_time;
+	struct timeval current_time;
+	double elapsed;
+	int elapsed_sec;
+	int elapsed_usec;
+	int simple_eta,last_eta;
+	double rate;
+	int amount;
+	int tbit_value;
+	int wait_for_more_data;
+	char buffer[1024];
+	char bar_buffer[1024];
+	char eta_buffer[1024];
+
+	if (gettimeofday(&start_time,NULL)) {
+		fprintf(stderr,"%s: gettimeofday failed!?!?\n", PROGNAME);
+		exit(errno);
+	}
+	memset(&progressInfo, 0, sizeof(RF_ProgressInfo_t));
+
+	percent_done = 0;
+	do_ioctl(fd, option, &progressInfo, "");
+	last_value = progressInfo.completed;
+	start_value = last_value;
+	last_time = start_time;
+	current_time = start_time;
+	
+	wait_for_more_data = 0;
+	tbit_value = 0;
+	while(progressInfo.completed < progressInfo.total) {
+
+		percent_done = (progressInfo.completed * 100) / 
+			progressInfo.total;
+
+		get_bar(bar_buffer, percent_done, 40);
+		
+		elapsed_sec = current_time.tv_sec - start_time.tv_sec;
+		elapsed_usec = current_time.tv_usec - start_time.tv_usec;
+		if (elapsed_usec < 0) {
+			elapsed_usec-=1000000;
+			elapsed_sec++;
+		}
+
+		elapsed = (double) elapsed_sec + 
+			(double) elapsed_usec / 1000000.0;
+
+		amount = progressInfo.completed - start_value;
+
+		if (amount <= 0) { /* we don't do negatives (yet?) */
+			amount = 0;
+			wait_for_more_data = 1;
+		} else {
+			wait_for_more_data = 0;
+		}
+
+		if (elapsed == 0)
+			rate = 0.0;
+		else
+			rate = amount / elapsed;
+
+		if (rate > 0.0) {
+			simple_eta = (int) (((double)progressInfo.total - 
+					     (double) progressInfo.completed) 
+					    / rate);
+		} else {
+			simple_eta = -1;
+		}
+
+		if (simple_eta <=0) { 
+			simple_eta = last_eta;
+		} else {
+			last_eta = simple_eta;
+		}
+
+		get_time_string(eta_buffer, simple_eta);
+
+		snprintf(buffer,1024,"\r%3d%% |%s| ETA: %s %c",
+			 percent_done,bar_buffer,eta_buffer,tbits[tbit_value]);
+
+		write(fileno(stdout),buffer,strlen(buffer));
+		fflush(stdout);
+
+		/* resolution wasn't high enough... wait until we get another
+		   timestamp and perhaps more "work" done. */
+
+		if (!wait_for_more_data) {
+			last_time = current_time;
+			last_value = progressInfo.completed;
+		}
+
+		if (++tbit_value>3) 
+			tbit_value = 0;
+
+		sleep(2);
+
+		if (gettimeofday(&current_time,NULL)) {
+			fprintf(stderr,"%s: gettimeofday failed!?!?\n",
+				PROGNAME);
+			exit(errno);
+		}
+
+		do_ioctl( fd, option, &progressInfo, "");
+		
+
+	}
+	printf("\n");
+}
+/* 40 '*''s per line, then 40 ' ''s line. */
+/* If you've got a screen wider than 160 characters, "tough" */
+
+#define STAR_MIDPOINT 4*40
+const char stars[] = "****************************************"
+                     "****************************************"
+                     "****************************************"
+                     "****************************************"
+                     "                                        "
+                     "                                        "
+                     "                                        "
+                     "                                        "
+                     "                                        ";
+
+static void
+get_bar(string,percent,max_strlen)
+	char *string;
+	double percent;
+	int max_strlen;
+{
+	int offset;
+
+	if (max_strlen > STAR_MIDPOINT) {
+		max_strlen = STAR_MIDPOINT;
+	}
+	offset = STAR_MIDPOINT - 
+		(int)((percent * max_strlen)/ 100);
+	if (offset < 0)
+		offset = 0;
+	snprintf(string,max_strlen,"%s",&stars[offset]);
+}
+
+static void
+get_time_string(string,simple_time)
+	char *string;
+	int simple_time;
+{
+	int minutes, seconds, hours;
+	char hours_buffer[5];
+	char minutes_buffer[5];
+	char seconds_buffer[5];
+
+	if (simple_time >= 0) {
+
+		minutes = (int) simple_time / 60;
+		seconds = ((int)simple_time - 60*minutes);
+		hours = minutes / 60;
+		minutes = minutes - 60*hours;
+		
+		if (hours > 0) {
+			snprintf(hours_buffer,5,"%02d:",hours);
+		} else {
+			snprintf(hours_buffer,5,"   ");
+		}
+		
+		snprintf(minutes_buffer,5,"%02d:",minutes);
+		snprintf(seconds_buffer,5,"%02d",seconds);
+		snprintf(string,1024,"%s%s%s",
+			 hours_buffer, minutes_buffer, seconds_buffer);
+	} else {
+		snprintf(string,1024,"   --:--");
+	}
+	
+}
+
+static void
+usage()
+{
+	const char *progname = PROGNAME;
+
+	fprintf(stderr, "usage: %s [-v] -a component dev\n", progname);
+	fprintf(stderr, "       %s [-v] -A yes | no | root dev\n", progname);
+	fprintf(stderr, "       %s [-v] -B dev\n", progname);
+	fprintf(stderr, "       %s [-v] -c config_file dev\n", progname);
+	fprintf(stderr, "       %s [-v] -C config_file dev\n", progname);
+	fprintf(stderr, "       %s [-v] -f component dev\n", progname);
+	fprintf(stderr, "       %s [-v] -F component dev\n", progname);
+	fprintf(stderr, "       %s [-v] -g component dev\n", progname);
+	fprintf(stderr, "       %s [-v] -i dev\n", progname);
+	fprintf(stderr, "       %s [-v] -I serial_number dev\n", progname);
+	fprintf(stderr, "       %s [-v] -r component dev\n", progname); 
+	fprintf(stderr, "       %s [-v] -R component dev\n", progname);
+	fprintf(stderr, "       %s [-v] -s dev\n", progname);
+	fprintf(stderr, "       %s [-v] -S dev\n", progname);
+	fprintf(stderr, "       %s [-v] -u dev\n", progname);
+#if 0
+	fprintf(stderr, "usage: %s %s\n", progname, 
+		"-a | -f | -F | -g | -r | -R component dev");
+	fprintf(stderr, "       %s -B | -i | -s | -S -u dev\n", progname);
+	fprintf(stderr, "       %s -c | -C config_file dev\n", progname);
+	fprintf(stderr, "       %s -I serial_number dev\n", progname);
+#endif
+	exit(1);
+	/* NOTREACHED */
+}
+
+#if defined(__FreeBSD__)
+static void
+check_driver(void)
+{
+	if (modfind("raidframe") == -1 && kldload("raidframe") == -1) {
+		printf("Error: Cannot load RAIDframe driver.\n");
+		exit(1);
+	}
+}
+#endif
+
diff --git a/sbin/raidctl/rf_configure.c b/sbin/raidctl/rf_configure.c
new file mode 100644
index 0000000..8df7889
--- /dev/null
+++ b/sbin/raidctl/rf_configure.c
@@ -0,0 +1,583 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_configure.c,v 1.13 2001/01/27 19:32:47 oster Exp $	*/
+
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***************************************************************
+ *
+ * rf_configure.c -- code related to configuring the raidframe system
+ *
+ * configuration is complicated by the fact that we want the same
+ * driver to work both in the kernel and at user level.  In the
+ * kernel, we can't read the configuration file, so we configure
+ * by running a user-level program that reads the config file,
+ * creates a data structure describing the configuration and
+ * passes it into the kernel via an ioctl.  Since we want the config
+ * code to be common between the two versions of the driver, we
+ * configure using the same two-step process when running at
+ * user level.  Of course, at user level, the config structure is
+ * passed directly to the config routine, rather than via ioctl.
+ *
+ * This file is not compiled into the kernel, so we have no
+ * need for KERNEL ifdefs.
+ *
+ **************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_raidframe.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_decluster.h>
+#include <dev/raidframe/rf_configure.h>
+
+/*
+ * XXX we include this here so we don't need to drag rf_debugMem.c into
+ * the picture...  This is userland, afterall...
+ */
+
+/*
+ * XXX sucky hack to override the defn. of RF_Malloc as given in
+ * rf_debugMem.c...  but I *really* don't want (nor need) to link with
+ * that file here in userland..  GO
+ */
+
+#undef RF_Malloc
+#define RF_Malloc(_p_, _size_, _cast_) \
+  { \
+     _p_ = _cast_ malloc((u_long)_size_); \
+     bzero((char *)_p_, _size_); \
+  }
+
+int     distSpareYes = 1;
+int     distSpareNo = 0;
+
+/* The mapsw[] table below contains all the various RAID types that might
+be supported by the kernel.  The actual supported types are found
+in sys/dev/raidframe/rf_layout.c. */
+
+static RF_LayoutSW_t mapsw[] = {
+	/* parity declustering */
+	{'T', "Parity declustering",
+	 rf_MakeLayoutSpecificDeclustered, &distSpareNo},
+	/* parity declustering with distributed sparing */
+	{'D', "Distributed sparing parity declustering",
+	 rf_MakeLayoutSpecificDeclustered, &distSpareYes},
+	/* declustered P+Q */
+	{'Q', "Declustered P+Q",
+	 rf_MakeLayoutSpecificDeclustered, &distSpareNo},
+	/* RAID 5 with rotated sparing */
+	{'R', "RAID Level 5 rotated sparing", rf_MakeLayoutSpecificNULL, NULL},
+	/* Chained Declustering */
+	{'C', "Chained Declustering", rf_MakeLayoutSpecificNULL, NULL},
+	/* Interleaved Declustering */
+	{'I', "Interleaved Declustering", rf_MakeLayoutSpecificNULL, NULL},
+	/* RAID level 0 */
+	{'0', "RAID Level 0", rf_MakeLayoutSpecificNULL, NULL},
+	/* RAID level 1 */
+	{'1', "RAID Level 1", rf_MakeLayoutSpecificNULL, NULL},
+	/* RAID level 4 */
+	{'4', "RAID Level 4", rf_MakeLayoutSpecificNULL, NULL},
+	/* RAID level 5 */
+	{'5', "RAID Level 5", rf_MakeLayoutSpecificNULL, NULL},
+	/* Evenodd */
+	{'E', "EvenOdd", rf_MakeLayoutSpecificNULL, NULL},
+	/* Declustered Evenodd */
+	{'e', "Declustered EvenOdd",
+	 rf_MakeLayoutSpecificDeclustered, &distSpareNo},
+	/* parity logging */
+	{'L', "Parity logging", rf_MakeLayoutSpecificNULL, NULL},
+	/* end-of-list marker */
+	{'\0', NULL, NULL, NULL}
+};
+RF_LayoutSW_t *
+rf_GetLayout(RF_ParityConfig_t parityConfig)
+{
+	RF_LayoutSW_t *p;
+
+	/* look up the specific layout */
+	for (p = &mapsw[0]; p->parityConfig; p++)
+		if (p->parityConfig == parityConfig)
+			break;
+	if (!p->parityConfig)
+		return (NULL);
+	RF_ASSERT(p->parityConfig == parityConfig);
+	return (p);
+}
+
+static int rf_search_file_for_start_of(const char *string, char *buf,
+    int len, FILE * fp);
+static int rf_get_next_nonblank_line(char *buf, int len, FILE * fp,
+    const char *errmsg);
+
+/*
+ * called from user level to read the configuration file and create
+ * a configuration control structure.  This is used in the user-level
+ * version of the driver, and in the user-level program that configures
+ * the system via ioctl.
+ */
+int 
+rf_MakeConfig(configname, cfgPtr)
+	char *configname;
+	RF_Config_t *cfgPtr;
+{
+	int numscanned, val, r, c, retcode, aa, bb, cc;
+	char buf[256], buf1[256], *cp;
+	RF_LayoutSW_t *lp;
+	FILE *fp;
+
+	bzero((char *) cfgPtr, sizeof(RF_Config_t));
+
+	fp = fopen(configname, "r");
+	if (!fp) {
+		RF_ERRORMSG1("Can't open config file %s\n", configname);
+		return (-1);
+	}
+	rewind(fp);
+	if (rf_search_file_for_start_of("array", buf, 256, fp)) {
+		RF_ERRORMSG1("Unable to find start of \"array\" params in config file %s\n", configname);
+		retcode = -1;
+		goto out;
+	}
+	rf_get_next_nonblank_line(buf, 256, fp, "Config file error (\"array\" section):  unable to get numRow and numCol\n");
+
+	/*
+         * wackiness with aa, bb, cc to get around size problems on
+         * different platforms
+         */
+	numscanned = sscanf(buf, "%d %d %d", &aa, &bb, &cc);
+	if (numscanned != 3) {
+		RF_ERRORMSG("Config file error (\"array\" section):  unable to get numRow, numCol, numSpare\n");
+		retcode = -1;
+		goto out;
+	}
+	cfgPtr->numRow = (RF_RowCol_t) aa;
+	cfgPtr->numCol = (RF_RowCol_t) bb;
+	cfgPtr->numSpare = (RF_RowCol_t) cc;
+
+	/* debug section is optional */
+	for (c = 0; c < RF_MAXDBGV; c++)
+		cfgPtr->debugVars[c][0] = '\0';
+	rewind(fp);
+	if (!rf_search_file_for_start_of("debug", buf, 256, fp)) {
+		for (c = 0; c < RF_MAXDBGV; c++) {
+			if (rf_get_next_nonblank_line(buf, 256, fp, NULL))
+				break;
+			cp = rf_find_non_white(buf);
+			if (!strncmp(cp, "START", strlen("START")))
+				break;
+			(void) strcpy(&cfgPtr->debugVars[c][0], cp);
+		}
+	}
+	rewind(fp);
+	strcpy(cfgPtr->diskQueueType, "fifo");
+	cfgPtr->maxOutstandingDiskReqs = 1;
+	/* scan the file for the block related to disk queues */
+	if (rf_search_file_for_start_of("queue", buf, 256, fp)) {
+		RF_ERRORMSG2("[No disk queue discipline specified in config file %s.  Using %s.]\n", configname, cfgPtr->diskQueueType);
+	} else {
+		if (rf_get_next_nonblank_line(buf, 256, fp, NULL)) {
+			RF_ERRORMSG2("[No disk queue discipline specified in config file %s.  Using %s.]\n", configname, cfgPtr->diskQueueType);
+		}
+	}
+
+	/* the queue specifier line contains two entries: 1st char of first
+	 * word specifies queue to be used 2nd word specifies max num reqs
+	 * that can be outstanding on the disk itself (typically 1) */
+	if (sscanf(buf, "%s %d", buf1, &val) != 2) {
+		RF_ERRORMSG1("Can't determine queue type and/or max outstanding reqs from line: %s", buf);
+		RF_ERRORMSG2("Using %s-%d\n", cfgPtr->diskQueueType, cfgPtr->maxOutstandingDiskReqs);
+	} else {
+		char *ch;
+		bcopy(buf1, cfgPtr->diskQueueType,
+		    RF_MIN(sizeof(cfgPtr->diskQueueType), strlen(buf1) + 1));
+		for (ch = buf1; *ch; ch++) {
+			if (*ch == ' ') {
+				*ch = '\0';
+				break;
+			}
+		}
+		cfgPtr->maxOutstandingDiskReqs = val;
+	}
+
+	rewind(fp);
+
+	if (rf_search_file_for_start_of("disks", buf, 256, fp)) {
+		RF_ERRORMSG1("Can't find \"disks\" section in config file %s\n", configname);
+		retcode = -1;
+		goto out;
+	}
+	for (r = 0; r < cfgPtr->numRow; r++) {
+		for (c = 0; c < cfgPtr->numCol; c++) {
+			int devfd;
+			char bfr[256], *bfr1;
+			if (rf_get_next_nonblank_line(&bfr[0], 256, fp, NULL)) {
+				RF_ERRORMSG2("Config file error: unable to get device file for disk at row %d col %d\n", r, c);
+				retcode = -1;
+				goto out;
+			}
+			/* Get rid of the newline at the end of the string */
+			if ((bfr1 = strchr(&bfr[0], '\n')) != NULL)
+				*bfr1 = NULL;
+			/* Make sure the device exists */
+			if ((devfd = open(&bfr[0], O_RDWR)) < 0) {
+				RF_ERRORMSG2(
+				    "Config file error: device %s, %s\n",
+				    &bfr[0], strerror(errno));
+				retcode = -1;
+				goto out;
+			}
+			close(devfd);
+			strncpy(&cfgPtr->devnames[r][c][0], &bfr[0], 50);
+		}
+	}
+
+	/* "spare" section is optional */
+	rewind(fp);
+	if (rf_search_file_for_start_of("spare", buf, 256, fp))
+		cfgPtr->numSpare = 0;
+	for (c = 0; c < cfgPtr->numSpare; c++) {
+		if (rf_get_next_nonblank_line(&cfgPtr->spare_names[c][0],
+		    256, fp, NULL)) {
+			RF_ERRORMSG1("Config file error: unable to get device file for spare disk %d\n", c);
+			retcode = -1;
+			goto out;
+		}
+	}
+
+	/* scan the file for the block related to layout */
+	rewind(fp);
+	if (rf_search_file_for_start_of("layout", buf, 256, fp)) {
+		RF_ERRORMSG1("Can't find \"layout\" section in configuration file %s\n", configname);
+		retcode = -1;
+		goto out;
+	}
+	if (rf_get_next_nonblank_line(buf, 256, fp, NULL)) {
+		RF_ERRORMSG("Config file error (\"layout\" section): unable to find common layout param line\n");
+		retcode = -1;
+		goto out;
+	}
+	c = sscanf(buf, "%d %d %d %c", &aa, &bb, &cc, &cfgPtr->parityConfig);
+	cfgPtr->sectPerSU = (RF_SectorNum_t) aa;
+	cfgPtr->SUsPerPU = (RF_StripeNum_t) bb;
+	cfgPtr->SUsPerRU = (RF_StripeNum_t) cc;
+	if (c != 4) {
+		RF_ERRORMSG("Unable to scan common layout line\n");
+		retcode = -1;
+		goto out;
+	}
+	lp = rf_GetLayout(cfgPtr->parityConfig);
+	if (lp == NULL) {
+		RF_ERRORMSG1("Unknown parity config '%c'\n",
+		    cfgPtr->parityConfig);
+		retcode = -1;
+		goto out;
+	}
+
+	retcode = lp->MakeLayoutSpecific(fp, cfgPtr, lp->makeLayoutSpecificArg);
+out:
+	fclose(fp);
+	if (retcode < 0)
+		retcode = errno = EINVAL;
+	else
+		errno = retcode;
+	return (retcode);
+}
+
+
+/* used in architectures such as RAID0 where there is no layout-specific
+ * information to be passed into the configuration code.
+ */
+int 
+rf_MakeLayoutSpecificNULL(fp, cfgPtr, ignored)
+	FILE *fp;
+	RF_Config_t *cfgPtr;
+	void *ignored;
+{
+	cfgPtr->layoutSpecificSize = 0;
+	cfgPtr->layoutSpecific = NULL;
+	return (0);
+}
+
+int 
+rf_MakeLayoutSpecificDeclustered(configfp, cfgPtr, arg)
+	FILE *configfp;
+	RF_Config_t *cfgPtr;
+	void *arg;
+{
+	int b, v, k, r, lambda, norotate, i, val, distSpare;
+	char *cfgBuf, *bdfile, *p, *smname;
+	char buf[256], smbuf[256];
+	FILE *fp;
+
+	distSpare = *((int *) arg);
+
+	/* get the block design file name */
+	if (rf_get_next_nonblank_line(buf, 256, configfp,
+	    "Can't find block design file name in config file\n"))
+		return (EINVAL);
+	bdfile = rf_find_non_white(buf);
+	if (bdfile[strlen(bdfile) - 1] == '\n') {
+		/* strip newline char */
+		bdfile[strlen(bdfile) - 1] = '\0';
+	}
+	/* open bd file, check validity of configuration */
+	if ((fp = fopen(bdfile, "r")) == NULL) {
+		RF_ERRORMSG1("RAID: config error: Can't open layout table file %s\n", bdfile);
+		return (EINVAL);
+	}
+	if (fgets(buf, 256, fp) == NULL) {
+		RF_ERRORMSG1("RAID: config error: Can't read layout from layout table file %s\n", bdfile);
+		return (EINVAL);
+	}
+	i = sscanf(buf, "%u %u %u %u %u %u", &b, &v, &k, &r, &lambda, &norotate);
+	if (i == 5)
+		norotate = 0;	/* no-rotate flag is optional */
+	else if (i != 6) {
+		RF_ERRORMSG("Unable to parse header line in block design file\n");
+		return (EINVAL);
+	}
+	/* set the sparemap directory.  In the in-kernel version, there's a
+	 * daemon that's responsible for finding the sparemaps */
+	if (distSpare) {
+		if (rf_get_next_nonblank_line(smbuf, 256, configfp,
+		    "Can't find sparemap file name in config file\n"))
+			return (EINVAL);
+		smname = rf_find_non_white(smbuf);
+		if (smname[strlen(smname) - 1] == '\n') {
+			/* strip newline char */
+			smname[strlen(smname) - 1] = '\0';
+		}
+	} else {
+		smbuf[0] = '\0';
+		smname = smbuf;
+	}
+
+	/* allocate a buffer to hold the configuration info */
+	cfgPtr->layoutSpecificSize = RF_SPAREMAP_NAME_LEN +
+	    6 * sizeof(int) + b * k;
+	/* can't use RF_Malloc here b/c debugMem module not yet init'd */
+	cfgBuf = (char *) malloc(cfgPtr->layoutSpecificSize);
+	cfgPtr->layoutSpecific = (void *) cfgBuf;
+	p = cfgBuf;
+
+	/* install name of sparemap file */
+	for (i = 0; smname[i]; i++)
+		*p++ = smname[i];
+	/* pad with zeros */
+	while (i < RF_SPAREMAP_NAME_LEN) {
+		*p++ = '\0';
+		i++;
+	}
+
+	/*
+         * fill in the buffer with the block design parameters
+         * and then the block design itself
+         */
+	*((int *) p) = b;
+	p += sizeof(int);
+	*((int *) p) = v;
+	p += sizeof(int);
+	*((int *) p) = k;
+	p += sizeof(int);
+	*((int *) p) = r;
+	p += sizeof(int);
+	*((int *) p) = lambda;
+	p += sizeof(int);
+	*((int *) p) = norotate;
+	p += sizeof(int);
+
+	while (fscanf(fp, "%d", &val) == 1)
+		*p++ = (char) val;
+	fclose(fp);
+	if (p - cfgBuf != cfgPtr->layoutSpecificSize) {
+		RF_ERRORMSG2("Size mismatch creating layout specific data: is %d sb %d bytes\n", (int) (p - cfgBuf), (int) (6 * sizeof(int) + b * k));
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/****************************************************************************
+ *
+ * utilities
+ *
+ ***************************************************************************/
+
+/* finds a non-white character in the line */
+char *
+rf_find_non_white(char *p)
+{
+	for (; *p != '\0' && (*p == ' ' || *p == '\t'); p++);
+	return (p);
+}
+
+/* finds a white character in the line */
+char *
+rf_find_white(char *p)
+{
+	for (; *p != '\0' && (*p != ' ' && *p != '\t'); p++);
+	return (p);
+}
+
+/*
+ * searches a file for a line that says "START string", where string is
+ * specified as a parameter
+ */
+static int 
+rf_search_file_for_start_of(string, buf, len, fp)
+	const char *string;
+	char *buf;
+	int len;
+	FILE *fp;
+{
+	char *p;
+
+	while (1) {
+		if (fgets(buf, len, fp) == NULL)
+			return (-1);
+		p = rf_find_non_white(buf);
+		if (!strncmp(p, "START", strlen("START"))) {
+			p = rf_find_white(p);
+			p = rf_find_non_white(p);
+			if (!strncmp(p, string, strlen(string)))
+				return (0);
+		}
+	}
+}
+
+/* reads from file fp into buf until it finds an interesting line */
+int 
+rf_get_next_nonblank_line(buf, len, fp, errmsg)
+	char *buf;
+	int len;
+	FILE *fp;
+	const char *errmsg;
+{
+	char *p;
+
+	while (fgets(buf, 256, fp) != NULL) {
+		p = rf_find_non_white(buf);
+		if (*p == '\n' || *p == '\0' || *p == '#')
+			continue;
+		return (0);
+	}
+	if (errmsg)
+		RF_ERRORMSG1("%s", errmsg);
+	return (1);
+}
+
+/*
+ * Allocates an array for the spare table, and initializes it from a file.
+ * In the user-level version, this is called when recon is initiated.
+ * When/if I move recon into the kernel, there'll be a daemon that does
+ * an ioctl into raidframe which will block until a spare table is needed.
+ * When it returns, it will read a spare table from the file system,
+ * pass it into the kernel via a different ioctl, and then block again
+ * on the original ioctl.
+ *
+ * This is specific to the declustered layout, but doesn't belong in
+ * rf_decluster.c because it uses stuff that can't be compiled into
+ * the kernel, and it needs to be compiled into the user-level sparemap daemon.
+ *
+ */
+void *
+rf_ReadSpareTable(req, fname)
+	RF_SparetWait_t *req;
+	char *fname;
+{
+	int i, j, numFound, linecount, tableNum, tupleNum,
+	    spareDisk, spareBlkOffset;
+	char buf[1024], targString[100], errString[100];
+	RF_SpareTableEntry_t **table;
+	FILE *fp;
+
+	/* allocate and initialize the table */
+	RF_Malloc(table,
+	    req->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *),
+	    (RF_SpareTableEntry_t **));
+	for (i = 0; i < req->TablesPerSpareRegion; i++) {
+		RF_Malloc(table[i],
+		    req->BlocksPerTable * sizeof(RF_SpareTableEntry_t),
+		    (RF_SpareTableEntry_t *));
+		for (j = 0; j < req->BlocksPerTable; j++)
+			table[i][j].spareDisk =
+			    table[i][j].spareBlockOffsetInSUs = -1;
+	}
+
+	/* 2.  open sparemap file, sanity check */
+	if ((fp = fopen(fname, "r")) == NULL) {
+		fprintf(stderr,
+		    "rf_ReadSpareTable:  Can't open sparemap file %s\n", fname);
+		return (NULL);
+	}
+	if (rf_get_next_nonblank_line(buf, 1024, fp,
+	    "Invalid sparemap file:  can't find header line\n"))
+		return (NULL);
+	if (buf[strlen(buf) - 1] == '\n')
+		buf[strlen(buf) - 1] = '\0';
+
+	sprintf(targString, "fdisk %d\n", req->fcol);
+	sprintf(errString,
+	    "Invalid sparemap file:  can't find \"fdisk %d\" line\n",
+	    req->fcol);
+	while (1) {
+		rf_get_next_nonblank_line(buf, 1024, fp, errString);
+		if (!strncmp(buf, targString, strlen(targString)))
+			break;
+	}
+
+	/* no more blank lines or comments allowed now */
+	linecount = req->TablesPerSpareRegion * req->TableDepthInPUs;
+	for (i = 0; i < linecount; i++) {
+		numFound = fscanf(fp, " %d %d %d %d", &tableNum, &tupleNum,
+		    &spareDisk, &spareBlkOffset);
+		if (numFound != 4) {
+			fprintf(stderr, "Sparemap file prematurely exhausted after %d of %d lines\n", i, linecount);
+			return (NULL);
+		}
+		RF_ASSERT(tableNum >= 0 &&
+		    tableNum < req->TablesPerSpareRegion);
+		RF_ASSERT(tupleNum >= 0 && tupleNum < req->BlocksPerTable);
+		RF_ASSERT(spareDisk >= 0 && spareDisk < req->C);
+		RF_ASSERT(spareBlkOffset >= 0 && spareBlkOffset <
+		    req->SpareSpaceDepthPerRegionInSUs / req->SUsPerPU);
+
+		table[tableNum][tupleNum].spareDisk = spareDisk;
+		table[tableNum][tupleNum].spareBlockOffsetInSUs =
+		    spareBlkOffset * req->SUsPerPU;
+	}
+
+	fclose(fp);
+	return ((void *) table);
+}
diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
index 94d54f4..cf80fb8 100644
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -146,6 +146,7 @@ MAN=	aac.4 \
 	pt.4 \
 	pty.4 \
 	puc.4 \
+	raid.4 \
 	random.4 \
 	rl.4 \
 	route.4 \
diff --git a/share/man/man4/raid.4 b/share/man/man4/raid.4
new file mode 100644
index 0000000..54c8b77
--- /dev/null
+++ b/share/man/man4/raid.4
@@ -0,0 +1,342 @@
+.\"	$FreeBSD$
+.\"     $NetBSD: raid.4,v 1.16 2000/11/02 03:34:08 oster Exp $
+.\"
+.\" Copyright (c) 1998 The NetBSD Foundation, Inc.
+.\" All rights reserved.
+.\"
+.\" This code is derived from software contributed to The NetBSD Foundation
+.\" by Greg Oster
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\"    must display the following acknowledgement:
+.\"        This product includes software developed by the NetBSD
+.\"        Foundation, Inc. and its contributors.
+.\" 4. Neither the name of The NetBSD Foundation nor the names of its
+.\"    contributors may be used to endorse or promote products derived
+.\"    from this software without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.\"
+.\" Copyright (c) 1995 Carnegie-Mellon University.
+.\" All rights reserved.
+.\" 
+.\" Author: Mark Holland
+.\" 
+.\" Permission to use, copy, modify and distribute this software and
+.\" its documentation is hereby granted, provided that both the copyright
+.\" notice and this permission notice appear in all copies of the
+.\" software, derivative works or modified versions, and any portions
+.\" thereof, and that both notices appear in supporting documentation.
+.\" 
+.\" CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+.\" CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+.\" FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+.\" 
+.\" Carnegie Mellon requests users of this software to return to
+.\" 
+.\"  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+.\"  School of Computer Science
+.\"  Carnegie Mellon University
+.\"  Pittsburgh PA 15213-3890
+.\" 
+.\" any improvements or extensions that they make and grant Carnegie the
+.\" rights to redistribute these changes.
+.\" 
+.Dd October 20, 2002
+.Dt RAID 4
+.Os
+.Sh NAME
+.Nm raid
+.Nd RAIDframe disk driver
+.Sh SYNOPSIS
+.Cd device raidframe
+.Sh DESCRIPTION
+The
+.Nm
+driver provides RAID 0, 1, 4, and 5 (and more!) capabilities to
+.Fx .
+This
+document assumes that the reader has at least some familiarity with RAID
+and RAID concepts.  The reader is also assumed to know how to configure
+disks and pseudo-devices into kernels, how to generate kernels, and how 
+to partition disks.
+.Pp
+RAIDframe provides a number of different RAID levels including:
+.Bl -tag -width indent
+.It RAID 0 
+provides simple data striping across the components.
+.It RAID 1 
+provides mirroring.
+.It RAID 4 
+provides data striping across the components, with parity
+stored on a dedicated drive (in this case, the last component).
+.It RAID 5 
+provides data striping across the components, with parity 
+distributed across all the components.
+.El
+.Pp
+There are a wide variety of other RAID levels supported by RAIDframe,
+including Even-Odd parity, RAID level 5 with rotated sparing, Chained
+declustering,  and Interleaved declustering.  The reader is referred
+to the RAIDframe documentation mentioned in the
+.Sx HISTORY
+section for more detail on these various RAID configurations.
+.Pp
+Depending on the parity level configured, the device driver can
+support the failure of component drives.  The number of failures
+allowed depends on the parity level selected.  If the driver is able
+to handle drive failures, and a drive does fail, then the system is
+operating in "degraded mode".  In this mode, all missing data must be
+reconstructed from the data and parity present on the other
+components.  This results in much slower data accesses, but
+does mean that a failure need not bring the system to a complete halt.
+.Pp
+The RAID driver supports and enforces the use of 
+.Sq component labels .
+A
+.Sq component label
+contains important information about the component, including a
+user-specified serial number, the row and column of that component in
+the RAID set, and whether the data (and parity) on the component is
+.Sq clean .
+If the driver determines that the labels are very inconsistent with
+respect to each other (e.g. two or more serial numbers do not match)
+or that the component label is not consistent with it's assigned place
+in the set (e.g. the component label claims the component should be
+the 3rd one a 6-disk set, but the RAID set has it as the 3rd component
+in a 5-disk set) then the device will fail to configure.  If the
+driver determines that exactly one component label seems to be
+incorrect, and the RAID set is being configured as a set that supports
+a single failure, then the RAID set will be allowed to configure, but
+the incorrectly labeled component will be marked as
+.Sq failed ,
+and the RAID set will begin operation in degraded mode.
+If all of the components are consistent among themselves, the RAID set
+will configure normally.
+.Pp
+Component labels are also used to support the auto-detection and
+auto-configuration of RAID sets.  A RAID set can be flagged as
+auto-configurable, in which case it will be configured automatically
+during the kernel boot process.  RAID filesystems which are
+automatically configured are also eligible to be the root filesystem.
+There is currently only limited support (alpha and pmax architectures)
+for booting a kernel directly from a RAID 1 set, and no support for
+booting from any other RAID sets.  To use a RAID set as the root
+filesystem, a kernel is usually obtained from a small non-RAID
+partition, after which any auto-configuring RAID set can be used for the
+root filesystem.  See
+.Xr raidctl 8
+for more information on auto-configuration of RAID sets.
+.Pp
+The driver supports 
+.Sq hot spares , 
+disks which are on-line, but are not
+actively used in an existing filesystem.  Should a disk fail, the
+driver is capable of reconstructing the failed disk onto a hot spare
+or back onto a replacement drive.
+If the components are hot swapable, the failed disk can then be
+removed, a new disk put in its place, and a copyback operation
+performed.  The copyback operation, as its name indicates, will copy
+the reconstructed data from the hot spare to the previously failed
+(and now replaced) disk.  Hot spares can also be hot-added using
+.Xr raidctl 8 .
+.Pp
+If a component cannot be detected when the RAID device is configured,
+that component will be simply marked as 'failed'.
+.Pp
+The user-land utility for doing all 
+.Nm
+configuration and other operations
+is 
+.Xr raidctl 8 .
+Most importantly, 
+.Xr raidctl 8
+must be used with the 
+.Fl i
+option to initialize all RAID sets.  In particular, this
+initialization includes re-building the parity data.  This rebuilding
+of parity data is also required when either a) a new RAID device is
+brought up for the first time or b) after an un-clean shutdown of a
+RAID device.  By using the
+.Fl P
+option to 
+.Xr raidctl 8 ,
+and performing this on-demand recomputation of all parity 
+before doing a 
+.Xr fsck 8
+or a
+.Xr newfs 8 ,
+filesystem integrity and parity integrity can be ensured.  It bears
+repeating again that parity recomputation is 
+.Ar required
+before any filesystems are created or used on the RAID device.  If the
+parity is not correct, then missing data cannot be correctly recovered.
+.Pp
+RAID levels may be combined in a hierarchical fashion.  For example, a RAID 0 
+device can be constructed out of a number of RAID 5 devices (which, in turn, 
+may be constructed out of the physical disks, or of other RAID devices).
+.Pp
+It is important that drives be hard-coded at their respective
+addresses (i.e. not left free-floating, where a drive with SCSI ID of
+4 can end up as /dev/da0c) for well-behaved functioning of the RAID
+device.  This is true for all types of drives, including IDE, SCSI,
+etc.  For IDE drivers, use the option ATAPI_STATIC_ID in your kernel
+config file.  For SCSI, you should 'wire down' the devices according to
+their ID.  See 
+.Xr cam 4
+for examples of this.
+The rationale for fixing the device addresses
+is as follows: Consider a system with three SCSI drives at SCSI ID's
+4, 5, and 6, and which map to components /dev/da0e, /dev/da1e, and
+/dev/da2e of a RAID 5 set.  If the drive with SCSI ID 5 fails, and the
+system reboots, the old /dev/da2e will show up as /dev/da1e.  The RAID
+driver is able to detect that component positions have changed, and
+will not allow normal configuration.  If the device addresses are hard
+coded, however, the RAID driver would detect that the middle component
+is unavailable, and bring the RAID 5 set up in degraded mode.  Note
+that the auto-detection and auto-configuration code does not care
+about where the components live.  The auto-configuration code will
+correctly configure a device even after any number of the components
+have been re-arranged.
+.Pp
+The first step to using the 
+.Nm
+driver is to ensure that it is suitably configured in the kernel.  This is
+done by adding a line similar to:
+.Bd -unfilled -offset indent
+pseudo-device   raidframe      # RAIDframe disk device
+.Ed
+.Pp
+to the kernel configuration file.  No count argument is required as the
+driver will automatically create and configure new device units as needed.
+To turn on component auto-detection and auto-configuration of RAID
+sets, simply add:
+.Bd -unfilled -offset indent
+options    RAID_AUTOCONFIG
+.Ed
+.Pp
+to the kernel configuration file.
+.Pp
+All component partitions must be of the type
+.Dv FS_BSDFFS 
+(e.g. 4.2BSD) or
+.Dv FS_RAID .
+The use of the latter is strongly encouraged, and is required if
+auto-configuration of the RAID set is desired.  Since RAIDframe leaves
+room for disklabels, RAID components can be simply raw disks, or
+partitions which use an entire disk.
+.Pp
+A more detailed treatment of actually using a 
+.Nm
+device is found in 
+.Xr raidctl 8 .
+It is highly recommended that the steps to reconstruct, copyback, and
+re-compute parity are well understood by the system administrator(s)
+.Ar before
+a component failure.  Doing the wrong thing when a component fails may
+result in data loss.
+.Pp
+.Sh WARNINGS
+Certain RAID levels (1, 4, 5, 6, and others) can protect against some
+data loss due to component failure.  However the loss of two
+components of a RAID 4 or 5 system, or the loss of a single component
+of a RAID 0 system, will result in the entire filesystems on that RAID
+device being lost.
+RAID is 
+.Ar NOT
+a substitute for good backup practices.
+.Pp
+Recomputation of parity 
+.Ar MUST
+be performed whenever there is a chance that it may have been
+compromised.  This includes after system crashes, or before a RAID
+device has been used for the first time.  Failure to keep parity
+correct will be catastrophic should a component ever fail -- it is
+better to use RAID 0 and get the additional space and speed, than it
+is to use parity, but not keep the parity correct.  At least with RAID
+0 there is no perception of increased data security. 
+.Pp
+.Sh FILES
+.Bl -tag -width /dev/XXrXraidX -compact
+.It Pa /dev/raid*
+.Nm
+device special files.
+.El
+.Pp
+.Sh SEE ALSO
+.Xr raidctl 8 ,
+.Xr config 8 ,
+.Xr fsck 8 ,
+.Xr mount 8 ,
+.Xr newfs 8 ,
+.Sh HISTORY
+The 
+.Nm
+driver in 
+.Fx
+is a port of RAIDframe, a framework for rapid prototyping of RAID
+structures developed by the folks at the Parallel Data Laboratory at
+Carnegie Mellon University (CMU).  RAIDframe, as originally distributed
+by CMU, provides a RAID simulator for a number of different
+architectures, and a user-level device driver and a kernel device
+driver for Digital Unix.  The 
+.Nm
+driver is a kernelized version of RAIDframe v1.1, based on the
+.Nx
+port of RAIDframe by Greg Oster.
+.Pp
+A more complete description of the internals and functionality of
+RAIDframe is found in the paper "RAIDframe: A Rapid Prototyping Tool
+for RAID Systems", by William V. Courtright II, Garth Gibson, Mark
+Holland, LeAnn Neal Reilly, and Jim Zelenka, and published by the
+Parallel Data Laboratory of Carnegie Mellon University.
+The 
+.Nm
+driver first appeared in
+.Fx 4.4 .
+.Sh COPYRIGHT
+.Bd -unfilled
+The RAIDframe Copyright is as follows:
+
+Copyright (c) 1994-1996 Carnegie-Mellon University.
+All rights reserved.
+
+Permission to use, copy, modify and distribute this software and
+its documentation is hereby granted, provided that both the copyright
+notice and this permission notice appear in all copies of the
+software, derivative works or modified versions, and any portions
+thereof, and that both notices appear in supporting documentation.
+
+CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+
+Carnegie Mellon requests users of this software to return to
+
+ Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ School of Computer Science
+ Carnegie Mellon University
+ Pittsburgh PA 15213-3890
+
+any improvements or extensions that they make and grant Carnegie the
+rights to redistribute these changes.
+.Ed
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index 02e1d9f..ea14607 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -979,6 +979,12 @@ device		ccd		#Concatenated disk driver
 device		vinum		#Vinum concat/mirror/raid driver
 options 	VINUMDEBUG	#enable Vinum debugging hooks
 
+# RAIDframe device.  RAID_AUTOCONFIG allows RAIDframe to search all of the
+# disk devices in the system looking for components that it recognizes (already
+# configured once before) and auto-configured them into arrays.
+device		raidframe
+options		RAID_AUTOCONFIG
+
 # Kernel side iconv library
 options 	LIBICONV
 
diff --git a/sys/conf/files b/sys/conf/files
index c003bc3..d2a8210 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -547,6 +547,66 @@ dev/puc/puc.c		optional puc
 dev/puc/puc_pci.c	optional puc pci
 dev/puc/puc_pccard.c	optional puc pccard
 dev/puc/pucdata.c	optional puc pci
+dev/raidframe/rf_acctrace.c		optional raidframe
+dev/raidframe/rf_alloclist.c		optional raidframe
+dev/raidframe/rf_aselect.c		optional raidframe
+dev/raidframe/rf_callback.c		optional raidframe
+dev/raidframe/rf_chaindecluster.c	optional raidframe
+dev/raidframe/rf_copyback.c		optional raidframe
+dev/raidframe/rf_cvscan.c		optional raidframe
+dev/raidframe/rf_dagdegrd.c		optional raidframe
+dev/raidframe/rf_dagdegwr.c		optional raidframe
+dev/raidframe/rf_dagffrd.c		optional raidframe
+dev/raidframe/rf_dagffwr.c		optional raidframe
+dev/raidframe/rf_dagfuncs.c		optional raidframe
+dev/raidframe/rf_dagutils.c		optional raidframe
+dev/raidframe/rf_debugMem.c		optional raidframe
+dev/raidframe/rf_debugprint.c		optional raidframe
+dev/raidframe/rf_decluster.c		optional raidframe
+dev/raidframe/rf_declusterPQ.c		optional raidframe
+dev/raidframe/rf_diskqueue.c		optional raidframe
+dev/raidframe/rf_disks.c		optional raidframe
+dev/raidframe/rf_driver.c		optional raidframe
+dev/raidframe/rf_engine.c		optional raidframe
+dev/raidframe/rf_evenodd.c		optional raidframe
+dev/raidframe/rf_evenodd_dagfuncs.c	optional raidframe
+dev/raidframe/rf_evenodd_dags.c		optional raidframe
+dev/raidframe/rf_fifo.c			optional raidframe
+dev/raidframe/rf_freebsdkintf.c		optional raidframe
+dev/raidframe/rf_interdecluster.c	optional raidframe
+dev/raidframe/rf_invertq.c		optional raidframe
+dev/raidframe/rf_layout.c		optional raidframe
+dev/raidframe/rf_map.c			optional raidframe
+dev/raidframe/rf_mcpair.c		optional raidframe
+dev/raidframe/rf_memchunk.c		optional raidframe
+dev/raidframe/rf_nwayxor.c		optional raidframe
+dev/raidframe/rf_options.c		optional raidframe
+dev/raidframe/rf_paritylog.c		optional raidframe
+dev/raidframe/rf_paritylogDiskMgr.c	optional raidframe
+dev/raidframe/rf_paritylogging.c	optional raidframe
+dev/raidframe/rf_parityloggingdags.c	optional raidframe
+dev/raidframe/rf_parityscan.c		optional raidframe
+dev/raidframe/rf_pq.c			optional raidframe
+dev/raidframe/rf_pqdeg.c		optional raidframe
+dev/raidframe/rf_pqdegdags.c		optional raidframe
+dev/raidframe/rf_psstatus.c		optional raidframe
+dev/raidframe/rf_raid0.c		optional raidframe
+dev/raidframe/rf_raid1.c		optional raidframe
+dev/raidframe/rf_raid4.c		optional raidframe
+dev/raidframe/rf_raid5.c		optional raidframe
+dev/raidframe/rf_raid5_rotatedspare.c	optional raidframe
+dev/raidframe/rf_reconbuffer.c		optional raidframe
+dev/raidframe/rf_reconmap.c		optional raidframe
+dev/raidframe/rf_reconstruct.c		optional raidframe
+dev/raidframe/rf_reconutil.c		optional raidframe
+dev/raidframe/rf_revent.c		optional raidframe
+dev/raidframe/rf_shutdown.c		optional raidframe
+dev/raidframe/rf_sstf.c			optional raidframe
+dev/raidframe/rf_states.c		optional raidframe
+dev/raidframe/rf_stripelocks.c		optional raidframe
+dev/raidframe/rf_strutils.c		optional raidframe
+dev/raidframe/rf_threadstuff.c		optional raidframe
+dev/raidframe/rf_utils.c		optional raidframe
 dev/random/harvest.c	standard
 dev/random/randomdev.c	optional random
 dev/random/yarrow.c	optional random
diff --git a/sys/conf/majors b/sys/conf/majors
index 1703c68..a83fa3d 100644
--- a/sys/conf/majors
+++ b/sys/conf/majors
@@ -193,6 +193,8 @@ chrdev	name		comments
 175	ips		IBM/Adaptec ServeRAID (control device)
 176	ipsd		IBM/Adaptec ServeRAID (disk device)
 177	openfirm	OpenFirmware control device <tmm>
+178	raidctl		RAIDframe (control device)
+179	raid		RAIDframe (disk device)
 200	??		entries from 200-252 are reserved for local use
 252	??		entries from 200-252 are reserved for local use
 254	internal	Used internally by the kernel
diff --git a/sys/conf/options b/sys/conf/options
index 0311849..9305b2a 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -549,6 +549,10 @@ ROOTDEVNAME		opt_rootdevname.h
 FDC_DEBUG		opt_fdc.h
 PCFCLOCK_VERBOSE	opt_pcfclock.h
 PCFCLOCK_MAX_RETRIES	opt_pcfclock.h
+
+# RAIDframe options
+RAID_AUTOCONFIG		opt_raid.h
+RAID_DEBUG		opt_raid.h
 TDFX_LINUX opt_tdfx.h
 
 KTR			opt_global.h
diff --git a/sys/dev/raidframe/rf_acctrace.c b/sys/dev/raidframe/rf_acctrace.c
new file mode 100644
index 0000000..eaa4b2a
--- /dev/null
+++ b/sys/dev/raidframe/rf_acctrace.c
@@ -0,0 +1,172 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_acctrace.c,v 1.4 1999/08/13 03:41:52 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * acctrace.c -- code to support collecting information about each access
+ *
+ *****************************************************************************/
+
+#if defined(__FreeBSD__)
+#include <sys/types.h>
+#include <sys/time.h>
+#endif
+#include <sys/stat.h>
+#if defined(__NetBSD__)
+#include <sys/types.h>
+#endif
+
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_acctrace.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_hist.h>
+#include <dev/raidframe/rf_shutdown.h>
+
+static long numTracesSoFar;
+static int accessTraceBufCount = 0;
+static RF_AccTraceEntry_t *access_tracebuf;
+static long traceCount;
+
+int     rf_stopCollectingTraces;
+RF_DECLARE_MUTEX(rf_tracing_mutex)
+	int     rf_trace_fd;
+
+	static void rf_ShutdownAccessTrace(void *);
+
+	static void rf_ShutdownAccessTrace(ignored)
+	void   *ignored;
+{
+	if (rf_accessTraceBufSize) {
+		if (accessTraceBufCount)
+			rf_FlushAccessTraceBuf();
+		RF_Free(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t));
+	}
+	rf_mutex_destroy(&rf_tracing_mutex);
+}
+
+int 
+rf_ConfigureAccessTrace(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     rc;
+
+	numTracesSoFar = accessTraceBufCount = rf_stopCollectingTraces = 0;
+	if (rf_accessTraceBufSize) {
+		RF_Malloc(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
+		accessTraceBufCount = 0;
+	}
+	traceCount = 0;
+	numTracesSoFar = 0;
+	rc = rf_mutex_init(&rf_tracing_mutex, __FUNCTION__);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+	}
+	rc = rf_ShutdownCreate(listp, rf_ShutdownAccessTrace, NULL);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		if (rf_accessTraceBufSize) {
+			RF_Free(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t));
+			rf_mutex_destroy(&rf_tracing_mutex);
+		}
+	}
+	return (rc);
+}
+/* install a trace record.  cause a flush to disk or to the trace collector daemon
+ * if the trace buffer is at least 1/2 full.
+ */
+void 
+rf_LogTraceRec(raid, rec)
+	RF_Raid_t *raid;
+	RF_AccTraceEntry_t *rec;
+{
+	RF_AccTotals_t *acc = &raid->acc_totals;
+#if 0
+	RF_Etimer_t timer;
+	int     i, n;
+#endif
+
+	if (rf_stopCollectingTraces || ((rf_maxNumTraces >= 0) && (numTracesSoFar >= rf_maxNumTraces)))
+		return;
+
+	/* update AccTotals for this device */
+	if (!raid->keep_acc_totals)
+		return;
+	acc->num_log_ents++;
+	if (rec->reconacc) {
+		acc->recon_start_to_fetch_us += rec->specific.recon.recon_start_to_fetch_us;
+		acc->recon_fetch_to_return_us += rec->specific.recon.recon_fetch_to_return_us;
+		acc->recon_return_to_submit_us += rec->specific.recon.recon_return_to_submit_us;
+		acc->recon_num_phys_ios += rec->num_phys_ios;
+		acc->recon_phys_io_us += rec->phys_io_us;
+		acc->recon_diskwait_us += rec->diskwait_us;
+		acc->recon_reccount++;
+	} else {
+		RF_HIST_ADD(acc->tot_hist, rec->total_us);
+		RF_HIST_ADD(acc->dw_hist, rec->diskwait_us);
+		/* count of physical ios which are too big.  often due to
+		 * thermal recalibration */
+		/* if bigvals > 0, you should probably ignore this data set */
+		if (rec->diskwait_us > 100000)
+			acc->bigvals++;
+		acc->total_us += rec->total_us;
+		acc->suspend_ovhd_us += rec->specific.user.suspend_ovhd_us;
+		acc->map_us += rec->specific.user.map_us;
+		acc->lock_us += rec->specific.user.lock_us;
+		acc->dag_create_us += rec->specific.user.dag_create_us;
+		acc->dag_retry_us += rec->specific.user.dag_retry_us;
+		acc->exec_us += rec->specific.user.exec_us;
+		acc->cleanup_us += rec->specific.user.cleanup_us;
+		acc->exec_engine_us += rec->specific.user.exec_engine_us;
+		acc->xor_us += rec->xor_us;
+		acc->q_us += rec->q_us;
+		acc->plog_us += rec->plog_us;
+		acc->diskqueue_us += rec->diskqueue_us;
+		acc->diskwait_us += rec->diskwait_us;
+		acc->num_phys_ios += rec->num_phys_ios;
+		acc->phys_io_us = rec->phys_io_us;
+		acc->user_reccount++;
+	}
+}
+
+
+/* assumes the tracing mutex is locked at entry.  In order to allow this to be called
+ * from interrupt context, we don't do any copyouts here, but rather just wake trace
+ * buffer collector thread.
+ */
+void 
+rf_FlushAccessTraceBuf()
+{
+	accessTraceBufCount = 0;
+}
diff --git a/sys/dev/raidframe/rf_acctrace.h b/sys/dev/raidframe/rf_acctrace.h
new file mode 100644
index 0000000..c211514
--- /dev/null
+++ b/sys/dev/raidframe/rf_acctrace.h
@@ -0,0 +1,134 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_acctrace.h,v 1.3 1999/02/05 00:06:06 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * acctrace.h -- header file for acctrace.c
+ *
+ *****************************************************************************/
+
+
+#ifndef _RF__RF_ACCTRACE_H_
+#define _RF__RF_ACCTRACE_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_hist.h>
+#include <dev/raidframe/rf_etimer.h>
+
+typedef struct RF_user_acc_stats_s {
+	RF_uint64 suspend_ovhd_us;	/* us spent mucking in the
+					 * access-suspension code */
+	RF_uint64 map_us;	/* us spent mapping the access */
+	RF_uint64 lock_us;	/* us spent locking & unlocking stripes,
+				 * including time spent blocked */
+	RF_uint64 dag_create_us;/* us spent creating the DAGs */
+	RF_uint64 dag_retry_us;	/* _total_ us spent retrying the op -- not
+				 * broken down into components */
+	RF_uint64 exec_us;	/* us spent in DispatchDAG */
+	RF_uint64 exec_engine_us;	/* us spent in engine, not including
+					 * blocking time */
+	RF_uint64 cleanup_us;	/* us spent tearing down the dag & maps, and
+				 * generally cleaning up */
+}       RF_user_acc_stats_t;
+
+typedef struct RF_recon_acc_stats_s {
+	RF_uint32 recon_start_to_fetch_us;
+	RF_uint32 recon_fetch_to_return_us;
+	RF_uint32 recon_return_to_submit_us;
+}       RF_recon_acc_stats_t;
+
+typedef struct RF_acctrace_entry_s {
+	union {
+		RF_user_acc_stats_t user;
+		RF_recon_acc_stats_t recon;
+	}       specific;
+	RF_uint8 reconacc;	/* whether  this is a tracerec for a user acc
+				 * or a recon acc */
+	RF_uint64 xor_us;	/* us spent doing XORs */
+	RF_uint64 q_us;		/* us spent doing XORs */
+	RF_uint64 plog_us;	/* us spent waiting to stuff parity into log */
+	RF_uint64 diskqueue_us;	/* _total_ us spent in disk queue(s), incl
+				 * concurrent ops */
+	RF_uint64 diskwait_us;	/* _total_ us spent waiting actually waiting
+				 * on the disk, incl concurrent ops */
+	RF_uint64 total_us;	/* total us spent on this access */
+	RF_uint64 num_phys_ios;	/* number of physical I/Os invoked */
+	RF_uint64 phys_io_us;	/* time of physical I/O */
+	RF_Etimer_t tot_timer;	/* a timer used to compute total access time */
+	RF_Etimer_t timer;	/* a generic timer val for timing events that
+				 * live across procedure boundaries */
+	RF_Etimer_t recon_timer;/* generic timer for recon stuff */
+	RF_uint64 index;
+}       RF_AccTraceEntry_t;
+
+typedef struct RF_AccTotals_s {
+	/* user acc stats */
+	RF_uint64 suspend_ovhd_us;
+	RF_uint64 map_us;
+	RF_uint64 lock_us;
+	RF_uint64 dag_create_us;
+	RF_uint64 dag_retry_us;
+	RF_uint64 exec_us;
+	RF_uint64 exec_engine_us;
+	RF_uint64 cleanup_us;
+	RF_uint64 user_reccount;
+	/* recon acc stats */
+	RF_uint64 recon_start_to_fetch_us;
+	RF_uint64 recon_fetch_to_return_us;
+	RF_uint64 recon_return_to_submit_us;
+	RF_uint64 recon_io_overflow_count;
+	RF_uint64 recon_phys_io_us;
+	RF_uint64 recon_num_phys_ios;
+	RF_uint64 recon_diskwait_us;
+	RF_uint64 recon_reccount;
+	/* trace entry stats */
+	RF_uint64 xor_us;
+	RF_uint64 q_us;
+	RF_uint64 plog_us;
+	RF_uint64 diskqueue_us;
+	RF_uint64 diskwait_us;
+	RF_uint64 total_us;
+	RF_uint64 num_log_ents;
+	RF_uint64 phys_io_overflow_count;
+	RF_uint64 num_phys_ios;
+	RF_uint64 phys_io_us;
+	RF_uint64 bigvals;
+	/* histograms */
+	RF_Hist_t dw_hist[RF_HIST_NUM_BUCKETS];
+	RF_Hist_t tot_hist[RF_HIST_NUM_BUCKETS];
+}       RF_AccTotals_t;
+#if RF_UTILITY == 0
+RF_DECLARE_EXTERN_MUTEX(rf_tracing_mutex)
+#endif				/* RF_UTILITY == 0 */
+
+	int     rf_ConfigureAccessTrace(RF_ShutdownList_t ** listp);
+	void    rf_LogTraceRec(RF_Raid_t * raid, RF_AccTraceEntry_t * rec);
+	void    rf_FlushAccessTraceBuf(void);
+
+#endif				/* !_RF__RF_ACCTRACE_H_ */
diff --git a/sys/dev/raidframe/rf_alloclist.c b/sys/dev/raidframe/rf_alloclist.c
new file mode 100644
index 0000000..2f0f63a
--- /dev/null
+++ b/sys/dev/raidframe/rf_alloclist.c
@@ -0,0 +1,188 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_alloclist.c,v 1.4 1999/08/13 03:41:53 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/****************************************************************************
+ *
+ * Alloclist.c -- code to manipulate allocation lists
+ *
+ * an allocation list is just a list of AllocListElem structures.  Each
+ * such structure contains a fixed-size array of pointers.  Calling
+ * FreeAList() causes each pointer to be freed.
+ *
+ ***************************************************************************/
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_shutdown.h>
+
+RF_DECLARE_STATIC_MUTEX(alist_mutex)
+	static unsigned int fl_hit_count, fl_miss_count;
+
+	static RF_AllocListElem_t *al_free_list = NULL;
+	static int al_free_list_count;
+
+#define RF_AL_FREELIST_MAX 256
+
+#define DO_FREE(_p,_sz) RF_Free((_p),(_sz))
+
+	static void rf_ShutdownAllocList(void *);
+
+	static void rf_ShutdownAllocList(ignored)
+	void   *ignored;
+{
+	RF_AllocListElem_t *p, *pt;
+
+	for (p = al_free_list; p;) {
+		pt = p;
+		p = p->next;
+		DO_FREE(pt, sizeof(*pt));
+	}
+	rf_mutex_destroy(&alist_mutex);
+	/*
+        printf("Alloclist: Free list hit count %lu (%lu %%) miss count %lu (%lu %%)\n",
+	       fl_hit_count, (100*fl_hit_count)/(fl_hit_count+fl_miss_count),
+	       fl_miss_count, (100*fl_miss_count)/(fl_hit_count+fl_miss_count));
+        */
+}
+
+int 
+rf_ConfigureAllocList(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     rc;
+
+	rc = rf_mutex_init(&alist_mutex, __FUNCTION__);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		return (rc);
+	}
+	al_free_list = NULL;
+	fl_hit_count = fl_miss_count = al_free_list_count = 0;
+	rc = rf_ShutdownCreate(listp, rf_ShutdownAllocList, NULL);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+		    __FILE__, __LINE__, rc);
+		rf_mutex_destroy(&alist_mutex);
+		return (rc);
+	}
+	return (0);
+}
+
+
+/* we expect the lists to have at most one or two elements, so we're willing
+ * to search for the end.  If you ever observe the lists growing longer,
+ * increase POINTERS_PER_ALLOC_LIST_ELEMENT.
+ */
+void 
+rf_real_AddToAllocList(l, p, size, lockflag)
+	RF_AllocListElem_t *l;
+	void   *p;
+	int     size;
+	int     lockflag;
+{
+	RF_AllocListElem_t *newelem;
+
+	for (; l->next; l = l->next)
+		RF_ASSERT(l->numPointers == RF_POINTERS_PER_ALLOC_LIST_ELEMENT);	/* find end of list */
+
+	RF_ASSERT(l->numPointers >= 0 && l->numPointers <= RF_POINTERS_PER_ALLOC_LIST_ELEMENT);
+	if (l->numPointers == RF_POINTERS_PER_ALLOC_LIST_ELEMENT) {
+		newelem = rf_real_MakeAllocList(lockflag);
+		l->next = newelem;
+		l = newelem;
+	}
+	l->pointers[l->numPointers] = p;
+	l->sizes[l->numPointers] = size;
+	l->numPointers++;
+
+}
+
+
+/* we use the debug_mem_mutex here because we need to lock it anyway to call free.
+ * this is probably a bug somewhere else in the code, but when I call malloc/free
+ * outside of any lock I have endless trouble with malloc appearing to return the
+ * same pointer twice.  Since we have to lock it anyway, we might as well use it
+ * as the lock around the al_free_list.  Note that we can't call Free with the
+ * debug_mem_mutex locked.
+ */
+void 
+rf_FreeAllocList(l)
+	RF_AllocListElem_t *l;
+{
+	int     i;
+	RF_AllocListElem_t *temp, *p;
+
+	for (p = l; p; p = p->next) {
+		RF_ASSERT(p->numPointers >= 0 && p->numPointers <= RF_POINTERS_PER_ALLOC_LIST_ELEMENT);
+		for (i = 0; i < p->numPointers; i++) {
+			RF_ASSERT(p->pointers[i]);
+			RF_Free(p->pointers[i], p->sizes[i]);
+		}
+	}
+	while (l) {
+		temp = l;
+		l = l->next;
+		if (al_free_list_count > RF_AL_FREELIST_MAX) {
+			DO_FREE(temp, sizeof(*temp));
+		} else {
+			temp->next = al_free_list;
+			al_free_list = temp;
+			al_free_list_count++;
+		}
+	}
+}
+
+RF_AllocListElem_t *
+rf_real_MakeAllocList(lockflag)
+	int     lockflag;
+{
+	RF_AllocListElem_t *p;
+
+	if (al_free_list) {
+		fl_hit_count++;
+		p = al_free_list;
+		al_free_list = p->next;
+		al_free_list_count--;
+	} else {
+		fl_miss_count++;
+		RF_Malloc(p, sizeof(RF_AllocListElem_t), (RF_AllocListElem_t *));	/* no allocation locking
+											 * in kernel, so this is
+											 * fine */
+	}
+	if (p == NULL) {
+		return (NULL);
+	}
+	bzero((char *) p, sizeof(RF_AllocListElem_t));
+	return (p);
+}
diff --git a/sys/dev/raidframe/rf_alloclist.h b/sys/dev/raidframe/rf_alloclist.h
new file mode 100644
index 0000000..c746452
--- /dev/null
+++ b/sys/dev/raidframe/rf_alloclist.h
@@ -0,0 +1,60 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_alloclist.h,v 1.3 1999/02/05 00:06:06 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/****************************************************************************
+ *
+ * alloclist.h -- header file for alloclist.c
+ *
+ ***************************************************************************/
+
+#ifndef _RF__RF_ALLOCLIST_H_
+#define _RF__RF_ALLOCLIST_H_
+
+#include <dev/raidframe/rf_types.h>
+
+#define RF_POINTERS_PER_ALLOC_LIST_ELEMENT 20
+
+struct RF_AllocListElem_s {
+	void   *pointers[RF_POINTERS_PER_ALLOC_LIST_ELEMENT];
+	int     sizes[RF_POINTERS_PER_ALLOC_LIST_ELEMENT];
+	int     numPointers;
+	RF_AllocListElem_t *next;
+};
+#define rf_MakeAllocList(_ptr_) _ptr_ = rf_real_MakeAllocList(1);
+#define rf_AddToAllocList(_l_,_ptr_,_sz_) rf_real_AddToAllocList((_l_), (_ptr_), (_sz_), 1)
+
+int     rf_ConfigureAllocList(RF_ShutdownList_t ** listp);
+
+#if RF_UTILITY == 0
+void    rf_real_AddToAllocList(RF_AllocListElem_t * l, void *p, int size, int lockflag);
+void    rf_FreeAllocList(RF_AllocListElem_t * l);
+RF_AllocListElem_t *rf_real_MakeAllocList(int lockflag);
+#endif				/* RF_UTILITY == 0 */
+
+#endif				/* !_RF__RF_ALLOCLIST_H_ */
diff --git a/sys/dev/raidframe/rf_archs.h b/sys/dev/raidframe/rf_archs.h
new file mode 100644
index 0000000..faef157
--- /dev/null
+++ b/sys/dev/raidframe/rf_archs.h
@@ -0,0 +1,75 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_archs.h,v 1.11 2001/01/26 04:43:16 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_archs.h -- defines for which architectures you want to
+ * include is some particular build of raidframe.  Unfortunately,
+ * it's difficult to exclude declustering, P+Q, and distributed
+ * sparing because the code is intermixed with RAID5 code.  This
+ * should be fixed.
+ *
+ * this is really intended only for use in the kernel, where I
+ * am worried about the size of the object module.  At user level and
+ * in the simulator, I don't really care that much, so all the
+ * architectures can be compiled together.  Note that by itself, turning
+ * off these defines does not affect the size of the executable; you
+ * have to edit the makefile for that.
+ *
+ * comment out any line below to eliminate that architecture.
+ * the list below includes all the modules that can be compiled
+ * out.
+ *
+ */
+
+#ifndef _RF__RF_ARCHS_H_
+#define _RF__RF_ARCHS_H_
+
+#define RF_INCLUDE_EVENODD       1
+
+#define RF_INCLUDE_RAID5_RS      1
+#define RF_INCLUDE_PARITYLOGGING 1
+
+#define RF_INCLUDE_CHAINDECLUSTER 1
+#define RF_INCLUDE_INTERDECLUSTER 1
+
+#define RF_INCLUDE_PARITY_DECLUSTERING 1
+#define RF_INCLUDE_PARITY_DECLUSTERING_DS 1
+
+#define RF_INCLUDE_RAID0   1
+#define RF_INCLUDE_RAID1   1
+#define RF_INCLUDE_RAID4   1
+#define RF_INCLUDE_RAID5   1
+#define RF_INCLUDE_RAID6   0
+#define RF_INCLUDE_DECL_PQ 0
+
+#define RF_MEMORY_REDZONES 0
+#define RF_RECON_STATS     1
+
+#include <dev/raidframe/rf_options.h>
+
+#endif				/* !_RF__RF_ARCHS_H_ */
diff --git a/sys/dev/raidframe/rf_aselect.c b/sys/dev/raidframe/rf_aselect.c
new file mode 100644
index 0000000..4fe69e3
--- /dev/null
+++ b/sys/dev/raidframe/rf_aselect.c
@@ -0,0 +1,494 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_aselect.c,v 1.3 1999/02/05 00:06:06 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * aselect.c -- algorithm selection code
+ *
+ *****************************************************************************/
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_desc.h>
+#include <dev/raidframe/rf_map.h>
+
+#if defined(__NetBSD__) || defined(__FreeBSD__) && defined(_KERNEL)
+/* the function below is not used... so don't define it! */
+#else
+static void TransferDagMemory(RF_DagHeader_t *, RF_DagHeader_t *);
+#endif
+
+static int InitHdrNode(RF_DagHeader_t **, RF_Raid_t *, int);
+static void UpdateNodeHdrPtr(RF_DagHeader_t *, RF_DagNode_t *);
+int     rf_SelectAlgorithm(RF_RaidAccessDesc_t *, RF_RaidAccessFlags_t);
+
+
+/******************************************************************************
+ *
+ * Create and Initialiaze a dag header and termination node
+ *
+ *****************************************************************************/
+static int 
+InitHdrNode(hdr, raidPtr, memChunkEnable)
+	RF_DagHeader_t **hdr;
+	RF_Raid_t *raidPtr;
+	int     memChunkEnable;
+{
+	/* create and initialize dag hdr */
+	*hdr = rf_AllocDAGHeader();
+	rf_MakeAllocList((*hdr)->allocList);
+	if ((*hdr)->allocList == NULL) {
+		rf_FreeDAGHeader(*hdr);
+		return (ENOMEM);
+	}
+	(*hdr)->status = rf_enable;
+	(*hdr)->numSuccedents = 0;
+	(*hdr)->raidPtr = raidPtr;
+	(*hdr)->next = NULL;
+	return (0);
+}
+/******************************************************************************
+ *
+ * Transfer allocation list and mem chunks from one dag to another
+ *
+ *****************************************************************************/
+#if defined(__NetBSD__) || defined(__FreeBSD__) && defined(_KERNEL)
+/* the function below is not used... so don't define it! */
+#else
+static void 
+TransferDagMemory(daga, dagb)
+	RF_DagHeader_t *daga;
+	RF_DagHeader_t *dagb;
+{
+	RF_AccessStripeMapHeader_t *end;
+	RF_AllocListElem_t *p;
+	int     i, memChunksXfrd = 0, xtraChunksXfrd = 0;
+
+	/* transfer allocList from dagb to daga */
+	for (p = dagb->allocList; p; p = p->next) {
+		for (i = 0; i < p->numPointers; i++) {
+			rf_AddToAllocList(daga->allocList, p->pointers[i], p->sizes[i]);
+			p->pointers[i] = NULL;
+			p->sizes[i] = 0;
+		}
+		p->numPointers = 0;
+	}
+
+	/* transfer chunks from dagb to daga */
+	while ((memChunksXfrd + xtraChunksXfrd < dagb->chunkIndex + dagb->xtraChunkIndex) && (daga->chunkIndex < RF_MAXCHUNKS)) {
+		/* stuff chunks into daga's memChunk array */
+		if (memChunksXfrd < dagb->chunkIndex) {
+			daga->memChunk[daga->chunkIndex++] = dagb->memChunk[memChunksXfrd];
+			dagb->memChunk[memChunksXfrd++] = NULL;
+		} else {
+			daga->memChunk[daga->xtraChunkIndex++] = dagb->xtraMemChunk[xtraChunksXfrd];
+			dagb->xtraMemChunk[xtraChunksXfrd++] = NULL;
+		}
+	}
+	/* use escape hatch to hold excess chunks */
+	while (memChunksXfrd + xtraChunksXfrd < dagb->chunkIndex + dagb->xtraChunkIndex) {
+		if (memChunksXfrd < dagb->chunkIndex) {
+			daga->xtraMemChunk[daga->xtraChunkIndex++] = dagb->memChunk[memChunksXfrd];
+			dagb->memChunk[memChunksXfrd++] = NULL;
+		} else {
+			daga->xtraMemChunk[daga->xtraChunkIndex++] = dagb->xtraMemChunk[xtraChunksXfrd];
+			dagb->xtraMemChunk[xtraChunksXfrd++] = NULL;
+		}
+	}
+	RF_ASSERT((memChunksXfrd == dagb->chunkIndex) && (xtraChunksXfrd == dagb->xtraChunkIndex));
+	RF_ASSERT(daga->chunkIndex <= RF_MAXCHUNKS);
+	RF_ASSERT(daga->xtraChunkIndex <= daga->xtraChunkCnt);
+	dagb->chunkIndex = 0;
+	dagb->xtraChunkIndex = 0;
+
+	/* transfer asmList from dagb to daga */
+	if (dagb->asmList) {
+		if (daga->asmList) {
+			end = daga->asmList;
+			while (end->next)
+				end = end->next;
+			end->next = dagb->asmList;
+		} else
+			daga->asmList = dagb->asmList;
+		dagb->asmList = NULL;
+	}
+}
+#endif				/* __NetBSD__ */
+
+/*****************************************************************************************
+ *
+ * Ensure that all node->dagHdr fields in a dag are consistent
+ *
+ * IMPORTANT: This routine recursively searches all succedents of the node.  If a
+ * succedent is encountered whose dagHdr ptr does not require adjusting, that node's
+ * succedents WILL NOT BE EXAMINED.
+ *
+ ****************************************************************************************/
+static void 
+UpdateNodeHdrPtr(hdr, node)
+	RF_DagHeader_t *hdr;
+	RF_DagNode_t *node;
+{
+	int     i;
+	RF_ASSERT(hdr != NULL && node != NULL);
+	for (i = 0; i < node->numSuccedents; i++)
+		if (node->succedents[i]->dagHdr != hdr)
+			UpdateNodeHdrPtr(hdr, node->succedents[i]);
+	node->dagHdr = hdr;
+}
+/******************************************************************************
+ *
+ * Create a DAG to do a read or write operation.
+ *
+ * create an array of dagLists, one list per parity stripe.
+ * return the lists in the array desc->dagArray.
+ *
+ * Normally, each list contains one dag for the entire stripe.  In some
+ * tricky cases, we break this into multiple dags, either one per stripe
+ * unit or one per block (sector).  When this occurs, these dags are returned
+ * as a linked list (dagList) which is executed sequentially (to preserve
+ * atomic parity updates in the stripe).
+ *
+ * dags which operate on independent parity goups (stripes) are returned in
+ * independent dagLists (distinct elements in desc->dagArray) and may be
+ * executed concurrently.
+ *
+ * Finally, if the SelectionFunc fails to create a dag for a block, we punt
+ * and return 1.
+ *
+ * The above process is performed in two phases:
+ *   1) create an array(s) of creation functions (eg stripeFuncs)
+ *   2) create dags and concatenate/merge to form the final dag.
+ *
+ * Because dag's are basic blocks (single entry, single exit, unconditional
+ * control flow, we can add the following optimizations (future work):
+ *   first-pass optimizer to allow max concurrency (need all data dependencies)
+ *   second-pass optimizer to eliminate common subexpressions (need true
+ *                         data dependencies)
+ *   third-pass optimizer to eliminate dead code (need true data dependencies)
+ *****************************************************************************/
+
+#define MAXNSTRIPES 5
+
+int 
+rf_SelectAlgorithm(desc, flags)
+	RF_RaidAccessDesc_t *desc;
+	RF_RaidAccessFlags_t flags;
+{
+	RF_AccessStripeMapHeader_t *asm_h = desc->asmap;
+	RF_IoType_t type = desc->type;
+	RF_Raid_t *raidPtr = desc->raidPtr;
+	void   *bp = desc->bp;
+
+	RF_AccessStripeMap_t *asmap = asm_h->stripeMap;
+	RF_AccessStripeMap_t *asm_p;
+	RF_DagHeader_t *dag_h = NULL, *tempdag_h, *lastdag_h;
+	int     i, j, k;
+	RF_VoidFuncPtr *stripeFuncs, normalStripeFuncs[MAXNSTRIPES];
+	RF_AccessStripeMap_t *asm_up, *asm_bp;
+	RF_AccessStripeMapHeader_t ***asmh_u, *endASMList;
+	RF_AccessStripeMapHeader_t ***asmh_b;
+	RF_VoidFuncPtr **stripeUnitFuncs, uFunc;
+	RF_VoidFuncPtr **blockFuncs, bFunc;
+	int     numStripesBailed = 0, cantCreateDAGs = RF_FALSE;
+	int     numStripeUnitsBailed = 0;
+	int     stripeNum, numUnitDags = 0, stripeUnitNum, numBlockDags = 0;
+	RF_StripeNum_t numStripeUnits;
+	RF_SectorNum_t numBlocks;
+	RF_RaidAddr_t address;
+	int     length;
+	RF_PhysDiskAddr_t *physPtr;
+	caddr_t buffer;
+
+	lastdag_h = NULL;
+	asmh_u = asmh_b = NULL;
+	stripeUnitFuncs = NULL;
+	blockFuncs = NULL;
+
+	/* get an array of dag-function creation pointers, try to avoid
+	 * calling malloc */
+	if (asm_h->numStripes <= MAXNSTRIPES)
+		stripeFuncs = normalStripeFuncs;
+	else
+		RF_Calloc(stripeFuncs, asm_h->numStripes, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr *));
+
+	/* walk through the asm list once collecting information */
+	/* attempt to find a single creation function for each stripe */
+	desc->numStripes = 0;
+	for (i = 0, asm_p = asmap; asm_p; asm_p = asm_p->next, i++) {
+		desc->numStripes++;
+		(raidPtr->Layout.map->SelectionFunc) (raidPtr, type, asm_p, &stripeFuncs[i]);
+		/* check to see if we found a creation func for this stripe */
+		if (stripeFuncs[i] == (RF_VoidFuncPtr) NULL) {
+			/* could not find creation function for entire stripe
+			 * so, let's see if we can find one for each stripe
+			 * unit in the stripe */
+
+			if (numStripesBailed == 0) {
+				/* one stripe map header for each stripe we
+				 * bail on */
+				RF_Malloc(asmh_u, sizeof(RF_AccessStripeMapHeader_t **) * asm_h->numStripes, (RF_AccessStripeMapHeader_t ***));
+				/* create an array of ptrs to arrays of
+				 * stripeFuncs */
+				RF_Calloc(stripeUnitFuncs, asm_h->numStripes, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr **));
+			}
+			/* create an array of creation funcs (called
+			 * stripeFuncs) for this stripe */
+			numStripeUnits = asm_p->numStripeUnitsAccessed;
+			RF_Calloc(stripeUnitFuncs[numStripesBailed], numStripeUnits, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr *));
+			RF_Malloc(asmh_u[numStripesBailed], numStripeUnits * sizeof(RF_AccessStripeMapHeader_t *), (RF_AccessStripeMapHeader_t **));
+
+			/* lookup array of stripeUnitFuncs for this stripe */
+			for (j = 0, physPtr = asm_p->physInfo; physPtr; physPtr = physPtr->next, j++) {
+				/* remap for series of single stripe-unit
+				 * accesses */
+				address = physPtr->raidAddress;
+				length = physPtr->numSector;
+				buffer = physPtr->bufPtr;
+
+				asmh_u[numStripesBailed][j] = rf_MapAccess(raidPtr, address, length, buffer, RF_DONT_REMAP);
+				asm_up = asmh_u[numStripesBailed][j]->stripeMap;
+
+				/* get the creation func for this stripe unit */
+				(raidPtr->Layout.map->SelectionFunc) (raidPtr, type, asm_up, &(stripeUnitFuncs[numStripesBailed][j]));
+
+				/* check to see if we found a creation func
+				 * for this stripe unit */
+				if (stripeUnitFuncs[numStripesBailed][j] == (RF_VoidFuncPtr) NULL) {
+					/* could not find creation function
+					 * for stripe unit so, let's see if we
+					 * can find one for each block in the
+					 * stripe unit */
+					if (numStripeUnitsBailed == 0) {
+						/* one stripe map header for
+						 * each stripe unit we bail on */
+						RF_Malloc(asmh_b, sizeof(RF_AccessStripeMapHeader_t **) * asm_h->numStripes * raidPtr->Layout.numDataCol, (RF_AccessStripeMapHeader_t ***));
+						/* create an array of ptrs to
+						 * arrays of blockFuncs */
+						RF_Calloc(blockFuncs, asm_h->numStripes * raidPtr->Layout.numDataCol, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr **));
+					}
+					/* create an array of creation funcs
+					 * (called blockFuncs) for this stripe
+					 * unit */
+					numBlocks = physPtr->numSector;
+					numBlockDags += numBlocks;
+					RF_Calloc(blockFuncs[numStripeUnitsBailed], numBlocks, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr *));
+					RF_Malloc(asmh_b[numStripeUnitsBailed], numBlocks * sizeof(RF_AccessStripeMapHeader_t *), (RF_AccessStripeMapHeader_t **));
+
+					/* lookup array of blockFuncs for this
+					 * stripe unit */
+					for (k = 0; k < numBlocks; k++) {
+						/* remap for series of single
+						 * stripe-unit accesses */
+						address = physPtr->raidAddress + k;
+						length = 1;
+						buffer = physPtr->bufPtr + (k * (1 << raidPtr->logBytesPerSector));
+
+						asmh_b[numStripeUnitsBailed][k] = rf_MapAccess(raidPtr, address, length, buffer, RF_DONT_REMAP);
+						asm_bp = asmh_b[numStripeUnitsBailed][k]->stripeMap;
+
+						/* get the creation func for
+						 * this stripe unit */
+						(raidPtr->Layout.map->SelectionFunc) (raidPtr, type, asm_bp, &(blockFuncs[numStripeUnitsBailed][k]));
+
+						/* check to see if we found a
+						 * creation func for this
+						 * stripe unit */
+						if (blockFuncs[numStripeUnitsBailed][k] == NULL)
+							cantCreateDAGs = RF_TRUE;
+					}
+					numStripeUnitsBailed++;
+				} else {
+					numUnitDags++;
+				}
+			}
+			RF_ASSERT(j == numStripeUnits);
+			numStripesBailed++;
+		}
+	}
+
+	if (cantCreateDAGs) {
+		/* free memory and punt */
+		if (asm_h->numStripes > MAXNSTRIPES)
+			RF_Free(stripeFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr));
+		if (numStripesBailed > 0) {
+			stripeNum = 0;
+			for (i = 0, asm_p = asmap; asm_p; asm_p = asm_p->next, i++)
+				if (stripeFuncs[i] == NULL) {
+					numStripeUnits = asm_p->numStripeUnitsAccessed;
+					for (j = 0; j < numStripeUnits; j++)
+						rf_FreeAccessStripeMap(asmh_u[stripeNum][j]);
+					RF_Free(asmh_u[stripeNum], numStripeUnits * sizeof(RF_AccessStripeMapHeader_t *));
+					RF_Free(stripeUnitFuncs[stripeNum], numStripeUnits * sizeof(RF_VoidFuncPtr));
+					stripeNum++;
+				}
+			RF_ASSERT(stripeNum == numStripesBailed);
+			RF_Free(stripeUnitFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr));
+			RF_Free(asmh_u, asm_h->numStripes * sizeof(RF_AccessStripeMapHeader_t **));
+		}
+		return (1);
+	} else {
+		/* begin dag creation */
+		stripeNum = 0;
+		stripeUnitNum = 0;
+
+		/* create an array of dagLists and fill them in */
+		RF_CallocAndAdd(desc->dagArray, desc->numStripes, sizeof(RF_DagList_t), (RF_DagList_t *), desc->cleanupList);
+
+		for (i = 0, asm_p = asmap; asm_p; asm_p = asm_p->next, i++) {
+			/* grab dag header for this stripe */
+			dag_h = NULL;
+			desc->dagArray[i].desc = desc;
+
+			if (stripeFuncs[i] == (RF_VoidFuncPtr) NULL) {
+				/* use bailout functions for this stripe */
+				for (j = 0, physPtr = asm_p->physInfo; physPtr; physPtr = physPtr->next, j++) {
+					uFunc = stripeUnitFuncs[stripeNum][j];
+					if (uFunc == (RF_VoidFuncPtr) NULL) {
+						/* use bailout functions for
+						 * this stripe unit */
+						for (k = 0; k < physPtr->numSector; k++) {
+							/* create a dag for
+							 * this block */
+							InitHdrNode(&tempdag_h, raidPtr, rf_useMemChunks);
+							desc->dagArray[i].numDags++;
+							if (dag_h == NULL) {
+								dag_h = tempdag_h;
+							} else {
+								lastdag_h->next = tempdag_h;
+							}
+							lastdag_h = tempdag_h;
+
+							bFunc = blockFuncs[stripeUnitNum][k];
+							RF_ASSERT(bFunc);
+							asm_bp = asmh_b[stripeUnitNum][k]->stripeMap;
+							(*bFunc) (raidPtr, asm_bp, tempdag_h, bp, flags, tempdag_h->allocList);
+						}
+						stripeUnitNum++;
+					} else {
+						/* create a dag for this unit */
+						InitHdrNode(&tempdag_h, raidPtr, rf_useMemChunks);
+						desc->dagArray[i].numDags++;
+						if (dag_h == NULL) {
+							dag_h = tempdag_h;
+						} else {
+							lastdag_h->next = tempdag_h;
+						}
+						lastdag_h = tempdag_h;
+
+						asm_up = asmh_u[stripeNum][j]->stripeMap;
+						(*uFunc) (raidPtr, asm_up, tempdag_h, bp, flags, tempdag_h->allocList);
+					}
+				}
+				RF_ASSERT(j == asm_p->numStripeUnitsAccessed);
+				/* merge linked bailout dag to existing dag
+				 * collection */
+				stripeNum++;
+			} else {
+				/* Create a dag for this parity stripe */
+				InitHdrNode(&tempdag_h, raidPtr, rf_useMemChunks);
+				desc->dagArray[i].numDags++;
+				if (dag_h == NULL) {
+					dag_h = tempdag_h;
+				} else {
+					lastdag_h->next = tempdag_h;
+				}
+				lastdag_h = tempdag_h;
+
+				(stripeFuncs[i]) (raidPtr, asm_p, tempdag_h, bp, flags, tempdag_h->allocList);
+			}
+			desc->dagArray[i].dags = dag_h;
+		}
+		RF_ASSERT(i == desc->numStripes);
+
+		/* free memory */
+		if (asm_h->numStripes > MAXNSTRIPES)
+			RF_Free(stripeFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr));
+		if ((numStripesBailed > 0) || (numStripeUnitsBailed > 0)) {
+			stripeNum = 0;
+			stripeUnitNum = 0;
+			if (dag_h->asmList) {
+				endASMList = dag_h->asmList;
+				while (endASMList->next)
+					endASMList = endASMList->next;
+			} else
+				endASMList = NULL;
+			/* walk through io, stripe by stripe */
+			for (i = 0, asm_p = asmap; asm_p; asm_p = asm_p->next, i++)
+				if (stripeFuncs[i] == NULL) {
+					numStripeUnits = asm_p->numStripeUnitsAccessed;
+					/* walk through stripe, stripe unit by
+					 * stripe unit */
+					for (j = 0, physPtr = asm_p->physInfo; physPtr; physPtr = physPtr->next, j++) {
+						if (stripeUnitFuncs[stripeNum][j] == NULL) {
+							numBlocks = physPtr->numSector;
+							/* walk through stripe
+							 * unit, block by
+							 * block */
+							for (k = 0; k < numBlocks; k++)
+								if (dag_h->asmList == NULL) {
+									dag_h->asmList = asmh_b[stripeUnitNum][k];
+									endASMList = dag_h->asmList;
+								} else {
+									endASMList->next = asmh_b[stripeUnitNum][k];
+									endASMList = endASMList->next;
+								}
+							RF_Free(asmh_b[stripeUnitNum], numBlocks * sizeof(RF_AccessStripeMapHeader_t *));
+							RF_Free(blockFuncs[stripeUnitNum], numBlocks * sizeof(RF_VoidFuncPtr));
+							stripeUnitNum++;
+						}
+						if (dag_h->asmList == NULL) {
+							dag_h->asmList = asmh_u[stripeNum][j];
+							endASMList = dag_h->asmList;
+						} else {
+							endASMList->next = asmh_u[stripeNum][j];
+							endASMList = endASMList->next;
+						}
+					}
+					RF_Free(asmh_u[stripeNum], numStripeUnits * sizeof(RF_AccessStripeMapHeader_t *));
+					RF_Free(stripeUnitFuncs[stripeNum], numStripeUnits * sizeof(RF_VoidFuncPtr));
+					stripeNum++;
+				}
+			RF_ASSERT(stripeNum == numStripesBailed);
+			RF_Free(stripeUnitFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr));
+			RF_Free(asmh_u, asm_h->numStripes * sizeof(RF_AccessStripeMapHeader_t **));
+			if (numStripeUnitsBailed > 0) {
+				RF_ASSERT(stripeUnitNum == numStripeUnitsBailed);
+				RF_Free(blockFuncs, raidPtr->Layout.numDataCol * asm_h->numStripes * sizeof(RF_VoidFuncPtr));
+				RF_Free(asmh_b, raidPtr->Layout.numDataCol * asm_h->numStripes * sizeof(RF_AccessStripeMapHeader_t **));
+			}
+		}
+		return (0);
+	}
+}
diff --git a/sys/dev/raidframe/rf_aselect.h b/sys/dev/raidframe/rf_aselect.h
new file mode 100644
index 0000000..de9cd76
--- /dev/null
+++ b/sys/dev/raidframe/rf_aselect.h
@@ -0,0 +1,43 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_aselect.h,v 1.3 1999/02/05 00:06:06 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * aselect.h -- header file for algorithm selection code
+ *
+ *****************************************************************************/
+
+#ifndef _RF__RF_ASELECT_H_
+#define _RF__RF_ASELECT_H_
+
+#include <dev/raidframe/rf_desc.h>
+
+int     rf_SelectAlgorithm(RF_RaidAccessDesc_t * desc, RF_RaidAccessFlags_t flags);
+
+#endif				/* !_RF__RF_ASELECT_H_ */
diff --git a/sys/dev/raidframe/rf_bsd.h b/sys/dev/raidframe/rf_bsd.h
new file mode 100644
index 0000000..14c10f5
--- /dev/null
+++ b/sys/dev/raidframe/rf_bsd.h
@@ -0,0 +1,152 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_netbsd.h,v 1.12 2000/05/28 22:53:49 oster Exp $	*/
+
+/*-
+ * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Greg Oster; Jason R. Thorpe.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RF__RF_BSD_H_
+#define _RF__RF_BSD_H_
+
+#ifdef _KERNEL
+#include <sys/fcntl.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include "opt_raid.h"
+
+#ifdef RAID_DEBUG
+#define rf_printf(lvl, fmt, args...)				\
+	do {							\
+		if (lvl <= RAID_DEBUG) printf(fmt, ##args);	\
+	} while(0)
+
+#else				/* DEBUG */
+#define rf_printf(lvl, fmt, args...) { }
+#endif				/* DEBUG */
+#endif /* _KERNEL */
+
+/* The per-component label information that the user can set */
+typedef struct RF_ComponentInfo_s {
+	int row;              /* the row number of this component */
+	int column;           /* the column number of this component */
+	int serial_number;    /* a user-specified serial number for this
+				 RAID set */
+} RF_ComponentInfo_t;
+
+/* The per-component label information */
+typedef struct RF_ComponentLabel_s {
+	int version;          /* The version of this label. */
+	int serial_number;    /* a user-specified serial number for this
+				 RAID set */
+	int mod_counter;      /* modification counter.  Changed (usually
+				 by incrementing) every time the label 
+				 is changed */
+	int row;              /* the row number of this component */
+	int column;           /* the column number of this component */
+	int num_rows;         /* number of rows in this RAID set */
+	int num_columns;      /* number of columns in this RAID set */
+	int clean;            /* 1 when clean, 0 when dirty */
+	int status;           /* rf_ds_optimal, rf_ds_dist_spared, whatever. */
+	/* stuff that will be in version 2 of the label */
+	int sectPerSU;        /* Sectors per Stripe Unit */
+	int SUsPerPU;         /* Stripe Units per Parity Units */
+	int SUsPerRU;         /* Stripe Units per Reconstruction Units */
+	int parityConfig;     /* '0' == RAID0, '1' == RAID1, etc. */
+	int maxOutstanding;   /* maxOutstanding disk requests */
+	int blockSize;        /* size of component block. 
+				 (disklabel->d_secsize) */
+	int numBlocks;        /* number of blocks on this component.  May
+			         be smaller than the partition size. */
+	int partitionSize;    /* number of blocks on this *partition*. 
+				 Must exactly match the partition size
+				 from the disklabel. */
+	int future_use[33];   /* Future expansion */
+	int autoconfigure;    /* automatically configure this RAID set. 
+				 0 == no, 1 == yes */
+	int root_partition;   /* Use this set as /
+				 0 == no, 1 == yes*/
+	int last_unit;        /* last unit number (e.g. 0 for /dev/raid0) 
+				 of this component.  Used for autoconfigure
+				 only. */
+	int config_order;     /* 0 .. n.  The order in which the component
+				 should be auto-configured.  E.g. 0 is will 
+				 done first, (and would become raid0).
+				 This may be in conflict with last_unit!!?! */
+	                      /* Not currently used. */
+	int future_use2[44];  /* More future expansion */
+} RF_ComponentLabel_t;
+
+typedef struct RF_SingleComponent_s {
+	int row;
+	int column;
+	char component_name[50]; /* name of the component */
+} RF_SingleComponent_t; 
+
+#ifdef _KERNEL
+
+struct raidcinfo {
+	struct vnode *ci_vp;	/* component device's vnode */
+	dev_t   ci_dev;		/* component device's dev_t */
+	RF_ComponentLabel_t ci_label; /* components RAIDframe label */
+#if 0
+	size_t  ci_size;	/* size */
+	char   *ci_path;	/* path to component */
+	size_t  ci_pathlen;	/* length of component path */
+#endif
+};
+
+
+
+/* XXX probably belongs in a different .h file. */
+typedef struct RF_AutoConfig_s {
+	char devname[56];       /* the name of this component */
+	int flag;               /* a general-purpose flag */
+	dev_t dev;              /* the device for this component */
+	struct vnode *vp;       /* Mr. Vnode Pointer */
+	RF_ComponentLabel_t *clabel;  /* the label */
+	struct RF_AutoConfig_s *next; /* the next autoconfig structure 
+				         in this set. */
+} RF_AutoConfig_t;
+
+typedef struct RF_ConfigSet_s {
+	struct RF_AutoConfig_s *ac; /* all of the autoconfig structures for
+				       this config set. */
+	int rootable;               /* Set to 1 if this set can be root */
+	struct RF_ConfigSet_s *next;
+} RF_ConfigSet_t;
+
+#endif /* _KERNEL */
+#endif /* _RF__RF_BSD_H_ */
diff --git a/sys/dev/raidframe/rf_callback.c b/sys/dev/raidframe/rf_callback.c
new file mode 100644
index 0000000..4b79d8b
--- /dev/null
+++ b/sys/dev/raidframe/rf_callback.c
@@ -0,0 +1,94 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_callback.c,v 1.3 1999/02/05 00:06:06 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************************
+ *
+ * callback.c -- code to manipulate callback descriptor
+ *
+ ****************************************************************************************/
+
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_callback.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_freelist.h>
+#include <dev/raidframe/rf_shutdown.h>
+
+static RF_FreeList_t *rf_callback_freelist;
+
+#define RF_MAX_FREE_CALLBACK 64
+#define RF_CALLBACK_INC       4
+#define RF_CALLBACK_INITIAL   4
+
+static void rf_ShutdownCallback(void *);
+static void 
+rf_ShutdownCallback(ignored)
+	void   *ignored;
+{
+	RF_FREELIST_DESTROY(rf_callback_freelist, next, (RF_CallbackDesc_t *));
+}
+
+int 
+rf_ConfigureCallback(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     rc;
+
+	RF_FREELIST_CREATE(rf_callback_freelist, RF_MAX_FREE_CALLBACK,
+	    RF_CALLBACK_INC, sizeof(RF_CallbackDesc_t));
+	if (rf_callback_freelist == NULL)
+		return (ENOMEM);
+	rc = rf_ShutdownCreate(listp, rf_ShutdownCallback, NULL);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		rf_ShutdownCallback(NULL);
+		return (rc);
+	}
+	RF_FREELIST_PRIME(rf_callback_freelist, RF_CALLBACK_INITIAL, next,
+	    (RF_CallbackDesc_t *));
+	return (0);
+}
+
+RF_CallbackDesc_t *
+rf_AllocCallbackDesc()
+{
+	RF_CallbackDesc_t *p;
+
+	RF_FREELIST_GET(rf_callback_freelist, p, next, (RF_CallbackDesc_t *));
+	return (p);
+}
+
+void 
+rf_FreeCallbackDesc(p)
+	RF_CallbackDesc_t *p;
+{
+	RF_FREELIST_FREE(rf_callback_freelist, p, next);
+}
diff --git a/sys/dev/raidframe/rf_callback.h b/sys/dev/raidframe/rf_callback.h
new file mode 100644
index 0000000..feda31d
--- /dev/null
+++ b/sys/dev/raidframe/rf_callback.h
@@ -0,0 +1,65 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_callback.h,v 1.3 1999/02/05 00:06:06 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************************
+ *
+ * callback.h -- header file for callback.c
+ *
+ * the reconstruction code must manage concurrent I/Os on multiple drives.
+ * it sometimes needs to suspend operation on a particular drive until some
+ * condition occurs.  we can't block the thread, of course, or we wouldn't
+ * be able to manage our other outstanding I/Os.  Instead we just suspend
+ * new activity on the indicated disk, and create a callback descriptor and
+ * put it someplace where it will get invoked when the condition that's
+ * stalling us has cleared.  When the descriptor is invoked, it will call
+ * a function that will restart operation on the indicated disk.
+ *
+ ****************************************************************************************/
+
+#ifndef _RF__RF_CALLBACK_H_
+#define _RF__RF_CALLBACK_H_
+
+#include <dev/raidframe/rf_types.h>
+
+struct RF_CallbackDesc_s {
+	void    (*callbackFunc) (RF_CBParam_t);	/* function to call */
+	RF_CBParam_t callbackArg;	/* args to give to function, or just
+					 * info about this callback  */
+	RF_CBParam_t callbackArg2;
+	RF_RowCol_t row;	/* disk row and column IDs to give to the
+				 * callback func */
+	RF_RowCol_t col;
+	RF_CallbackDesc_t *next;/* next entry in list */
+};
+
+int     rf_ConfigureCallback(RF_ShutdownList_t ** listp);
+RF_CallbackDesc_t *rf_AllocCallbackDesc(void);
+void    rf_FreeCallbackDesc(RF_CallbackDesc_t * p);
+
+#endif				/* !_RF__RF_CALLBACK_H_ */
diff --git a/sys/dev/raidframe/rf_chaindecluster.c b/sys/dev/raidframe/rf_chaindecluster.c
new file mode 100644
index 0000000..ff12bb5
--- /dev/null
+++ b/sys/dev/raidframe/rf_chaindecluster.c
@@ -0,0 +1,290 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_chaindecluster.c,v 1.6 2001/01/26 04:27:16 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/******************************************************************************
+ *
+ * rf_chaindecluster.c -- implements chained declustering
+ *
+ *****************************************************************************/
+
+#include <dev/raidframe/rf_archs.h>
+
+#if (RF_INCLUDE_CHAINDECLUSTER > 0) 
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_chaindecluster.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagffrd.h>
+#include <dev/raidframe/rf_dagffwr.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_utils.h>
+
+typedef struct RF_ChaindeclusterConfigInfo_s {
+	RF_RowCol_t **stripeIdentifier;	/* filled in at config time and used
+					 * by IdentifyStripe */
+	RF_StripeCount_t numSparingRegions;
+	RF_StripeCount_t stripeUnitsPerSparingRegion;
+	RF_SectorNum_t mirrorStripeOffset;
+}       RF_ChaindeclusterConfigInfo_t;
+
+int 
+rf_ConfigureChainDecluster(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_StripeCount_t num_used_stripeUnitsPerDisk;
+	RF_ChaindeclusterConfigInfo_t *info;
+	RF_RowCol_t i;
+
+	/* create a Chained Declustering configuration structure */
+	RF_MallocAndAdd(info, sizeof(RF_ChaindeclusterConfigInfo_t), (RF_ChaindeclusterConfigInfo_t *), raidPtr->cleanupList);
+	if (info == NULL)
+		return (ENOMEM);
+	layoutPtr->layoutSpecificInfo = (void *) info;
+
+	/* fill in the config structure.  */
+	info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, 2, raidPtr->cleanupList);
+	if (info->stripeIdentifier == NULL)
+		return (ENOMEM);
+	for (i = 0; i < raidPtr->numCol; i++) {
+		info->stripeIdentifier[i][0] = i % raidPtr->numCol;
+		info->stripeIdentifier[i][1] = (i + 1) % raidPtr->numCol;
+	}
+
+	RF_ASSERT(raidPtr->numRow == 1);
+
+	/* fill in the remaining layout parameters */
+	num_used_stripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk - (layoutPtr->stripeUnitsPerDisk %
+	    (2 * raidPtr->numCol - 2));
+	info->numSparingRegions = num_used_stripeUnitsPerDisk / (2 * raidPtr->numCol - 2);
+	info->stripeUnitsPerSparingRegion = raidPtr->numCol * (raidPtr->numCol - 1);
+	info->mirrorStripeOffset = info->numSparingRegions * (raidPtr->numCol - 1);
+	layoutPtr->numStripe = info->numSparingRegions * info->stripeUnitsPerSparingRegion;
+	layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+	layoutPtr->numDataCol = 1;
+	layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->numParityCol = 1;
+
+	layoutPtr->dataStripeUnitsPerDisk = num_used_stripeUnitsPerDisk;
+
+	raidPtr->sectorsPerDisk =
+	    num_used_stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+	raidPtr->totalSectors =
+	    (layoutPtr->numStripe) * layoutPtr->sectorsPerStripeUnit;
+
+	layoutPtr->stripeUnitsPerDisk = raidPtr->sectorsPerDisk / layoutPtr->sectorsPerStripeUnit;
+
+	return (0);
+}
+
+RF_ReconUnitCount_t 
+rf_GetNumSpareRUsChainDecluster(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+
+	/*
+         * The layout uses two stripe units per disk as spare within each
+         * sparing region.
+         */
+	return (2 * info->numSparingRegions);
+}
+
+
+/* Maps to the primary copy of the data, i.e. the first mirror pair */
+void 
+rf_MapSectorChainDecluster(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+	RF_SectorNum_t index_within_region, index_within_disk;
+	RF_StripeNum_t sparing_region_id;
+	int     col_before_remap;
+
+	*row = 0;
+	sparing_region_id = SUID / info->stripeUnitsPerSparingRegion;
+	index_within_region = SUID % info->stripeUnitsPerSparingRegion;
+	index_within_disk = index_within_region / raidPtr->numCol;
+	col_before_remap = SUID % raidPtr->numCol;
+
+	if (!remap) {
+		*col = col_before_remap;
+		*diskSector = (index_within_disk + ((raidPtr->numCol - 1) * sparing_region_id)) *
+		    raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+	} else {
+		/* remap sector to spare space... */
+		*diskSector = sparing_region_id * (raidPtr->numCol + 1) * raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += (raidPtr->numCol - 1) * raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+		index_within_disk = index_within_region / raidPtr->numCol;
+		if (index_within_disk < col_before_remap)
+			*col = index_within_disk;
+		else
+			if (index_within_disk == raidPtr->numCol - 2) {
+				*col = (col_before_remap + raidPtr->numCol - 1) % raidPtr->numCol;
+				*diskSector += raidPtr->Layout.sectorsPerStripeUnit;
+			} else
+				*col = (index_within_disk + 2) % raidPtr->numCol;
+	}
+
+}
+
+
+
+/* Maps to the second copy of the mirror pair, which is chain declustered. The second copy is contained
+   in the next disk (mod numCol) after the disk containing the primary copy.
+   The offset into the disk is one-half disk down */
+void 
+rf_MapParityChainDecluster(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+	RF_SectorNum_t index_within_region, index_within_disk;
+	RF_StripeNum_t sparing_region_id;
+	int     col_before_remap;
+
+	*row = 0;
+	if (!remap) {
+		*col = SUID % raidPtr->numCol;
+		*col = (*col + 1) % raidPtr->numCol;
+		*diskSector = info->mirrorStripeOffset * raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += (SUID / raidPtr->numCol) * raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+	} else {
+		/* remap parity to spare space ... */
+		sparing_region_id = SUID / info->stripeUnitsPerSparingRegion;
+		index_within_region = SUID % info->stripeUnitsPerSparingRegion;
+		index_within_disk = index_within_region / raidPtr->numCol;
+		*diskSector = sparing_region_id * (raidPtr->numCol + 1) * raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += (raidPtr->numCol) * raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+		col_before_remap = SUID % raidPtr->numCol;
+		if (index_within_disk < col_before_remap)
+			*col = index_within_disk;
+		else
+			if (index_within_disk == raidPtr->numCol - 2) {
+				*col = (col_before_remap + 2) % raidPtr->numCol;
+				*diskSector -= raidPtr->Layout.sectorsPerStripeUnit;
+			} else
+				*col = (index_within_disk + 2) % raidPtr->numCol;
+	}
+
+}
+
+void 
+rf_IdentifyStripeChainDecluster(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids,
+    RF_RowCol_t * outRow)
+{
+	RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+	RF_StripeNum_t SUID;
+	RF_RowCol_t col;
+
+	SUID = addr / raidPtr->Layout.sectorsPerStripeUnit;
+	col = SUID % raidPtr->numCol;
+	*outRow = 0;
+	*diskids = info->stripeIdentifier[col];
+}
+
+void 
+rf_MapSIDToPSIDChainDecluster(
+    RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID,
+    RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru)
+{
+	*which_ru = 0;
+	*psID = stripeID;
+}
+/******************************************************************************
+ * select a graph to perform a single-stripe access
+ *
+ * Parameters:  raidPtr    - description of the physical array
+ *              type       - type of operation (read or write) requested
+ *              asmap      - logical & physical addresses for this access
+ *              createFunc - function to use to create the graph (return value)
+ *****************************************************************************/
+
+void 
+rf_RAIDCDagSelect(
+    RF_Raid_t * raidPtr,
+    RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap,
+    RF_VoidFuncPtr * createFunc)
+#if 0
+	void    (**createFunc) (RF_Raid_t *, RF_AccessStripeMap_t *,
+            RF_DagHeader_t *, void *, RF_RaidAccessFlags_t,
+            RF_AllocListElem_t *)
+#endif
+{
+	RF_ASSERT(RF_IO_IS_R_OR_W(type));
+	RF_ASSERT(raidPtr->numRow == 1);
+
+	if (asmap->numDataFailed + asmap->numParityFailed > 1) {
+		RF_ERRORMSG("Multiple disks failed in a single group!  Aborting I/O operation.\n");
+		*createFunc = NULL;
+		return;
+	}
+	*createFunc = (type == RF_IO_TYPE_READ) ? (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG : (RF_VoidFuncPtr) rf_CreateRaidOneWriteDAG;
+
+	if (type == RF_IO_TYPE_READ) {
+		if ((raidPtr->status[0] == rf_rs_degraded) || (raidPtr->status[0] == rf_rs_reconstructing))
+			*createFunc = (RF_VoidFuncPtr) rf_CreateRaidCDegradedReadDAG;	/* array status is
+											 * degraded, implement
+											 * workload shifting */
+		else
+			*createFunc = (RF_VoidFuncPtr) rf_CreateMirrorPartitionReadDAG;	/* array status not
+											 * degraded, so use
+											 * mirror partition dag */
+	} else
+		*createFunc = (RF_VoidFuncPtr) rf_CreateRaidOneWriteDAG;
+}
+#endif /* (RF_INCLUDE_CHAINDECLUSTER > 0) */
diff --git a/sys/dev/raidframe/rf_chaindecluster.h b/sys/dev/raidframe/rf_chaindecluster.h
new file mode 100644
index 0000000..6030289
--- /dev/null
+++ b/sys/dev/raidframe/rf_chaindecluster.h
@@ -0,0 +1,68 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_chaindecluster.h,v 1.4 2001/01/26 04:14:14 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_chaindecluster.h
+ * header file for Chained Declustering
+ */
+
+
+#ifndef _RF__RF_CHAINDECLUSTER_H_
+#define _RF__RF_CHAINDECLUSTER_H_
+
+int 
+rf_ConfigureChainDecluster(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+RF_ReconUnitCount_t rf_GetNumSpareRUsChainDecluster(RF_Raid_t * raidPtr);
+void 
+rf_MapSectorChainDecluster(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_MapParityChainDecluster(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_IdentifyStripeChainDecluster(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
+void 
+rf_MapSIDToPSIDChainDecluster(RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID, RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru);
+void 
+rf_RAIDCDagSelect(RF_Raid_t * raidPtr, RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap,
+    RF_VoidFuncPtr *);
+#if 0
+void    (**createFunc) (RF_Raid_t *,
+            RF_AccessStripeMap_t *,
+            RF_DagHeader_t *,
+            void *,
+            RF_RaidAccessFlags_t,
+            RF_AllocListElem_t *);
+#endif
+
+#endif				/* !_RF__RF_CHAINDECLUSTER_H_ */
diff --git a/sys/dev/raidframe/rf_configure.h b/sys/dev/raidframe/rf_configure.h
new file mode 100644
index 0000000..c51b8a3
--- /dev/null
+++ b/sys/dev/raidframe/rf_configure.h
@@ -0,0 +1,99 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_configure.h,v 1.4 1999/03/02 03:18:49 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/********************************
+ *
+ * rf_configure.h
+ *
+ * header file for raidframe configuration in the kernel version only.
+ * configuration is invoked via ioctl rather than at boot time
+ *
+ *******************************/
+
+
+#ifndef _RF__RF_CONFIGURE_H_
+#define _RF__RF_CONFIGURE_H_
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_types.h>
+
+#include <sys/param.h>
+#include <sys/proc.h>
+
+#if defined(__NetBSD__)
+#include <sys/ioctl.h>
+#elif defined(__FreeBSD__)
+#include <sys/ioccom.h>
+#include <sys/filio.h>
+#endif
+
+/* the raidframe configuration, passed down through an ioctl.
+ * the driver can be reconfigured (with total loss of data) at any time,
+ * but it must be shut down first.
+ */
+struct RF_Config_s {
+	RF_RowCol_t numRow, numCol, numSpare;	/* number of rows, columns,
+						 * and spare disks */
+	dev_t   devs[RF_MAXROW][RF_MAXCOL];	/* device numbers for disks
+						 * comprising array */
+	char    devnames[RF_MAXROW][RF_MAXCOL][50];	/* device names */
+	dev_t   spare_devs[RF_MAXSPARE];	/* device numbers for spare
+						 * disks */
+	char    spare_names[RF_MAXSPARE][50];	/* device names */
+	RF_SectorNum_t sectPerSU;	/* sectors per stripe unit */
+	RF_StripeNum_t SUsPerPU;/* stripe units per parity unit */
+	RF_StripeNum_t SUsPerRU;/* stripe units per reconstruction unit */
+	RF_ParityConfig_t parityConfig;	/* identifies the RAID architecture to
+					 * be used */
+	RF_DiskQueueType_t diskQueueType;	/* 'f' = fifo, 'c' = cvscan,
+						 * not used in kernel */
+	char    maxOutstandingDiskReqs;	/* # concurrent reqs to be sent to a
+					 * disk.  not used in kernel. */
+	char    debugVars[RF_MAXDBGV][50];	/* space for specifying debug
+						 * variables & their values */
+	unsigned int layoutSpecificSize;	/* size in bytes of
+						 * layout-specific info */
+	void   *layoutSpecific;	/* a pointer to a layout-specific structure to
+				 * be copied in */
+	int     force;                          /* if !0, ignore many fatal
+						   configuration conditions */
+	/* 
+	   "force" is used to override cases where the component labels would 
+	   indicate that configuration should not proceed without user 
+	   intervention
+	 */
+};
+#ifndef _KERNEL
+int     rf_MakeConfig(char *configname, RF_Config_t * cfgPtr);
+int     rf_MakeLayoutSpecificNULL(FILE * fp, RF_Config_t * cfgPtr, void *arg);
+int     rf_MakeLayoutSpecificDeclustered(FILE * configfp, RF_Config_t * cfgPtr, void *arg);
+void   *rf_ReadSpareTable(RF_SparetWait_t * req, char *fname);
+#endif				/* !_KERNEL */
+
+#endif				/* !_RF__RF_CONFIGURE_H_ */
diff --git a/sys/dev/raidframe/rf_copyback.c b/sys/dev/raidframe/rf_copyback.c
new file mode 100644
index 0000000..0e9a84d
--- /dev/null
+++ b/sys/dev/raidframe/rf_copyback.c
@@ -0,0 +1,431 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_copyback.c,v 1.15 2001/01/26 02:16:24 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************************
+ *
+ * copyback.c -- code to copy reconstructed data back from spare space to
+ *               the replaced disk.
+ *
+ * the code operates using callbacks on the I/Os to continue with the next
+ * unit to be copied back.  We do this because a simple loop containing blocking I/Os
+ * will not work in the simulator.
+ *
+ ****************************************************************************************/
+
+#include <dev/raidframe/rf_types.h>
+
+#if defined(__FreeBSD__)
+#include <sys/types.h>
+#include <sys/systm.h>
+#if __FreeBSD_version > 500005
+#include <sys/bio.h>
+#endif
+#endif
+
+#include <sys/time.h>
+#include <sys/buf.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_mcpair.h>
+#include <dev/raidframe/rf_acctrace.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_utils.h>
+#include <dev/raidframe/rf_copyback.h>
+#include <dev/raidframe/rf_decluster.h>
+#include <dev/raidframe/rf_driver.h>
+#include <dev/raidframe/rf_shutdown.h>
+#include <dev/raidframe/rf_kintf.h>
+
+#define RF_COPYBACK_DATA   0
+#define RF_COPYBACK_PARITY 1
+
+int     rf_copyback_in_progress;
+
+static int rf_CopybackReadDoneProc(RF_CopybackDesc_t * desc, int status);
+static int rf_CopybackWriteDoneProc(RF_CopybackDesc_t * desc, int status);
+static void rf_CopybackOne(RF_CopybackDesc_t * desc, int typ,
+			   RF_RaidAddr_t addr, RF_RowCol_t testRow, 
+			   RF_RowCol_t testCol,
+			   RF_SectorNum_t testOffs);
+static void rf_CopybackComplete(RF_CopybackDesc_t * desc, int status);
+
+int 
+rf_ConfigureCopyback(listp)
+	RF_ShutdownList_t **listp;
+{
+	rf_copyback_in_progress = 0;
+	return (0);
+}
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#if defined(__NetBSD__)
+#include <sys/ioctl.h>
+#elif defined(__FreeBSD__)
+#include <sys/ioccom.h>
+#include <sys/filio.h>
+#endif
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+
+/* do a complete copyback */
+void 
+rf_CopybackReconstructedData(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	RF_ComponentLabel_t *c_label;
+	int     done, retcode;
+	RF_CopybackDesc_t *desc;
+	RF_RowCol_t frow, fcol;
+	RF_RaidDisk_t *badDisk;
+	struct vnode *vp;
+	char   *databuf;
+	int ac;
+
+	RF_Malloc(c_label, sizeof(RF_ComponentLabel_t), (RF_ComponentLabel_t *));
+	if (c_label == NULL) {
+		printf("rf_CopybackReconstructedData: Out of memory?\n");
+		return;
+	}
+
+	done = 0;
+	fcol = 0;
+	for (frow = 0; frow < raidPtr->numRow; frow++) {
+		for (fcol = 0; fcol < raidPtr->numCol; fcol++) {
+			if (raidPtr->Disks[frow][fcol].status == rf_ds_dist_spared
+			    || raidPtr->Disks[frow][fcol].status == rf_ds_spared) {
+				done = 1;
+				break;
+			}
+		}
+		if (done)
+			break;
+	}
+
+	if (frow == raidPtr->numRow) {
+		printf("COPYBACK:  no disks need copyback\n");
+		return;
+	}
+	badDisk = &raidPtr->Disks[frow][fcol];
+
+	/* This device may have been opened successfully the first time. Close
+	 * it before trying to open it again.. */
+
+	if (raidPtr->raid_cinfo[frow][fcol].ci_vp != NULL) {
+		printf("Closed the open device: %s\n",
+		    raidPtr->Disks[frow][fcol].devname);
+		vp = raidPtr->raid_cinfo[frow][fcol].ci_vp;
+		ac = raidPtr->Disks[frow][fcol].auto_configured;
+		rf_close_component(raidPtr, vp, ac);
+		raidPtr->raid_cinfo[frow][fcol].ci_vp = NULL;
+
+	}
+	/* note that this disk was *not* auto_configured (any longer) */
+	raidPtr->Disks[frow][fcol].auto_configured = 0;
+
+	printf("About to (re-)open the device: %s\n",
+	    raidPtr->Disks[frow][fcol].devname);
+
+	retcode = raid_getcomponentsize(raidPtr, frow, fcol);
+
+	if (retcode) {
+		printf("COPYBACK: raidlookup on device: %s failed: %d!\n",
+		    raidPtr->Disks[frow][fcol].devname, retcode);
+
+		/* XXX the component isn't responding properly... must be
+		 * still dead :-( */
+		return;
+
+	}
+#if 0
+	/* This is the way it was done before the CAM stuff was removed */
+
+	if (rf_extract_ids(badDisk->devname, &bus, &targ, &lun)) {
+		printf("COPYBACK: unable to extract bus, target, lun from devname %s\n",
+		    badDisk->devname);
+		return;
+	}
+	/* TUR the disk that's marked as bad to be sure that it's actually
+	 * alive */
+	rf_SCSI_AllocTUR(&tur_op);
+	retcode = rf_SCSI_DoTUR(tur_op, bus, targ, lun, badDisk->dev);
+	rf_SCSI_FreeDiskOp(tur_op, 0);
+#endif
+
+	if (retcode) {
+		printf("COPYBACK: target disk failed TUR\n");
+		return;
+	}
+	/* get a buffer to hold one SU  */
+	RF_Malloc(databuf, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (char *));
+
+	/* create a descriptor */
+	RF_Malloc(desc, sizeof(*desc), (RF_CopybackDesc_t *));
+	desc->raidPtr = raidPtr;
+	desc->status = 0;
+	desc->frow = frow;
+	desc->fcol = fcol;
+	desc->spRow = badDisk->spareRow;
+	desc->spCol = badDisk->spareCol;
+	desc->stripeAddr = 0;
+	desc->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
+	desc->sectPerStripe = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.numDataCol;
+	desc->databuf = databuf;
+	desc->mcpair = rf_AllocMCPair();
+
+	printf("COPYBACK: Quiescing the array\n");
+	/* quiesce the array, since we don't want to code support for user
+	 * accs here */
+	rf_SuspendNewRequestsAndWait(raidPtr);
+
+	/* adjust state of the array and of the disks */
+	RF_LOCK_MUTEX(raidPtr->mutex);
+	raidPtr->Disks[desc->frow][desc->fcol].status = rf_ds_optimal;
+	raidPtr->status[desc->frow] = rf_rs_optimal;
+	rf_copyback_in_progress = 1;	/* debug only */
+	RF_UNLOCK_MUTEX(raidPtr->mutex);
+
+	printf("COPYBACK: Beginning\n");
+	RF_GETTIME(desc->starttime);
+	rf_ContinueCopyback(desc);
+
+	/* Data has been restored.  Fix up the component label. */
+	/* Don't actually need the read here.. */
+	raidread_component_label( raidPtr->raid_cinfo[frow][fcol].ci_dev,
+				  raidPtr->raid_cinfo[frow][fcol].ci_vp,
+				  c_label);
+	
+	raid_init_component_label( raidPtr, c_label );
+
+	c_label->row = frow;
+	c_label->column = fcol;
+	c_label->partitionSize = raidPtr->Disks[frow][fcol].partitionSize;
+
+	raidwrite_component_label( raidPtr->raid_cinfo[frow][fcol].ci_dev,
+				   raidPtr->raid_cinfo[frow][fcol].ci_vp,
+				   c_label);
+	RF_Free(c_label, sizeof(RF_ComponentLabel_t));
+}
+
+
+/*
+ * invoked via callback after a copyback I/O has completed to
+ * continue on with the next one
+ */
+void 
+rf_ContinueCopyback(desc)
+	RF_CopybackDesc_t *desc;
+{
+	RF_SectorNum_t testOffs, stripeAddr;
+	RF_Raid_t *raidPtr = desc->raidPtr;
+	RF_RaidAddr_t addr;
+	RF_RowCol_t testRow, testCol;
+	int     old_pctg, new_pctg, done;
+	struct timeval t, diff;
+
+	old_pctg = (-1);
+	while (1) {
+		stripeAddr = desc->stripeAddr;
+		desc->raidPtr->copyback_stripes_done = stripeAddr
+			/ desc->sectPerStripe;
+		if (rf_prReconSched) {
+			old_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors;
+		}
+		desc->stripeAddr += desc->sectPerStripe;
+		if (rf_prReconSched) {
+			new_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors;
+			if (new_pctg != old_pctg) {
+				RF_GETTIME(t);
+				RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff);
+				printf("%d %d.%06d\n", new_pctg, (int) diff.tv_sec, (int) diff.tv_usec);
+			}
+		}
+		if (stripeAddr >= raidPtr->totalSectors) {
+			rf_CopybackComplete(desc, 0);
+			return;
+		}
+		/* walk through the current stripe, su-by-su */
+		for (done = 0, addr = stripeAddr; addr < stripeAddr + desc->sectPerStripe; addr += desc->sectPerSU) {
+
+			/* map the SU, disallowing remap to spare space */
+			(raidPtr->Layout.map->MapSector) (raidPtr, addr, &testRow, &testCol, &testOffs, RF_DONT_REMAP);
+
+			if (testRow == desc->frow && testCol == desc->fcol) {
+				rf_CopybackOne(desc, RF_COPYBACK_DATA, addr, testRow, testCol, testOffs);
+				done = 1;
+				break;
+			}
+		}
+
+		if (!done) {
+			/* we didn't find the failed disk in the data part.
+			 * check parity. */
+
+			/* map the parity for this stripe, disallowing remap
+			 * to spare space */
+			(raidPtr->Layout.map->MapParity) (raidPtr, stripeAddr, &testRow, &testCol, &testOffs, RF_DONT_REMAP);
+
+			if (testRow == desc->frow && testCol == desc->fcol) {
+				rf_CopybackOne(desc, RF_COPYBACK_PARITY, stripeAddr, testRow, testCol, testOffs);
+			}
+		}
+		/* check to see if the last read/write pair failed */
+		if (desc->status) {
+			rf_CopybackComplete(desc, 1);
+			return;
+		}
+		/* we didn't find any units to copy back in this stripe.
+		 * Continue with the next one */
+	}
+}
+
+
+/* copyback one unit */
+static void 
+rf_CopybackOne(desc, typ, addr, testRow, testCol, testOffs)
+	RF_CopybackDesc_t *desc;
+	int     typ;
+	RF_RaidAddr_t addr;
+	RF_RowCol_t testRow;
+	RF_RowCol_t testCol;
+	RF_SectorNum_t testOffs;
+{
+	RF_SectorCount_t sectPerSU = desc->sectPerSU;
+	RF_Raid_t *raidPtr = desc->raidPtr;
+	RF_RowCol_t spRow = desc->spRow;
+	RF_RowCol_t spCol = desc->spCol;
+	RF_SectorNum_t spOffs;
+
+	/* find the spare spare location for this SU */
+	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+		if (typ == RF_COPYBACK_DATA)
+			raidPtr->Layout.map->MapSector(raidPtr, addr, &spRow, &spCol, &spOffs, RF_REMAP);
+		else
+			raidPtr->Layout.map->MapParity(raidPtr, addr, &spRow, &spCol, &spOffs, RF_REMAP);
+	} else {
+		spOffs = testOffs;
+	}
+
+	/* create reqs to read the old location & write the new */
+	desc->readreq = rf_CreateDiskQueueData(RF_IO_TYPE_READ, spOffs,
+	    sectPerSU, desc->databuf, 0L, 0,
+	    (int (*) (void *, int)) rf_CopybackReadDoneProc, desc,
+	    NULL, NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL);
+	desc->writereq = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, testOffs,
+	    sectPerSU, desc->databuf, 0L, 0,
+	    (int (*) (void *, int)) rf_CopybackWriteDoneProc, desc,
+	    NULL, NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL);
+	desc->frow = testRow;
+	desc->fcol = testCol;
+
+	/* enqueue the read.  the write will go out as part of the callback on
+	 * the read. at user-level & in the kernel, wait for the read-write
+	 * pair to complete. in the simulator, just return, since everything
+	 * will happen as callbacks */
+
+	RF_LOCK_MUTEX(desc->mcpair->mutex);
+	desc->mcpair->flag = 0;
+
+	rf_DiskIOEnqueue(&raidPtr->Queues[spRow][spCol], desc->readreq, RF_IO_NORMAL_PRIORITY);
+
+	while (!desc->mcpair->flag) {
+		RF_WAIT_MCPAIR(desc->mcpair);
+	}
+	RF_UNLOCK_MUTEX(desc->mcpair->mutex);
+	rf_FreeDiskQueueData(desc->readreq);
+	rf_FreeDiskQueueData(desc->writereq);
+
+}
+
+
+/* called at interrupt context when the read has completed.  just send out the write */
+static int 
+rf_CopybackReadDoneProc(desc, status)
+	RF_CopybackDesc_t *desc;
+	int     status;
+{
+	if (status) {		/* invoke the callback with bad status */
+		printf("COPYBACK: copyback read failed.  Aborting.\n");
+		(desc->writereq->CompleteFunc) (desc, -100);
+	} else {
+		rf_DiskIOEnqueue(&(desc->raidPtr->Queues[desc->frow][desc->fcol]), desc->writereq, RF_IO_NORMAL_PRIORITY);
+	}
+	return (0);
+}
+/* called at interrupt context when the write has completed.
+ * at user level & in the kernel, wake up the copyback thread.
+ * in the simulator, invoke the next copyback directly.
+ * can't free diskqueuedata structs in the kernel b/c we're at interrupt context.
+ */
+static int 
+rf_CopybackWriteDoneProc(desc, status)
+	RF_CopybackDesc_t *desc;
+	int     status;
+{
+	if (status && status != -100) {
+		printf("COPYBACK: copyback write failed.  Aborting.\n");
+	}
+	desc->status = status;
+	rf_MCPairWakeupFunc(desc->mcpair);
+	return (0);
+}
+/* invoked when the copyback has completed */
+static void 
+rf_CopybackComplete(desc, status)
+	RF_CopybackDesc_t *desc;
+	int     status;
+{
+	RF_Raid_t *raidPtr = desc->raidPtr;
+	struct timeval t, diff;
+
+	if (!status) {
+		RF_LOCK_MUTEX(raidPtr->mutex);
+		if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+			RF_ASSERT(raidPtr->Layout.map->parityConfig == 'D');
+			rf_FreeSpareTable(raidPtr);
+		} else {
+			raidPtr->Disks[desc->spRow][desc->spCol].status = rf_ds_spare;
+		}
+		RF_UNLOCK_MUTEX(raidPtr->mutex);
+
+		RF_GETTIME(t);
+		RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff);
+		printf("Copyback time was %d.%06d seconds\n",
+		    (int) diff.tv_sec, (int) diff.tv_usec);
+	} else
+		printf("COPYBACK: Failure.\n");
+
+	RF_Free(desc->databuf, rf_RaidAddressToByte(raidPtr, desc->sectPerSU));
+	rf_FreeMCPair(desc->mcpair);
+	RF_Free(desc, sizeof(*desc));
+
+	rf_copyback_in_progress = 0;
+	rf_ResumeNewRequests(raidPtr);
+}
diff --git a/sys/dev/raidframe/rf_copyback.h b/sys/dev/raidframe/rf_copyback.h
new file mode 100644
index 0000000..67da842
--- /dev/null
+++ b/sys/dev/raidframe/rf_copyback.h
@@ -0,0 +1,61 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_copyback.h,v 1.3 1999/02/05 00:06:06 oster Exp $	*/
+/*
+ * rf_copyback.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_COPYBACK_H_
+#define _RF__RF_COPYBACK_H_
+
+#include <dev/raidframe/rf_types.h>
+
+typedef struct RF_CopybackDesc_s {
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t frow;
+	RF_RowCol_t fcol;
+	RF_RowCol_t spRow;
+	RF_RowCol_t spCol;
+	int     status;
+	RF_StripeNum_t stripeAddr;
+	RF_SectorCount_t sectPerSU;
+	RF_SectorCount_t sectPerStripe;
+	char   *databuf;
+	RF_DiskQueueData_t *readreq;
+	RF_DiskQueueData_t *writereq;
+	struct timeval starttime;
+	RF_MCPair_t *mcpair;
+}       RF_CopybackDesc_t;
+
+extern int rf_copyback_in_progress;
+
+int     rf_ConfigureCopyback(RF_ShutdownList_t ** listp);
+void    rf_CopybackReconstructedData(RF_Raid_t * raidPtr);
+void    rf_ContinueCopyback(RF_CopybackDesc_t * desc);
+
+#endif				/* !_RF__RF_COPYBACK_H_ */
diff --git a/sys/dev/raidframe/rf_cvscan.c b/sys/dev/raidframe/rf_cvscan.c
new file mode 100644
index 0000000..f52f938
--- /dev/null
+++ b/sys/dev/raidframe/rf_cvscan.c
@@ -0,0 +1,439 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_cvscan.c,v 1.5 1999/08/13 03:41:53 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*******************************************************************************
+ *
+ * cvscan.c --  prioritized cvscan disk queueing code.
+ *
+ * Nov 9, 1994, adapted from raidSim version (MCH)
+ *
+ ******************************************************************************/
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_stripelocks.h>
+#include <dev/raidframe/rf_layout.h>
+#include <dev/raidframe/rf_diskqueue.h>
+#include <dev/raidframe/rf_cvscan.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_general.h>
+
+#define DO_CHECK_STATE(_hdr_) CheckCvscanState((_hdr_), __FILE__, __LINE__)
+
+#define pri_ok(p)  ( ((p) == RF_IO_NORMAL_PRIORITY) || ((p) == RF_IO_LOW_PRIORITY))
+
+static void 
+CheckCvscanState(RF_CvscanHeader_t * hdr, char *file, int line)
+{
+	long    i, key;
+	RF_DiskQueueData_t *tmp;
+
+	if (hdr->left != (RF_DiskQueueData_t *) NULL)
+		RF_ASSERT(hdr->left->sectorOffset < hdr->cur_block);
+	for (key = hdr->cur_block, i = 0, tmp = hdr->left;
+	    tmp != (RF_DiskQueueData_t *) NULL;
+	    key = tmp->sectorOffset, i++, tmp = tmp->next)
+		RF_ASSERT(tmp->sectorOffset <= key
+		    && tmp->priority == hdr->nxt_priority && pri_ok(tmp->priority));
+	RF_ASSERT(i == hdr->left_cnt);
+
+	for (key = hdr->cur_block, i = 0, tmp = hdr->right;
+	    tmp != (RF_DiskQueueData_t *) NULL;
+	    key = tmp->sectorOffset, i++, tmp = tmp->next) {
+		RF_ASSERT(key <= tmp->sectorOffset);
+		RF_ASSERT(tmp->priority == hdr->nxt_priority);
+		RF_ASSERT(pri_ok(tmp->priority));
+	}
+	RF_ASSERT(i == hdr->right_cnt);
+
+	for (key = hdr->nxt_priority - 1, tmp = hdr->burner;
+	    tmp != (RF_DiskQueueData_t *) NULL;
+	    key = tmp->priority, tmp = tmp->next) {
+		RF_ASSERT(tmp);
+		RF_ASSERT(hdr);
+		RF_ASSERT(pri_ok(tmp->priority));
+		RF_ASSERT(key >= tmp->priority);
+		RF_ASSERT(tmp->priority < hdr->nxt_priority);
+	}
+}
+
+
+
+static void 
+PriorityInsert(RF_DiskQueueData_t ** list_ptr, RF_DiskQueueData_t * req)
+{
+	/* * insert block pointed to by req in to list whose first * entry is
+	 * pointed to by the pointer that list_ptr points to * ie., list_ptr
+	 * is a grandparent of the first entry */
+
+	for (; (*list_ptr) != (RF_DiskQueueData_t *) NULL &&
+	    (*list_ptr)->priority > req->priority;
+	    list_ptr = &((*list_ptr)->next)) {
+	}
+	req->next = (*list_ptr);
+	(*list_ptr) = req;
+}
+
+
+
+static void 
+ReqInsert(RF_DiskQueueData_t ** list_ptr, RF_DiskQueueData_t * req, RF_CvscanArmDir_t order)
+{
+	/* * insert block pointed to by req in to list whose first * entry is
+	 * pointed to by the pointer that list_ptr points to * ie., list_ptr
+	 * is a grandparent of the first entry */
+
+	for (; (*list_ptr) != (RF_DiskQueueData_t *) NULL &&
+
+	    ((order == rf_cvscan_RIGHT && (*list_ptr)->sectorOffset <= req->sectorOffset)
+		|| (order == rf_cvscan_LEFT && (*list_ptr)->sectorOffset > req->sectorOffset));
+	    list_ptr = &((*list_ptr)->next)) {
+	}
+	req->next = (*list_ptr);
+	(*list_ptr) = req;
+}
+
+
+
+static RF_DiskQueueData_t *
+ReqDequeue(RF_DiskQueueData_t ** list_ptr)
+{
+	RF_DiskQueueData_t *ret = (*list_ptr);
+	if ((*list_ptr) != (RF_DiskQueueData_t *) NULL) {
+		(*list_ptr) = (*list_ptr)->next;
+	}
+	return (ret);
+}
+
+
+
+static void 
+ReBalance(RF_CvscanHeader_t * hdr)
+{
+	/* DO_CHECK_STATE(hdr); */
+	while (hdr->right != (RF_DiskQueueData_t *) NULL
+	    && hdr->right->sectorOffset < hdr->cur_block) {
+		hdr->right_cnt--;
+		hdr->left_cnt++;
+		ReqInsert(&hdr->left, ReqDequeue(&hdr->right), rf_cvscan_LEFT);
+	}
+	/* DO_CHECK_STATE(hdr); */
+}
+
+
+
+static void 
+Transfer(RF_DiskQueueData_t ** to_list_ptr, RF_DiskQueueData_t ** from_list_ptr)
+{
+	RF_DiskQueueData_t *gp;
+	for (gp = (*from_list_ptr); gp != (RF_DiskQueueData_t *) NULL;) {
+		RF_DiskQueueData_t *p = gp->next;
+		PriorityInsert(to_list_ptr, gp);
+		gp = p;
+	}
+	(*from_list_ptr) = (RF_DiskQueueData_t *) NULL;
+}
+
+
+
+static void 
+RealEnqueue(RF_CvscanHeader_t * hdr, RF_DiskQueueData_t * req)
+{
+	RF_ASSERT(req->priority == RF_IO_NORMAL_PRIORITY || req->priority == RF_IO_LOW_PRIORITY);
+
+	DO_CHECK_STATE(hdr);
+	if (hdr->left_cnt == 0 && hdr->right_cnt == 0) {
+		hdr->nxt_priority = req->priority;
+	}
+	if (req->priority > hdr->nxt_priority) {
+		/*
+		** dump all other outstanding requests on the back burner
+		*/
+		Transfer(&hdr->burner, &hdr->left);
+		Transfer(&hdr->burner, &hdr->right);
+		hdr->left_cnt = 0;
+		hdr->right_cnt = 0;
+		hdr->nxt_priority = req->priority;
+	}
+	if (req->priority < hdr->nxt_priority) {
+		/*
+		** yet another low priority task!
+		*/
+		PriorityInsert(&hdr->burner, req);
+	} else {
+		if (req->sectorOffset < hdr->cur_block) {
+			/* this request is to the left of the current arms */
+			ReqInsert(&hdr->left, req, rf_cvscan_LEFT);
+			hdr->left_cnt++;
+		} else {
+			/* this request is to the right of the current arms */
+			ReqInsert(&hdr->right, req, rf_cvscan_RIGHT);
+			hdr->right_cnt++;
+		}
+	}
+	DO_CHECK_STATE(hdr);
+}
+
+
+
+void 
+rf_CvscanEnqueue(void *q_in, RF_DiskQueueData_t * elem, int priority)
+{
+	RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in;
+	RealEnqueue(hdr, elem /* req */ );
+}
+
+
+
+RF_DiskQueueData_t *
+rf_CvscanDequeue(void *q_in)
+{
+	RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in;
+	long    range, i, sum_dist_left, sum_dist_right;
+	RF_DiskQueueData_t *ret;
+	RF_DiskQueueData_t *tmp;
+
+	DO_CHECK_STATE(hdr);
+
+	if (hdr->left_cnt == 0 && hdr->right_cnt == 0)
+		return ((RF_DiskQueueData_t *) NULL);
+
+	range = RF_MIN(hdr->range_for_avg, RF_MIN(hdr->left_cnt, hdr->right_cnt));
+	for (i = 0, tmp = hdr->left, sum_dist_left =
+	    ((hdr->direction == rf_cvscan_RIGHT) ? range * hdr->change_penalty : 0);
+	    tmp != (RF_DiskQueueData_t *) NULL && i < range;
+	    tmp = tmp->next, i++) {
+		sum_dist_left += hdr->cur_block - tmp->sectorOffset;
+	}
+	for (i = 0, tmp = hdr->right, sum_dist_right =
+	    ((hdr->direction == rf_cvscan_LEFT) ? range * hdr->change_penalty : 0);
+	    tmp != (RF_DiskQueueData_t *) NULL && i < range;
+	    tmp = tmp->next, i++) {
+		sum_dist_right += tmp->sectorOffset - hdr->cur_block;
+	}
+
+	if (hdr->right_cnt == 0 || sum_dist_left < sum_dist_right) {
+		hdr->direction = rf_cvscan_LEFT;
+		hdr->cur_block = hdr->left->sectorOffset + hdr->left->numSector;
+		hdr->left_cnt = RF_MAX(hdr->left_cnt - 1, 0);
+		tmp = hdr->left;
+		ret = (ReqDequeue(&hdr->left)) /*->parent*/ ;
+	} else {
+		hdr->direction = rf_cvscan_RIGHT;
+		hdr->cur_block = hdr->right->sectorOffset + hdr->right->numSector;
+		hdr->right_cnt = RF_MAX(hdr->right_cnt - 1, 0);
+		tmp = hdr->right;
+		ret = (ReqDequeue(&hdr->right)) /*->parent*/ ;
+	}
+	ReBalance(hdr);
+
+	if (hdr->left_cnt == 0 && hdr->right_cnt == 0
+	    && hdr->burner != (RF_DiskQueueData_t *) NULL) {
+		/*
+		** restore low priority requests for next dequeue
+		*/
+		RF_DiskQueueData_t *burner = hdr->burner;
+		hdr->nxt_priority = burner->priority;
+		while (burner != (RF_DiskQueueData_t *) NULL
+		    && burner->priority == hdr->nxt_priority) {
+			RF_DiskQueueData_t *next = burner->next;
+			RealEnqueue(hdr, burner);
+			burner = next;
+		}
+		hdr->burner = burner;
+	}
+	DO_CHECK_STATE(hdr);
+	return (ret);
+}
+
+
+
+RF_DiskQueueData_t *
+rf_CvscanPeek(void *q_in)
+{
+	RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in;
+	long    range, i, sum_dist_left, sum_dist_right;
+	RF_DiskQueueData_t *tmp, *headElement;
+
+	DO_CHECK_STATE(hdr);
+
+	if (hdr->left_cnt == 0 && hdr->right_cnt == 0)
+		headElement = NULL;
+	else {
+		range = RF_MIN(hdr->range_for_avg, RF_MIN(hdr->left_cnt, hdr->right_cnt));
+		for (i = 0, tmp = hdr->left, sum_dist_left =
+		    ((hdr->direction == rf_cvscan_RIGHT) ? range * hdr->change_penalty : 0);
+		    tmp != (RF_DiskQueueData_t *) NULL && i < range;
+		    tmp = tmp->next, i++) {
+			sum_dist_left += hdr->cur_block - tmp->sectorOffset;
+		}
+		for (i = 0, tmp = hdr->right, sum_dist_right =
+		    ((hdr->direction == rf_cvscan_LEFT) ? range * hdr->change_penalty : 0);
+		    tmp != (RF_DiskQueueData_t *) NULL && i < range;
+		    tmp = tmp->next, i++) {
+			sum_dist_right += tmp->sectorOffset - hdr->cur_block;
+		}
+
+		if (hdr->right_cnt == 0 || sum_dist_left < sum_dist_right)
+			headElement = hdr->left;
+		else
+			headElement = hdr->right;
+	}
+	return (headElement);
+}
+
+
+
+/*
+** CVSCAN( 1, 0 ) is Shortest Seek Time First (SSTF)
+**				lowest average response time
+** CVSCAN( 1, infinity ) is SCAN
+**				lowest response time standard deviation
+*/
+
+
+int 
+rf_CvscanConfigure()
+{
+	return (0);
+}
+
+
+
+void   *
+rf_CvscanCreate(RF_SectorCount_t sectPerDisk,
+    RF_AllocListElem_t * clList,
+    RF_ShutdownList_t ** listp)
+{
+	RF_CvscanHeader_t *hdr;
+	long    range = 2;	/* Currently no mechanism to change these */
+	long    penalty = sectPerDisk / 5;
+
+	RF_MallocAndAdd(hdr, sizeof(RF_CvscanHeader_t), (RF_CvscanHeader_t *), clList);
+	bzero((char *) hdr, sizeof(RF_CvscanHeader_t));
+	hdr->range_for_avg = RF_MAX(range, 1);
+	hdr->change_penalty = RF_MAX(penalty, 0);
+	hdr->direction = rf_cvscan_RIGHT;
+	hdr->cur_block = 0;
+	hdr->left_cnt = hdr->right_cnt = 0;
+	hdr->left = hdr->right = (RF_DiskQueueData_t *) NULL;
+	hdr->burner = (RF_DiskQueueData_t *) NULL;
+	DO_CHECK_STATE(hdr);
+
+	return ((void *) hdr);
+}
+
+
+#if defined(__NetBSD__) || defined(__FreeBSD__) && defined(_KERNEL)
+/* PrintCvscanQueue is not used, so we ignore it... */
+#else
+static void 
+PrintCvscanQueue(RF_CvscanHeader_t * hdr)
+{
+	RF_DiskQueueData_t *tmp;
+
+	printf("CVSCAN(%d,%d) at %d going %s\n",
+	    (int) hdr->range_for_avg,
+	    (int) hdr->change_penalty,
+	    (int) hdr->cur_block,
+	    (hdr->direction == rf_cvscan_LEFT) ? "LEFT" : "RIGHT");
+	printf("\tLeft(%d): ", hdr->left_cnt);
+	for (tmp = hdr->left; tmp != (RF_DiskQueueData_t *) NULL; tmp = tmp->next)
+		printf("(%d,%ld,%d) ",
+		    (int) tmp->sectorOffset,
+		    (long) (tmp->sectorOffset + tmp->numSector),
+		    tmp->priority);
+	printf("\n");
+	printf("\tRight(%d): ", hdr->right_cnt);
+	for (tmp = hdr->right; tmp != (RF_DiskQueueData_t *) NULL; tmp = tmp->next)
+		printf("(%d,%ld,%d) ",
+		    (int) tmp->sectorOffset,
+		    (long) (tmp->sectorOffset + tmp->numSector),
+		    tmp->priority);
+	printf("\n");
+	printf("\tBurner: ");
+	for (tmp = hdr->burner; tmp != (RF_DiskQueueData_t *) NULL; tmp = tmp->next)
+		printf("(%d,%ld,%d) ",
+		    (int) tmp->sectorOffset,
+		    (long) (tmp->sectorOffset + tmp->numSector),
+		    tmp->priority);
+	printf("\n");
+}
+#endif
+
+
+/* promotes reconstruction accesses for the given stripeID to normal priority.
+ * returns 1 if an access was found and zero otherwise.  Normally, we should
+ * only have one or zero entries in the burner queue, so execution time should
+ * be short.
+ */
+int 
+rf_CvscanPromote(void *q_in, RF_StripeNum_t parityStripeID, RF_ReconUnitNum_t which_ru)
+{
+	RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in;
+	RF_DiskQueueData_t *trailer = NULL, *tmp = hdr->burner, *tlist = NULL;
+	int     retval = 0;
+
+	DO_CHECK_STATE(hdr);
+	while (tmp) {		/* handle entries at the front of the list */
+		if (tmp->parityStripeID == parityStripeID && tmp->which_ru == which_ru) {
+			hdr->burner = tmp->next;
+			tmp->priority = RF_IO_NORMAL_PRIORITY;
+			tmp->next = tlist;
+			tlist = tmp;
+			tmp = hdr->burner;
+		} else
+			break;
+	}
+	if (tmp) {
+		trailer = tmp;
+		tmp = tmp->next;
+	}
+	while (tmp) {		/* handle entries on the rest of the list */
+		if (tmp->parityStripeID == parityStripeID && tmp->which_ru == which_ru) {
+			trailer->next = tmp->next;
+			tmp->priority = RF_IO_NORMAL_PRIORITY;
+			tmp->next = tlist;
+			tlist = tmp;	/* insert on a temp queue */
+			tmp = trailer->next;
+		} else {
+			trailer = tmp;
+			tmp = tmp->next;
+		}
+	}
+	while (tlist) {
+		retval++;
+		tmp = tlist->next;
+		RealEnqueue(hdr, tlist);
+		tlist = tmp;
+	}
+	RF_ASSERT(retval == 0 || retval == 1);
+	DO_CHECK_STATE((RF_CvscanHeader_t *) q_in);
+	return (retval);
+}
diff --git a/sys/dev/raidframe/rf_cvscan.h b/sys/dev/raidframe/rf_cvscan.h
new file mode 100644
index 0000000..7f536a8
--- /dev/null
+++ b/sys/dev/raidframe/rf_cvscan.h
@@ -0,0 +1,85 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_cvscan.h,v 1.3 1999/02/05 00:06:07 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+**	Disk scheduling by CVSCAN( N, r )
+**
+**	Given a set of requests, partition them into one set on each
+**	side of the current arm position.  The trick is to pick which
+**	side you are going to service next; once a side is picked you will
+**	service the closest request.
+**	Let there be n1 requests on one side and n2 requests on the other
+**	side.  If one of n1 or n2 is zero, select the other side.
+**	If both n1 and n2 are nonzero, select a "range" for examination
+**	that is N' = min( n1, n2, N ).  Average the distance from the
+**	current position to the nearest N' requests on each side giving
+**	d1 and d2.
+**	Suppose the last decision was to move toward set 2, then the
+**	current direction is toward set 2, and you will only switch to set
+**	1 if d1+R < d2 where R is r*(total number of cylinders), r in [0,1].
+**
+**	I extend this by applying only to the set of requests that all
+**	share the same, highest priority level.
+*/
+
+#ifndef _RF__RF_CVSCAN_H_
+#define _RF__RF_CVSCAN_H_
+
+#include <dev/raidframe/rf_diskqueue.h>
+
+typedef enum RF_CvscanArmDir_e {
+	rf_cvscan_LEFT,
+	rf_cvscan_RIGHT
+}       RF_CvscanArmDir_t;
+
+typedef struct RF_CvscanHeader_s {
+	long    range_for_avg;	/* CVSCAN param N */
+	long    change_penalty;	/* CVSCAN param R */
+	RF_CvscanArmDir_t direction;
+	RF_SectorNum_t cur_block;
+	int     nxt_priority;
+	RF_DiskQueueData_t *left;
+	int     left_cnt;
+	RF_DiskQueueData_t *right;
+	int     right_cnt;
+	RF_DiskQueueData_t *burner;
+}       RF_CvscanHeader_t;
+
+int     rf_CvscanConfigure(void);
+void   *
+rf_CvscanCreate(RF_SectorCount_t sect_per_disk,
+    RF_AllocListElem_t * cl_list, RF_ShutdownList_t ** listp);
+void    rf_CvscanEnqueue(void *qptr, RF_DiskQueueData_t * req, int priority);
+RF_DiskQueueData_t *rf_CvscanDequeue(void *qptr);
+RF_DiskQueueData_t *rf_CvscanPeek(void *qptr);
+int 
+rf_CvscanPromote(void *qptr, RF_StripeNum_t parityStripeID,
+    RF_ReconUnitNum_t which_ru);
+
+#endif				/* !_RF__RF_CVSCAN_H_ */
diff --git a/sys/dev/raidframe/rf_dag.h b/sys/dev/raidframe/rf_dag.h
new file mode 100644
index 0000000..15cd4a8
--- /dev/null
+++ b/sys/dev/raidframe/rf_dag.h
@@ -0,0 +1,239 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dag.h,v 1.3 1999/02/05 00:06:07 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II, Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/****************************************************************************
+ *                                                                          *
+ * dag.h -- header file for DAG-related data structures                     *
+ *                                                                          *
+ ****************************************************************************/
+
+#ifndef _RF__RF_DAG_H_
+#define _RF__RF_DAG_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_stripelocks.h>
+#include <dev/raidframe/rf_layout.h>
+#include <dev/raidframe/rf_dagflags.h>
+#include <dev/raidframe/rf_acctrace.h>
+#include <dev/raidframe/rf_memchunk.h>
+
+#define RF_THREAD_CONTEXT   0	/* we were invoked from thread context */
+#define RF_INTR_CONTEXT     1	/* we were invoked from interrupt context */
+#define RF_MAX_ANTECEDENTS 20	/* max num of antecedents a node may posses */
+
+#if defined(__FreeBSD__) && __FreeBSD_version > 500005
+#include <sys/bio.h>
+#endif
+#include <sys/buf.h>
+
+struct RF_PropHeader_s {	/* structure for propagation of results */
+	int     resultNum;	/* bind result # resultNum */
+	int     paramNum;	/* to parameter # paramNum */
+	RF_PropHeader_t *next;	/* linked list for multiple results/params */
+};
+
+typedef enum RF_NodeStatus_e {
+	rf_bwd1,		/* node is ready for undo logging (backward
+				 * error recovery only) */
+	rf_bwd2,		/* node has completed undo logging (backward
+				 * error recovery only) */
+	rf_wait,		/* node is waiting to be executed */
+	rf_fired,		/* node is currently executing its do function */
+	rf_good,		/* node successfully completed execution of
+				 * its do function */
+	rf_bad,			/* node failed to successfully execute its do
+				 * function */
+	rf_skipped,		/* not used anymore, used to imply a node was
+				 * not executed */
+	rf_recover,		/* node is currently executing its undo
+				 * function */
+	rf_panic,		/* node failed to successfully execute its
+				 * undo function */
+	rf_undone		/* node successfully executed its undo
+				 * function */
+}       RF_NodeStatus_t;
+/*
+ * These were used to control skipping a node.
+ * Now, these are only used as comments.
+ */
+typedef enum RF_AntecedentType_e {
+	rf_trueData,
+	rf_antiData,
+	rf_outputData,
+	rf_control
+}       RF_AntecedentType_t;
+#define RF_DAG_PTRCACHESIZE   40
+#define RF_DAG_PARAMCACHESIZE 12
+
+typedef RF_uint8 RF_DagNodeFlags_t;
+
+struct RF_DagNode_s {
+	RF_NodeStatus_t status;	/* current status of this node */
+	int     (*doFunc) (RF_DagNode_t *);	/* normal function */
+	int     (*undoFunc) (RF_DagNode_t *);	/* func to remove effect of
+						 * doFunc */
+	int     (*wakeFunc) (RF_DagNode_t *, int status);	/* func called when the
+								 * node completes an I/O */
+	int     numParams;	/* number of parameters required by *funcPtr */
+	int     numResults;	/* number of results produced by *funcPtr */
+	int     numAntecedents;	/* number of antecedents */
+	int     numAntDone;	/* number of antecedents which have finished */
+	int     numSuccedents;	/* number of succedents */
+	int     numSuccFired;	/* incremented when a succedent is fired
+				 * during forward execution */
+	int     numSuccDone;	/* incremented when a succedent finishes
+				 * during rollBackward */
+	int     commitNode;	/* boolean flag - if true, this is a commit
+				 * node */
+	RF_DagNode_t **succedents;	/* succedents, array size
+					 * numSuccedents */
+	RF_DagNode_t **antecedents;	/* antecedents, array size
+					 * numAntecedents */
+	RF_AntecedentType_t antType[RF_MAX_ANTECEDENTS];	/* type of each
+								 * antecedent */
+	void  **results;	/* array of results produced by *funcPtr */
+	RF_DagParam_t *params;	/* array of parameters required by *funcPtr */
+	RF_PropHeader_t **propList;	/* propagation list, size
+					 * numSuccedents */
+	RF_DagHeader_t *dagHdr;	/* ptr to head of dag containing this node */
+	void   *dagFuncData;	/* dag execution func uses this for whatever
+				 * it wants */
+	RF_DagNode_t *next;
+	int     nodeNum;	/* used by PrintDAG for debug only */
+	int     visited;	/* used to avoid re-visiting nodes on DAG
+				 * walks */
+	/* ANY CODE THAT USES THIS FIELD MUST MAINTAIN THE PROPERTY THAT AFTER
+	 * IT FINISHES, ALL VISITED FLAGS IN THE DAG ARE IDENTICAL */
+	char   *name;		/* debug only */
+	RF_DagNodeFlags_t flags;/* see below */
+	RF_DagNode_t *dag_ptrs[RF_DAG_PTRCACHESIZE];	/* cache for performance */
+	RF_DagParam_t dag_params[RF_DAG_PARAMCACHESIZE];	/* cache for performance */
+};
+/*
+ * Bit values for flags field of RF_DagNode_t
+ */
+#define RF_DAGNODE_FLAG_NONE  0x00
+#define RF_DAGNODE_FLAG_YIELD 0x01	/* in the kernel, yield the processor
+					 * before firing this node */
+
+/* enable - DAG ready for normal execution, no errors encountered
+ * rollForward - DAG encountered an error after commit point, rolling forward
+ * rollBackward - DAG encountered an error prior to commit point, rolling backward
+ */
+typedef enum RF_DagStatus_e {
+	rf_enable,
+	rf_rollForward,
+	rf_rollBackward
+}       RF_DagStatus_t;
+#define RF_MAX_HDR_SUCC 1
+
+#define RF_MAXCHUNKS 10
+
+struct RF_DagHeader_s {
+	RF_DagStatus_t status;	/* status of this DAG */
+	int     numSuccedents;	/* DAG may be a tree, i.e. may have > 1 root */
+	int     numCommitNodes;	/* number of commit nodes in graph */
+	int     numCommits;	/* number of commit nodes which have been
+				 * fired  */
+	RF_DagNode_t *succedents[RF_MAX_HDR_SUCC];	/* array of succedents,
+							 * size numSuccedents */
+	RF_DagHeader_t *next;	/* ptr to allow a list of dags */
+	RF_AllocListElem_t *allocList;	/* ptr to list of ptrs to be freed
+					 * prior to freeing DAG */
+	RF_AccessStripeMapHeader_t *asmList;	/* list of access stripe maps
+						 * to be freed */
+	int     nodeNum;	/* used by PrintDAG for debug only */
+	int     numNodesCompleted;
+	RF_AccTraceEntry_t *tracerec;	/* perf mon only */
+
+	void    (*cbFunc) (void *);	/* function to call when the dag
+					 * completes */
+	void   *cbArg;		/* argument for cbFunc */
+	char   *creator;	/* name of function used to create this dag */
+
+	RF_Raid_t *raidPtr;	/* the descriptor for the RAID device this DAG
+				 * is for */
+	void   *bp;		/* the bp for this I/O passed down from the
+				 * file system. ignored outside kernel */
+
+	RF_ChunkDesc_t *memChunk[RF_MAXCHUNKS];	/* experimental- Chunks of
+						 * memory to be retained upon
+						 * DAG free for re-use */
+	int     chunkIndex;	/* the idea is to avoid calls to alloc and
+				 * free */
+
+	RF_ChunkDesc_t **xtraMemChunk;	/* escape hatch which allows
+					 * SelectAlgorithm to merge memChunks
+					 * from several dags */
+	int     xtraChunkIndex;	/* number of ptrs to valid chunks */
+	int     xtraChunkCnt;	/* number of ptrs to chunks allocated */
+
+};
+
+struct RF_DagList_s {
+	/* common info for a list of dags which will be fired sequentially */
+	int     numDags;	/* number of dags in the list */
+	int     numDagsFired;	/* number of dags in list which have initiated
+				 * execution */
+	int     numDagsDone;	/* number of dags in list which have completed
+				 * execution */
+	RF_DagHeader_t *dags;	/* list of dags */
+	RF_RaidAccessDesc_t *desc;	/* ptr to descriptor for this access */
+	RF_AccTraceEntry_t tracerec;	/* perf mon info for dags (not user
+					 * info) */
+};
+/* resets a node so that it can be fired again */
+#define RF_ResetNode(_n_)  { \
+  (_n_)->status = rf_wait;   \
+  (_n_)->numAntDone = 0;     \
+  (_n_)->numSuccFired = 0;   \
+  (_n_)->numSuccDone = 0;    \
+  (_n_)->next = NULL;        \
+}
+
+#define RF_ResetDagHeader(_h_) { \
+  (_h_)->numNodesCompleted = 0;  \
+  (_h_)->numCommits = 0;         \
+  (_h_)->status = rf_enable;     \
+}
+
+/* convience macro for declaring a create dag function */
+
+#define RF_CREATE_DAG_FUNC_DECL(_name_) \
+void _name_ ( \
+	RF_Raid_t             *raidPtr, \
+	RF_AccessStripeMap_t  *asmap, \
+	RF_DagHeader_t        *dag_h, \
+	void                  *bp, \
+	RF_RaidAccessFlags_t   flags, \
+	RF_AllocListElem_t    *allocList)
+
+#endif				/* !_RF__RF_DAG_H_ */
diff --git a/sys/dev/raidframe/rf_dagdegrd.c b/sys/dev/raidframe/rf_dagdegrd.c
new file mode 100644
index 0000000..8e4c15a
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagdegrd.c
@@ -0,0 +1,1130 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dagdegrd.c,v 1.7 2001/01/26 14:06:16 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_dagdegrd.c
+ *
+ * code for creating degraded read DAGs
+ */
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_memchunk.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+
+
+/******************************************************************************
+ *
+ * General comments on DAG creation:
+ *
+ * All DAGs in this file use roll-away error recovery.  Each DAG has a single
+ * commit node, usually called "Cmt."  If an error occurs before the Cmt node
+ * is reached, the execution engine will halt forward execution and work
+ * backward through the graph, executing the undo functions.  Assuming that
+ * each node in the graph prior to the Cmt node are undoable and atomic - or -
+ * does not make changes to permanent state, the graph will fail atomically.
+ * If an error occurs after the Cmt node executes, the engine will roll-forward
+ * through the graph, blindly executing nodes until it reaches the end.
+ * If a graph reaches the end, it is assumed to have completed successfully.
+ *
+ * A graph has only 1 Cmt node.
+ *
+ */
+
+
+/******************************************************************************
+ *
+ * The following wrappers map the standard DAG creation interface to the
+ * DAG creation routines.  Additionally, these wrappers enable experimentation
+ * with new DAG structures by providing an extra level of indirection, allowing
+ * the DAG creation routines to be replaced at this single point.
+ */
+
+void 
+rf_CreateRaidFiveDegradedReadDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList)
+{
+	rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+	    &rf_xorRecoveryFuncs);
+}
+
+
+/******************************************************************************
+ *
+ * DAG creation code begins here
+ */
+
+
+/******************************************************************************
+ * Create a degraded read DAG for RAID level 1
+ *
+ * Hdr -> Nil -> R(p/s)d -> Commit -> Trm
+ *
+ * The "Rd" node reads data from the surviving disk in the mirror pair
+ *   Rpd - read of primary copy
+ *   Rsd - read of secondary copy
+ *
+ * Parameters:  raidPtr   - description of the physical array
+ *              asmap     - logical & physical addresses for this access
+ *              bp        - buffer ptr (for holding write data)
+ *              flags     - general flags (e.g. disk locking)
+ *              allocList - list of memory allocated in DAG creation
+ *****************************************************************************/
+
+void 
+rf_CreateRaidOneDegradedReadDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList)
+{
+	RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode;
+	RF_StripeNum_t parityStripeID;
+	RF_ReconUnitNum_t which_ru;
+	RF_PhysDiskAddr_t *pda;
+	int     useMirror, i;
+
+	useMirror = 0;
+	parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
+	    asmap->raidAddress, &which_ru);
+	if (rf_dagDebug) {
+		printf("[Creating RAID level 1 degraded read DAG]\n");
+	}
+	dag_h->creator = "RaidOneDegradedReadDAG";
+	/* alloc the Wnd nodes and the Wmir node */
+	if (asmap->numDataFailed == 0)
+		useMirror = RF_FALSE;
+	else
+		useMirror = RF_TRUE;
+
+	/* total number of nodes = 1 + (block + commit + terminator) */
+	RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+	i = 0;
+	rdNode = &nodes[i];
+	i++;
+	blockNode = &nodes[i];
+	i++;
+	commitNode = &nodes[i];
+	i++;
+	termNode = &nodes[i];
+	i++;
+
+	/* this dag can not commit until the commit node is reached.   errors
+	 * prior to the commit point imply the dag has failed and must be
+	 * retried */
+	dag_h->numCommitNodes = 1;
+	dag_h->numCommits = 0;
+	dag_h->numSuccedents = 1;
+
+	/* initialize the block, commit, and terminator nodes */
+	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+	    NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
+	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+	    NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+	    NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+	pda = asmap->physInfo;
+	RF_ASSERT(pda != NULL);
+	/* parityInfo must describe entire parity unit */
+	RF_ASSERT(asmap->parityInfo->next == NULL);
+
+	/* initialize the data node */
+	if (!useMirror) {
+		/* read primary copy of data */
+		rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+		    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList);
+		rdNode->params[0].p = pda;
+		rdNode->params[1].p = pda->bufPtr;
+		rdNode->params[2].v = parityStripeID;
+		rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+	} else {
+		/* read secondary copy of data */
+		rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+		    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList);
+		rdNode->params[0].p = asmap->parityInfo;
+		rdNode->params[1].p = pda->bufPtr;
+		rdNode->params[2].v = parityStripeID;
+		rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+	}
+
+	/* connect header to block node */
+	RF_ASSERT(dag_h->numSuccedents == 1);
+	RF_ASSERT(blockNode->numAntecedents == 0);
+	dag_h->succedents[0] = blockNode;
+
+	/* connect block node to rdnode */
+	RF_ASSERT(blockNode->numSuccedents == 1);
+	RF_ASSERT(rdNode->numAntecedents == 1);
+	blockNode->succedents[0] = rdNode;
+	rdNode->antecedents[0] = blockNode;
+	rdNode->antType[0] = rf_control;
+
+	/* connect rdnode to commit node */
+	RF_ASSERT(rdNode->numSuccedents == 1);
+	RF_ASSERT(commitNode->numAntecedents == 1);
+	rdNode->succedents[0] = commitNode;
+	commitNode->antecedents[0] = rdNode;
+	commitNode->antType[0] = rf_control;
+
+	/* connect commit node to terminator */
+	RF_ASSERT(commitNode->numSuccedents == 1);
+	RF_ASSERT(termNode->numAntecedents == 1);
+	RF_ASSERT(termNode->numSuccedents == 0);
+	commitNode->succedents[0] = termNode;
+	termNode->antecedents[0] = commitNode;
+	termNode->antType[0] = rf_control;
+}
+
+
+
+/******************************************************************************
+ *
+ * creates a DAG to perform a degraded-mode read of data within one stripe.
+ * This DAG is as follows:
+ *
+ * Hdr -> Block -> Rud -> Xor -> Cmt -> T
+ *              -> Rrd ->
+ *              -> Rp -->
+ *
+ * Each R node is a successor of the L node
+ * One successor arc from each R node goes to C, and the other to X
+ * There is one Rud for each chunk of surviving user data requested by the
+ * user, and one Rrd for each chunk of surviving user data _not_ being read by
+ * the user
+ * R = read, ud = user data, rd = recovery (surviving) data, p = parity
+ * X = XOR, C = Commit, T = terminate
+ *
+ * The block node guarantees a single source node.
+ *
+ * Note:  The target buffer for the XOR node is set to the actual user buffer
+ * where the failed data is supposed to end up.  This buffer is zero'd by the
+ * code here.  Thus, if you create a degraded read dag, use it, and then
+ * re-use, you have to be sure to zero the target buffer prior to the re-use.
+ *
+ * The recfunc argument at the end specifies the name and function used for
+ * the redundancy
+ * recovery function.
+ *
+ *****************************************************************************/
+
+void 
+rf_CreateDegradedReadDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    RF_RedFuncs_t * recFunc)
+{
+	RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *xorNode, *blockNode;
+	RF_DagNode_t *commitNode, *rpNode, *termNode;
+	int     nNodes, nRrdNodes, nRudNodes, nXorBufs, i;
+	int     j, paramNum;
+	RF_SectorCount_t sectorsPerSU;
+	RF_ReconUnitNum_t which_ru;
+	char   *overlappingPDAs;/* a temporary array of flags */
+	RF_AccessStripeMapHeader_t *new_asm_h[2];
+	RF_PhysDiskAddr_t *pda, *parityPDA;
+	RF_StripeNum_t parityStripeID;
+	RF_PhysDiskAddr_t *failedPDA;
+	RF_RaidLayout_t *layoutPtr;
+	char   *rpBuf;
+
+	layoutPtr = &(raidPtr->Layout);
+	/* failedPDA points to the pda within the asm that targets the failed
+	 * disk */
+	failedPDA = asmap->failedPDAs[0];
+	parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr,
+	    asmap->raidAddress, &which_ru);
+	sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
+
+	if (rf_dagDebug) {
+		printf("[Creating degraded read DAG]\n");
+	}
+	RF_ASSERT(asmap->numDataFailed == 1);
+	dag_h->creator = "DegradedReadDAG";
+
+	/*
+         * generate two ASMs identifying the surviving data we need
+         * in order to recover the lost data
+         */
+
+	/* overlappingPDAs array must be zero'd */
+	RF_Calloc(overlappingPDAs, asmap->numStripeUnitsAccessed, sizeof(char), (char *));
+	rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h, &nXorBufs,
+	    &rpBuf, overlappingPDAs, allocList);
+
+	/*
+         * create all the nodes at once
+         *
+         * -1 because no access is generated for the failed pda
+         */
+	nRudNodes = asmap->numStripeUnitsAccessed - 1;
+	nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
+	    ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
+	nNodes = 5 + nRudNodes + nRrdNodes;	/* lock, unlock, xor, Rp, Rud,
+						 * Rrd */
+	RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *),
+	    allocList);
+	i = 0;
+	blockNode = &nodes[i];
+	i++;
+	commitNode = &nodes[i];
+	i++;
+	xorNode = &nodes[i];
+	i++;
+	rpNode = &nodes[i];
+	i++;
+	termNode = &nodes[i];
+	i++;
+	rudNodes = &nodes[i];
+	i += nRudNodes;
+	rrdNodes = &nodes[i];
+	i += nRrdNodes;
+	RF_ASSERT(i == nNodes);
+
+	/* initialize nodes */
+	dag_h->numCommitNodes = 1;
+	dag_h->numCommits = 0;
+	/* this dag can not commit until the commit node is reached errors
+	 * prior to the commit point imply the dag has failed */
+	dag_h->numSuccedents = 1;
+
+	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+	    NULL, nRudNodes + nRrdNodes + 1, 0, 0, 0, dag_h, "Nil", allocList);
+	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+	    NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+	    NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+	rf_InitNode(xorNode, rf_wait, RF_FALSE, recFunc->simple, rf_NullNodeUndoFunc,
+	    NULL, 1, nRudNodes + nRrdNodes + 1, 2 * nXorBufs + 2, 1, dag_h,
+	    recFunc->SimpleName, allocList);
+
+	/* fill in the Rud nodes */
+	for (pda = asmap->physInfo, i = 0; i < nRudNodes; i++, pda = pda->next) {
+		if (pda == failedPDA) {
+			i--;
+			continue;
+		}
+		rf_InitNode(&rudNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc,
+		    rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
+		    "Rud", allocList);
+		RF_ASSERT(pda);
+		rudNodes[i].params[0].p = pda;
+		rudNodes[i].params[1].p = pda->bufPtr;
+		rudNodes[i].params[2].v = parityStripeID;
+		rudNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+	}
+
+	/* fill in the Rrd nodes */
+	i = 0;
+	if (new_asm_h[0]) {
+		for (pda = new_asm_h[0]->stripeMap->physInfo;
+		    i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
+		    i++, pda = pda->next) {
+			rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc,
+			    rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
+			    dag_h, "Rrd", allocList);
+			RF_ASSERT(pda);
+			rrdNodes[i].params[0].p = pda;
+			rrdNodes[i].params[1].p = pda->bufPtr;
+			rrdNodes[i].params[2].v = parityStripeID;
+			rrdNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		}
+	}
+	if (new_asm_h[1]) {
+		for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo;
+		    j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
+		    j++, pda = pda->next) {
+			rf_InitNode(&rrdNodes[i + j], rf_wait, RF_FALSE, rf_DiskReadFunc,
+			    rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
+			    dag_h, "Rrd", allocList);
+			RF_ASSERT(pda);
+			rrdNodes[i + j].params[0].p = pda;
+			rrdNodes[i + j].params[1].p = pda->bufPtr;
+			rrdNodes[i + j].params[2].v = parityStripeID;
+			rrdNodes[i + j].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		}
+	}
+	/* make a PDA for the parity unit */
+	RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+	parityPDA->row = asmap->parityInfo->row;
+	parityPDA->col = asmap->parityInfo->col;
+	parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU)
+	    * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
+	parityPDA->numSector = failedPDA->numSector;
+
+	/* initialize the Rp node */
+	rf_InitNode(rpNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+	    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rp ", allocList);
+	rpNode->params[0].p = parityPDA;
+	rpNode->params[1].p = rpBuf;
+	rpNode->params[2].v = parityStripeID;
+	rpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+
+	/*
+         * the last and nastiest step is to assign all
+         * the parameters of the Xor node
+         */
+	paramNum = 0;
+	for (i = 0; i < nRrdNodes; i++) {
+		/* all the Rrd nodes need to be xored together */
+		xorNode->params[paramNum++] = rrdNodes[i].params[0];
+		xorNode->params[paramNum++] = rrdNodes[i].params[1];
+	}
+	for (i = 0; i < nRudNodes; i++) {
+		/* any Rud nodes that overlap the failed access need to be
+		 * xored in */
+		if (overlappingPDAs[i]) {
+			RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+			bcopy((char *) rudNodes[i].params[0].p, (char *) pda, sizeof(RF_PhysDiskAddr_t));
+			rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0);
+			xorNode->params[paramNum++].p = pda;
+			xorNode->params[paramNum++].p = pda->bufPtr;
+		}
+	}
+	RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char));
+
+	/* install parity pda as last set of params to be xor'd */
+	xorNode->params[paramNum++].p = parityPDA;
+	xorNode->params[paramNum++].p = rpBuf;
+
+	/*
+         * the last 2 params to the recovery xor node are
+         * the failed PDA and the raidPtr
+         */
+	xorNode->params[paramNum++].p = failedPDA;
+	xorNode->params[paramNum++].p = raidPtr;
+	RF_ASSERT(paramNum == 2 * nXorBufs + 2);
+
+	/*
+         * The xor node uses results[0] as the target buffer.
+         * Set pointer and zero the buffer. In the kernel, this
+         * may be a user buffer in which case we have to remap it.
+         */
+	xorNode->results[0] = failedPDA->bufPtr;
+	RF_BZERO(bp, failedPDA->bufPtr, rf_RaidAddressToByte(raidPtr,
+		failedPDA->numSector));
+
+	/* connect nodes to form graph */
+	/* connect the header to the block node */
+	RF_ASSERT(dag_h->numSuccedents == 1);
+	RF_ASSERT(blockNode->numAntecedents == 0);
+	dag_h->succedents[0] = blockNode;
+
+	/* connect the block node to the read nodes */
+	RF_ASSERT(blockNode->numSuccedents == (1 + nRrdNodes + nRudNodes));
+	RF_ASSERT(rpNode->numAntecedents == 1);
+	blockNode->succedents[0] = rpNode;
+	rpNode->antecedents[0] = blockNode;
+	rpNode->antType[0] = rf_control;
+	for (i = 0; i < nRrdNodes; i++) {
+		RF_ASSERT(rrdNodes[i].numSuccedents == 1);
+		blockNode->succedents[1 + i] = &rrdNodes[i];
+		rrdNodes[i].antecedents[0] = blockNode;
+		rrdNodes[i].antType[0] = rf_control;
+	}
+	for (i = 0; i < nRudNodes; i++) {
+		RF_ASSERT(rudNodes[i].numSuccedents == 1);
+		blockNode->succedents[1 + nRrdNodes + i] = &rudNodes[i];
+		rudNodes[i].antecedents[0] = blockNode;
+		rudNodes[i].antType[0] = rf_control;
+	}
+
+	/* connect the read nodes to the xor node */
+	RF_ASSERT(xorNode->numAntecedents == (1 + nRrdNodes + nRudNodes));
+	RF_ASSERT(rpNode->numSuccedents == 1);
+	rpNode->succedents[0] = xorNode;
+	xorNode->antecedents[0] = rpNode;
+	xorNode->antType[0] = rf_trueData;
+	for (i = 0; i < nRrdNodes; i++) {
+		RF_ASSERT(rrdNodes[i].numSuccedents == 1);
+		rrdNodes[i].succedents[0] = xorNode;
+		xorNode->antecedents[1 + i] = &rrdNodes[i];
+		xorNode->antType[1 + i] = rf_trueData;
+	}
+	for (i = 0; i < nRudNodes; i++) {
+		RF_ASSERT(rudNodes[i].numSuccedents == 1);
+		rudNodes[i].succedents[0] = xorNode;
+		xorNode->antecedents[1 + nRrdNodes + i] = &rudNodes[i];
+		xorNode->antType[1 + nRrdNodes + i] = rf_trueData;
+	}
+
+	/* connect the xor node to the commit node */
+	RF_ASSERT(xorNode->numSuccedents == 1);
+	RF_ASSERT(commitNode->numAntecedents == 1);
+	xorNode->succedents[0] = commitNode;
+	commitNode->antecedents[0] = xorNode;
+	commitNode->antType[0] = rf_control;
+
+	/* connect the termNode to the commit node */
+	RF_ASSERT(commitNode->numSuccedents == 1);
+	RF_ASSERT(termNode->numAntecedents == 1);
+	RF_ASSERT(termNode->numSuccedents == 0);
+	commitNode->succedents[0] = termNode;
+	termNode->antType[0] = rf_control;
+	termNode->antecedents[0] = commitNode;
+}
+
+#if (RF_INCLUDE_CHAINDECLUSTER > 0)
+/******************************************************************************
+ * Create a degraded read DAG for Chained Declustering
+ *
+ * Hdr -> Nil -> R(p/s)d -> Cmt -> Trm
+ *
+ * The "Rd" node reads data from the surviving disk in the mirror pair
+ *   Rpd - read of primary copy
+ *   Rsd - read of secondary copy
+ *
+ * Parameters:  raidPtr   - description of the physical array
+ *              asmap     - logical & physical addresses for this access
+ *              bp        - buffer ptr (for holding write data)
+ *              flags     - general flags (e.g. disk locking)
+ *              allocList - list of memory allocated in DAG creation
+ *****************************************************************************/
+
+void 
+rf_CreateRaidCDegradedReadDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList)
+{
+	RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode;
+	RF_StripeNum_t parityStripeID;
+	int     useMirror, i, shiftable;
+	RF_ReconUnitNum_t which_ru;
+	RF_PhysDiskAddr_t *pda;
+
+	if ((asmap->numDataFailed + asmap->numParityFailed) == 0) {
+		shiftable = RF_TRUE;
+	} else {
+		shiftable = RF_FALSE;
+	}
+	useMirror = 0;
+	parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
+	    asmap->raidAddress, &which_ru);
+
+	if (rf_dagDebug) {
+		printf("[Creating RAID C degraded read DAG]\n");
+	}
+	dag_h->creator = "RaidCDegradedReadDAG";
+	/* alloc the Wnd nodes and the Wmir node */
+	if (asmap->numDataFailed == 0)
+		useMirror = RF_FALSE;
+	else
+		useMirror = RF_TRUE;
+
+	/* total number of nodes = 1 + (block + commit + terminator) */
+	RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+	i = 0;
+	rdNode = &nodes[i];
+	i++;
+	blockNode = &nodes[i];
+	i++;
+	commitNode = &nodes[i];
+	i++;
+	termNode = &nodes[i];
+	i++;
+
+	/*
+         * This dag can not commit until the commit node is reached.
+         * Errors prior to the commit point imply the dag has failed
+         * and must be retried.
+         */
+	dag_h->numCommitNodes = 1;
+	dag_h->numCommits = 0;
+	dag_h->numSuccedents = 1;
+
+	/* initialize the block, commit, and terminator nodes */
+	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+	    NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
+	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+	    NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+	    NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+	pda = asmap->physInfo;
+	RF_ASSERT(pda != NULL);
+	/* parityInfo must describe entire parity unit */
+	RF_ASSERT(asmap->parityInfo->next == NULL);
+
+	/* initialize the data node */
+	if (!useMirror) {
+		rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+		    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList);
+		if (shiftable && rf_compute_workload_shift(raidPtr, pda)) {
+			/* shift this read to the next disk in line */
+			rdNode->params[0].p = asmap->parityInfo;
+			rdNode->params[1].p = pda->bufPtr;
+			rdNode->params[2].v = parityStripeID;
+			rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		} else {
+			/* read primary copy */
+			rdNode->params[0].p = pda;
+			rdNode->params[1].p = pda->bufPtr;
+			rdNode->params[2].v = parityStripeID;
+			rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		}
+	} else {
+		/* read secondary copy of data */
+		rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+		    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList);
+		rdNode->params[0].p = asmap->parityInfo;
+		rdNode->params[1].p = pda->bufPtr;
+		rdNode->params[2].v = parityStripeID;
+		rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+	}
+
+	/* connect header to block node */
+	RF_ASSERT(dag_h->numSuccedents == 1);
+	RF_ASSERT(blockNode->numAntecedents == 0);
+	dag_h->succedents[0] = blockNode;
+
+	/* connect block node to rdnode */
+	RF_ASSERT(blockNode->numSuccedents == 1);
+	RF_ASSERT(rdNode->numAntecedents == 1);
+	blockNode->succedents[0] = rdNode;
+	rdNode->antecedents[0] = blockNode;
+	rdNode->antType[0] = rf_control;
+
+	/* connect rdnode to commit node */
+	RF_ASSERT(rdNode->numSuccedents == 1);
+	RF_ASSERT(commitNode->numAntecedents == 1);
+	rdNode->succedents[0] = commitNode;
+	commitNode->antecedents[0] = rdNode;
+	commitNode->antType[0] = rf_control;
+
+	/* connect commit node to terminator */
+	RF_ASSERT(commitNode->numSuccedents == 1);
+	RF_ASSERT(termNode->numAntecedents == 1);
+	RF_ASSERT(termNode->numSuccedents == 0);
+	commitNode->succedents[0] = termNode;
+	termNode->antecedents[0] = commitNode;
+	termNode->antType[0] = rf_control;
+}
+#endif /* (RF_INCLUDE_CHAINDECLUSTER > 0) */
+
+#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0)
+/*
+ * XXX move this elsewhere?
+ */
+void 
+rf_DD_GenerateFailedAccessASMs(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_PhysDiskAddr_t ** pdap,
+    int *nNodep,
+    RF_PhysDiskAddr_t ** pqpdap,
+    int *nPQNodep,
+    RF_AllocListElem_t * allocList)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	int     PDAPerDisk, i;
+	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
+	int     numDataCol = layoutPtr->numDataCol;
+	int     state;
+	RF_SectorNum_t suoff, suend;
+	unsigned firstDataCol, napdas, count;
+	RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end = 0;
+	RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1];
+	RF_PhysDiskAddr_t *pda_p;
+	RF_PhysDiskAddr_t *phys_p;
+	RF_RaidAddr_t sosAddr;
+
+	/* determine how many pda's we will have to generate per unaccess
+	 * stripe. If there is only one failed data unit, it is one; if two,
+	 * possibly two, depending wether they overlap. */
+
+	fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector);
+	fone_end = fone_start + fone->numSector;
+
+#define CONS_PDA(if,start,num) \
+  pda_p->row = asmap->if->row;    pda_p->col = asmap->if->col; \
+  pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
+  pda_p->numSector = num; \
+  pda_p->next = NULL; \
+  RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
+
+	if (asmap->numDataFailed == 1) {
+		PDAPerDisk = 1;
+		state = 1;
+		RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+		pda_p = *pqpdap;
+		/* build p */
+		CONS_PDA(parityInfo, fone_start, fone->numSector);
+		pda_p->type = RF_PDA_TYPE_PARITY;
+		pda_p++;
+		/* build q */
+		CONS_PDA(qInfo, fone_start, fone->numSector);
+		pda_p->type = RF_PDA_TYPE_Q;
+	} else {
+		ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector);
+		ftwo_end = ftwo_start + ftwo->numSector;
+		if (fone->numSector + ftwo->numSector > secPerSU) {
+			PDAPerDisk = 1;
+			state = 2;
+			RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+			pda_p = *pqpdap;
+			CONS_PDA(parityInfo, 0, secPerSU);
+			pda_p->type = RF_PDA_TYPE_PARITY;
+			pda_p++;
+			CONS_PDA(qInfo, 0, secPerSU);
+			pda_p->type = RF_PDA_TYPE_Q;
+		} else {
+			PDAPerDisk = 2;
+			state = 3;
+			/* four of them, fone, then ftwo */
+			RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+			pda_p = *pqpdap;
+			CONS_PDA(parityInfo, fone_start, fone->numSector);
+			pda_p->type = RF_PDA_TYPE_PARITY;
+			pda_p++;
+			CONS_PDA(qInfo, fone_start, fone->numSector);
+			pda_p->type = RF_PDA_TYPE_Q;
+			pda_p++;
+			CONS_PDA(parityInfo, ftwo_start, ftwo->numSector);
+			pda_p->type = RF_PDA_TYPE_PARITY;
+			pda_p++;
+			CONS_PDA(qInfo, ftwo_start, ftwo->numSector);
+			pda_p->type = RF_PDA_TYPE_Q;
+		}
+	}
+	/* figure out number of nonaccessed pda */
+	napdas = PDAPerDisk * (numDataCol - asmap->numStripeUnitsAccessed - (ftwo == NULL ? 1 : 0));
+	*nPQNodep = PDAPerDisk;
+
+	/* sweep over the over accessed pda's, figuring out the number of
+	 * additional pda's to generate. Of course, skip the failed ones */
+
+	count = 0;
+	for (pda_p = asmap->physInfo; pda_p; pda_p = pda_p->next) {
+		if ((pda_p == fone) || (pda_p == ftwo))
+			continue;
+		suoff = rf_StripeUnitOffset(layoutPtr, pda_p->startSector);
+		suend = suoff + pda_p->numSector;
+		switch (state) {
+		case 1:	/* one failed PDA to overlap */
+			/* if a PDA doesn't contain the failed unit, it can
+			 * only miss the start or end, not both */
+			if ((suoff > fone_start) || (suend < fone_end))
+				count++;
+			break;
+		case 2:	/* whole stripe */
+			if (suoff)	/* leak at begining */
+				count++;
+			if (suend < numDataCol)	/* leak at end */
+				count++;
+			break;
+		case 3:	/* two disjoint units */
+			if ((suoff > fone_start) || (suend < fone_end))
+				count++;
+			if ((suoff > ftwo_start) || (suend < ftwo_end))
+				count++;
+			break;
+		default:
+			RF_PANIC();
+		}
+	}
+
+	napdas += count;
+	*nNodep = napdas;
+	if (napdas == 0)
+		return;		/* short circuit */
+
+	/* allocate up our list of pda's */
+
+	RF_CallocAndAdd(pda_p, napdas, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+	*pdap = pda_p;
+
+	/* linkem together */
+	for (i = 0; i < (napdas - 1); i++)
+		pda_p[i].next = pda_p + (i + 1);
+
+	/* march through the one's up to the first accessed disk */
+	firstDataCol = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), asmap->physInfo->raidAddress) % numDataCol;
+	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+	for (i = 0; i < firstDataCol; i++) {
+		if ((pda_p - (*pdap)) == napdas)
+			continue;
+		pda_p->type = RF_PDA_TYPE_DATA;
+		pda_p->raidAddress = sosAddr + (i * secPerSU);
+		(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+		/* skip over dead disks */
+		if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
+			continue;
+		switch (state) {
+		case 1:	/* fone */
+			pda_p->numSector = fone->numSector;
+			pda_p->raidAddress += fone_start;
+			pda_p->startSector += fone_start;
+			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+			break;
+		case 2:	/* full stripe */
+			pda_p->numSector = secPerSU;
+			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, secPerSU), (char *), allocList);
+			break;
+		case 3:	/* two slabs */
+			pda_p->numSector = fone->numSector;
+			pda_p->raidAddress += fone_start;
+			pda_p->startSector += fone_start;
+			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+			pda_p++;
+			pda_p->type = RF_PDA_TYPE_DATA;
+			pda_p->raidAddress = sosAddr + (i * secPerSU);
+			(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+			pda_p->numSector = ftwo->numSector;
+			pda_p->raidAddress += ftwo_start;
+			pda_p->startSector += ftwo_start;
+			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+			break;
+		default:
+			RF_PANIC();
+		}
+		pda_p++;
+	}
+
+	/* march through the touched stripe units */
+	for (phys_p = asmap->physInfo; phys_p; phys_p = phys_p->next, i++) {
+		if ((phys_p == asmap->failedPDAs[0]) || (phys_p == asmap->failedPDAs[1]))
+			continue;
+		suoff = rf_StripeUnitOffset(layoutPtr, phys_p->startSector);
+		suend = suoff + phys_p->numSector;
+		switch (state) {
+		case 1:	/* single buffer */
+			if (suoff > fone_start) {
+				RF_ASSERT(suend >= fone_end);
+				/* The data read starts after the mapped
+				 * access, snip off the begining */
+				pda_p->numSector = suoff - fone_start;
+				pda_p->raidAddress = sosAddr + (i * secPerSU) + fone_start;
+				(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+				RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+				pda_p++;
+			}
+			if (suend < fone_end) {
+				RF_ASSERT(suoff <= fone_start);
+				/* The data read stops before the end of the
+				 * failed access, extend */
+				pda_p->numSector = fone_end - suend;
+				pda_p->raidAddress = sosAddr + (i * secPerSU) + suend;	/* off by one? */
+				(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+				RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+				pda_p++;
+			}
+			break;
+		case 2:	/* whole stripe unit */
+			RF_ASSERT((suoff == 0) || (suend == secPerSU));
+			if (suend < secPerSU) {	/* short read, snip from end
+						 * on */
+				pda_p->numSector = secPerSU - suend;
+				pda_p->raidAddress = sosAddr + (i * secPerSU) + suend;	/* off by one? */
+				(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+				RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+				pda_p++;
+			} else
+				if (suoff > 0) {	/* short at front */
+					pda_p->numSector = suoff;
+					pda_p->raidAddress = sosAddr + (i * secPerSU);
+					(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+					RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+					pda_p++;
+				}
+			break;
+		case 3:	/* two nonoverlapping failures */
+			if ((suoff > fone_start) || (suend < fone_end)) {
+				if (suoff > fone_start) {
+					RF_ASSERT(suend >= fone_end);
+					/* The data read starts after the
+					 * mapped access, snip off the
+					 * begining */
+					pda_p->numSector = suoff - fone_start;
+					pda_p->raidAddress = sosAddr + (i * secPerSU) + fone_start;
+					(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+					RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+					pda_p++;
+				}
+				if (suend < fone_end) {
+					RF_ASSERT(suoff <= fone_start);
+					/* The data read stops before the end
+					 * of the failed access, extend */
+					pda_p->numSector = fone_end - suend;
+					pda_p->raidAddress = sosAddr + (i * secPerSU) + suend;	/* off by one? */
+					(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+					RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+					pda_p++;
+				}
+			}
+			if ((suoff > ftwo_start) || (suend < ftwo_end)) {
+				if (suoff > ftwo_start) {
+					RF_ASSERT(suend >= ftwo_end);
+					/* The data read starts after the
+					 * mapped access, snip off the
+					 * begining */
+					pda_p->numSector = suoff - ftwo_start;
+					pda_p->raidAddress = sosAddr + (i * secPerSU) + ftwo_start;
+					(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+					RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+					pda_p++;
+				}
+				if (suend < ftwo_end) {
+					RF_ASSERT(suoff <= ftwo_start);
+					/* The data read stops before the end
+					 * of the failed access, extend */
+					pda_p->numSector = ftwo_end - suend;
+					pda_p->raidAddress = sosAddr + (i * secPerSU) + suend;	/* off by one? */
+					(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+					RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+					pda_p++;
+				}
+			}
+			break;
+		default:
+			RF_PANIC();
+		}
+	}
+
+	/* after the last accessed disk */
+	for (; i < numDataCol; i++) {
+		if ((pda_p - (*pdap)) == napdas)
+			continue;
+		pda_p->type = RF_PDA_TYPE_DATA;
+		pda_p->raidAddress = sosAddr + (i * secPerSU);
+		(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+		/* skip over dead disks */
+		if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
+			continue;
+		switch (state) {
+		case 1:	/* fone */
+			pda_p->numSector = fone->numSector;
+			pda_p->raidAddress += fone_start;
+			pda_p->startSector += fone_start;
+			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+			break;
+		case 2:	/* full stripe */
+			pda_p->numSector = secPerSU;
+			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, secPerSU), (char *), allocList);
+			break;
+		case 3:	/* two slabs */
+			pda_p->numSector = fone->numSector;
+			pda_p->raidAddress += fone_start;
+			pda_p->startSector += fone_start;
+			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+			pda_p++;
+			pda_p->type = RF_PDA_TYPE_DATA;
+			pda_p->raidAddress = sosAddr + (i * secPerSU);
+			(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+			pda_p->numSector = ftwo->numSector;
+			pda_p->raidAddress += ftwo_start;
+			pda_p->startSector += ftwo_start;
+			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+			break;
+		default:
+			RF_PANIC();
+		}
+		pda_p++;
+	}
+
+	RF_ASSERT(pda_p - *pdap == napdas);
+	return;
+}
+#define INIT_DISK_NODE(node,name) \
+rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
+(node)->succedents[0] = unblockNode; \
+(node)->succedents[1] = recoveryNode; \
+(node)->antecedents[0] = blockNode; \
+(node)->antType[0] = rf_control
+
+#define DISK_NODE_PARAMS(_node_,_p_) \
+  (_node_).params[0].p = _p_ ; \
+  (_node_).params[1].p = (_p_)->bufPtr; \
+  (_node_).params[2].v = parityStripeID; \
+  (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
+
+void 
+rf_DoubleDegRead(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    char *redundantReadNodeName,
+    char *recoveryNodeName,
+    int (*recovFunc) (RF_DagNode_t *))
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *recoveryNode, *blockNode,
+	       *unblockNode, *rpNodes, *rqNodes, *termNode;
+	RF_PhysDiskAddr_t *pda, *pqPDAs;
+	RF_PhysDiskAddr_t *npdas;
+	int     nNodes, nRrdNodes, nRudNodes, i;
+	RF_ReconUnitNum_t which_ru;
+	int     nReadNodes, nPQNodes;
+	RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0];
+	RF_PhysDiskAddr_t *failedPDAtwo = asmap->failedPDAs[1];
+	RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru);
+
+	if (rf_dagDebug)
+		printf("[Creating Double Degraded Read DAG]\n");
+	rf_DD_GenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes, allocList);
+
+	nRudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
+	nReadNodes = nRrdNodes + nRudNodes + 2 * nPQNodes;
+	nNodes = 4 /* block, unblock, recovery, term */ + nReadNodes;
+
+	RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+	i = 0;
+	blockNode = &nodes[i];
+	i += 1;
+	unblockNode = &nodes[i];
+	i += 1;
+	recoveryNode = &nodes[i];
+	i += 1;
+	termNode = &nodes[i];
+	i += 1;
+	rudNodes = &nodes[i];
+	i += nRudNodes;
+	rrdNodes = &nodes[i];
+	i += nRrdNodes;
+	rpNodes = &nodes[i];
+	i += nPQNodes;
+	rqNodes = &nodes[i];
+	i += nPQNodes;
+	RF_ASSERT(i == nNodes);
+
+	dag_h->numSuccedents = 1;
+	dag_h->succedents[0] = blockNode;
+	dag_h->creator = "DoubleDegRead";
+	dag_h->numCommits = 0;
+	dag_h->numCommitNodes = 1;	/* unblock */
+
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 2, 0, 0, dag_h, "Trm", allocList);
+	termNode->antecedents[0] = unblockNode;
+	termNode->antType[0] = rf_control;
+	termNode->antecedents[1] = recoveryNode;
+	termNode->antType[1] = rf_control;
+
+	/* init the block and unblock nodes */
+	/* The block node has all nodes except itself, unblock and recovery as
+	 * successors. Similarly for predecessors of the unblock. */
+	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList);
+	rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nReadNodes, 0, 0, dag_h, "Nil", allocList);
+
+	for (i = 0; i < nReadNodes; i++) {
+		blockNode->succedents[i] = rudNodes + i;
+		unblockNode->antecedents[i] = rudNodes + i;
+		unblockNode->antType[i] = rf_control;
+	}
+	unblockNode->succedents[0] = termNode;
+
+	/* The recovery node has all the reads as predecessors, and the term
+	 * node as successors. It gets a pda as a param from each of the read
+	 * nodes plus the raidPtr. For each failed unit is has a result pda. */
+	rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL,
+	    1,			/* succesors */
+	    nReadNodes,		/* preds */
+	    nReadNodes + 2,	/* params */
+	    asmap->numDataFailed,	/* results */
+	    dag_h, recoveryNodeName, allocList);
+
+	recoveryNode->succedents[0] = termNode;
+	for (i = 0; i < nReadNodes; i++) {
+		recoveryNode->antecedents[i] = rudNodes + i;
+		recoveryNode->antType[i] = rf_trueData;
+	}
+
+	/* build the read nodes, then come back and fill in recovery params
+	 * and results */
+	pda = asmap->physInfo;
+	for (i = 0; i < nRudNodes; pda = pda->next) {
+		if ((pda == failedPDA) || (pda == failedPDAtwo))
+			continue;
+		INIT_DISK_NODE(rudNodes + i, "Rud");
+		RF_ASSERT(pda);
+		DISK_NODE_PARAMS(rudNodes[i], pda);
+		i++;
+	}
+
+	pda = npdas;
+	for (i = 0; i < nRrdNodes; i++, pda = pda->next) {
+		INIT_DISK_NODE(rrdNodes + i, "Rrd");
+		RF_ASSERT(pda);
+		DISK_NODE_PARAMS(rrdNodes[i], pda);
+	}
+
+	/* redundancy pdas */
+	pda = pqPDAs;
+	INIT_DISK_NODE(rpNodes, "Rp");
+	RF_ASSERT(pda);
+	DISK_NODE_PARAMS(rpNodes[0], pda);
+	pda++;
+	INIT_DISK_NODE(rqNodes, redundantReadNodeName);
+	RF_ASSERT(pda);
+	DISK_NODE_PARAMS(rqNodes[0], pda);
+	if (nPQNodes == 2) {
+		pda++;
+		INIT_DISK_NODE(rpNodes + 1, "Rp");
+		RF_ASSERT(pda);
+		DISK_NODE_PARAMS(rpNodes[1], pda);
+		pda++;
+		INIT_DISK_NODE(rqNodes + 1, redundantReadNodeName);
+		RF_ASSERT(pda);
+		DISK_NODE_PARAMS(rqNodes[1], pda);
+	}
+	/* fill in recovery node params */
+	for (i = 0; i < nReadNodes; i++)
+		recoveryNode->params[i] = rudNodes[i].params[0];	/* pda */
+	recoveryNode->params[i++].p = (void *) raidPtr;
+	recoveryNode->params[i++].p = (void *) asmap;
+	recoveryNode->results[0] = failedPDA;
+	if (asmap->numDataFailed == 2)
+		recoveryNode->results[1] = failedPDAtwo;
+
+	/* zero fill the target data buffers? */
+}
+
+#endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */
diff --git a/sys/dev/raidframe/rf_dagdegrd.h b/sys/dev/raidframe/rf_dagdegrd.h
new file mode 100644
index 0000000..2e899d8
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagdegrd.h
@@ -0,0 +1,64 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dagdegrd.h,v 1.3 1999/02/05 00:06:07 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_DAGDEGRD_H_
+#define _RF__RF_DAGDEGRD_H_
+
+#include <dev/raidframe/rf_types.h>
+
+/* degraded read DAG creation routines */
+void 
+rf_CreateRaidFiveDegradedReadDAG(RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+    RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
+void 
+rf_CreateRaidOneDegradedReadDAG(RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+    RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
+void 
+rf_CreateDegradedReadDAG(RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+    RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
+    RF_RedFuncs_t * recFunc);
+void 
+rf_CreateRaidCDegradedReadDAG(RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+    RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
+void 
+rf_DD_GenerateFailedAccessASMs(RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap, RF_PhysDiskAddr_t ** pdap,
+    int *nNodep, RF_PhysDiskAddr_t ** pqpdap, int *nPQNodep,
+    RF_AllocListElem_t * allocList);
+void 
+rf_DoubleDegRead(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList, char *redundantReadNodeName,
+    char *recoveryNodeName, int (*recovFunc) (RF_DagNode_t *));
+
+#endif				/* !_RF__RF_DAGDEGRD_H_ */
diff --git a/sys/dev/raidframe/rf_dagdegwr.c b/sys/dev/raidframe/rf_dagdegwr.c
new file mode 100644
index 0000000..68d1899
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagdegwr.c
@@ -0,0 +1,844 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dagdegwr.c,v 1.6 2001/01/26 04:05:08 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_dagdegwr.c
+ *
+ * code for creating degraded write DAGs
+ *
+ */
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_memchunk.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_dagdegwr.h>
+
+
+/******************************************************************************
+ *
+ * General comments on DAG creation:
+ *
+ * All DAGs in this file use roll-away error recovery.  Each DAG has a single
+ * commit node, usually called "Cmt."  If an error occurs before the Cmt node
+ * is reached, the execution engine will halt forward execution and work
+ * backward through the graph, executing the undo functions.  Assuming that
+ * each node in the graph prior to the Cmt node are undoable and atomic - or -
+ * does not make changes to permanent state, the graph will fail atomically.
+ * If an error occurs after the Cmt node executes, the engine will roll-forward
+ * through the graph, blindly executing nodes until it reaches the end.
+ * If a graph reaches the end, it is assumed to have completed successfully.
+ *
+ * A graph has only 1 Cmt node.
+ *
+ */
+
+
+/******************************************************************************
+ *
+ * The following wrappers map the standard DAG creation interface to the
+ * DAG creation routines.  Additionally, these wrappers enable experimentation
+ * with new DAG structures by providing an extra level of indirection, allowing
+ * the DAG creation routines to be replaced at this single point.
+ */
+
+static 
+RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG)
+{
+	rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp,
+	    flags, allocList, 1, rf_RecoveryXorFunc, RF_TRUE);
+}
+
+void 
+rf_CreateDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList)
+	RF_Raid_t *raidPtr;
+	RF_AccessStripeMap_t *asmap;
+	RF_DagHeader_t *dag_h;
+	void   *bp;
+	RF_RaidAccessFlags_t flags;
+	RF_AllocListElem_t *allocList;
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0];
+
+	RF_ASSERT(asmap->numDataFailed == 1);
+	dag_h->creator = "DegradedWriteDAG";
+
+	/* if the access writes only a portion of the failed unit, and also
+	 * writes some portion of at least one surviving unit, we create two
+	 * DAGs, one for the failed component and one for the non-failed
+	 * component, and do them sequentially.  Note that the fact that we're
+	 * accessing only a portion of the failed unit indicates that the
+	 * access either starts or ends in the failed unit, and hence we need
+	 * create only two dags.  This is inefficient in that the same data or
+	 * parity can get read and written twice using this structure.  I need
+	 * to fix this to do the access all at once. */
+	RF_ASSERT(!(asmap->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit));
+	rf_CreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList);
+}
+
+
+
+/******************************************************************************
+ *
+ * DAG creation code begins here
+ */
+
+
+
+/******************************************************************************
+ *
+ * CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode
+ * write, which is as follows
+ *
+ *                                        / {Wnq} --\
+ * hdr -> blockNode ->  Rod -> Xor -> Cmt -> Wnp ----> unblock -> term
+ *                  \  {Rod} /            \  Wnd ---/
+ *                                        \ {Wnd} -/
+ *
+ * commit nodes: Xor, Wnd
+ *
+ * IMPORTANT:
+ * This DAG generator does not work for double-degraded archs since it does not
+ * generate Q
+ *
+ * This dag is essentially identical to the large-write dag, except that the
+ * write to the failed data unit is suppressed.
+ *
+ * IMPORTANT:  this dag does not work in the case where the access writes only
+ * a portion of the failed unit, and also writes some portion of at least one
+ * surviving SU.  this case is handled in CreateDegradedWriteDAG above.
+ *
+ * The block & unblock nodes are leftovers from a previous version.  They
+ * do nothing, but I haven't deleted them because it would be a tremendous
+ * effort to put them back in.
+ *
+ * This dag is used whenever a one of the data units in a write has failed.
+ * If it is the parity unit that failed, the nonredundant write dag (below)
+ * is used.
+ *****************************************************************************/
+
+void 
+rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
+    allocList, nfaults, redFunc, allowBufferRecycle)
+	RF_Raid_t *raidPtr;
+	RF_AccessStripeMap_t *asmap;
+	RF_DagHeader_t *dag_h;
+	void   *bp;
+	RF_RaidAccessFlags_t flags;
+	RF_AllocListElem_t *allocList;
+	int     nfaults;
+	int     (*redFunc) (RF_DagNode_t *);
+	int     allowBufferRecycle;
+{
+	int     nNodes, nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum,
+	        rdnodesFaked;
+	RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *wnqNode, *termNode;
+	RF_DagNode_t *nodes, *wndNodes, *rrdNodes, *xorNode, *commitNode;
+	RF_SectorCount_t sectorsPerSU;
+	RF_ReconUnitNum_t which_ru;
+	char   *xorTargetBuf = NULL;	/* the target buffer for the XOR
+					 * operation */
+	char   *overlappingPDAs;/* a temporary array of flags */
+	RF_AccessStripeMapHeader_t *new_asm_h[2];
+	RF_PhysDiskAddr_t *pda, *parityPDA;
+	RF_StripeNum_t parityStripeID;
+	RF_PhysDiskAddr_t *failedPDA;
+	RF_RaidLayout_t *layoutPtr;
+
+	layoutPtr = &(raidPtr->Layout);
+	parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
+	    &which_ru);
+	sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
+	/* failedPDA points to the pda within the asm that targets the failed
+	 * disk */
+	failedPDA = asmap->failedPDAs[0];
+
+	if (rf_dagDebug)
+		printf("[Creating degraded-write DAG]\n");
+
+	RF_ASSERT(asmap->numDataFailed == 1);
+	dag_h->creator = "SimpleDegradedWriteDAG";
+
+	/*
+         * Generate two ASMs identifying the surviving data
+         * we need in order to recover the lost data.
+         */
+	/* overlappingPDAs array must be zero'd */
+	RF_Calloc(overlappingPDAs, asmap->numStripeUnitsAccessed, sizeof(char), (char *));
+	rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h,
+	    &nXorBufs, NULL, overlappingPDAs, allocList);
+
+	/* create all the nodes at once */
+	nWndNodes = asmap->numStripeUnitsAccessed - 1;	/* no access is
+							 * generated for the
+							 * failed pda */
+
+	nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
+	    ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
+	/*
+         * XXX
+         *
+         * There's a bug with a complete stripe overwrite- that means 0 reads
+         * of old data, and the rest of the DAG generation code doesn't like
+         * that. A release is coming, and I don't wanna risk breaking a critical
+         * DAG generator, so here's what I'm gonna do- if there's no read nodes,
+         * I'm gonna fake there being a read node, and I'm gonna swap in a
+         * no-op node in its place (to make all the link-up code happy).
+         * This should be fixed at some point.  --jimz
+         */
+	if (nRrdNodes == 0) {
+		nRrdNodes = 1;
+		rdnodesFaked = 1;
+	} else {
+		rdnodesFaked = 0;
+	}
+	/* lock, unlock, xor, Wnd, Rrd, W(nfaults) */
+	nNodes = 5 + nfaults + nWndNodes + nRrdNodes;
+	RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t),
+	    (RF_DagNode_t *), allocList);
+	i = 0;
+	blockNode = &nodes[i];
+	i += 1;
+	commitNode = &nodes[i];
+	i += 1;
+	unblockNode = &nodes[i];
+	i += 1;
+	termNode = &nodes[i];
+	i += 1;
+	xorNode = &nodes[i];
+	i += 1;
+	wnpNode = &nodes[i];
+	i += 1;
+	wndNodes = &nodes[i];
+	i += nWndNodes;
+	rrdNodes = &nodes[i];
+	i += nRrdNodes;
+	if (nfaults == 2) {
+		wnqNode = &nodes[i];
+		i += 1;
+	} else {
+		wnqNode = NULL;
+	}
+	RF_ASSERT(i == nNodes);
+
+	/* this dag can not commit until all rrd and xor Nodes have completed */
+	dag_h->numCommitNodes = 1;
+	dag_h->numCommits = 0;
+	dag_h->numSuccedents = 1;
+
+	RF_ASSERT(nRrdNodes > 0);
+	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+	    NULL, nRrdNodes, 0, 0, 0, dag_h, "Nil", allocList);
+	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+	    NULL, nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
+	rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+	    NULL, 1, nWndNodes + nfaults, 0, 0, dag_h, "Nil", allocList);
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+	    NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+	rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
+	    nRrdNodes, 2 * nXorBufs + 2, nfaults, dag_h, "Xrc", allocList);
+
+	/*
+         * Fill in the Rrd nodes. If any of the rrd buffers are the same size as
+         * the failed buffer, save a pointer to it so we can use it as the target
+         * of the XOR. The pdas in the rrd nodes have been range-restricted, so if
+         * a buffer is the same size as the failed buffer, it must also be at the
+         * same alignment within the SU.
+         */
+	i = 0;
+	if (new_asm_h[0]) {
+		for (i = 0, pda = new_asm_h[0]->stripeMap->physInfo;
+		    i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
+		    i++, pda = pda->next) {
+			rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
+			RF_ASSERT(pda);
+			rrdNodes[i].params[0].p = pda;
+			rrdNodes[i].params[1].p = pda->bufPtr;
+			rrdNodes[i].params[2].v = parityStripeID;
+			rrdNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		}
+	}
+	/* i now equals the number of stripe units accessed in new_asm_h[0] */
+	if (new_asm_h[1]) {
+		for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo;
+		    j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
+		    j++, pda = pda->next) {
+			rf_InitNode(&rrdNodes[i + j], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
+			RF_ASSERT(pda);
+			rrdNodes[i + j].params[0].p = pda;
+			rrdNodes[i + j].params[1].p = pda->bufPtr;
+			rrdNodes[i + j].params[2].v = parityStripeID;
+			rrdNodes[i + j].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+			if (allowBufferRecycle && (pda->numSector == failedPDA->numSector))
+				xorTargetBuf = pda->bufPtr;
+		}
+	}
+	if (rdnodesFaked) {
+		/*
+	         * This is where we'll init that fake noop read node
+	         * (XXX should the wakeup func be different?)
+	         */
+		rf_InitNode(&rrdNodes[0], rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+		    NULL, 1, 1, 0, 0, dag_h, "RrN", allocList);
+	}
+	/*
+         * Make a PDA for the parity unit.  The parity PDA should start at
+         * the same offset into the SU as the failed PDA.
+         */
+	/* Danner comment: I don't think this copy is really necessary. We are
+	 * in one of two cases here. (1) The entire failed unit is written.
+	 * Then asmap->parityInfo will describe the entire parity. (2) We are
+	 * only writing a subset of the failed unit and nothing else. Then the
+	 * asmap->parityInfo describes the failed unit and the copy can also
+	 * be avoided. */
+
+	RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+	parityPDA->row = asmap->parityInfo->row;
+	parityPDA->col = asmap->parityInfo->col;
+	parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU)
+	    * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
+	parityPDA->numSector = failedPDA->numSector;
+
+	if (!xorTargetBuf) {
+		RF_CallocAndAdd(xorTargetBuf, 1,
+		    rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
+	}
+	/* init the Wnp node */
+	rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+	    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
+	wnpNode->params[0].p = parityPDA;
+	wnpNode->params[1].p = xorTargetBuf;
+	wnpNode->params[2].v = parityStripeID;
+	wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+
+	/* fill in the Wnq Node */
+	if (nfaults == 2) {
+		{
+			RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
+			    (RF_PhysDiskAddr_t *), allocList);
+			parityPDA->row = asmap->qInfo->row;
+			parityPDA->col = asmap->qInfo->col;
+			parityPDA->startSector = ((asmap->qInfo->startSector / sectorsPerSU)
+			    * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
+			parityPDA->numSector = failedPDA->numSector;
+
+			rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
+			wnqNode->params[0].p = parityPDA;
+			RF_CallocAndAdd(xorNode->results[1], 1,
+			    rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
+			wnqNode->params[1].p = xorNode->results[1];
+			wnqNode->params[2].v = parityStripeID;
+			wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		}
+	}
+	/* fill in the Wnd nodes */
+	for (pda = asmap->physInfo, i = 0; i < nWndNodes; i++, pda = pda->next) {
+		if (pda == failedPDA) {
+			i--;
+			continue;
+		}
+		rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+		    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
+		RF_ASSERT(pda);
+		wndNodes[i].params[0].p = pda;
+		wndNodes[i].params[1].p = pda->bufPtr;
+		wndNodes[i].params[2].v = parityStripeID;
+		wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+	}
+
+	/* fill in the results of the xor node */
+	xorNode->results[0] = xorTargetBuf;
+
+	/* fill in the params of the xor node */
+
+	paramNum = 0;
+	if (rdnodesFaked == 0) {
+		for (i = 0; i < nRrdNodes; i++) {
+			/* all the Rrd nodes need to be xored together */
+			xorNode->params[paramNum++] = rrdNodes[i].params[0];
+			xorNode->params[paramNum++] = rrdNodes[i].params[1];
+		}
+	}
+	for (i = 0; i < nWndNodes; i++) {
+		/* any Wnd nodes that overlap the failed access need to be
+		 * xored in */
+		if (overlappingPDAs[i]) {
+			RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+			bcopy((char *) wndNodes[i].params[0].p, (char *) pda, sizeof(RF_PhysDiskAddr_t));
+			rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0);
+			xorNode->params[paramNum++].p = pda;
+			xorNode->params[paramNum++].p = pda->bufPtr;
+		}
+	}
+	RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char));
+
+	/*
+         * Install the failed PDA into the xor param list so that the
+         * new data gets xor'd in.
+         */
+	xorNode->params[paramNum++].p = failedPDA;
+	xorNode->params[paramNum++].p = failedPDA->bufPtr;
+
+	/*
+         * The last 2 params to the recovery xor node are always the failed
+         * PDA and the raidPtr. install the failedPDA even though we have just
+         * done so above. This allows us to use the same XOR function for both
+         * degraded reads and degraded writes.
+         */
+	xorNode->params[paramNum++].p = failedPDA;
+	xorNode->params[paramNum++].p = raidPtr;
+	RF_ASSERT(paramNum == 2 * nXorBufs + 2);
+
+	/*
+         * Code to link nodes begins here
+         */
+
+	/* link header to block node */
+	RF_ASSERT(blockNode->numAntecedents == 0);
+	dag_h->succedents[0] = blockNode;
+
+	/* link block node to rd nodes */
+	RF_ASSERT(blockNode->numSuccedents == nRrdNodes);
+	for (i = 0; i < nRrdNodes; i++) {
+		RF_ASSERT(rrdNodes[i].numAntecedents == 1);
+		blockNode->succedents[i] = &rrdNodes[i];
+		rrdNodes[i].antecedents[0] = blockNode;
+		rrdNodes[i].antType[0] = rf_control;
+	}
+
+	/* link read nodes to xor node */
+	RF_ASSERT(xorNode->numAntecedents == nRrdNodes);
+	for (i = 0; i < nRrdNodes; i++) {
+		RF_ASSERT(rrdNodes[i].numSuccedents == 1);
+		rrdNodes[i].succedents[0] = xorNode;
+		xorNode->antecedents[i] = &rrdNodes[i];
+		xorNode->antType[i] = rf_trueData;
+	}
+
+	/* link xor node to commit node */
+	RF_ASSERT(xorNode->numSuccedents == 1);
+	RF_ASSERT(commitNode->numAntecedents == 1);
+	xorNode->succedents[0] = commitNode;
+	commitNode->antecedents[0] = xorNode;
+	commitNode->antType[0] = rf_control;
+
+	/* link commit node to wnd nodes */
+	RF_ASSERT(commitNode->numSuccedents == nfaults + nWndNodes);
+	for (i = 0; i < nWndNodes; i++) {
+		RF_ASSERT(wndNodes[i].numAntecedents == 1);
+		commitNode->succedents[i] = &wndNodes[i];
+		wndNodes[i].antecedents[0] = commitNode;
+		wndNodes[i].antType[0] = rf_control;
+	}
+
+	/* link the commit node to wnp, wnq nodes */
+	RF_ASSERT(wnpNode->numAntecedents == 1);
+	commitNode->succedents[nWndNodes] = wnpNode;
+	wnpNode->antecedents[0] = commitNode;
+	wnpNode->antType[0] = rf_control;
+	if (nfaults == 2) {
+		RF_ASSERT(wnqNode->numAntecedents == 1);
+		commitNode->succedents[nWndNodes + 1] = wnqNode;
+		wnqNode->antecedents[0] = commitNode;
+		wnqNode->antType[0] = rf_control;
+	}
+	/* link write new data nodes to unblock node */
+	RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nfaults));
+	for (i = 0; i < nWndNodes; i++) {
+		RF_ASSERT(wndNodes[i].numSuccedents == 1);
+		wndNodes[i].succedents[0] = unblockNode;
+		unblockNode->antecedents[i] = &wndNodes[i];
+		unblockNode->antType[i] = rf_control;
+	}
+
+	/* link write new parity node to unblock node */
+	RF_ASSERT(wnpNode->numSuccedents == 1);
+	wnpNode->succedents[0] = unblockNode;
+	unblockNode->antecedents[nWndNodes] = wnpNode;
+	unblockNode->antType[nWndNodes] = rf_control;
+
+	/* link write new q node to unblock node */
+	if (nfaults == 2) {
+		RF_ASSERT(wnqNode->numSuccedents == 1);
+		wnqNode->succedents[0] = unblockNode;
+		unblockNode->antecedents[nWndNodes + 1] = wnqNode;
+		unblockNode->antType[nWndNodes + 1] = rf_control;
+	}
+	/* link unblock node to term node */
+	RF_ASSERT(unblockNode->numSuccedents == 1);
+	RF_ASSERT(termNode->numAntecedents == 1);
+	RF_ASSERT(termNode->numSuccedents == 0);
+	unblockNode->succedents[0] = termNode;
+	termNode->antecedents[0] = unblockNode;
+	termNode->antType[0] = rf_control;
+}
+#define CONS_PDA(if,start,num) \
+  pda_p->row = asmap->if->row;    pda_p->col = asmap->if->col; \
+  pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
+  pda_p->numSector = num; \
+  pda_p->next = NULL; \
+  RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
+#if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0)
+void 
+rf_WriteGenerateFailedAccessASMs(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_PhysDiskAddr_t ** pdap,
+    int *nNodep,
+    RF_PhysDiskAddr_t ** pqpdap,
+    int *nPQNodep,
+    RF_AllocListElem_t * allocList)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	int     PDAPerDisk, i;
+	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
+	int     numDataCol = layoutPtr->numDataCol;
+	int     state;
+	unsigned napdas;
+	RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end;
+	RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1];
+	RF_PhysDiskAddr_t *pda_p;
+	RF_RaidAddr_t sosAddr;
+
+	/* determine how many pda's we will have to generate per unaccess
+	 * stripe. If there is only one failed data unit, it is one; if two,
+	 * possibly two, depending wether they overlap. */
+
+	fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector);
+	fone_end = fone_start + fone->numSector;
+
+	if (asmap->numDataFailed == 1) {
+		PDAPerDisk = 1;
+		state = 1;
+		RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+		pda_p = *pqpdap;
+		/* build p */
+		CONS_PDA(parityInfo, fone_start, fone->numSector);
+		pda_p->type = RF_PDA_TYPE_PARITY;
+		pda_p++;
+		/* build q */
+		CONS_PDA(qInfo, fone_start, fone->numSector);
+		pda_p->type = RF_PDA_TYPE_Q;
+	} else {
+		ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector);
+		ftwo_end = ftwo_start + ftwo->numSector;
+		if (fone->numSector + ftwo->numSector > secPerSU) {
+			PDAPerDisk = 1;
+			state = 2;
+			RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+			pda_p = *pqpdap;
+			CONS_PDA(parityInfo, 0, secPerSU);
+			pda_p->type = RF_PDA_TYPE_PARITY;
+			pda_p++;
+			CONS_PDA(qInfo, 0, secPerSU);
+			pda_p->type = RF_PDA_TYPE_Q;
+		} else {
+			PDAPerDisk = 2;
+			state = 3;
+			/* four of them, fone, then ftwo */
+			RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+			pda_p = *pqpdap;
+			CONS_PDA(parityInfo, fone_start, fone->numSector);
+			pda_p->type = RF_PDA_TYPE_PARITY;
+			pda_p++;
+			CONS_PDA(qInfo, fone_start, fone->numSector);
+			pda_p->type = RF_PDA_TYPE_Q;
+			pda_p++;
+			CONS_PDA(parityInfo, ftwo_start, ftwo->numSector);
+			pda_p->type = RF_PDA_TYPE_PARITY;
+			pda_p++;
+			CONS_PDA(qInfo, ftwo_start, ftwo->numSector);
+			pda_p->type = RF_PDA_TYPE_Q;
+		}
+	}
+	/* figure out number of nonaccessed pda */
+	napdas = PDAPerDisk * (numDataCol - 2);
+	*nPQNodep = PDAPerDisk;
+
+	*nNodep = napdas;
+	if (napdas == 0)
+		return;		/* short circuit */
+
+	/* allocate up our list of pda's */
+
+	RF_CallocAndAdd(pda_p, napdas, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+	*pdap = pda_p;
+
+	/* linkem together */
+	for (i = 0; i < (napdas - 1); i++)
+		pda_p[i].next = pda_p + (i + 1);
+
+	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+	for (i = 0; i < numDataCol; i++) {
+		if ((pda_p - (*pdap)) == napdas)
+			continue;
+		pda_p->type = RF_PDA_TYPE_DATA;
+		pda_p->raidAddress = sosAddr + (i * secPerSU);
+		(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+		/* skip over dead disks */
+		if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
+			continue;
+		switch (state) {
+		case 1:	/* fone */
+			pda_p->numSector = fone->numSector;
+			pda_p->raidAddress += fone_start;
+			pda_p->startSector += fone_start;
+			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+			break;
+		case 2:	/* full stripe */
+			pda_p->numSector = secPerSU;
+			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, secPerSU), (char *), allocList);
+			break;
+		case 3:	/* two slabs */
+			pda_p->numSector = fone->numSector;
+			pda_p->raidAddress += fone_start;
+			pda_p->startSector += fone_start;
+			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+			pda_p++;
+			pda_p->type = RF_PDA_TYPE_DATA;
+			pda_p->raidAddress = sosAddr + (i * secPerSU);
+			(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+			pda_p->numSector = ftwo->numSector;
+			pda_p->raidAddress += ftwo_start;
+			pda_p->startSector += ftwo_start;
+			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
+			break;
+		default:
+			RF_PANIC();
+		}
+		pda_p++;
+	}
+
+	RF_ASSERT(pda_p - *pdap == napdas);
+	return;
+}
+#define DISK_NODE_PDA(node)  ((node)->params[0].p)
+
+#define DISK_NODE_PARAMS(_node_,_p_) \
+  (_node_).params[0].p = _p_ ; \
+  (_node_).params[1].p = (_p_)->bufPtr; \
+  (_node_).params[2].v = parityStripeID; \
+  (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
+
+void 
+rf_DoubleDegSmallWrite(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    char *redundantReadNodeName,
+    char *redundantWriteNodeName,
+    char *recoveryNodeName,
+    int (*recovFunc) (RF_DagNode_t *))
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_DagNode_t *nodes, *wudNodes, *rrdNodes, *recoveryNode, *blockNode,
+	       *unblockNode, *rpNodes, *rqNodes, *wpNodes, *wqNodes, *termNode;
+	RF_PhysDiskAddr_t *pda, *pqPDAs;
+	RF_PhysDiskAddr_t *npdas;
+	int     nWriteNodes, nNodes, nReadNodes, nRrdNodes, nWudNodes, i;
+	RF_ReconUnitNum_t which_ru;
+	int     nPQNodes;
+	RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru);
+
+	/* simple small write case - First part looks like a reconstruct-read
+	 * of the failed data units. Then a write of all data units not
+	 * failed. */
+
+
+	/* Hdr | ------Block- /  /         \   Rrd  Rrd ...  Rrd  Rp Rq \  \
+	 * /  -------PQ----- /   \   \ Wud   Wp  WQ	     \    |   /
+	 * --Unblock- | T
+	 * 
+	 * Rrd = read recovery data  (potentially none) Wud = write user data
+	 * (not incl. failed disks) Wp = Write P (could be two) Wq = Write Q
+	 * (could be two)
+	 * 
+	 */
+
+	rf_WriteGenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes, allocList);
+
+	RF_ASSERT(asmap->numDataFailed == 1);
+
+	nWudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
+	nReadNodes = nRrdNodes + 2 * nPQNodes;
+	nWriteNodes = nWudNodes + 2 * nPQNodes;
+	nNodes = 4 + nReadNodes + nWriteNodes;
+
+	RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+	blockNode = nodes;
+	unblockNode = blockNode + 1;
+	termNode = unblockNode + 1;
+	recoveryNode = termNode + 1;
+	rrdNodes = recoveryNode + 1;
+	rpNodes = rrdNodes + nRrdNodes;
+	rqNodes = rpNodes + nPQNodes;
+	wudNodes = rqNodes + nPQNodes;
+	wpNodes = wudNodes + nWudNodes;
+	wqNodes = wpNodes + nPQNodes;
+
+	dag_h->creator = "PQ_DDSimpleSmallWrite";
+	dag_h->numSuccedents = 1;
+	dag_h->succedents[0] = blockNode;
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+	termNode->antecedents[0] = unblockNode;
+	termNode->antType[0] = rf_control;
+
+	/* init the block and unblock nodes */
+	/* The block node has all the read nodes as successors */
+	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList);
+	for (i = 0; i < nReadNodes; i++)
+		blockNode->succedents[i] = rrdNodes + i;
+
+	/* The unblock node has all the writes as successors */
+	rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWriteNodes, 0, 0, dag_h, "Nil", allocList);
+	for (i = 0; i < nWriteNodes; i++) {
+		unblockNode->antecedents[i] = wudNodes + i;
+		unblockNode->antType[i] = rf_control;
+	}
+	unblockNode->succedents[0] = termNode;
+
+#define INIT_READ_NODE(node,name) \
+  rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
+  (node)->succedents[0] = recoveryNode; \
+  (node)->antecedents[0] = blockNode; \
+  (node)->antType[0] = rf_control;
+
+	/* build the read nodes */
+	pda = npdas;
+	for (i = 0; i < nRrdNodes; i++, pda = pda->next) {
+		INIT_READ_NODE(rrdNodes + i, "rrd");
+		DISK_NODE_PARAMS(rrdNodes[i], pda);
+	}
+
+	/* read redundancy pdas */
+	pda = pqPDAs;
+	INIT_READ_NODE(rpNodes, "Rp");
+	RF_ASSERT(pda);
+	DISK_NODE_PARAMS(rpNodes[0], pda);
+	pda++;
+	INIT_READ_NODE(rqNodes, redundantReadNodeName);
+	RF_ASSERT(pda);
+	DISK_NODE_PARAMS(rqNodes[0], pda);
+	if (nPQNodes == 2) {
+		pda++;
+		INIT_READ_NODE(rpNodes + 1, "Rp");
+		RF_ASSERT(pda);
+		DISK_NODE_PARAMS(rpNodes[1], pda);
+		pda++;
+		INIT_READ_NODE(rqNodes + 1, redundantReadNodeName);
+		RF_ASSERT(pda);
+		DISK_NODE_PARAMS(rqNodes[1], pda);
+	}
+	/* the recovery node has all reads as precedessors and all writes as
+	 * successors. It generates a result for every write P or write Q
+	 * node. As parameters, it takes a pda per read and a pda per stripe
+	 * of user data written. It also takes as the last params the raidPtr
+	 * and asm. For results, it takes PDA for P & Q. */
+
+
+	rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL,
+	    nWriteNodes,	/* succesors */
+	    nReadNodes,		/* preds */
+	    nReadNodes + nWudNodes + 3,	/* params */
+	    2 * nPQNodes,	/* results */
+	    dag_h, recoveryNodeName, allocList);
+
+
+
+	for (i = 0; i < nReadNodes; i++) {
+		recoveryNode->antecedents[i] = rrdNodes + i;
+		recoveryNode->antType[i] = rf_control;
+		recoveryNode->params[i].p = DISK_NODE_PDA(rrdNodes + i);
+	}
+	for (i = 0; i < nWudNodes; i++) {
+		recoveryNode->succedents[i] = wudNodes + i;
+	}
+	recoveryNode->params[nReadNodes + nWudNodes].p = asmap->failedPDAs[0];
+	recoveryNode->params[nReadNodes + nWudNodes + 1].p = raidPtr;
+	recoveryNode->params[nReadNodes + nWudNodes + 2].p = asmap;
+
+	for (; i < nWriteNodes; i++)
+		recoveryNode->succedents[i] = wudNodes + i;
+
+	pda = pqPDAs;
+	recoveryNode->results[0] = pda;
+	pda++;
+	recoveryNode->results[1] = pda;
+	if (nPQNodes == 2) {
+		pda++;
+		recoveryNode->results[2] = pda;
+		pda++;
+		recoveryNode->results[3] = pda;
+	}
+	/* fill writes */
+#define INIT_WRITE_NODE(node,name) \
+  rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
+    (node)->succedents[0] = unblockNode; \
+    (node)->antecedents[0] = recoveryNode; \
+    (node)->antType[0] = rf_control;
+
+	pda = asmap->physInfo;
+	for (i = 0; i < nWudNodes; i++) {
+		INIT_WRITE_NODE(wudNodes + i, "Wd");
+		DISK_NODE_PARAMS(wudNodes[i], pda);
+		recoveryNode->params[nReadNodes + i].p = DISK_NODE_PDA(wudNodes + i);
+		pda = pda->next;
+	}
+	/* write redundancy pdas */
+	pda = pqPDAs;
+	INIT_WRITE_NODE(wpNodes, "Wp");
+	RF_ASSERT(pda);
+	DISK_NODE_PARAMS(wpNodes[0], pda);
+	pda++;
+	INIT_WRITE_NODE(wqNodes, "Wq");
+	RF_ASSERT(pda);
+	DISK_NODE_PARAMS(wqNodes[0], pda);
+	if (nPQNodes == 2) {
+		pda++;
+		INIT_WRITE_NODE(wpNodes + 1, "Wp");
+		RF_ASSERT(pda);
+		DISK_NODE_PARAMS(wpNodes[1], pda);
+		pda++;
+		INIT_WRITE_NODE(wqNodes + 1, "Wq");
+		RF_ASSERT(pda);
+		DISK_NODE_PARAMS(wqNodes[1], pda);
+	}
+}
+#endif   /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0) */
diff --git a/sys/dev/raidframe/rf_dagdegwr.h b/sys/dev/raidframe/rf_dagdegwr.h
new file mode 100644
index 0000000..1e4b5e2
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagdegwr.h
@@ -0,0 +1,55 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dagdegwr.h,v 1.4 1999/08/15 02:36:03 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+
+#ifndef _RF__RF_DAGDEGWR_H_
+#define _RF__RF_DAGDEGWR_H_
+
+/* degraded write DAG creation routines */
+void rf_CreateDegradedWriteDAG(RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+    RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
+
+void rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+    RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
+    int nfaults, int (*redFunc) (RF_DagNode_t *), int allowBufferRecycle);
+
+void rf_WriteGenerateFailedAccessASMs(RF_Raid_t * raidPtr,
+      RF_AccessStripeMap_t * asmap, RF_PhysDiskAddr_t ** pdap,
+      int *nNodep, RF_PhysDiskAddr_t ** pqpdap,
+      int *nPQNodep, RF_AllocListElem_t * allocList);
+
+void rf_DoubleDegSmallWrite(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
+            RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
+            RF_AllocListElem_t * allocList, char *redundantReadNodeName,
+            char *redundantWriteNodeName, char *recoveryNodeName,
+            int (*recovFunc) (RF_DagNode_t *));
+
+#endif				/* !_RF__RF_DAGDEGWR_H_ */
diff --git a/sys/dev/raidframe/rf_dagffrd.c b/sys/dev/raidframe/rf_dagffrd.c
new file mode 100644
index 0000000..51f3f9f
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagffrd.c
@@ -0,0 +1,439 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dagffrd.c,v 1.4 2000/01/07 03:40:58 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_dagffrd.c
+ *
+ * code for creating fault-free read DAGs
+ *
+ */
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_memchunk.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_dagffrd.h>
+
+/******************************************************************************
+ *
+ * General comments on DAG creation:
+ *
+ * All DAGs in this file use roll-away error recovery.  Each DAG has a single
+ * commit node, usually called "Cmt."  If an error occurs before the Cmt node
+ * is reached, the execution engine will halt forward execution and work
+ * backward through the graph, executing the undo functions.  Assuming that
+ * each node in the graph prior to the Cmt node are undoable and atomic - or -
+ * does not make changes to permanent state, the graph will fail atomically.
+ * If an error occurs after the Cmt node executes, the engine will roll-forward
+ * through the graph, blindly executing nodes until it reaches the end.
+ * If a graph reaches the end, it is assumed to have completed successfully.
+ *
+ * A graph has only 1 Cmt node.
+ *
+ */
+
+
+/******************************************************************************
+ *
+ * The following wrappers map the standard DAG creation interface to the
+ * DAG creation routines.  Additionally, these wrappers enable experimentation
+ * with new DAG structures by providing an extra level of indirection, allowing
+ * the DAG creation routines to be replaced at this single point.
+ */
+
+void 
+rf_CreateFaultFreeReadDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList)
+{
+	rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+	    RF_IO_TYPE_READ);
+}
+
+
+/******************************************************************************
+ *
+ * DAG creation code begins here
+ */
+
+/******************************************************************************
+ *
+ * creates a DAG to perform a nonredundant read or write of data within one
+ * stripe.
+ * For reads, this DAG is as follows:
+ *
+ *                   /---- read ----\
+ *    Header -- Block ---- read ---- Commit -- Terminate
+ *                   \---- read ----/
+ *
+ * For writes, this DAG is as follows:
+ *
+ *                    /---- write ----\
+ *    Header -- Commit ---- write ---- Block -- Terminate
+ *                    \---- write ----/
+ *
+ * There is one disk node per stripe unit accessed, and all disk nodes are in
+ * parallel.
+ *
+ * Tricky point here:  The first disk node (read or write) is created
+ * normally.  Subsequent disk nodes are created by copying the first one,
+ * and modifying a few params.  The "succedents" and "antecedents" fields are
+ * _not_ re-created in each node, but rather left pointing to the same array
+ * that was malloc'd when the first node was created.  Thus, it's essential
+ * that when this DAG is freed, the succedents and antecedents fields be freed
+ * in ONLY ONE of the read nodes.  This does not apply to the "params" field
+ * because it is recreated for each READ node.
+ *
+ * Note that normal-priority accesses do not need to be tagged with their
+ * parity stripe ID, because they will never be promoted.  Hence, I've
+ * commented-out the code to do this, and marked it with UNNEEDED.
+ *
+ *****************************************************************************/
+
+void 
+rf_CreateNonredundantDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    RF_IoType_t type)
+{
+	RF_DagNode_t *nodes, *diskNodes, *blockNode, *commitNode, *termNode;
+	RF_PhysDiskAddr_t *pda = asmap->physInfo;
+	int     (*doFunc) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *);
+	int     i, n, totalNumNodes;
+	char   *name;
+
+	n = asmap->numStripeUnitsAccessed;
+	dag_h->creator = "NonredundantDAG";
+
+	RF_ASSERT(RF_IO_IS_R_OR_W(type));
+	switch (type) {
+	case RF_IO_TYPE_READ:
+		doFunc = rf_DiskReadFunc;
+		undoFunc = rf_DiskReadUndoFunc;
+		name = "R  ";
+		if (rf_dagDebug)
+			printf("[Creating non-redundant read DAG]\n");
+		break;
+	case RF_IO_TYPE_WRITE:
+		doFunc = rf_DiskWriteFunc;
+		undoFunc = rf_DiskWriteUndoFunc;
+		name = "W  ";
+		if (rf_dagDebug)
+			printf("[Creating non-redundant write DAG]\n");
+		break;
+	default:
+		RF_PANIC();
+	}
+
+	/*
+         * For reads, the dag can not commit until the block node is reached.
+         * for writes, the dag commits immediately.
+         */
+	dag_h->numCommitNodes = 1;
+	dag_h->numCommits = 0;
+	dag_h->numSuccedents = 1;
+
+	/*
+         * Node count:
+         * 1 block node
+         * n data reads (or writes)
+         * 1 commit node
+         * 1 terminator node
+         */
+	RF_ASSERT(n > 0);
+	totalNumNodes = n + 3;
+	RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t),
+	    (RF_DagNode_t *), allocList);
+	i = 0;
+	diskNodes = &nodes[i];
+	i += n;
+	blockNode = &nodes[i];
+	i += 1;
+	commitNode = &nodes[i];
+	i += 1;
+	termNode = &nodes[i];
+	i += 1;
+	RF_ASSERT(i == totalNumNodes);
+
+	/* initialize nodes */
+	switch (type) {
+	case RF_IO_TYPE_READ:
+		rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+		    NULL, n, 0, 0, 0, dag_h, "Nil", allocList);
+		rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+		    NULL, 1, n, 0, 0, dag_h, "Cmt", allocList);
+		rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+		    NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+		break;
+	case RF_IO_TYPE_WRITE:
+		rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+		    NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
+		rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+		    NULL, n, 1, 0, 0, dag_h, "Cmt", allocList);
+		rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+		    NULL, 0, n, 0, 0, dag_h, "Trm", allocList);
+		break;
+	default:
+		RF_PANIC();
+	}
+
+	for (i = 0; i < n; i++) {
+		RF_ASSERT(pda != NULL);
+		rf_InitNode(&diskNodes[i], rf_wait, RF_FALSE, doFunc, undoFunc, rf_GenericWakeupFunc,
+		    1, 1, 4, 0, dag_h, name, allocList);
+		diskNodes[i].params[0].p = pda;
+		diskNodes[i].params[1].p = pda->bufPtr;
+		/* parity stripe id is not necessary */
+		diskNodes[i].params[2].v = 0;
+		diskNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
+		pda = pda->next;
+	}
+
+	/*
+         * Connect nodes.
+         */
+
+	/* connect hdr to block node */
+	RF_ASSERT(blockNode->numAntecedents == 0);
+	dag_h->succedents[0] = blockNode;
+
+	if (type == RF_IO_TYPE_READ) {
+		/* connecting a nonredundant read DAG */
+		RF_ASSERT(blockNode->numSuccedents == n);
+		RF_ASSERT(commitNode->numAntecedents == n);
+		for (i = 0; i < n; i++) {
+			/* connect block node to each read node */
+			RF_ASSERT(diskNodes[i].numAntecedents == 1);
+			blockNode->succedents[i] = &diskNodes[i];
+			diskNodes[i].antecedents[0] = blockNode;
+			diskNodes[i].antType[0] = rf_control;
+
+			/* connect each read node to the commit node */
+			RF_ASSERT(diskNodes[i].numSuccedents == 1);
+			diskNodes[i].succedents[0] = commitNode;
+			commitNode->antecedents[i] = &diskNodes[i];
+			commitNode->antType[i] = rf_control;
+		}
+		/* connect the commit node to the term node */
+		RF_ASSERT(commitNode->numSuccedents == 1);
+		RF_ASSERT(termNode->numAntecedents == 1);
+		RF_ASSERT(termNode->numSuccedents == 0);
+		commitNode->succedents[0] = termNode;
+		termNode->antecedents[0] = commitNode;
+		termNode->antType[0] = rf_control;
+	} else {
+		/* connecting a nonredundant write DAG */
+		/* connect the block node to the commit node */
+		RF_ASSERT(blockNode->numSuccedents == 1);
+		RF_ASSERT(commitNode->numAntecedents == 1);
+		blockNode->succedents[0] = commitNode;
+		commitNode->antecedents[0] = blockNode;
+		commitNode->antType[0] = rf_control;
+
+		RF_ASSERT(commitNode->numSuccedents == n);
+		RF_ASSERT(termNode->numAntecedents == n);
+		RF_ASSERT(termNode->numSuccedents == 0);
+		for (i = 0; i < n; i++) {
+			/* connect the commit node to each write node */
+			RF_ASSERT(diskNodes[i].numAntecedents == 1);
+			commitNode->succedents[i] = &diskNodes[i];
+			diskNodes[i].antecedents[0] = commitNode;
+			diskNodes[i].antType[0] = rf_control;
+
+			/* connect each write node to the term node */
+			RF_ASSERT(diskNodes[i].numSuccedents == 1);
+			diskNodes[i].succedents[0] = termNode;
+			termNode->antecedents[i] = &diskNodes[i];
+			termNode->antType[i] = rf_control;
+		}
+	}
+}
+/******************************************************************************
+ * Create a fault-free read DAG for RAID level 1
+ *
+ * Hdr -> Nil -> Rmir -> Cmt -> Trm
+ *
+ * The "Rmir" node schedules a read from the disk in the mirror pair with the
+ * shortest disk queue.  the proper queue is selected at Rmir execution.  this
+ * deferred mapping is unlike other archs in RAIDframe which generally fix
+ * mapping at DAG creation time.
+ *
+ * Parameters:  raidPtr   - description of the physical array
+ *              asmap     - logical & physical addresses for this access
+ *              bp        - buffer ptr (for holding read data)
+ *              flags     - general flags (e.g. disk locking)
+ *              allocList - list of memory allocated in DAG creation
+ *****************************************************************************/
+
+static void 
+CreateMirrorReadDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    int (*readfunc) (RF_DagNode_t * node))
+{
+	RF_DagNode_t *readNodes, *nodes, *blockNode, *commitNode, *termNode;
+	RF_PhysDiskAddr_t *data_pda = asmap->physInfo;
+	RF_PhysDiskAddr_t *parity_pda = asmap->parityInfo;
+	int     i, n, totalNumNodes;
+
+	n = asmap->numStripeUnitsAccessed;
+	dag_h->creator = "RaidOneReadDAG";
+	if (rf_dagDebug) {
+		printf("[Creating RAID level 1 read DAG]\n");
+	}
+	/*
+         * This dag can not commit until the commit node is reached
+         * errors prior to the commit point imply the dag has failed.
+         */
+	dag_h->numCommitNodes = 1;
+	dag_h->numCommits = 0;
+	dag_h->numSuccedents = 1;
+
+	/*
+         * Node count:
+         * n data reads
+         * 1 block node
+         * 1 commit node
+         * 1 terminator node
+         */
+	RF_ASSERT(n > 0);
+	totalNumNodes = n + 3;
+	RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t),
+	    (RF_DagNode_t *), allocList);
+	i = 0;
+	readNodes = &nodes[i];
+	i += n;
+	blockNode = &nodes[i];
+	i += 1;
+	commitNode = &nodes[i];
+	i += 1;
+	termNode = &nodes[i];
+	i += 1;
+	RF_ASSERT(i == totalNumNodes);
+
+	/* initialize nodes */
+	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
+	    rf_NullNodeUndoFunc, NULL, n, 0, 0, 0, dag_h, "Nil", allocList);
+	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
+	    rf_NullNodeUndoFunc, NULL, 1, n, 0, 0, dag_h, "Cmt", allocList);
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
+	    rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+	for (i = 0; i < n; i++) {
+		RF_ASSERT(data_pda != NULL);
+		RF_ASSERT(parity_pda != NULL);
+		rf_InitNode(&readNodes[i], rf_wait, RF_FALSE, readfunc,
+		    rf_DiskReadMirrorUndoFunc, rf_GenericWakeupFunc, 1, 1, 5, 0, dag_h,
+		    "Rmir", allocList);
+		readNodes[i].params[0].p = data_pda;
+		readNodes[i].params[1].p = data_pda->bufPtr;
+		/* parity stripe id is not necessary */
+		readNodes[i].params[2].p = 0;
+		readNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
+		readNodes[i].params[4].p = parity_pda;
+		data_pda = data_pda->next;
+		parity_pda = parity_pda->next;
+	}
+
+	/*
+         * Connect nodes
+         */
+
+	/* connect hdr to block node */
+	RF_ASSERT(blockNode->numAntecedents == 0);
+	dag_h->succedents[0] = blockNode;
+
+	/* connect block node to read nodes */
+	RF_ASSERT(blockNode->numSuccedents == n);
+	for (i = 0; i < n; i++) {
+		RF_ASSERT(readNodes[i].numAntecedents == 1);
+		blockNode->succedents[i] = &readNodes[i];
+		readNodes[i].antecedents[0] = blockNode;
+		readNodes[i].antType[0] = rf_control;
+	}
+
+	/* connect read nodes to commit node */
+	RF_ASSERT(commitNode->numAntecedents == n);
+	for (i = 0; i < n; i++) {
+		RF_ASSERT(readNodes[i].numSuccedents == 1);
+		readNodes[i].succedents[0] = commitNode;
+		commitNode->antecedents[i] = &readNodes[i];
+		commitNode->antType[i] = rf_control;
+	}
+
+	/* connect commit node to term node */
+	RF_ASSERT(commitNode->numSuccedents == 1);
+	RF_ASSERT(termNode->numAntecedents == 1);
+	RF_ASSERT(termNode->numSuccedents == 0);
+	commitNode->succedents[0] = termNode;
+	termNode->antecedents[0] = commitNode;
+	termNode->antType[0] = rf_control;
+}
+
+void 
+rf_CreateMirrorIdleReadDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList)
+{
+	CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+	    rf_DiskReadMirrorIdleFunc);
+}
+
+void 
+rf_CreateMirrorPartitionReadDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList)
+{
+	CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+	    rf_DiskReadMirrorPartitionFunc);
+}
diff --git a/sys/dev/raidframe/rf_dagffrd.h b/sys/dev/raidframe/rf_dagffrd.h
new file mode 100644
index 0000000..6862a8d
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagffrd.h
@@ -0,0 +1,53 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dagffrd.h,v 1.3 1999/02/05 00:06:07 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_DAGFFRD_H_
+#define _RF__RF_DAGFFRD_H_
+
+#include <dev/raidframe/rf_types.h>
+
+/* fault-free read DAG creation routines */
+void 
+rf_CreateFaultFreeReadDAG(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList);
+void 
+rf_CreateNonredundantDAG(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList, RF_IoType_t type);
+void 
+rf_CreateMirrorIdleReadDAG(RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+    RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
+void 
+rf_CreateMirrorPartitionReadDAG(RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+    RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
+
+#endif				/* !_RF__RF_DAGFFRD_H_ */
diff --git a/sys/dev/raidframe/rf_dagffwr.c b/sys/dev/raidframe/rf_dagffwr.c
new file mode 100644
index 0000000..7520cba
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagffwr.c
@@ -0,0 +1,2129 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dagffwr.c,v 1.5 2000/01/07 03:40:58 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_dagff.c
+ *
+ * code for creating fault-free DAGs
+ *
+ */
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_dagffrd.h>
+#include <dev/raidframe/rf_memchunk.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_dagffwr.h>
+
+/******************************************************************************
+ *
+ * General comments on DAG creation:
+ *
+ * All DAGs in this file use roll-away error recovery.  Each DAG has a single
+ * commit node, usually called "Cmt."  If an error occurs before the Cmt node
+ * is reached, the execution engine will halt forward execution and work
+ * backward through the graph, executing the undo functions.  Assuming that
+ * each node in the graph prior to the Cmt node are undoable and atomic - or -
+ * does not make changes to permanent state, the graph will fail atomically.
+ * If an error occurs after the Cmt node executes, the engine will roll-forward
+ * through the graph, blindly executing nodes until it reaches the end.
+ * If a graph reaches the end, it is assumed to have completed successfully.
+ *
+ * A graph has only 1 Cmt node.
+ *
+ */
+
+
+/******************************************************************************
+ *
+ * The following wrappers map the standard DAG creation interface to the
+ * DAG creation routines.  Additionally, these wrappers enable experimentation
+ * with new DAG structures by providing an extra level of indirection, allowing
+ * the DAG creation routines to be replaced at this single point.
+ */
+
+
+void 
+rf_CreateNonRedundantWriteDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    RF_IoType_t type)
+{
+	rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+	    RF_IO_TYPE_WRITE);
+}
+
+void 
+rf_CreateRAID0WriteDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    RF_IoType_t type)
+{
+	rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+	    RF_IO_TYPE_WRITE);
+}
+
+void 
+rf_CreateSmallWriteDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList)
+{
+	/* "normal" rollaway */
+	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+	    &rf_xorFuncs, NULL);
+}
+
+void 
+rf_CreateLargeWriteDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList)
+{
+	/* "normal" rollaway */
+	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+	    1, rf_RegularXorFunc, RF_TRUE);
+}
+
+
+/******************************************************************************
+ *
+ * DAG creation code begins here
+ */
+
+
+/******************************************************************************
+ *
+ * creates a DAG to perform a large-write operation:
+ *
+ *           / Rod \           / Wnd \
+ * H -- block- Rod - Xor - Cmt - Wnd --- T
+ *           \ Rod /          \  Wnp /
+ *                             \[Wnq]/
+ *
+ * The XOR node also does the Q calculation in the P+Q architecture.
+ * All nodes are before the commit node (Cmt) are assumed to be atomic and
+ * undoable - or - they make no changes to permanent state.
+ *
+ * Rod = read old data
+ * Cmt = commit node
+ * Wnp = write new parity
+ * Wnd = write new data
+ * Wnq = write new "q"
+ * [] denotes optional segments in the graph
+ *
+ * Parameters:  raidPtr   - description of the physical array
+ *              asmap     - logical & physical addresses for this access
+ *              bp        - buffer ptr (holds write data)
+ *              flags     - general flags (e.g. disk locking)
+ *              allocList - list of memory allocated in DAG creation
+ *              nfaults   - number of faults array can tolerate
+ *                          (equal to # redundancy units in stripe)
+ *              redfuncs  - list of redundancy generating functions
+ *
+ *****************************************************************************/
+
+void 
+rf_CommonCreateLargeWriteDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    int nfaults,
+    int (*redFunc) (RF_DagNode_t *),
+    int allowBufferRecycle)
+{
+	RF_DagNode_t *nodes, *wndNodes, *rodNodes, *xorNode, *wnpNode;
+	RF_DagNode_t *wnqNode, *blockNode, *commitNode, *termNode;
+	int     nWndNodes, nRodNodes, i, nodeNum, asmNum;
+	RF_AccessStripeMapHeader_t *new_asm_h[2];
+	RF_StripeNum_t parityStripeID;
+	char   *sosBuffer, *eosBuffer;
+	RF_ReconUnitNum_t which_ru;
+	RF_RaidLayout_t *layoutPtr;
+	RF_PhysDiskAddr_t *pda;
+
+	layoutPtr = &(raidPtr->Layout);
+	parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
+	    &which_ru);
+
+	if (rf_dagDebug) {
+		printf("[Creating large-write DAG]\n");
+	}
+	dag_h->creator = "LargeWriteDAG";
+
+	dag_h->numCommitNodes = 1;
+	dag_h->numCommits = 0;
+	dag_h->numSuccedents = 1;
+
+	/* alloc the nodes: Wnd, xor, commit, block, term, and  Wnp */
+	nWndNodes = asmap->numStripeUnitsAccessed;
+	RF_CallocAndAdd(nodes, nWndNodes + 4 + nfaults, sizeof(RF_DagNode_t),
+	    (RF_DagNode_t *), allocList);
+	i = 0;
+	wndNodes = &nodes[i];
+	i += nWndNodes;
+	xorNode = &nodes[i];
+	i += 1;
+	wnpNode = &nodes[i];
+	i += 1;
+	blockNode = &nodes[i];
+	i += 1;
+	commitNode = &nodes[i];
+	i += 1;
+	termNode = &nodes[i];
+	i += 1;
+	if (nfaults == 2) {
+		wnqNode = &nodes[i];
+		i += 1;
+	} else {
+		wnqNode = NULL;
+	}
+	rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h,
+	    &nRodNodes, &sosBuffer, &eosBuffer, allocList);
+	if (nRodNodes > 0) {
+		RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t),
+		    (RF_DagNode_t *), allocList);
+	} else {
+		rodNodes = NULL;
+	}
+
+	/* begin node initialization */
+	if (nRodNodes > 0) {
+		rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+		    NULL, nRodNodes, 0, 0, 0, dag_h, "Nil", allocList);
+	} else {
+		rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+		    NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
+	}
+
+	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL,
+	    nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL,
+	    0, nWndNodes + nfaults, 0, 0, dag_h, "Trm", allocList);
+
+	/* initialize the Rod nodes */
+	for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
+		if (new_asm_h[asmNum]) {
+			pda = new_asm_h[asmNum]->stripeMap->physInfo;
+			while (pda) {
+				rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc,
+				    rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
+				    "Rod", allocList);
+				rodNodes[nodeNum].params[0].p = pda;
+				rodNodes[nodeNum].params[1].p = pda->bufPtr;
+				rodNodes[nodeNum].params[2].v = parityStripeID;
+				rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+				    0, 0, which_ru);
+				nodeNum++;
+				pda = pda->next;
+			}
+		}
+	}
+	RF_ASSERT(nodeNum == nRodNodes);
+
+	/* initialize the wnd nodes */
+	pda = asmap->physInfo;
+	for (i = 0; i < nWndNodes; i++) {
+		rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+		    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
+		RF_ASSERT(pda != NULL);
+		wndNodes[i].params[0].p = pda;
+		wndNodes[i].params[1].p = pda->bufPtr;
+		wndNodes[i].params[2].v = parityStripeID;
+		wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		pda = pda->next;
+	}
+
+	/* initialize the redundancy node */
+	if (nRodNodes > 0) {
+		rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
+		    nRodNodes, 2 * (nWndNodes + nRodNodes) + 1, nfaults, dag_h,
+		    "Xr ", allocList);
+	} else {
+		rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
+		    1, 2 * (nWndNodes + nRodNodes) + 1, nfaults, dag_h, "Xr ", allocList);
+	}
+	xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
+	for (i = 0; i < nWndNodes; i++) {
+		xorNode->params[2 * i + 0] = wndNodes[i].params[0];	/* pda */
+		xorNode->params[2 * i + 1] = wndNodes[i].params[1];	/* buf ptr */
+	}
+	for (i = 0; i < nRodNodes; i++) {
+		xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0];	/* pda */
+		xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1];	/* buf ptr */
+	}
+	/* xor node needs to get at RAID information */
+	xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr;
+
+	/*
+         * Look for an Rod node that reads a complete SU. If none, alloc a buffer
+         * to receive the parity info. Note that we can't use a new data buffer
+         * because it will not have gotten written when the xor occurs.
+         */
+	if (allowBufferRecycle) {
+		for (i = 0; i < nRodNodes; i++) {
+			if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
+				break;
+		}
+	}
+	if ((!allowBufferRecycle) || (i == nRodNodes)) {
+		RF_CallocAndAdd(xorNode->results[0], 1,
+		    rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
+		    (void *), allocList);
+	} else {
+		xorNode->results[0] = rodNodes[i].params[1].p;
+	}
+
+	/* initialize the Wnp node */
+	rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+	    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
+	wnpNode->params[0].p = asmap->parityInfo;
+	wnpNode->params[1].p = xorNode->results[0];
+	wnpNode->params[2].v = parityStripeID;
+	wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+	/* parityInfo must describe entire parity unit */
+	RF_ASSERT(asmap->parityInfo->next == NULL);
+
+	if (nfaults == 2) {
+		/*
+	         * We never try to recycle a buffer for the Q calcuation
+	         * in addition to the parity. This would cause two buffers
+	         * to get smashed during the P and Q calculation, guaranteeing
+	         * one would be wrong.
+	         */
+		RF_CallocAndAdd(xorNode->results[1], 1,
+		    rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
+		    (void *), allocList);
+		rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+		    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
+		wnqNode->params[0].p = asmap->qInfo;
+		wnqNode->params[1].p = xorNode->results[1];
+		wnqNode->params[2].v = parityStripeID;
+		wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		/* parityInfo must describe entire parity unit */
+		RF_ASSERT(asmap->parityInfo->next == NULL);
+	}
+	/*
+         * Connect nodes to form graph.
+         */
+
+	/* connect dag header to block node */
+	RF_ASSERT(blockNode->numAntecedents == 0);
+	dag_h->succedents[0] = blockNode;
+
+	if (nRodNodes > 0) {
+		/* connect the block node to the Rod nodes */
+		RF_ASSERT(blockNode->numSuccedents == nRodNodes);
+		RF_ASSERT(xorNode->numAntecedents == nRodNodes);
+		for (i = 0; i < nRodNodes; i++) {
+			RF_ASSERT(rodNodes[i].numAntecedents == 1);
+			blockNode->succedents[i] = &rodNodes[i];
+			rodNodes[i].antecedents[0] = blockNode;
+			rodNodes[i].antType[0] = rf_control;
+
+			/* connect the Rod nodes to the Xor node */
+			RF_ASSERT(rodNodes[i].numSuccedents == 1);
+			rodNodes[i].succedents[0] = xorNode;
+			xorNode->antecedents[i] = &rodNodes[i];
+			xorNode->antType[i] = rf_trueData;
+		}
+	} else {
+		/* connect the block node to the Xor node */
+		RF_ASSERT(blockNode->numSuccedents == 1);
+		RF_ASSERT(xorNode->numAntecedents == 1);
+		blockNode->succedents[0] = xorNode;
+		xorNode->antecedents[0] = blockNode;
+		xorNode->antType[0] = rf_control;
+	}
+
+	/* connect the xor node to the commit node */
+	RF_ASSERT(xorNode->numSuccedents == 1);
+	RF_ASSERT(commitNode->numAntecedents == 1);
+	xorNode->succedents[0] = commitNode;
+	commitNode->antecedents[0] = xorNode;
+	commitNode->antType[0] = rf_control;
+
+	/* connect the commit node to the write nodes */
+	RF_ASSERT(commitNode->numSuccedents == nWndNodes + nfaults);
+	for (i = 0; i < nWndNodes; i++) {
+		RF_ASSERT(wndNodes->numAntecedents == 1);
+		commitNode->succedents[i] = &wndNodes[i];
+		wndNodes[i].antecedents[0] = commitNode;
+		wndNodes[i].antType[0] = rf_control;
+	}
+	RF_ASSERT(wnpNode->numAntecedents == 1);
+	commitNode->succedents[nWndNodes] = wnpNode;
+	wnpNode->antecedents[0] = commitNode;
+	wnpNode->antType[0] = rf_trueData;
+	if (nfaults == 2) {
+		RF_ASSERT(wnqNode->numAntecedents == 1);
+		commitNode->succedents[nWndNodes + 1] = wnqNode;
+		wnqNode->antecedents[0] = commitNode;
+		wnqNode->antType[0] = rf_trueData;
+	}
+	/* connect the write nodes to the term node */
+	RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults);
+	RF_ASSERT(termNode->numSuccedents == 0);
+	for (i = 0; i < nWndNodes; i++) {
+		RF_ASSERT(wndNodes->numSuccedents == 1);
+		wndNodes[i].succedents[0] = termNode;
+		termNode->antecedents[i] = &wndNodes[i];
+		termNode->antType[i] = rf_control;
+	}
+	RF_ASSERT(wnpNode->numSuccedents == 1);
+	wnpNode->succedents[0] = termNode;
+	termNode->antecedents[nWndNodes] = wnpNode;
+	termNode->antType[nWndNodes] = rf_control;
+	if (nfaults == 2) {
+		RF_ASSERT(wnqNode->numSuccedents == 1);
+		wnqNode->succedents[0] = termNode;
+		termNode->antecedents[nWndNodes + 1] = wnqNode;
+		termNode->antType[nWndNodes + 1] = rf_control;
+	}
+}
+/******************************************************************************
+ *
+ * creates a DAG to perform a small-write operation (either raid 5 or pq),
+ * which is as follows:
+ *
+ * Hdr -> Nil -> Rop -> Xor -> Cmt ----> Wnp [Unp] --> Trm
+ *            \- Rod X      /     \----> Wnd [Und]-/
+ *           [\- Rod X     /       \---> Wnd [Und]-/]
+ *           [\- Roq -> Q /         \--> Wnq [Unq]-/]
+ *
+ * Rop = read old parity
+ * Rod = read old data
+ * Roq = read old "q"
+ * Cmt = commit node
+ * Und = unlock data disk
+ * Unp = unlock parity disk
+ * Unq = unlock q disk
+ * Wnp = write new parity
+ * Wnd = write new data
+ * Wnq = write new "q"
+ * [ ] denotes optional segments in the graph
+ *
+ * Parameters:  raidPtr   - description of the physical array
+ *              asmap     - logical & physical addresses for this access
+ *              bp        - buffer ptr (holds write data)
+ *              flags     - general flags (e.g. disk locking)
+ *              allocList - list of memory allocated in DAG creation
+ *              pfuncs    - list of parity generating functions
+ *              qfuncs    - list of q generating functions
+ *
+ * A null qfuncs indicates single fault tolerant
+ *****************************************************************************/
+
+void 
+rf_CommonCreateSmallWriteDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    RF_RedFuncs_t * pfuncs,
+    RF_RedFuncs_t * qfuncs)
+{
+	RF_DagNode_t *readDataNodes, *readParityNodes, *readQNodes, *termNode;
+	RF_DagNode_t *unlockDataNodes, *unlockParityNodes, *unlockQNodes;
+	RF_DagNode_t *xorNodes, *qNodes, *blockNode, *commitNode, *nodes;
+	RF_DagNode_t *writeDataNodes, *writeParityNodes, *writeQNodes;
+	int     i, j, nNodes, totalNumNodes, lu_flag;
+	RF_ReconUnitNum_t which_ru;
+	int     (*func) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *);
+	int     (*qfunc) (RF_DagNode_t *);
+	int     numDataNodes, numParityNodes;
+	RF_StripeNum_t parityStripeID;
+	RF_PhysDiskAddr_t *pda;
+	char   *name, *qname;
+	long    nfaults;
+
+	nfaults = qfuncs ? 2 : 1;
+	lu_flag = (rf_enableAtomicRMW) ? 1 : 0;	/* lock/unlock flag */
+
+	parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
+	    asmap->raidAddress, &which_ru);
+	pda = asmap->physInfo;
+	numDataNodes = asmap->numStripeUnitsAccessed;
+	numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
+
+	if (rf_dagDebug) {
+		printf("[Creating small-write DAG]\n");
+	}
+	RF_ASSERT(numDataNodes > 0);
+	dag_h->creator = "SmallWriteDAG";
+
+	dag_h->numCommitNodes = 1;
+	dag_h->numCommits = 0;
+	dag_h->numSuccedents = 1;
+
+	/*
+         * DAG creation occurs in four steps:
+         * 1. count the number of nodes in the DAG
+         * 2. create the nodes
+         * 3. initialize the nodes
+         * 4. connect the nodes
+         */
+
+	/*
+         * Step 1. compute number of nodes in the graph
+         */
+
+	/* number of nodes: a read and write for each data unit a redundancy
+	 * computation node for each parity node (nfaults * nparity) a read
+	 * and write for each parity unit a block and commit node (2) a
+	 * terminate node if atomic RMW an unlock node for each data unit,
+	 * redundancy unit */
+	totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes)
+	    + (nfaults * 2 * numParityNodes) + 3;
+	if (lu_flag) {
+		totalNumNodes += (numDataNodes + (nfaults * numParityNodes));
+	}
+	/*
+         * Step 2. create the nodes
+         */
+	RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t),
+	    (RF_DagNode_t *), allocList);
+	i = 0;
+	blockNode = &nodes[i];
+	i += 1;
+	commitNode = &nodes[i];
+	i += 1;
+	readDataNodes = &nodes[i];
+	i += numDataNodes;
+	readParityNodes = &nodes[i];
+	i += numParityNodes;
+	writeDataNodes = &nodes[i];
+	i += numDataNodes;
+	writeParityNodes = &nodes[i];
+	i += numParityNodes;
+	xorNodes = &nodes[i];
+	i += numParityNodes;
+	termNode = &nodes[i];
+	i += 1;
+	if (lu_flag) {
+		unlockDataNodes = &nodes[i];
+		i += numDataNodes;
+		unlockParityNodes = &nodes[i];
+		i += numParityNodes;
+	} else {
+		unlockDataNodes = unlockParityNodes = NULL;
+	}
+	if (nfaults == 2) {
+		readQNodes = &nodes[i];
+		i += numParityNodes;
+		writeQNodes = &nodes[i];
+		i += numParityNodes;
+		qNodes = &nodes[i];
+		i += numParityNodes;
+		if (lu_flag) {
+			unlockQNodes = &nodes[i];
+			i += numParityNodes;
+		} else {
+			unlockQNodes = NULL;
+		}
+	} else {
+		readQNodes = writeQNodes = qNodes = unlockQNodes = NULL;
+	}
+	RF_ASSERT(i == totalNumNodes);
+
+	/*
+         * Step 3. initialize the nodes
+         */
+	/* initialize block node (Nil) */
+	nNodes = numDataNodes + (nfaults * numParityNodes);
+	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+	    NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
+
+	/* initialize commit node (Cmt) */
+	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+	    NULL, nNodes, (nfaults * numParityNodes), 0, 0, dag_h, "Cmt", allocList);
+
+	/* initialize terminate node (Trm) */
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+	    NULL, 0, nNodes, 0, 0, dag_h, "Trm", allocList);
+
+	/* initialize nodes which read old data (Rod) */
+	for (i = 0; i < numDataNodes; i++) {
+		rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+		    rf_GenericWakeupFunc, (nfaults * numParityNodes), 1, 4, 0, dag_h,
+		    "Rod", allocList);
+		RF_ASSERT(pda != NULL);
+		/* physical disk addr desc */
+		readDataNodes[i].params[0].p = pda;
+		/* buffer to hold old data */
+		readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
+		    dag_h, pda, allocList);
+		readDataNodes[i].params[2].v = parityStripeID;
+		readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+		    lu_flag, 0, which_ru);
+		pda = pda->next;
+		for (j = 0; j < readDataNodes[i].numSuccedents; j++) {
+			readDataNodes[i].propList[j] = NULL;
+		}
+	}
+
+	/* initialize nodes which read old parity (Rop) */
+	pda = asmap->parityInfo;
+	i = 0;
+	for (i = 0; i < numParityNodes; i++) {
+		RF_ASSERT(pda != NULL);
+		rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc,
+		    rf_DiskReadUndoFunc, rf_GenericWakeupFunc, numParityNodes, 1, 4,
+		    0, dag_h, "Rop", allocList);
+		readParityNodes[i].params[0].p = pda;
+		/* buffer to hold old parity */
+		readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
+		    dag_h, pda, allocList);
+		readParityNodes[i].params[2].v = parityStripeID;
+		readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+		    lu_flag, 0, which_ru);
+		pda = pda->next;
+		for (j = 0; j < readParityNodes[i].numSuccedents; j++) {
+			readParityNodes[i].propList[0] = NULL;
+		}
+	}
+
+	/* initialize nodes which read old Q (Roq) */
+	if (nfaults == 2) {
+		pda = asmap->qInfo;
+		for (i = 0; i < numParityNodes; i++) {
+			RF_ASSERT(pda != NULL);
+			rf_InitNode(&readQNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+			    rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, dag_h, "Roq", allocList);
+			readQNodes[i].params[0].p = pda;
+			/* buffer to hold old Q */
+			readQNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda,
+			    allocList);
+			readQNodes[i].params[2].v = parityStripeID;
+			readQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+			    lu_flag, 0, which_ru);
+			pda = pda->next;
+			for (j = 0; j < readQNodes[i].numSuccedents; j++) {
+				readQNodes[i].propList[0] = NULL;
+			}
+		}
+	}
+	/* initialize nodes which write new data (Wnd) */
+	pda = asmap->physInfo;
+	for (i = 0; i < numDataNodes; i++) {
+		RF_ASSERT(pda != NULL);
+		rf_InitNode(&writeDataNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc,
+		    rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
+		    "Wnd", allocList);
+		/* physical disk addr desc */
+		writeDataNodes[i].params[0].p = pda;
+		/* buffer holding new data to be written */
+		writeDataNodes[i].params[1].p = pda->bufPtr;
+		writeDataNodes[i].params[2].v = parityStripeID;
+		writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+		    0, 0, which_ru);
+		if (lu_flag) {
+			/* initialize node to unlock the disk queue */
+			rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc,
+			    rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
+			    "Und", allocList);
+			/* physical disk addr desc */
+			unlockDataNodes[i].params[0].p = pda;
+			unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+			    0, lu_flag, which_ru);
+		}
+		pda = pda->next;
+	}
+
+	/*
+         * Initialize nodes which compute new parity and Q.
+         */
+	/*
+         * We use the simple XOR func in the double-XOR case, and when
+         * we're accessing only a portion of one stripe unit. The distinction
+         * between the two is that the regular XOR func assumes that the targbuf
+         * is a full SU in size, and examines the pda associated with the buffer
+         * to decide where within the buffer to XOR the data, whereas
+         * the simple XOR func just XORs the data into the start of the buffer.
+         */
+	if ((numParityNodes == 2) || ((numDataNodes == 1)
+		&& (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
+		func = pfuncs->simple;
+		undoFunc = rf_NullNodeUndoFunc;
+		name = pfuncs->SimpleName;
+		if (qfuncs) {
+			qfunc = qfuncs->simple;
+			qname = qfuncs->SimpleName;
+		} else {
+			qfunc = NULL;
+			qname = NULL;
+		}
+	} else {
+		func = pfuncs->regular;
+		undoFunc = rf_NullNodeUndoFunc;
+		name = pfuncs->RegularName;
+		if (qfuncs) {
+			qfunc = qfuncs->regular;
+			qname = qfuncs->RegularName;
+		} else {
+			qfunc = NULL;
+			qname = NULL;
+		}
+	}
+	/*
+         * Initialize the xor nodes: params are {pda,buf}
+         * from {Rod,Wnd,Rop} nodes, and raidPtr
+         */
+	if (numParityNodes == 2) {
+		/* double-xor case */
+		for (i = 0; i < numParityNodes; i++) {
+			/* note: no wakeup func for xor */
+			rf_InitNode(&xorNodes[i], rf_wait, RF_FALSE, func, undoFunc, NULL,
+			    1, (numDataNodes + numParityNodes), 7, 1, dag_h, name, allocList);
+			xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
+			xorNodes[i].params[0] = readDataNodes[i].params[0];
+			xorNodes[i].params[1] = readDataNodes[i].params[1];
+			xorNodes[i].params[2] = readParityNodes[i].params[0];
+			xorNodes[i].params[3] = readParityNodes[i].params[1];
+			xorNodes[i].params[4] = writeDataNodes[i].params[0];
+			xorNodes[i].params[5] = writeDataNodes[i].params[1];
+			xorNodes[i].params[6].p = raidPtr;
+			/* use old parity buf as target buf */
+			xorNodes[i].results[0] = readParityNodes[i].params[1].p;
+			if (nfaults == 2) {
+				/* note: no wakeup func for qor */
+				rf_InitNode(&qNodes[i], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, 1,
+				    (numDataNodes + numParityNodes), 7, 1, dag_h, qname, allocList);
+				qNodes[i].params[0] = readDataNodes[i].params[0];
+				qNodes[i].params[1] = readDataNodes[i].params[1];
+				qNodes[i].params[2] = readQNodes[i].params[0];
+				qNodes[i].params[3] = readQNodes[i].params[1];
+				qNodes[i].params[4] = writeDataNodes[i].params[0];
+				qNodes[i].params[5] = writeDataNodes[i].params[1];
+				qNodes[i].params[6].p = raidPtr;
+				/* use old Q buf as target buf */
+				qNodes[i].results[0] = readQNodes[i].params[1].p;
+			}
+		}
+	} else {
+		/* there is only one xor node in this case */
+		rf_InitNode(&xorNodes[0], rf_wait, RF_FALSE, func, undoFunc, NULL, 1,
+		    (numDataNodes + numParityNodes),
+		    (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
+		xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
+		for (i = 0; i < numDataNodes + 1; i++) {
+			/* set up params related to Rod and Rop nodes */
+			xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0];	/* pda */
+			xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1];	/* buffer ptr */
+		}
+		for (i = 0; i < numDataNodes; i++) {
+			/* set up params related to Wnd and Wnp nodes */
+			xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] =	/* pda */
+			    writeDataNodes[i].params[0];
+			xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] =	/* buffer ptr */
+			    writeDataNodes[i].params[1];
+		}
+		/* xor node needs to get at RAID information */
+		xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
+		xorNodes[0].results[0] = readParityNodes[0].params[1].p;
+		if (nfaults == 2) {
+			rf_InitNode(&qNodes[0], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, 1,
+			    (numDataNodes + numParityNodes),
+			    (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h,
+			    qname, allocList);
+			for (i = 0; i < numDataNodes; i++) {
+				/* set up params related to Rod */
+				qNodes[0].params[2 * i + 0] = readDataNodes[i].params[0];	/* pda */
+				qNodes[0].params[2 * i + 1] = readDataNodes[i].params[1];	/* buffer ptr */
+			}
+			/* and read old q */
+			qNodes[0].params[2 * numDataNodes + 0] =	/* pda */
+			    readQNodes[0].params[0];
+			qNodes[0].params[2 * numDataNodes + 1] =	/* buffer ptr */
+			    readQNodes[0].params[1];
+			for (i = 0; i < numDataNodes; i++) {
+				/* set up params related to Wnd nodes */
+				qNodes[0].params[2 * (numDataNodes + 1 + i) + 0] =	/* pda */
+				    writeDataNodes[i].params[0];
+				qNodes[0].params[2 * (numDataNodes + 1 + i) + 1] =	/* buffer ptr */
+				    writeDataNodes[i].params[1];
+			}
+			/* xor node needs to get at RAID information */
+			qNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
+			qNodes[0].results[0] = readQNodes[0].params[1].p;
+		}
+	}
+
+	/* initialize nodes which write new parity (Wnp) */
+	pda = asmap->parityInfo;
+	for (i = 0; i < numParityNodes; i++) {
+		rf_InitNode(&writeParityNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc,
+		    rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
+		    "Wnp", allocList);
+		RF_ASSERT(pda != NULL);
+		writeParityNodes[i].params[0].p = pda;	/* param 1 (bufPtr)
+							 * filled in by xor node */
+		writeParityNodes[i].params[1].p = xorNodes[i].results[0];	/* buffer pointer for
+										 * parity write
+										 * operation */
+		writeParityNodes[i].params[2].v = parityStripeID;
+		writeParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+		    0, 0, which_ru);
+		if (lu_flag) {
+			/* initialize node to unlock the disk queue */
+			rf_InitNode(&unlockParityNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc,
+			    rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
+			    "Unp", allocList);
+			unlockParityNodes[i].params[0].p = pda;	/* physical disk addr
+								 * desc */
+			unlockParityNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+			    0, lu_flag, which_ru);
+		}
+		pda = pda->next;
+	}
+
+	/* initialize nodes which write new Q (Wnq) */
+	if (nfaults == 2) {
+		pda = asmap->qInfo;
+		for (i = 0; i < numParityNodes; i++) {
+			rf_InitNode(&writeQNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc,
+			    rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
+			    "Wnq", allocList);
+			RF_ASSERT(pda != NULL);
+			writeQNodes[i].params[0].p = pda;	/* param 1 (bufPtr)
+								 * filled in by xor node */
+			writeQNodes[i].params[1].p = qNodes[i].results[0];	/* buffer pointer for
+										 * parity write
+										 * operation */
+			writeQNodes[i].params[2].v = parityStripeID;
+			writeQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+			    0, 0, which_ru);
+			if (lu_flag) {
+				/* initialize node to unlock the disk queue */
+				rf_InitNode(&unlockQNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc,
+				    rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
+				    "Unq", allocList);
+				unlockQNodes[i].params[0].p = pda;	/* physical disk addr
+									 * desc */
+				unlockQNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+				    0, lu_flag, which_ru);
+			}
+			pda = pda->next;
+		}
+	}
+	/*
+         * Step 4. connect the nodes.
+         */
+
+	/* connect header to block node */
+	dag_h->succedents[0] = blockNode;
+
+	/* connect block node to read old data nodes */
+	RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults)));
+	for (i = 0; i < numDataNodes; i++) {
+		blockNode->succedents[i] = &readDataNodes[i];
+		RF_ASSERT(readDataNodes[i].numAntecedents == 1);
+		readDataNodes[i].antecedents[0] = blockNode;
+		readDataNodes[i].antType[0] = rf_control;
+	}
+
+	/* connect block node to read old parity nodes */
+	for (i = 0; i < numParityNodes; i++) {
+		blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
+		RF_ASSERT(readParityNodes[i].numAntecedents == 1);
+		readParityNodes[i].antecedents[0] = blockNode;
+		readParityNodes[i].antType[0] = rf_control;
+	}
+
+	/* connect block node to read old Q nodes */
+	if (nfaults == 2) {
+		for (i = 0; i < numParityNodes; i++) {
+			blockNode->succedents[numDataNodes + numParityNodes + i] = &readQNodes[i];
+			RF_ASSERT(readQNodes[i].numAntecedents == 1);
+			readQNodes[i].antecedents[0] = blockNode;
+			readQNodes[i].antType[0] = rf_control;
+		}
+	}
+	/* connect read old data nodes to xor nodes */
+	for (i = 0; i < numDataNodes; i++) {
+		RF_ASSERT(readDataNodes[i].numSuccedents == (nfaults * numParityNodes));
+		for (j = 0; j < numParityNodes; j++) {
+			RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
+			readDataNodes[i].succedents[j] = &xorNodes[j];
+			xorNodes[j].antecedents[i] = &readDataNodes[i];
+			xorNodes[j].antType[i] = rf_trueData;
+		}
+	}
+
+	/* connect read old data nodes to q nodes */
+	if (nfaults == 2) {
+		for (i = 0; i < numDataNodes; i++) {
+			for (j = 0; j < numParityNodes; j++) {
+				RF_ASSERT(qNodes[j].numAntecedents == numDataNodes + numParityNodes);
+				readDataNodes[i].succedents[numParityNodes + j] = &qNodes[j];
+				qNodes[j].antecedents[i] = &readDataNodes[i];
+				qNodes[j].antType[i] = rf_trueData;
+			}
+		}
+	}
+	/* connect read old parity nodes to xor nodes */
+	for (i = 0; i < numParityNodes; i++) {
+		RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
+		for (j = 0; j < numParityNodes; j++) {
+			readParityNodes[i].succedents[j] = &xorNodes[j];
+			xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
+			xorNodes[j].antType[numDataNodes + i] = rf_trueData;
+		}
+	}
+
+	/* connect read old q nodes to q nodes */
+	if (nfaults == 2) {
+		for (i = 0; i < numParityNodes; i++) {
+			RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
+			for (j = 0; j < numParityNodes; j++) {
+				readQNodes[i].succedents[j] = &qNodes[j];
+				qNodes[j].antecedents[numDataNodes + i] = &readQNodes[i];
+				qNodes[j].antType[numDataNodes + i] = rf_trueData;
+			}
+		}
+	}
+	/* connect xor nodes to commit node */
+	RF_ASSERT(commitNode->numAntecedents == (nfaults * numParityNodes));
+	for (i = 0; i < numParityNodes; i++) {
+		RF_ASSERT(xorNodes[i].numSuccedents == 1);
+		xorNodes[i].succedents[0] = commitNode;
+		commitNode->antecedents[i] = &xorNodes[i];
+		commitNode->antType[i] = rf_control;
+	}
+
+	/* connect q nodes to commit node */
+	if (nfaults == 2) {
+		for (i = 0; i < numParityNodes; i++) {
+			RF_ASSERT(qNodes[i].numSuccedents == 1);
+			qNodes[i].succedents[0] = commitNode;
+			commitNode->antecedents[i + numParityNodes] = &qNodes[i];
+			commitNode->antType[i + numParityNodes] = rf_control;
+		}
+	}
+	/* connect commit node to write nodes */
+	RF_ASSERT(commitNode->numSuccedents == (numDataNodes + (nfaults * numParityNodes)));
+	for (i = 0; i < numDataNodes; i++) {
+		RF_ASSERT(writeDataNodes[i].numAntecedents == 1);
+		commitNode->succedents[i] = &writeDataNodes[i];
+		writeDataNodes[i].antecedents[0] = commitNode;
+		writeDataNodes[i].antType[0] = rf_trueData;
+	}
+	for (i = 0; i < numParityNodes; i++) {
+		RF_ASSERT(writeParityNodes[i].numAntecedents == 1);
+		commitNode->succedents[i + numDataNodes] = &writeParityNodes[i];
+		writeParityNodes[i].antecedents[0] = commitNode;
+		writeParityNodes[i].antType[0] = rf_trueData;
+	}
+	if (nfaults == 2) {
+		for (i = 0; i < numParityNodes; i++) {
+			RF_ASSERT(writeQNodes[i].numAntecedents == 1);
+			commitNode->succedents[i + numDataNodes + numParityNodes] = &writeQNodes[i];
+			writeQNodes[i].antecedents[0] = commitNode;
+			writeQNodes[i].antType[0] = rf_trueData;
+		}
+	}
+	RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
+	RF_ASSERT(termNode->numSuccedents == 0);
+	for (i = 0; i < numDataNodes; i++) {
+		if (lu_flag) {
+			/* connect write new data nodes to unlock nodes */
+			RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
+			RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
+			writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
+			unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
+			unlockDataNodes[i].antType[0] = rf_control;
+
+			/* connect unlock nodes to term node */
+			RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
+			unlockDataNodes[i].succedents[0] = termNode;
+			termNode->antecedents[i] = &unlockDataNodes[i];
+			termNode->antType[i] = rf_control;
+		} else {
+			/* connect write new data nodes to term node */
+			RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
+			RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
+			writeDataNodes[i].succedents[0] = termNode;
+			termNode->antecedents[i] = &writeDataNodes[i];
+			termNode->antType[i] = rf_control;
+		}
+	}
+
+	for (i = 0; i < numParityNodes; i++) {
+		if (lu_flag) {
+			/* connect write new parity nodes to unlock nodes */
+			RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
+			RF_ASSERT(unlockParityNodes[i].numAntecedents == 1);
+			writeParityNodes[i].succedents[0] = &unlockParityNodes[i];
+			unlockParityNodes[i].antecedents[0] = &writeParityNodes[i];
+			unlockParityNodes[i].antType[0] = rf_control;
+
+			/* connect unlock nodes to term node */
+			RF_ASSERT(unlockParityNodes[i].numSuccedents == 1);
+			unlockParityNodes[i].succedents[0] = termNode;
+			termNode->antecedents[numDataNodes + i] = &unlockParityNodes[i];
+			termNode->antType[numDataNodes + i] = rf_control;
+		} else {
+			RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
+			writeParityNodes[i].succedents[0] = termNode;
+			termNode->antecedents[numDataNodes + i] = &writeParityNodes[i];
+			termNode->antType[numDataNodes + i] = rf_control;
+		}
+	}
+
+	if (nfaults == 2) {
+		for (i = 0; i < numParityNodes; i++) {
+			if (lu_flag) {
+				/* connect write new Q nodes to unlock nodes */
+				RF_ASSERT(writeQNodes[i].numSuccedents == 1);
+				RF_ASSERT(unlockQNodes[i].numAntecedents == 1);
+				writeQNodes[i].succedents[0] = &unlockQNodes[i];
+				unlockQNodes[i].antecedents[0] = &writeQNodes[i];
+				unlockQNodes[i].antType[0] = rf_control;
+
+				/* connect unlock nodes to unblock node */
+				RF_ASSERT(unlockQNodes[i].numSuccedents == 1);
+				unlockQNodes[i].succedents[0] = termNode;
+				termNode->antecedents[numDataNodes + numParityNodes + i] = &unlockQNodes[i];
+				termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
+			} else {
+				RF_ASSERT(writeQNodes[i].numSuccedents == 1);
+				writeQNodes[i].succedents[0] = termNode;
+				termNode->antecedents[numDataNodes + numParityNodes + i] = &writeQNodes[i];
+				termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
+			}
+		}
+	}
+}
+
+
+/******************************************************************************
+ * create a write graph (fault-free or degraded) for RAID level 1
+ *
+ * Hdr -> Commit -> Wpd -> Nil -> Trm
+ *               -> Wsd ->
+ *
+ * The "Wpd" node writes data to the primary copy in the mirror pair
+ * The "Wsd" node writes data to the secondary copy in the mirror pair
+ *
+ * Parameters:  raidPtr   - description of the physical array
+ *              asmap     - logical & physical addresses for this access
+ *              bp        - buffer ptr (holds write data)
+ *              flags     - general flags (e.g. disk locking)
+ *              allocList - list of memory allocated in DAG creation
+ *****************************************************************************/
+
+void 
+rf_CreateRaidOneWriteDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList)
+{
+	RF_DagNode_t *unblockNode, *termNode, *commitNode;
+	RF_DagNode_t *nodes, *wndNode, *wmirNode;
+	int     nWndNodes, nWmirNodes, i;
+	RF_ReconUnitNum_t which_ru;
+	RF_PhysDiskAddr_t *pda, *pdaP;
+	RF_StripeNum_t parityStripeID;
+
+	parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
+	    asmap->raidAddress, &which_ru);
+	if (rf_dagDebug) {
+		printf("[Creating RAID level 1 write DAG]\n");
+	}
+	dag_h->creator = "RaidOneWriteDAG";
+
+	/* 2 implies access not SU aligned */
+	nWmirNodes = (asmap->parityInfo->next) ? 2 : 1;
+	nWndNodes = (asmap->physInfo->next) ? 2 : 1;
+
+	/* alloc the Wnd nodes and the Wmir node */
+	if (asmap->numDataFailed == 1)
+		nWndNodes--;
+	if (asmap->numParityFailed == 1)
+		nWmirNodes--;
+
+	/* total number of nodes = nWndNodes + nWmirNodes + (commit + unblock
+	 * + terminator) */
+	RF_CallocAndAdd(nodes, nWndNodes + nWmirNodes + 3, sizeof(RF_DagNode_t),
+	    (RF_DagNode_t *), allocList);
+	i = 0;
+	wndNode = &nodes[i];
+	i += nWndNodes;
+	wmirNode = &nodes[i];
+	i += nWmirNodes;
+	commitNode = &nodes[i];
+	i += 1;
+	unblockNode = &nodes[i];
+	i += 1;
+	termNode = &nodes[i];
+	i += 1;
+	RF_ASSERT(i == (nWndNodes + nWmirNodes + 3));
+
+	/* this dag can commit immediately */
+	dag_h->numCommitNodes = 1;
+	dag_h->numCommits = 0;
+	dag_h->numSuccedents = 1;
+
+	/* initialize the commit, unblock, and term nodes */
+	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+	    NULL, (nWndNodes + nWmirNodes), 0, 0, 0, dag_h, "Cmt", allocList);
+	rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+	    NULL, 1, (nWndNodes + nWmirNodes), 0, 0, dag_h, "Nil", allocList);
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+	    NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+	/* initialize the wnd nodes */
+	if (nWndNodes > 0) {
+		pda = asmap->physInfo;
+		for (i = 0; i < nWndNodes; i++) {
+			rf_InitNode(&wndNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wpd", allocList);
+			RF_ASSERT(pda != NULL);
+			wndNode[i].params[0].p = pda;
+			wndNode[i].params[1].p = pda->bufPtr;
+			wndNode[i].params[2].v = parityStripeID;
+			wndNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+			pda = pda->next;
+		}
+		RF_ASSERT(pda == NULL);
+	}
+	/* initialize the mirror nodes */
+	if (nWmirNodes > 0) {
+		pda = asmap->physInfo;
+		pdaP = asmap->parityInfo;
+		for (i = 0; i < nWmirNodes; i++) {
+			rf_InitNode(&wmirNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wsd", allocList);
+			RF_ASSERT(pda != NULL);
+			wmirNode[i].params[0].p = pdaP;
+			wmirNode[i].params[1].p = pda->bufPtr;
+			wmirNode[i].params[2].v = parityStripeID;
+			wmirNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+			pda = pda->next;
+			pdaP = pdaP->next;
+		}
+		RF_ASSERT(pda == NULL);
+		RF_ASSERT(pdaP == NULL);
+	}
+	/* link the header node to the commit node */
+	RF_ASSERT(dag_h->numSuccedents == 1);
+	RF_ASSERT(commitNode->numAntecedents == 0);
+	dag_h->succedents[0] = commitNode;
+
+	/* link the commit node to the write nodes */
+	RF_ASSERT(commitNode->numSuccedents == (nWndNodes + nWmirNodes));
+	for (i = 0; i < nWndNodes; i++) {
+		RF_ASSERT(wndNode[i].numAntecedents == 1);
+		commitNode->succedents[i] = &wndNode[i];
+		wndNode[i].antecedents[0] = commitNode;
+		wndNode[i].antType[0] = rf_control;
+	}
+	for (i = 0; i < nWmirNodes; i++) {
+		RF_ASSERT(wmirNode[i].numAntecedents == 1);
+		commitNode->succedents[i + nWndNodes] = &wmirNode[i];
+		wmirNode[i].antecedents[0] = commitNode;
+		wmirNode[i].antType[0] = rf_control;
+	}
+
+	/* link the write nodes to the unblock node */
+	RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes));
+	for (i = 0; i < nWndNodes; i++) {
+		RF_ASSERT(wndNode[i].numSuccedents == 1);
+		wndNode[i].succedents[0] = unblockNode;
+		unblockNode->antecedents[i] = &wndNode[i];
+		unblockNode->antType[i] = rf_control;
+	}
+	for (i = 0; i < nWmirNodes; i++) {
+		RF_ASSERT(wmirNode[i].numSuccedents == 1);
+		wmirNode[i].succedents[0] = unblockNode;
+		unblockNode->antecedents[i + nWndNodes] = &wmirNode[i];
+		unblockNode->antType[i + nWndNodes] = rf_control;
+	}
+
+	/* link the unblock node to the term node */
+	RF_ASSERT(unblockNode->numSuccedents == 1);
+	RF_ASSERT(termNode->numAntecedents == 1);
+	RF_ASSERT(termNode->numSuccedents == 0);
+	unblockNode->succedents[0] = termNode;
+	termNode->antecedents[0] = unblockNode;
+	termNode->antType[0] = rf_control;
+}
+
+
+
+/* DAGs which have no commit points.
+ *
+ * The following DAGs are used in forward and backward error recovery experiments.
+ * They are identical to the DAGs above this comment with the exception that the
+ * the commit points have been removed.
+ */
+
+
+
+void 
+rf_CommonCreateLargeWriteDAGFwd(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    int nfaults,
+    int (*redFunc) (RF_DagNode_t *),
+    int allowBufferRecycle)
+{
+	RF_DagNode_t *nodes, *wndNodes, *rodNodes, *xorNode, *wnpNode;
+	RF_DagNode_t *wnqNode, *blockNode, *syncNode, *termNode;
+	int     nWndNodes, nRodNodes, i, nodeNum, asmNum;
+	RF_AccessStripeMapHeader_t *new_asm_h[2];
+	RF_StripeNum_t parityStripeID;
+	char   *sosBuffer, *eosBuffer;
+	RF_ReconUnitNum_t which_ru;
+	RF_RaidLayout_t *layoutPtr;
+	RF_PhysDiskAddr_t *pda;
+
+	layoutPtr = &(raidPtr->Layout);
+	parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
+
+	if (rf_dagDebug)
+		printf("[Creating large-write DAG]\n");
+	dag_h->creator = "LargeWriteDAGFwd";
+
+	dag_h->numCommitNodes = 0;
+	dag_h->numCommits = 0;
+	dag_h->numSuccedents = 1;
+
+	/* alloc the nodes: Wnd, xor, commit, block, term, and  Wnp */
+	nWndNodes = asmap->numStripeUnitsAccessed;
+	RF_CallocAndAdd(nodes, nWndNodes + 4 + nfaults, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+	i = 0;
+	wndNodes = &nodes[i];
+	i += nWndNodes;
+	xorNode = &nodes[i];
+	i += 1;
+	wnpNode = &nodes[i];
+	i += 1;
+	blockNode = &nodes[i];
+	i += 1;
+	syncNode = &nodes[i];
+	i += 1;
+	termNode = &nodes[i];
+	i += 1;
+	if (nfaults == 2) {
+		wnqNode = &nodes[i];
+		i += 1;
+	} else {
+		wnqNode = NULL;
+	}
+	rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList);
+	if (nRodNodes > 0) {
+		RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+	} else {
+		rodNodes = NULL;
+	}
+
+	/* begin node initialization */
+	if (nRodNodes > 0) {
+		rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes, 0, 0, 0, dag_h, "Nil", allocList);
+		rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes, 0, 0, dag_h, "Nil", allocList);
+	} else {
+		rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
+		rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, 1, 0, 0, dag_h, "Nil", allocList);
+	}
+
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, nWndNodes + nfaults, 0, 0, dag_h, "Trm", allocList);
+
+	/* initialize the Rod nodes */
+	for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
+		if (new_asm_h[asmNum]) {
+			pda = new_asm_h[asmNum]->stripeMap->physInfo;
+			while (pda) {
+				rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rod", allocList);
+				rodNodes[nodeNum].params[0].p = pda;
+				rodNodes[nodeNum].params[1].p = pda->bufPtr;
+				rodNodes[nodeNum].params[2].v = parityStripeID;
+				rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+				nodeNum++;
+				pda = pda->next;
+			}
+		}
+	}
+	RF_ASSERT(nodeNum == nRodNodes);
+
+	/* initialize the wnd nodes */
+	pda = asmap->physInfo;
+	for (i = 0; i < nWndNodes; i++) {
+		rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
+		RF_ASSERT(pda != NULL);
+		wndNodes[i].params[0].p = pda;
+		wndNodes[i].params[1].p = pda->bufPtr;
+		wndNodes[i].params[2].v = parityStripeID;
+		wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		pda = pda->next;
+	}
+
+	/* initialize the redundancy node */
+	rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1, nfaults, 2 * (nWndNodes + nRodNodes) + 1, nfaults, dag_h, "Xr ", allocList);
+	xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
+	for (i = 0; i < nWndNodes; i++) {
+		xorNode->params[2 * i + 0] = wndNodes[i].params[0];	/* pda */
+		xorNode->params[2 * i + 1] = wndNodes[i].params[1];	/* buf ptr */
+	}
+	for (i = 0; i < nRodNodes; i++) {
+		xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0];	/* pda */
+		xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1];	/* buf ptr */
+	}
+	xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr;	/* xor node needs to get
+									 * at RAID information */
+
+	/* look for an Rod node that reads a complete SU.  If none, alloc a
+	 * buffer to receive the parity info. Note that we can't use a new
+	 * data buffer because it will not have gotten written when the xor
+	 * occurs. */
+	if (allowBufferRecycle) {
+		for (i = 0; i < nRodNodes; i++)
+			if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
+				break;
+	}
+	if ((!allowBufferRecycle) || (i == nRodNodes)) {
+		RF_CallocAndAdd(xorNode->results[0], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
+	} else
+		xorNode->results[0] = rodNodes[i].params[1].p;
+
+	/* initialize the Wnp node */
+	rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
+	wnpNode->params[0].p = asmap->parityInfo;
+	wnpNode->params[1].p = xorNode->results[0];
+	wnpNode->params[2].v = parityStripeID;
+	wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+	RF_ASSERT(asmap->parityInfo->next == NULL);	/* parityInfo must
+							 * describe entire
+							 * parity unit */
+
+	if (nfaults == 2) {
+		/* we never try to recycle a buffer for the Q calcuation in
+		 * addition to the parity. This would cause two buffers to get
+		 * smashed during the P and Q calculation, guaranteeing one
+		 * would be wrong. */
+		RF_CallocAndAdd(xorNode->results[1], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
+		rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
+		wnqNode->params[0].p = asmap->qInfo;
+		wnqNode->params[1].p = xorNode->results[1];
+		wnqNode->params[2].v = parityStripeID;
+		wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		RF_ASSERT(asmap->parityInfo->next == NULL);	/* parityInfo must
+								 * describe entire
+								 * parity unit */
+	}
+	/* connect nodes to form graph */
+
+	/* connect dag header to block node */
+	RF_ASSERT(blockNode->numAntecedents == 0);
+	dag_h->succedents[0] = blockNode;
+
+	if (nRodNodes > 0) {
+		/* connect the block node to the Rod nodes */
+		RF_ASSERT(blockNode->numSuccedents == nRodNodes);
+		RF_ASSERT(syncNode->numAntecedents == nRodNodes);
+		for (i = 0; i < nRodNodes; i++) {
+			RF_ASSERT(rodNodes[i].numAntecedents == 1);
+			blockNode->succedents[i] = &rodNodes[i];
+			rodNodes[i].antecedents[0] = blockNode;
+			rodNodes[i].antType[0] = rf_control;
+
+			/* connect the Rod nodes to the Nil node */
+			RF_ASSERT(rodNodes[i].numSuccedents == 1);
+			rodNodes[i].succedents[0] = syncNode;
+			syncNode->antecedents[i] = &rodNodes[i];
+			syncNode->antType[i] = rf_trueData;
+		}
+	} else {
+		/* connect the block node to the Nil node */
+		RF_ASSERT(blockNode->numSuccedents == 1);
+		RF_ASSERT(syncNode->numAntecedents == 1);
+		blockNode->succedents[0] = syncNode;
+		syncNode->antecedents[0] = blockNode;
+		syncNode->antType[0] = rf_control;
+	}
+
+	/* connect the sync node to the Wnd nodes */
+	RF_ASSERT(syncNode->numSuccedents == (1 + nWndNodes));
+	for (i = 0; i < nWndNodes; i++) {
+		RF_ASSERT(wndNodes->numAntecedents == 1);
+		syncNode->succedents[i] = &wndNodes[i];
+		wndNodes[i].antecedents[0] = syncNode;
+		wndNodes[i].antType[0] = rf_control;
+	}
+
+	/* connect the sync node to the Xor node */
+	RF_ASSERT(xorNode->numAntecedents == 1);
+	syncNode->succedents[nWndNodes] = xorNode;
+	xorNode->antecedents[0] = syncNode;
+	xorNode->antType[0] = rf_control;
+
+	/* connect the xor node to the write parity node */
+	RF_ASSERT(xorNode->numSuccedents == nfaults);
+	RF_ASSERT(wnpNode->numAntecedents == 1);
+	xorNode->succedents[0] = wnpNode;
+	wnpNode->antecedents[0] = xorNode;
+	wnpNode->antType[0] = rf_trueData;
+	if (nfaults == 2) {
+		RF_ASSERT(wnqNode->numAntecedents == 1);
+		xorNode->succedents[1] = wnqNode;
+		wnqNode->antecedents[0] = xorNode;
+		wnqNode->antType[0] = rf_trueData;
+	}
+	/* connect the write nodes to the term node */
+	RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults);
+	RF_ASSERT(termNode->numSuccedents == 0);
+	for (i = 0; i < nWndNodes; i++) {
+		RF_ASSERT(wndNodes->numSuccedents == 1);
+		wndNodes[i].succedents[0] = termNode;
+		termNode->antecedents[i] = &wndNodes[i];
+		termNode->antType[i] = rf_control;
+	}
+	RF_ASSERT(wnpNode->numSuccedents == 1);
+	wnpNode->succedents[0] = termNode;
+	termNode->antecedents[nWndNodes] = wnpNode;
+	termNode->antType[nWndNodes] = rf_control;
+	if (nfaults == 2) {
+		RF_ASSERT(wnqNode->numSuccedents == 1);
+		wnqNode->succedents[0] = termNode;
+		termNode->antecedents[nWndNodes + 1] = wnqNode;
+		termNode->antType[nWndNodes + 1] = rf_control;
+	}
+}
+
+
+/******************************************************************************
+ *
+ * creates a DAG to perform a small-write operation (either raid 5 or pq),
+ * which is as follows:
+ *
+ * Hdr -> Nil -> Rop - Xor - Wnp [Unp] -- Trm
+ *            \- Rod X- Wnd [Und] -------/
+ *           [\- Rod X- Wnd [Und] ------/]
+ *           [\- Roq - Q --> Wnq [Unq]-/]
+ *
+ * Rop = read old parity
+ * Rod = read old data
+ * Roq = read old "q"
+ * Cmt = commit node
+ * Und = unlock data disk
+ * Unp = unlock parity disk
+ * Unq = unlock q disk
+ * Wnp = write new parity
+ * Wnd = write new data
+ * Wnq = write new "q"
+ * [ ] denotes optional segments in the graph
+ *
+ * Parameters:  raidPtr   - description of the physical array
+ *              asmap     - logical & physical addresses for this access
+ *              bp        - buffer ptr (holds write data)
+ *              flags     - general flags (e.g. disk locking)
+ *              allocList - list of memory allocated in DAG creation
+ *              pfuncs    - list of parity generating functions
+ *              qfuncs    - list of q generating functions
+ *
+ * A null qfuncs indicates single fault tolerant
+ *****************************************************************************/
+
+void 
+rf_CommonCreateSmallWriteDAGFwd(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    RF_RedFuncs_t * pfuncs,
+    RF_RedFuncs_t * qfuncs)
+{
+	RF_DagNode_t *readDataNodes, *readParityNodes, *readQNodes, *termNode;
+	RF_DagNode_t *unlockDataNodes, *unlockParityNodes, *unlockQNodes;
+	RF_DagNode_t *xorNodes, *qNodes, *blockNode, *nodes;
+	RF_DagNode_t *writeDataNodes, *writeParityNodes, *writeQNodes;
+	int     i, j, nNodes, totalNumNodes, lu_flag;
+	RF_ReconUnitNum_t which_ru;
+	int     (*func) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *);
+	int     (*qfunc) (RF_DagNode_t *);
+	int     numDataNodes, numParityNodes;
+	RF_StripeNum_t parityStripeID;
+	RF_PhysDiskAddr_t *pda;
+	char   *name, *qname;
+	long    nfaults;
+
+	nfaults = qfuncs ? 2 : 1;
+	lu_flag = (rf_enableAtomicRMW) ? 1 : 0;	/* lock/unlock flag */
+
+	parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
+	pda = asmap->physInfo;
+	numDataNodes = asmap->numStripeUnitsAccessed;
+	numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
+
+	if (rf_dagDebug)
+		printf("[Creating small-write DAG]\n");
+	RF_ASSERT(numDataNodes > 0);
+	dag_h->creator = "SmallWriteDAGFwd";
+
+	dag_h->numCommitNodes = 0;
+	dag_h->numCommits = 0;
+	dag_h->numSuccedents = 1;
+
+	qfunc = NULL;
+	qname = NULL;
+
+	/* DAG creation occurs in four steps: 1. count the number of nodes in
+	 * the DAG 2. create the nodes 3. initialize the nodes 4. connect the
+	 * nodes */
+
+	/* Step 1. compute number of nodes in the graph */
+
+	/* number of nodes: a read and write for each data unit a redundancy
+	 * computation node for each parity node (nfaults * nparity) a read
+	 * and write for each parity unit a block node a terminate node if
+	 * atomic RMW an unlock node for each data unit, redundancy unit */
+	totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes) + (nfaults * 2 * numParityNodes) + 2;
+	if (lu_flag)
+		totalNumNodes += (numDataNodes + (nfaults * numParityNodes));
+
+
+	/* Step 2. create the nodes */
+	RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+	i = 0;
+	blockNode = &nodes[i];
+	i += 1;
+	readDataNodes = &nodes[i];
+	i += numDataNodes;
+	readParityNodes = &nodes[i];
+	i += numParityNodes;
+	writeDataNodes = &nodes[i];
+	i += numDataNodes;
+	writeParityNodes = &nodes[i];
+	i += numParityNodes;
+	xorNodes = &nodes[i];
+	i += numParityNodes;
+	termNode = &nodes[i];
+	i += 1;
+	if (lu_flag) {
+		unlockDataNodes = &nodes[i];
+		i += numDataNodes;
+		unlockParityNodes = &nodes[i];
+		i += numParityNodes;
+	} else {
+		unlockDataNodes = unlockParityNodes = NULL;
+	}
+	if (nfaults == 2) {
+		readQNodes = &nodes[i];
+		i += numParityNodes;
+		writeQNodes = &nodes[i];
+		i += numParityNodes;
+		qNodes = &nodes[i];
+		i += numParityNodes;
+		if (lu_flag) {
+			unlockQNodes = &nodes[i];
+			i += numParityNodes;
+		} else {
+			unlockQNodes = NULL;
+		}
+	} else {
+		readQNodes = writeQNodes = qNodes = unlockQNodes = NULL;
+	}
+	RF_ASSERT(i == totalNumNodes);
+
+	/* Step 3. initialize the nodes */
+	/* initialize block node (Nil) */
+	nNodes = numDataNodes + (nfaults * numParityNodes);
+	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
+
+	/* initialize terminate node (Trm) */
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, nNodes, 0, 0, dag_h, "Trm", allocList);
+
+	/* initialize nodes which read old data (Rod) */
+	for (i = 0; i < numDataNodes; i++) {
+		rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, (numParityNodes * nfaults) + 1, 1, 4, 0, dag_h, "Rod", allocList);
+		RF_ASSERT(pda != NULL);
+		readDataNodes[i].params[0].p = pda;	/* physical disk addr
+							 * desc */
+		readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList);	/* buffer to hold old
+												 * data */
+		readDataNodes[i].params[2].v = parityStripeID;
+		readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
+		pda = pda->next;
+		for (j = 0; j < readDataNodes[i].numSuccedents; j++)
+			readDataNodes[i].propList[j] = NULL;
+	}
+
+	/* initialize nodes which read old parity (Rop) */
+	pda = asmap->parityInfo;
+	i = 0;
+	for (i = 0; i < numParityNodes; i++) {
+		RF_ASSERT(pda != NULL);
+		rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, dag_h, "Rop", allocList);
+		readParityNodes[i].params[0].p = pda;
+		readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList);	/* buffer to hold old
+													 * parity */
+		readParityNodes[i].params[2].v = parityStripeID;
+		readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
+		for (j = 0; j < readParityNodes[i].numSuccedents; j++)
+			readParityNodes[i].propList[0] = NULL;
+		pda = pda->next;
+	}
+
+	/* initialize nodes which read old Q (Roq) */
+	if (nfaults == 2) {
+		pda = asmap->qInfo;
+		for (i = 0; i < numParityNodes; i++) {
+			RF_ASSERT(pda != NULL);
+			rf_InitNode(&readQNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, dag_h, "Roq", allocList);
+			readQNodes[i].params[0].p = pda;
+			readQNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList);	/* buffer to hold old Q */
+			readQNodes[i].params[2].v = parityStripeID;
+			readQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
+			for (j = 0; j < readQNodes[i].numSuccedents; j++)
+				readQNodes[i].propList[0] = NULL;
+			pda = pda->next;
+		}
+	}
+	/* initialize nodes which write new data (Wnd) */
+	pda = asmap->physInfo;
+	for (i = 0; i < numDataNodes; i++) {
+		RF_ASSERT(pda != NULL);
+		rf_InitNode(&writeDataNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
+		writeDataNodes[i].params[0].p = pda;	/* physical disk addr
+							 * desc */
+		writeDataNodes[i].params[1].p = pda->bufPtr;	/* buffer holding new
+								 * data to be written */
+		writeDataNodes[i].params[2].v = parityStripeID;
+		writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+
+		if (lu_flag) {
+			/* initialize node to unlock the disk queue */
+			rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Und", allocList);
+			unlockDataNodes[i].params[0].p = pda;	/* physical disk addr
+								 * desc */
+			unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
+		}
+		pda = pda->next;
+	}
+
+
+	/* initialize nodes which compute new parity and Q */
+	/* we use the simple XOR func in the double-XOR case, and when we're
+	 * accessing only a portion of one stripe unit. the distinction
+	 * between the two is that the regular XOR func assumes that the
+	 * targbuf is a full SU in size, and examines the pda associated with
+	 * the buffer to decide where within the buffer to XOR the data,
+	 * whereas the simple XOR func just XORs the data into the start of
+	 * the buffer. */
+	if ((numParityNodes == 2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
+		func = pfuncs->simple;
+		undoFunc = rf_NullNodeUndoFunc;
+		name = pfuncs->SimpleName;
+		if (qfuncs) {
+			qfunc = qfuncs->simple;
+			qname = qfuncs->SimpleName;
+		}
+	} else {
+		func = pfuncs->regular;
+		undoFunc = rf_NullNodeUndoFunc;
+		name = pfuncs->RegularName;
+		if (qfuncs) {
+			qfunc = qfuncs->regular;
+			qname = qfuncs->RegularName;
+		}
+	}
+	/* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop}
+	 * nodes, and raidPtr  */
+	if (numParityNodes == 2) {	/* double-xor case */
+		for (i = 0; i < numParityNodes; i++) {
+			rf_InitNode(&xorNodes[i], rf_wait, RF_FALSE, func, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, 7, 1, dag_h, name, allocList);	/* no wakeup func for
+																						 * xor */
+			xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
+			xorNodes[i].params[0] = readDataNodes[i].params[0];
+			xorNodes[i].params[1] = readDataNodes[i].params[1];
+			xorNodes[i].params[2] = readParityNodes[i].params[0];
+			xorNodes[i].params[3] = readParityNodes[i].params[1];
+			xorNodes[i].params[4] = writeDataNodes[i].params[0];
+			xorNodes[i].params[5] = writeDataNodes[i].params[1];
+			xorNodes[i].params[6].p = raidPtr;
+			xorNodes[i].results[0] = readParityNodes[i].params[1].p;	/* use old parity buf as
+											 * target buf */
+			if (nfaults == 2) {
+				rf_InitNode(&qNodes[i], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, 7, 1, dag_h, qname, allocList);	/* no wakeup func for
+																							 * xor */
+				qNodes[i].params[0] = readDataNodes[i].params[0];
+				qNodes[i].params[1] = readDataNodes[i].params[1];
+				qNodes[i].params[2] = readQNodes[i].params[0];
+				qNodes[i].params[3] = readQNodes[i].params[1];
+				qNodes[i].params[4] = writeDataNodes[i].params[0];
+				qNodes[i].params[5] = writeDataNodes[i].params[1];
+				qNodes[i].params[6].p = raidPtr;
+				qNodes[i].results[0] = readQNodes[i].params[1].p;	/* use old Q buf as
+											 * target buf */
+			}
+		}
+	} else {
+		/* there is only one xor node in this case */
+		rf_InitNode(&xorNodes[0], rf_wait, RF_FALSE, func, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
+		xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
+		for (i = 0; i < numDataNodes + 1; i++) {
+			/* set up params related to Rod and Rop nodes */
+			xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0];	/* pda */
+			xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1];	/* buffer pointer */
+		}
+		for (i = 0; i < numDataNodes; i++) {
+			/* set up params related to Wnd and Wnp nodes */
+			xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = writeDataNodes[i].params[0];	/* pda */
+			xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = writeDataNodes[i].params[1];	/* buffer pointer */
+		}
+		xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;	/* xor node needs to get
+											 * at RAID information */
+		xorNodes[0].results[0] = readParityNodes[0].params[1].p;
+		if (nfaults == 2) {
+			rf_InitNode(&qNodes[0], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, qname, allocList);
+			for (i = 0; i < numDataNodes; i++) {
+				/* set up params related to Rod */
+				qNodes[0].params[2 * i + 0] = readDataNodes[i].params[0];	/* pda */
+				qNodes[0].params[2 * i + 1] = readDataNodes[i].params[1];	/* buffer pointer */
+			}
+			/* and read old q */
+			qNodes[0].params[2 * numDataNodes + 0] = readQNodes[0].params[0];	/* pda */
+			qNodes[0].params[2 * numDataNodes + 1] = readQNodes[0].params[1];	/* buffer pointer */
+			for (i = 0; i < numDataNodes; i++) {
+				/* set up params related to Wnd nodes */
+				qNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = writeDataNodes[i].params[0];	/* pda */
+				qNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = writeDataNodes[i].params[1];	/* buffer pointer */
+			}
+			qNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;	/* xor node needs to get
+												 * at RAID information */
+			qNodes[0].results[0] = readQNodes[0].params[1].p;
+		}
+	}
+
+	/* initialize nodes which write new parity (Wnp) */
+	pda = asmap->parityInfo;
+	for (i = 0; i < numParityNodes; i++) {
+		rf_InitNode(&writeParityNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, numParityNodes, 4, 0, dag_h, "Wnp", allocList);
+		RF_ASSERT(pda != NULL);
+		writeParityNodes[i].params[0].p = pda;	/* param 1 (bufPtr)
+							 * filled in by xor node */
+		writeParityNodes[i].params[1].p = xorNodes[i].results[0];	/* buffer pointer for
+										 * parity write
+										 * operation */
+		writeParityNodes[i].params[2].v = parityStripeID;
+		writeParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+
+		if (lu_flag) {
+			/* initialize node to unlock the disk queue */
+			rf_InitNode(&unlockParityNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Unp", allocList);
+			unlockParityNodes[i].params[0].p = pda;	/* physical disk addr
+								 * desc */
+			unlockParityNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
+		}
+		pda = pda->next;
+	}
+
+	/* initialize nodes which write new Q (Wnq) */
+	if (nfaults == 2) {
+		pda = asmap->qInfo;
+		for (i = 0; i < numParityNodes; i++) {
+			rf_InitNode(&writeQNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, numParityNodes, 4, 0, dag_h, "Wnq", allocList);
+			RF_ASSERT(pda != NULL);
+			writeQNodes[i].params[0].p = pda;	/* param 1 (bufPtr)
+								 * filled in by xor node */
+			writeQNodes[i].params[1].p = qNodes[i].results[0];	/* buffer pointer for
+										 * parity write
+										 * operation */
+			writeQNodes[i].params[2].v = parityStripeID;
+			writeQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+
+			if (lu_flag) {
+				/* initialize node to unlock the disk queue */
+				rf_InitNode(&unlockQNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Unq", allocList);
+				unlockQNodes[i].params[0].p = pda;	/* physical disk addr
+									 * desc */
+				unlockQNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
+			}
+			pda = pda->next;
+		}
+	}
+	/* Step 4. connect the nodes */
+
+	/* connect header to block node */
+	dag_h->succedents[0] = blockNode;
+
+	/* connect block node to read old data nodes */
+	RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults)));
+	for (i = 0; i < numDataNodes; i++) {
+		blockNode->succedents[i] = &readDataNodes[i];
+		RF_ASSERT(readDataNodes[i].numAntecedents == 1);
+		readDataNodes[i].antecedents[0] = blockNode;
+		readDataNodes[i].antType[0] = rf_control;
+	}
+
+	/* connect block node to read old parity nodes */
+	for (i = 0; i < numParityNodes; i++) {
+		blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
+		RF_ASSERT(readParityNodes[i].numAntecedents == 1);
+		readParityNodes[i].antecedents[0] = blockNode;
+		readParityNodes[i].antType[0] = rf_control;
+	}
+
+	/* connect block node to read old Q nodes */
+	if (nfaults == 2)
+		for (i = 0; i < numParityNodes; i++) {
+			blockNode->succedents[numDataNodes + numParityNodes + i] = &readQNodes[i];
+			RF_ASSERT(readQNodes[i].numAntecedents == 1);
+			readQNodes[i].antecedents[0] = blockNode;
+			readQNodes[i].antType[0] = rf_control;
+		}
+
+	/* connect read old data nodes to write new data nodes */
+	for (i = 0; i < numDataNodes; i++) {
+		RF_ASSERT(readDataNodes[i].numSuccedents == ((nfaults * numParityNodes) + 1));
+		RF_ASSERT(writeDataNodes[i].numAntecedents == 1);
+		readDataNodes[i].succedents[0] = &writeDataNodes[i];
+		writeDataNodes[i].antecedents[0] = &readDataNodes[i];
+		writeDataNodes[i].antType[0] = rf_antiData;
+	}
+
+	/* connect read old data nodes to xor nodes */
+	for (i = 0; i < numDataNodes; i++) {
+		for (j = 0; j < numParityNodes; j++) {
+			RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
+			readDataNodes[i].succedents[1 + j] = &xorNodes[j];
+			xorNodes[j].antecedents[i] = &readDataNodes[i];
+			xorNodes[j].antType[i] = rf_trueData;
+		}
+	}
+
+	/* connect read old data nodes to q nodes */
+	if (nfaults == 2)
+		for (i = 0; i < numDataNodes; i++)
+			for (j = 0; j < numParityNodes; j++) {
+				RF_ASSERT(qNodes[j].numAntecedents == numDataNodes + numParityNodes);
+				readDataNodes[i].succedents[1 + numParityNodes + j] = &qNodes[j];
+				qNodes[j].antecedents[i] = &readDataNodes[i];
+				qNodes[j].antType[i] = rf_trueData;
+			}
+
+	/* connect read old parity nodes to xor nodes */
+	for (i = 0; i < numParityNodes; i++) {
+		for (j = 0; j < numParityNodes; j++) {
+			RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
+			readParityNodes[i].succedents[j] = &xorNodes[j];
+			xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
+			xorNodes[j].antType[numDataNodes + i] = rf_trueData;
+		}
+	}
+
+	/* connect read old q nodes to q nodes */
+	if (nfaults == 2)
+		for (i = 0; i < numParityNodes; i++) {
+			for (j = 0; j < numParityNodes; j++) {
+				RF_ASSERT(readQNodes[i].numSuccedents == numParityNodes);
+				readQNodes[i].succedents[j] = &qNodes[j];
+				qNodes[j].antecedents[numDataNodes + i] = &readQNodes[i];
+				qNodes[j].antType[numDataNodes + i] = rf_trueData;
+			}
+		}
+
+	/* connect xor nodes to the write new parity nodes */
+	for (i = 0; i < numParityNodes; i++) {
+		RF_ASSERT(writeParityNodes[i].numAntecedents == numParityNodes);
+		for (j = 0; j < numParityNodes; j++) {
+			RF_ASSERT(xorNodes[j].numSuccedents == numParityNodes);
+			xorNodes[i].succedents[j] = &writeParityNodes[j];
+			writeParityNodes[j].antecedents[i] = &xorNodes[i];
+			writeParityNodes[j].antType[i] = rf_trueData;
+		}
+	}
+
+	/* connect q nodes to the write new q nodes */
+	if (nfaults == 2)
+		for (i = 0; i < numParityNodes; i++) {
+			RF_ASSERT(writeQNodes[i].numAntecedents == numParityNodes);
+			for (j = 0; j < numParityNodes; j++) {
+				RF_ASSERT(qNodes[j].numSuccedents == 1);
+				qNodes[i].succedents[j] = &writeQNodes[j];
+				writeQNodes[j].antecedents[i] = &qNodes[i];
+				writeQNodes[j].antType[i] = rf_trueData;
+			}
+		}
+
+	RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
+	RF_ASSERT(termNode->numSuccedents == 0);
+	for (i = 0; i < numDataNodes; i++) {
+		if (lu_flag) {
+			/* connect write new data nodes to unlock nodes */
+			RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
+			RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
+			writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
+			unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
+			unlockDataNodes[i].antType[0] = rf_control;
+
+			/* connect unlock nodes to term node */
+			RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
+			unlockDataNodes[i].succedents[0] = termNode;
+			termNode->antecedents[i] = &unlockDataNodes[i];
+			termNode->antType[i] = rf_control;
+		} else {
+			/* connect write new data nodes to term node */
+			RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
+			RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
+			writeDataNodes[i].succedents[0] = termNode;
+			termNode->antecedents[i] = &writeDataNodes[i];
+			termNode->antType[i] = rf_control;
+		}
+	}
+
+	for (i = 0; i < numParityNodes; i++) {
+		if (lu_flag) {
+			/* connect write new parity nodes to unlock nodes */
+			RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
+			RF_ASSERT(unlockParityNodes[i].numAntecedents == 1);
+			writeParityNodes[i].succedents[0] = &unlockParityNodes[i];
+			unlockParityNodes[i].antecedents[0] = &writeParityNodes[i];
+			unlockParityNodes[i].antType[0] = rf_control;
+
+			/* connect unlock nodes to term node */
+			RF_ASSERT(unlockParityNodes[i].numSuccedents == 1);
+			unlockParityNodes[i].succedents[0] = termNode;
+			termNode->antecedents[numDataNodes + i] = &unlockParityNodes[i];
+			termNode->antType[numDataNodes + i] = rf_control;
+		} else {
+			RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
+			writeParityNodes[i].succedents[0] = termNode;
+			termNode->antecedents[numDataNodes + i] = &writeParityNodes[i];
+			termNode->antType[numDataNodes + i] = rf_control;
+		}
+	}
+
+	if (nfaults == 2)
+		for (i = 0; i < numParityNodes; i++) {
+			if (lu_flag) {
+				/* connect write new Q nodes to unlock nodes */
+				RF_ASSERT(writeQNodes[i].numSuccedents == 1);
+				RF_ASSERT(unlockQNodes[i].numAntecedents == 1);
+				writeQNodes[i].succedents[0] = &unlockQNodes[i];
+				unlockQNodes[i].antecedents[0] = &writeQNodes[i];
+				unlockQNodes[i].antType[0] = rf_control;
+
+				/* connect unlock nodes to unblock node */
+				RF_ASSERT(unlockQNodes[i].numSuccedents == 1);
+				unlockQNodes[i].succedents[0] = termNode;
+				termNode->antecedents[numDataNodes + numParityNodes + i] = &unlockQNodes[i];
+				termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
+			} else {
+				RF_ASSERT(writeQNodes[i].numSuccedents == 1);
+				writeQNodes[i].succedents[0] = termNode;
+				termNode->antecedents[numDataNodes + numParityNodes + i] = &writeQNodes[i];
+				termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
+			}
+		}
+}
+
+
+
+/******************************************************************************
+ * create a write graph (fault-free or degraded) for RAID level 1
+ *
+ * Hdr  Nil -> Wpd -> Nil -> Trm
+ *      Nil -> Wsd ->
+ *
+ * The "Wpd" node writes data to the primary copy in the mirror pair
+ * The "Wsd" node writes data to the secondary copy in the mirror pair
+ *
+ * Parameters:  raidPtr   - description of the physical array
+ *              asmap     - logical & physical addresses for this access
+ *              bp        - buffer ptr (holds write data)
+ *              flags     - general flags (e.g. disk locking)
+ *              allocList - list of memory allocated in DAG creation
+ *****************************************************************************/
+
+void 
+rf_CreateRaidOneWriteDAGFwd(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList)
+{
+	RF_DagNode_t *blockNode, *unblockNode, *termNode;
+	RF_DagNode_t *nodes, *wndNode, *wmirNode;
+	int     nWndNodes, nWmirNodes, i;
+	RF_ReconUnitNum_t which_ru;
+	RF_PhysDiskAddr_t *pda, *pdaP;
+	RF_StripeNum_t parityStripeID;
+
+	parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
+	    asmap->raidAddress, &which_ru);
+	if (rf_dagDebug) {
+		printf("[Creating RAID level 1 write DAG]\n");
+	}
+	nWmirNodes = (asmap->parityInfo->next) ? 2 : 1;	/* 2 implies access not
+							 * SU aligned */
+	nWndNodes = (asmap->physInfo->next) ? 2 : 1;
+
+	/* alloc the Wnd nodes and the Wmir node */
+	if (asmap->numDataFailed == 1)
+		nWndNodes--;
+	if (asmap->numParityFailed == 1)
+		nWmirNodes--;
+
+	/* total number of nodes = nWndNodes + nWmirNodes + (block + unblock +
+	 * terminator) */
+	RF_CallocAndAdd(nodes, nWndNodes + nWmirNodes + 3, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+	i = 0;
+	wndNode = &nodes[i];
+	i += nWndNodes;
+	wmirNode = &nodes[i];
+	i += nWmirNodes;
+	blockNode = &nodes[i];
+	i += 1;
+	unblockNode = &nodes[i];
+	i += 1;
+	termNode = &nodes[i];
+	i += 1;
+	RF_ASSERT(i == (nWndNodes + nWmirNodes + 3));
+
+	/* this dag can commit immediately */
+	dag_h->numCommitNodes = 0;
+	dag_h->numCommits = 0;
+	dag_h->numSuccedents = 1;
+
+	/* initialize the unblock and term nodes */
+	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, (nWndNodes + nWmirNodes), 0, 0, 0, dag_h, "Nil", allocList);
+	rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, (nWndNodes + nWmirNodes), 0, 0, dag_h, "Nil", allocList);
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+	/* initialize the wnd nodes */
+	if (nWndNodes > 0) {
+		pda = asmap->physInfo;
+		for (i = 0; i < nWndNodes; i++) {
+			rf_InitNode(&wndNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wpd", allocList);
+			RF_ASSERT(pda != NULL);
+			wndNode[i].params[0].p = pda;
+			wndNode[i].params[1].p = pda->bufPtr;
+			wndNode[i].params[2].v = parityStripeID;
+			wndNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+			pda = pda->next;
+		}
+		RF_ASSERT(pda == NULL);
+	}
+	/* initialize the mirror nodes */
+	if (nWmirNodes > 0) {
+		pda = asmap->physInfo;
+		pdaP = asmap->parityInfo;
+		for (i = 0; i < nWmirNodes; i++) {
+			rf_InitNode(&wmirNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wsd", allocList);
+			RF_ASSERT(pda != NULL);
+			wmirNode[i].params[0].p = pdaP;
+			wmirNode[i].params[1].p = pda->bufPtr;
+			wmirNode[i].params[2].v = parityStripeID;
+			wmirNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+			pda = pda->next;
+			pdaP = pdaP->next;
+		}
+		RF_ASSERT(pda == NULL);
+		RF_ASSERT(pdaP == NULL);
+	}
+	/* link the header node to the block node */
+	RF_ASSERT(dag_h->numSuccedents == 1);
+	RF_ASSERT(blockNode->numAntecedents == 0);
+	dag_h->succedents[0] = blockNode;
+
+	/* link the block node to the write nodes */
+	RF_ASSERT(blockNode->numSuccedents == (nWndNodes + nWmirNodes));
+	for (i = 0; i < nWndNodes; i++) {
+		RF_ASSERT(wndNode[i].numAntecedents == 1);
+		blockNode->succedents[i] = &wndNode[i];
+		wndNode[i].antecedents[0] = blockNode;
+		wndNode[i].antType[0] = rf_control;
+	}
+	for (i = 0; i < nWmirNodes; i++) {
+		RF_ASSERT(wmirNode[i].numAntecedents == 1);
+		blockNode->succedents[i + nWndNodes] = &wmirNode[i];
+		wmirNode[i].antecedents[0] = blockNode;
+		wmirNode[i].antType[0] = rf_control;
+	}
+
+	/* link the write nodes to the unblock node */
+	RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes));
+	for (i = 0; i < nWndNodes; i++) {
+		RF_ASSERT(wndNode[i].numSuccedents == 1);
+		wndNode[i].succedents[0] = unblockNode;
+		unblockNode->antecedents[i] = &wndNode[i];
+		unblockNode->antType[i] = rf_control;
+	}
+	for (i = 0; i < nWmirNodes; i++) {
+		RF_ASSERT(wmirNode[i].numSuccedents == 1);
+		wmirNode[i].succedents[0] = unblockNode;
+		unblockNode->antecedents[i + nWndNodes] = &wmirNode[i];
+		unblockNode->antType[i + nWndNodes] = rf_control;
+	}
+
+	/* link the unblock node to the term node */
+	RF_ASSERT(unblockNode->numSuccedents == 1);
+	RF_ASSERT(termNode->numAntecedents == 1);
+	RF_ASSERT(termNode->numSuccedents == 0);
+	unblockNode->succedents[0] = termNode;
+	termNode->antecedents[0] = unblockNode;
+	termNode->antType[0] = rf_control;
+
+	return;
+}
diff --git a/sys/dev/raidframe/rf_dagffwr.h b/sys/dev/raidframe/rf_dagffwr.h
new file mode 100644
index 0000000..f65875e
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagffwr.h
@@ -0,0 +1,77 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dagffwr.h,v 1.3 1999/02/05 00:06:08 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_DAGFFWR_H_
+#define _RF__RF_DAGFFWR_H_
+
+#include <dev/raidframe/rf_types.h>
+
+/* fault-free write DAG creation routines */
+void 
+rf_CreateNonRedundantWriteDAG(RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+    RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
+    RF_IoType_t type);
+void 
+rf_CreateRAID0WriteDAG(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList, RF_IoType_t type);
+void 
+rf_CreateSmallWriteDAG(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList);
+void 
+rf_CreateLargeWriteDAG(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList);
+void 
+rf_CommonCreateLargeWriteDAG(RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+    RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList, int nfaults,
+    int (*redFunc) (RF_DagNode_t *), int allowBufferRecycle);
+	void    rf_CommonCreateLargeWriteDAGFwd(RF_Raid_t * raidPtr,
+            RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+            RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList, int nfaults,
+            int (*redFunc) (RF_DagNode_t *), int allowBufferRecycle);
+	void    rf_CommonCreateSmallWriteDAG(RF_Raid_t * raidPtr,
+            RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+            RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
+            RF_RedFuncs_t * pfuncs, RF_RedFuncs_t * qfuncs);
+	void    rf_CommonCreateSmallWriteDAGFwd(RF_Raid_t * raidPtr,
+            RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+            RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
+            RF_RedFuncs_t * pfuncs, RF_RedFuncs_t * qfuncs);
+	void    rf_CreateRaidOneWriteDAG(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
+            RF_DagHeader_t * dag_h, void *bp, RF_RaidAccessFlags_t flags,
+            RF_AllocListElem_t * allocList);
+	void    rf_CreateRaidOneWriteDAGFwd(RF_Raid_t * raidPtr,
+            RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h, void *bp,
+            RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList);
+
+#endif				/* !_RF__RF_DAGFFWR_H_ */
diff --git a/sys/dev/raidframe/rf_dagflags.h b/sys/dev/raidframe/rf_dagflags.h
new file mode 100644
index 0000000..b0777bd
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagflags.h
@@ -0,0 +1,68 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dagflags.h,v 1.3 1999/02/05 00:06:08 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/**************************************************************************************
+ *
+ * dagflags.h -- flags that can be given to DoAccess
+ * I pulled these out of dag.h because routines that call DoAccess may need these flags,
+ * but certainly do not need the declarations related to the DAG data structures.
+ *
+ **************************************************************************************/
+
+
+#ifndef _RF__RF_DAGFLAGS_H_
+#define _RF__RF_DAGFLAGS_H_
+
+/*
+ * Bitmasks for the "flags" parameter (RF_RaidAccessFlags_t) used
+ * by DoAccess, SelectAlgorithm, and the DAG creation routines.
+ *
+ * If USE_DAG or USE_ASM is specified, neither the DAG nor the ASM
+ * will be modified, which means that you can't SUPRESS if you
+ * specify USE_DAG.
+ */
+
+#define RF_DAG_FLAGS_NONE             0	/* no flags */
+#define RF_DAG_SUPPRESS_LOCKS     (1<<0)	/* supress all stripe locks in
+						 * the DAG */
+#define RF_DAG_RETURN_ASM         (1<<1)	/* create an ASM and return it
+						 * instead of freeing it */
+#define RF_DAG_RETURN_DAG         (1<<2)	/* create a DAG and return it
+						 * instead of freeing it */
+#define RF_DAG_NONBLOCKING_IO     (1<<3)	/* cause DoAccess to be
+						 * non-blocking */
+#define RF_DAG_ACCESS_COMPLETE    (1<<4)	/* the access is complete */
+#define RF_DAG_DISPATCH_RETURNED  (1<<5)	/* used to handle the case
+						 * where the dag invokes no
+						 * I/O */
+#define RF_DAG_TEST_ACCESS        (1<<6)	/* this access came through
+						 * rf_ioctl instead of
+						 * rf_strategy */
+
+#endif				/* !_RF__RF_DAGFLAGS_H_ */
diff --git a/sys/dev/raidframe/rf_dagfuncs.c b/sys/dev/raidframe/rf_dagfuncs.c
new file mode 100644
index 0000000..09ee274
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagfuncs.c
@@ -0,0 +1,904 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dagfuncs.c,v 1.7 2001/02/03 12:51:10 mrg Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * dagfuncs.c -- DAG node execution routines
+ *
+ * Rules:
+ * 1. Every DAG execution function must eventually cause node->status to
+ *    get set to "good" or "bad", and "FinishNode" to be called. In the
+ *    case of nodes that complete immediately (xor, NullNodeFunc, etc),
+ *    the node execution function can do these two things directly. In
+ *    the case of nodes that have to wait for some event (a disk read to
+ *    complete, a lock to be released, etc) to occur before they can
+ *    complete, this is typically achieved by having whatever module
+ *    is doing the operation call GenericWakeupFunc upon completion.
+ * 2. DAG execution functions should check the status in the DAG header
+ *    and NOP out their operations if the status is not "enable". However,
+ *    execution functions that release resources must be sure to release
+ *    them even when they NOP out the function that would use them.
+ *    Functions that acquire resources should go ahead and acquire them
+ *    even when they NOP, so that a downstream release node will not have
+ *    to check to find out whether or not the acquire was suppressed.
+ */
+
+#include <sys/param.h>
+#if defined(__NetBSD__)
+#include <sys/ioctl.h>
+#elif defined(__FreeBSD__)
+#include <sys/ioccom.h>
+#include <sys/filio.h>
+#endif
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_layout.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_acctrace.h>
+#include <dev/raidframe/rf_diskqueue.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_engine.h>
+#include <dev/raidframe/rf_dagutils.h>
+
+#include <dev/raidframe/rf_kintf.h>
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+#include <dev/raidframe/rf_paritylog.h>
+#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
+
+int     (*rf_DiskReadFunc) (RF_DagNode_t *);
+int     (*rf_DiskWriteFunc) (RF_DagNode_t *);
+int     (*rf_DiskReadUndoFunc) (RF_DagNode_t *);
+int     (*rf_DiskWriteUndoFunc) (RF_DagNode_t *);
+int     (*rf_DiskUnlockFunc) (RF_DagNode_t *);
+int     (*rf_DiskUnlockUndoFunc) (RF_DagNode_t *);
+int     (*rf_RegularXorUndoFunc) (RF_DagNode_t *);
+int     (*rf_SimpleXorUndoFunc) (RF_DagNode_t *);
+int     (*rf_RecoveryXorUndoFunc) (RF_DagNode_t *);
+
+/*****************************************************************************************
+ * main (only) configuration routine for this module
+ ****************************************************************************************/
+int 
+rf_ConfigureDAGFuncs(listp)
+	RF_ShutdownList_t **listp;
+{
+	RF_ASSERT(((sizeof(long) == 8) && RF_LONGSHIFT == 3) || ((sizeof(long) == 4) && RF_LONGSHIFT == 2));
+	rf_DiskReadFunc = rf_DiskReadFuncForThreads;
+	rf_DiskReadUndoFunc = rf_DiskUndoFunc;
+	rf_DiskWriteFunc = rf_DiskWriteFuncForThreads;
+	rf_DiskWriteUndoFunc = rf_DiskUndoFunc;
+	rf_DiskUnlockFunc = rf_DiskUnlockFuncForThreads;
+	rf_DiskUnlockUndoFunc = rf_NullNodeUndoFunc;
+	rf_RegularXorUndoFunc = rf_NullNodeUndoFunc;
+	rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc;
+	rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc;
+	return (0);
+}
+
+
+
+/*****************************************************************************************
+ * the execution function associated with a terminate node
+ ****************************************************************************************/
+int 
+rf_TerminateFunc(node)
+	RF_DagNode_t *node;
+{
+	RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes);
+	node->status = rf_good;
+	return (rf_FinishNode(node, RF_THREAD_CONTEXT));
+}
+
+int 
+rf_TerminateUndoFunc(node)
+	RF_DagNode_t *node;
+{
+	return (0);
+}
+
+
+/*****************************************************************************************
+ * execution functions associated with a mirror node
+ *
+ * parameters:
+ *
+ * 0 - physical disk addres of data
+ * 1 - buffer for holding read data
+ * 2 - parity stripe ID
+ * 3 - flags
+ * 4 - physical disk address of mirror (parity)
+ *
+ ****************************************************************************************/
+
+int 
+rf_DiskReadMirrorIdleFunc(node)
+	RF_DagNode_t *node;
+{
+	/* select the mirror copy with the shortest queue and fill in node
+	 * parameters with physical disk address */
+
+	rf_SelectMirrorDiskIdle(node);
+	return (rf_DiskReadFunc(node));
+}
+
+int 
+rf_DiskReadMirrorPartitionFunc(node)
+	RF_DagNode_t *node;
+{
+	/* select the mirror copy with the shortest queue and fill in node
+	 * parameters with physical disk address */
+
+	rf_SelectMirrorDiskPartition(node);
+	return (rf_DiskReadFunc(node));
+}
+
+int 
+rf_DiskReadMirrorUndoFunc(node)
+	RF_DagNode_t *node;
+{
+	return (0);
+}
+
+
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+/*****************************************************************************************
+ * the execution function associated with a parity log update node
+ ****************************************************************************************/
+int 
+rf_ParityLogUpdateFunc(node)
+	RF_DagNode_t *node;
+{
+	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+	caddr_t buf = (caddr_t) node->params[1].p;
+	RF_ParityLogData_t *logData;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+
+	if (node->dagHdr->status == rf_enable) {
+		RF_ETIMER_START(timer);
+		logData = rf_CreateParityLogData(RF_UPDATE, pda, buf,
+		    (RF_Raid_t *) (node->dagHdr->raidPtr),
+		    node->wakeFunc, (void *) node,
+		    node->dagHdr->tracerec, timer);
+		if (logData)
+			rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
+		else {
+			RF_ETIMER_STOP(timer);
+			RF_ETIMER_EVAL(timer);
+			tracerec->plog_us += RF_ETIMER_VAL_US(timer);
+			(node->wakeFunc) (node, ENOMEM);
+		}
+	}
+	return (0);
+}
+
+
+/*****************************************************************************************
+ * the execution function associated with a parity log overwrite node
+ ****************************************************************************************/
+int 
+rf_ParityLogOverwriteFunc(node)
+	RF_DagNode_t *node;
+{
+	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+	caddr_t buf = (caddr_t) node->params[1].p;
+	RF_ParityLogData_t *logData;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+
+	if (node->dagHdr->status == rf_enable) {
+		RF_ETIMER_START(timer);
+		logData = rf_CreateParityLogData(RF_OVERWRITE, pda, buf, (RF_Raid_t *) (node->dagHdr->raidPtr),
+		    node->wakeFunc, (void *) node, node->dagHdr->tracerec, timer);
+		if (logData)
+			rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
+		else {
+			RF_ETIMER_STOP(timer);
+			RF_ETIMER_EVAL(timer);
+			tracerec->plog_us += RF_ETIMER_VAL_US(timer);
+			(node->wakeFunc) (node, ENOMEM);
+		}
+	}
+	return (0);
+}
+#else				/* RF_INCLUDE_PARITYLOGGING > 0 */
+
+int 
+rf_ParityLogUpdateFunc(node)
+	RF_DagNode_t *node;
+{
+	return (0);
+}
+int 
+rf_ParityLogOverwriteFunc(node)
+	RF_DagNode_t *node;
+{
+	return (0);
+}
+#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
+
+int 
+rf_ParityLogUpdateUndoFunc(node)
+	RF_DagNode_t *node;
+{
+	return (0);
+}
+
+int 
+rf_ParityLogOverwriteUndoFunc(node)
+	RF_DagNode_t *node;
+{
+	return (0);
+}
+/*****************************************************************************************
+ * the execution function associated with a NOP node
+ ****************************************************************************************/
+int 
+rf_NullNodeFunc(node)
+	RF_DagNode_t *node;
+{
+	node->status = rf_good;
+	return (rf_FinishNode(node, RF_THREAD_CONTEXT));
+}
+
+int 
+rf_NullNodeUndoFunc(node)
+	RF_DagNode_t *node;
+{
+	node->status = rf_undone;
+	return (rf_FinishNode(node, RF_THREAD_CONTEXT));
+}
+
+
+/*****************************************************************************************
+ * the execution function associated with a disk-read node
+ ****************************************************************************************/
+int 
+rf_DiskReadFuncForThreads(node)
+	RF_DagNode_t *node;
+{
+	RF_DiskQueueData_t *req;
+	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+	caddr_t buf = (caddr_t) node->params[1].p;
+	RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
+	unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
+	unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
+	unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
+	unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
+	RF_DiskQueueDataFlags_t flags = 0;
+	RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_READ : RF_IO_TYPE_NOP;
+	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
+	void   *b_proc = NULL;
+
+#if defined(__NetBSD__)
+	if (node->dagHdr->bp)
+		b_proc = (void *) ((RF_Buf_t) node->dagHdr->bp)->b_proc;
+#endif
+
+	RF_ASSERT(!(lock && unlock));
+	flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
+	flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
+
+	req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
+	    buf, parityStripeID, which_ru,
+	    (int (*) (void *, int)) node->wakeFunc,
+	    node, NULL, node->dagHdr->tracerec,
+	    (void *) (node->dagHdr->raidPtr), flags, b_proc);
+	if (!req) {
+		(node->wakeFunc) (node, ENOMEM);
+	} else {
+		node->dagFuncData = (void *) req;
+		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
+	}
+	return (0);
+}
+
+
+/*****************************************************************************************
+ * the execution function associated with a disk-write node
+ ****************************************************************************************/
+int 
+rf_DiskWriteFuncForThreads(node)
+	RF_DagNode_t *node;
+{
+	RF_DiskQueueData_t *req;
+	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+	caddr_t buf = (caddr_t) node->params[1].p;
+	RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
+	unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
+	unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
+	unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
+	unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
+	RF_DiskQueueDataFlags_t flags = 0;
+	RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP;
+	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
+	void   *b_proc = NULL;
+
+#if defined(__NetBSD__)
+	if (node->dagHdr->bp)
+		b_proc = (void *) ((RF_Buf_t) node->dagHdr->bp)->b_proc;
+#endif
+
+	/* normal processing (rollaway or forward recovery) begins here */
+	RF_ASSERT(!(lock && unlock));
+	flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
+	flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
+	req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
+	    buf, parityStripeID, which_ru,
+	    (int (*) (void *, int)) node->wakeFunc,
+	    (void *) node, NULL,
+	    node->dagHdr->tracerec,
+	    (void *) (node->dagHdr->raidPtr),
+	    flags, b_proc);
+
+	if (!req) {
+		(node->wakeFunc) (node, ENOMEM);
+	} else {
+		node->dagFuncData = (void *) req;
+		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
+	}
+
+	return (0);
+}
+/*****************************************************************************************
+ * the undo function for disk nodes
+ * Note:  this is not a proper undo of a write node, only locks are released.
+ *        old data is not restored to disk!
+ ****************************************************************************************/
+int 
+rf_DiskUndoFunc(node)
+	RF_DagNode_t *node;
+{
+	RF_DiskQueueData_t *req;
+	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
+
+	req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
+	    0L, 0, NULL, 0L, 0,
+	    (int (*) (void *, int)) node->wakeFunc,
+	    (void *) node,
+	    NULL, node->dagHdr->tracerec,
+	    (void *) (node->dagHdr->raidPtr),
+	    RF_UNLOCK_DISK_QUEUE, NULL);
+	if (!req)
+		(node->wakeFunc) (node, ENOMEM);
+	else {
+		node->dagFuncData = (void *) req;
+		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY);
+	}
+
+	return (0);
+}
+/*****************************************************************************************
+ * the execution function associated with an "unlock disk queue" node
+ ****************************************************************************************/
+int 
+rf_DiskUnlockFuncForThreads(node)
+	RF_DagNode_t *node;
+{
+	RF_DiskQueueData_t *req;
+	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
+
+	req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
+	    0L, 0, NULL, 0L, 0,
+	    (int (*) (void *, int)) node->wakeFunc,
+	    (void *) node,
+	    NULL, node->dagHdr->tracerec,
+	    (void *) (node->dagHdr->raidPtr),
+	    RF_UNLOCK_DISK_QUEUE, NULL);
+	if (!req)
+		(node->wakeFunc) (node, ENOMEM);
+	else {
+		node->dagFuncData = (void *) req;
+		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY);
+	}
+
+	return (0);
+}
+/*****************************************************************************************
+ * Callback routine for DiskRead and DiskWrite nodes.  When the disk op completes,
+ * the routine is called to set the node status and inform the execution engine that
+ * the node has fired.
+ ****************************************************************************************/
+int 
+rf_GenericWakeupFunc(node, status)
+	RF_DagNode_t *node;
+	int     status;
+{
+	switch (node->status) {
+	case rf_bwd1:
+		node->status = rf_bwd2;
+		if (node->dagFuncData)
+			rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
+		return (rf_DiskWriteFuncForThreads(node));
+		break;
+	case rf_fired:
+		if (status)
+			node->status = rf_bad;
+		else
+			node->status = rf_good;
+		break;
+	case rf_recover:
+		/* probably should never reach this case */
+		if (status)
+			node->status = rf_panic;
+		else
+			node->status = rf_undone;
+		break;
+	default:
+		printf("rf_GenericWakeupFunc:");
+		printf("node->status is %d,", node->status);
+		printf("status is %d \n", status);
+		RF_PANIC();
+		break;
+	}
+	if (node->dagFuncData)
+		rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
+	return (rf_FinishNode(node, RF_INTR_CONTEXT));
+}
+
+
+/*****************************************************************************************
+ * there are three distinct types of xor nodes
+ * A "regular xor" is used in the fault-free case where the access spans a complete
+ * stripe unit.  It assumes that the result buffer is one full stripe unit in size,
+ * and uses the stripe-unit-offset values that it computes from the PDAs to determine
+ * where within the stripe unit to XOR each argument buffer.
+ *
+ * A "simple xor" is used in the fault-free case where the access touches only a portion
+ * of one (or two, in some cases) stripe unit(s).  It assumes that all the argument
+ * buffers are of the same size and have the same stripe unit offset.
+ *
+ * A "recovery xor" is used in the degraded-mode case.  It's similar to the regular
+ * xor function except that it takes the failed PDA as an additional parameter, and
+ * uses it to determine what portions of the argument buffers need to be xor'd into
+ * the result buffer, and where in the result buffer they should go.
+ ****************************************************************************************/
+
+/* xor the params together and store the result in the result field.
+ * assume the result field points to a buffer that is the size of one SU,
+ * and use the pda params to determine where within the buffer to XOR
+ * the input buffers.
+ */
+int 
+rf_RegularXorFunc(node)
+	RF_DagNode_t *node;
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+	int     i, retcode;
+
+	retcode = 0;
+	if (node->dagHdr->status == rf_enable) {
+		/* don't do the XOR if the input is the same as the output */
+		RF_ETIMER_START(timer);
+		for (i = 0; i < node->numParams - 1; i += 2)
+			if (node->params[i + 1].p != node->results[0]) {
+				retcode = rf_XorIntoBuffer(raidPtr, (RF_PhysDiskAddr_t *) node->params[i].p,
+				    (char *) node->params[i + 1].p, (char *) node->results[0], node->dagHdr->bp);
+			}
+		RF_ETIMER_STOP(timer);
+		RF_ETIMER_EVAL(timer);
+		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
+	}
+	return (rf_GenericWakeupFunc(node, retcode));	/* call wake func
+							 * explicitly since no
+							 * I/O in this node */
+}
+/* xor the inputs into the result buffer, ignoring placement issues */
+int 
+rf_SimpleXorFunc(node)
+	RF_DagNode_t *node;
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
+	int     i, retcode = 0;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+
+	if (node->dagHdr->status == rf_enable) {
+		RF_ETIMER_START(timer);
+		/* don't do the XOR if the input is the same as the output */
+		for (i = 0; i < node->numParams - 1; i += 2)
+			if (node->params[i + 1].p != node->results[0]) {
+				retcode = rf_bxor((char *)node->params[i + 1].p,
+				    (char *)node->results[0],
+				    rf_RaidAddressToByte(raidPtr,
+				    ((RF_PhysDiskAddr_t *)node->params[i].p)->
+				    numSector), (RF_Buf_t)node->dagHdr->bp);
+			}
+		RF_ETIMER_STOP(timer);
+		RF_ETIMER_EVAL(timer);
+		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
+	}
+	return (rf_GenericWakeupFunc(node, retcode));	/* call wake func
+							 * explicitly since no
+							 * I/O in this node */
+}
+/* this xor is used by the degraded-mode dag functions to recover lost data.
+ * the second-to-last parameter is the PDA for the failed portion of the access.
+ * the code here looks at this PDA and assumes that the xor target buffer is
+ * equal in size to the number of sectors in the failed PDA.  It then uses
+ * the other PDAs in the parameter list to determine where within the target
+ * buffer the corresponding data should be xored.
+ */
+int 
+rf_RecoveryXorFunc(node)
+	RF_DagNode_t *node;
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
+	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
+	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
+	int     i, retcode = 0;
+	RF_PhysDiskAddr_t *pda;
+	int     suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
+	char   *srcbuf, *destbuf;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+
+	if (node->dagHdr->status == rf_enable) {
+		RF_ETIMER_START(timer);
+		for (i = 0; i < node->numParams - 2; i += 2)
+			if (node->params[i + 1].p != node->results[0]) {
+				pda = (RF_PhysDiskAddr_t *) node->params[i].p;
+				srcbuf = (char *) node->params[i + 1].p;
+				suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+				destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
+				retcode = rf_bxor(srcbuf, destbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), node->dagHdr->bp);
+			}
+		RF_ETIMER_STOP(timer);
+		RF_ETIMER_EVAL(timer);
+		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
+	}
+	return (rf_GenericWakeupFunc(node, retcode));
+}
+/*****************************************************************************************
+ * The next three functions are utilities used by the above xor-execution functions.
+ ****************************************************************************************/
+
+
+/*
+ * this is just a glorified buffer xor.  targbuf points to a buffer that is one full stripe unit
+ * in size.  srcbuf points to a buffer that may be less than 1 SU, but never more.  When the
+ * access described by pda is one SU in size (which by implication means it's SU-aligned),
+ * all that happens is (targbuf) <- (srcbuf ^ targbuf).  When the access is less than one
+ * SU in size the XOR occurs on only the portion of targbuf identified in the pda.
+ */
+
+int 
+rf_XorIntoBuffer(raidPtr, pda, srcbuf, targbuf, bp)
+	RF_Raid_t *raidPtr;
+	RF_PhysDiskAddr_t *pda;
+	char   *srcbuf;
+	char   *targbuf;
+	void   *bp;
+{
+	char   *targptr;
+	int     sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
+	int     SUOffset = pda->startSector % sectPerSU;
+	int     length, retcode = 0;
+
+	RF_ASSERT(pda->numSector <= sectPerSU);
+
+	targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset);
+	length = rf_RaidAddressToByte(raidPtr, pda->numSector);
+	retcode = rf_bxor(srcbuf, targptr, length, bp);
+	return (retcode);
+}
+/* it really should be the case that the buffer pointers (returned by malloc)
+ * are aligned to the natural word size of the machine, so this is the only
+ * case we optimize for.  The length should always be a multiple of the sector
+ * size, so there should be no problem with leftover bytes at the end.
+ */
+int 
+rf_bxor(src, dest, len, bp)
+	char   *src;
+	char   *dest;
+	int     len;
+	void   *bp;
+{
+	unsigned mask = sizeof(long) - 1, retcode = 0;
+
+	if (!(((unsigned long) src) & mask) && !(((unsigned long) dest) & mask) && !(len & mask)) {
+		retcode = rf_longword_bxor((unsigned long *) src, (unsigned long *) dest, len >> RF_LONGSHIFT, bp);
+	} else {
+		RF_ASSERT(0);
+	}
+	return (retcode);
+}
+/* map a user buffer into kernel space, if necessary */
+#define REMAP_VA(_bp,x,y) (y) = (x)
+
+/* When XORing in kernel mode, we need to map each user page to kernel space before we can access it.
+ * We don't want to assume anything about which input buffers are in kernel/user
+ * space, nor about their alignment, so in each loop we compute the maximum number
+ * of bytes that we can xor without crossing any page boundaries, and do only this many
+ * bytes before the next remap.
+ */
+int 
+rf_longword_bxor(src, dest, len, bp)
+	unsigned long *src;
+	unsigned long *dest;
+	int     len;		/* longwords */
+	void   *bp;
+{
+	unsigned long *end = src + len;
+	unsigned long d0, d1, d2, d3, s0, s1, s2, s3;	/* temps */
+	unsigned long *pg_src, *pg_dest;	/* per-page source/dest
+							 * pointers */
+	int     longs_this_time;/* # longwords to xor in the current iteration */
+
+	REMAP_VA(bp, src, pg_src);
+	REMAP_VA(bp, dest, pg_dest);
+	if (!pg_src || !pg_dest)
+		return (EFAULT);
+
+	while (len >= 4) {
+		longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src), RF_BLIP(pg_dest)) >> RF_LONGSHIFT);	/* note len in longwords */
+		src += longs_this_time;
+		dest += longs_this_time;
+		len -= longs_this_time;
+		while (longs_this_time >= 4) {
+			d0 = pg_dest[0];
+			d1 = pg_dest[1];
+			d2 = pg_dest[2];
+			d3 = pg_dest[3];
+			s0 = pg_src[0];
+			s1 = pg_src[1];
+			s2 = pg_src[2];
+			s3 = pg_src[3];
+			pg_dest[0] = d0 ^ s0;
+			pg_dest[1] = d1 ^ s1;
+			pg_dest[2] = d2 ^ s2;
+			pg_dest[3] = d3 ^ s3;
+			pg_src += 4;
+			pg_dest += 4;
+			longs_this_time -= 4;
+		}
+		while (longs_this_time > 0) {	/* cannot cross any page
+						 * boundaries here */
+			*pg_dest++ ^= *pg_src++;
+			longs_this_time--;
+		}
+
+		/* either we're done, or we've reached a page boundary on one
+		 * (or possibly both) of the pointers */
+		if (len) {
+			if (RF_PAGE_ALIGNED(src))
+				REMAP_VA(bp, src, pg_src);
+			if (RF_PAGE_ALIGNED(dest))
+				REMAP_VA(bp, dest, pg_dest);
+			if (!pg_src || !pg_dest)
+				return (EFAULT);
+		}
+	}
+	while (src < end) {
+		*pg_dest++ ^= *pg_src++;
+		src++;
+		dest++;
+		len--;
+		if (RF_PAGE_ALIGNED(src))
+			REMAP_VA(bp, src, pg_src);
+		if (RF_PAGE_ALIGNED(dest))
+			REMAP_VA(bp, dest, pg_dest);
+	}
+	RF_ASSERT(len == 0);
+	return (0);
+}
+
+
+/*
+   dst = a ^ b ^ c;
+   a may equal dst
+   see comment above longword_bxor
+*/
+int 
+rf_longword_bxor3(dst, a, b, c, len, bp)
+	unsigned long *dst;
+	unsigned long *a;
+	unsigned long *b;
+	unsigned long *c;
+	int     len;		/* length in longwords */
+	void   *bp;
+{
+	unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
+	unsigned long *pg_a, *pg_b, *pg_c, *pg_dst;	/* per-page source/dest
+								 * pointers */
+	int     longs_this_time;/* # longs to xor in the current iteration */
+	char    dst_is_a = 0;
+
+	REMAP_VA(bp, a, pg_a);
+	REMAP_VA(bp, b, pg_b);
+	REMAP_VA(bp, c, pg_c);
+	if (a == dst) {
+		pg_dst = pg_a;
+		dst_is_a = 1;
+	} else {
+		REMAP_VA(bp, dst, pg_dst);
+	}
+
+	/* align dest to cache line.  Can't cross a pg boundary on dst here. */
+	while ((((unsigned long) pg_dst) & 0x1f)) {
+		*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
+		dst++;
+		a++;
+		b++;
+		c++;
+		if (RF_PAGE_ALIGNED(a)) {
+			REMAP_VA(bp, a, pg_a);
+			if (!pg_a)
+				return (EFAULT);
+		}
+		if (RF_PAGE_ALIGNED(b)) {
+			REMAP_VA(bp, a, pg_b);
+			if (!pg_b)
+				return (EFAULT);
+		}
+		if (RF_PAGE_ALIGNED(c)) {
+			REMAP_VA(bp, a, pg_c);
+			if (!pg_c)
+				return (EFAULT);
+		}
+		len--;
+	}
+
+	while (len > 4) {
+		longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a), RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >> RF_LONGSHIFT);
+		a += longs_this_time;
+		b += longs_this_time;
+		c += longs_this_time;
+		dst += longs_this_time;
+		len -= longs_this_time;
+		while (longs_this_time >= 4) {
+			a0 = pg_a[0];
+			longs_this_time -= 4;
+
+			a1 = pg_a[1];
+			a2 = pg_a[2];
+
+			a3 = pg_a[3];
+			pg_a += 4;
+
+			b0 = pg_b[0];
+			b1 = pg_b[1];
+
+			b2 = pg_b[2];
+			b3 = pg_b[3];
+			/* start dual issue */
+			a0 ^= b0;
+			b0 = pg_c[0];
+
+			pg_b += 4;
+			a1 ^= b1;
+
+			a2 ^= b2;
+			a3 ^= b3;
+
+			b1 = pg_c[1];
+			a0 ^= b0;
+
+			b2 = pg_c[2];
+			a1 ^= b1;
+
+			b3 = pg_c[3];
+			a2 ^= b2;
+
+			pg_dst[0] = a0;
+			a3 ^= b3;
+			pg_dst[1] = a1;
+			pg_c += 4;
+			pg_dst[2] = a2;
+			pg_dst[3] = a3;
+			pg_dst += 4;
+		}
+		while (longs_this_time > 0) {	/* cannot cross any page
+						 * boundaries here */
+			*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
+			longs_this_time--;
+		}
+
+		if (len) {
+			if (RF_PAGE_ALIGNED(a)) {
+				REMAP_VA(bp, a, pg_a);
+				if (!pg_a)
+					return (EFAULT);
+				if (dst_is_a)
+					pg_dst = pg_a;
+			}
+			if (RF_PAGE_ALIGNED(b)) {
+				REMAP_VA(bp, b, pg_b);
+				if (!pg_b)
+					return (EFAULT);
+			}
+			if (RF_PAGE_ALIGNED(c)) {
+				REMAP_VA(bp, c, pg_c);
+				if (!pg_c)
+					return (EFAULT);
+			}
+			if (!dst_is_a)
+				if (RF_PAGE_ALIGNED(dst)) {
+					REMAP_VA(bp, dst, pg_dst);
+					if (!pg_dst)
+						return (EFAULT);
+				}
+		}
+	}
+	while (len) {
+		*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
+		dst++;
+		a++;
+		b++;
+		c++;
+		if (RF_PAGE_ALIGNED(a)) {
+			REMAP_VA(bp, a, pg_a);
+			if (!pg_a)
+				return (EFAULT);
+			if (dst_is_a)
+				pg_dst = pg_a;
+		}
+		if (RF_PAGE_ALIGNED(b)) {
+			REMAP_VA(bp, b, pg_b);
+			if (!pg_b)
+				return (EFAULT);
+		}
+		if (RF_PAGE_ALIGNED(c)) {
+			REMAP_VA(bp, c, pg_c);
+			if (!pg_c)
+				return (EFAULT);
+		}
+		if (!dst_is_a)
+			if (RF_PAGE_ALIGNED(dst)) {
+				REMAP_VA(bp, dst, pg_dst);
+				if (!pg_dst)
+					return (EFAULT);
+			}
+		len--;
+	}
+	return (0);
+}
+
+int 
+rf_bxor3(dst, a, b, c, len, bp)
+	unsigned char *dst;
+	unsigned char *a;
+	unsigned char *b;
+	unsigned char *c;
+	unsigned long len;
+	void   *bp;
+{
+	RF_ASSERT(((RF_UL(dst) | RF_UL(a) | RF_UL(b) | RF_UL(c) | len) & 0x7) == 0);
+
+	return (rf_longword_bxor3((unsigned long *) dst, (unsigned long *) a,
+		(unsigned long *) b, (unsigned long *) c, len >> RF_LONGSHIFT, bp));
+}
diff --git a/sys/dev/raidframe/rf_dagfuncs.h b/sys/dev/raidframe/rf_dagfuncs.h
new file mode 100644
index 0000000..da7e8b2
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagfuncs.h
@@ -0,0 +1,90 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dagfuncs.h,v 1.4 2000/03/30 13:39:07 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************************
+ *
+ * dagfuncs.h -- header file for DAG node execution routines
+ *
+ ****************************************************************************************/
+
+#ifndef _RF__RF_DAGFUNCS_H_
+#define _RF__RF_DAGFUNCS_H_
+
+int     rf_ConfigureDAGFuncs(RF_ShutdownList_t ** listp);
+int     rf_TerminateFunc(RF_DagNode_t * node);
+int     rf_TerminateUndoFunc(RF_DagNode_t * node);
+int     rf_DiskReadMirrorIdleFunc(RF_DagNode_t * node);
+int     rf_DiskReadMirrorPartitionFunc(RF_DagNode_t * node);
+int     rf_DiskReadMirrorUndoFunc(RF_DagNode_t * node);
+int     rf_ParityLogUpdateFunc(RF_DagNode_t * node);
+int     rf_ParityLogOverwriteFunc(RF_DagNode_t * node);
+int     rf_ParityLogUpdateUndoFunc(RF_DagNode_t * node);
+int     rf_ParityLogOverwriteUndoFunc(RF_DagNode_t * node);
+int     rf_NullNodeFunc(RF_DagNode_t * node);
+int     rf_NullNodeUndoFunc(RF_DagNode_t * node);
+int     rf_DiskReadFuncForThreads(RF_DagNode_t * node);
+int     rf_DiskWriteFuncForThreads(RF_DagNode_t * node);
+int     rf_DiskUndoFunc(RF_DagNode_t * node);
+int     rf_DiskUnlockFuncForThreads(RF_DagNode_t * node);
+int     rf_GenericWakeupFunc(RF_DagNode_t * node, int status);
+int     rf_RegularXorFunc(RF_DagNode_t * node);
+int     rf_SimpleXorFunc(RF_DagNode_t * node);
+int     rf_RecoveryXorFunc(RF_DagNode_t * node);
+int 
+rf_XorIntoBuffer(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, char *srcbuf,
+    char *targbuf, void *bp);
+int     rf_bxor(char *src, char *dest, int len, void *bp);
+int 
+rf_longword_bxor(unsigned long *src, unsigned long *dest, int len, void *bp);
+int 
+rf_longword_bxor3(unsigned long *dest, unsigned long *a, unsigned long *b, 
+		  unsigned long *c, int len, void *bp);
+int 
+rf_bxor3(unsigned char *dst, unsigned char *a, unsigned char *b,
+    unsigned char *c, unsigned long len, void *bp);
+
+/* function ptrs defined in ConfigureDAGFuncs() */
+extern int (*rf_DiskReadFunc) (RF_DagNode_t *);
+extern int (*rf_DiskWriteFunc) (RF_DagNode_t *);
+extern int (*rf_DiskReadUndoFunc) (RF_DagNode_t *);
+extern int (*rf_DiskWriteUndoFunc) (RF_DagNode_t *);
+extern int (*rf_DiskUnlockFunc) (RF_DagNode_t *);
+extern int (*rf_DiskUnlockUndoFunc) (RF_DagNode_t *);
+extern int (*rf_SimpleXorUndoFunc) (RF_DagNode_t *);
+extern int (*rf_RegularXorUndoFunc) (RF_DagNode_t *);
+extern int (*rf_RecoveryXorUndoFunc) (RF_DagNode_t *);
+
+/* macros for manipulating the param[3] in a read or write node */
+#define RF_CREATE_PARAM3(pri, lk, unlk, wru) (((RF_uint64)(((wru&0xFFFFFF)<<8)|((lk)?0x10:0)|((unlk)?0x20:0)|((pri)&0xF)) ))
+#define RF_EXTRACT_PRIORITY(_x_)     ((((unsigned) ((unsigned long)(_x_))) >> 0) & 0x0F)
+#define RF_EXTRACT_LOCK_FLAG(_x_)    ((((unsigned) ((unsigned long)(_x_))) >> 4) & 0x1)
+#define RF_EXTRACT_UNLOCK_FLAG(_x_)  ((((unsigned) ((unsigned long)(_x_))) >> 5) & 0x1)
+#define RF_EXTRACT_RU(_x_)           ((((unsigned) ((unsigned long)(_x_))) >> 8) & 0xFFFFFF)
+
+#endif				/* !_RF__RF_DAGFUNCS_H_ */
diff --git a/sys/dev/raidframe/rf_dagutils.c b/sys/dev/raidframe/rf_dagutils.c
new file mode 100644
index 0000000..dd851a4
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagutils.c
@@ -0,0 +1,1297 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dagutils.c,v 1.6 1999/12/09 02:26:09 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/******************************************************************************
+ *
+ * rf_dagutils.c -- utility routines for manipulating dags
+ *
+ *****************************************************************************/
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_freelist.h>
+#include <dev/raidframe/rf_map.h>
+#include <dev/raidframe/rf_shutdown.h>
+
+#define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
+
+RF_RedFuncs_t rf_xorFuncs = {
+	rf_RegularXorFunc, "Reg Xr",
+rf_SimpleXorFunc, "Simple Xr"};
+
+RF_RedFuncs_t rf_xorRecoveryFuncs = {
+	rf_RecoveryXorFunc, "Recovery Xr",
+rf_RecoveryXorFunc, "Recovery Xr"};
+
+static void rf_RecurPrintDAG(RF_DagNode_t *, int, int);
+static void rf_PrintDAG(RF_DagHeader_t *);
+static int 
+rf_ValidateBranch(RF_DagNode_t *, int *, int *,
+    RF_DagNode_t **, int);
+static void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int);
+static void rf_ValidateVisitedBits(RF_DagHeader_t *);
+
+/******************************************************************************
+ *
+ * InitNode - initialize a dag node
+ *
+ * the size of the propList array is always the same as that of the
+ * successors array.
+ *
+ *****************************************************************************/
+void 
+rf_InitNode(
+    RF_DagNode_t * node,
+    RF_NodeStatus_t initstatus,
+    int commit,
+    int (*doFunc) (RF_DagNode_t * node),
+    int (*undoFunc) (RF_DagNode_t * node),
+    int (*wakeFunc) (RF_DagNode_t * node, int status),
+    int nSucc,
+    int nAnte,
+    int nParam,
+    int nResult,
+    RF_DagHeader_t * hdr,
+    char *name,
+    RF_AllocListElem_t * alist)
+{
+	void  **ptrs;
+	int     nptrs;
+
+	if (nAnte > RF_MAX_ANTECEDENTS)
+		RF_PANIC();
+	node->status = initstatus;
+	node->commitNode = commit;
+	node->doFunc = doFunc;
+	node->undoFunc = undoFunc;
+	node->wakeFunc = wakeFunc;
+	node->numParams = nParam;
+	node->numResults = nResult;
+	node->numAntecedents = nAnte;
+	node->numAntDone = 0;
+	node->next = NULL;
+	node->numSuccedents = nSucc;
+	node->name = name;
+	node->dagHdr = hdr;
+	node->visited = 0;
+
+	/* allocate all the pointers with one call to malloc */
+	nptrs = nSucc + nAnte + nResult + nSucc;
+
+	if (nptrs <= RF_DAG_PTRCACHESIZE) {
+		/*
+	         * The dag_ptrs field of the node is basically some scribble
+	         * space to be used here. We could get rid of it, and always
+	         * allocate the range of pointers, but that's expensive. So,
+	         * we pick a "common case" size for the pointer cache. Hopefully,
+	         * we'll find that:
+	         * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
+	         *     only a little bit (least efficient case)
+	         * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE
+	         *     (wasted memory)
+	         */
+		ptrs = (void **) node->dag_ptrs;
+	} else {
+		RF_CallocAndAdd(ptrs, nptrs, sizeof(void *), (void **), alist);
+	}
+	node->succedents = (nSucc) ? (RF_DagNode_t **) ptrs : NULL;
+	node->antecedents = (nAnte) ? (RF_DagNode_t **) (ptrs + nSucc) : NULL;
+	node->results = (nResult) ? (void **) (ptrs + nSucc + nAnte) : NULL;
+	node->propList = (nSucc) ? (RF_PropHeader_t **) (ptrs + nSucc + nAnte + nResult) : NULL;
+
+	if (nParam) {
+		if (nParam <= RF_DAG_PARAMCACHESIZE) {
+			node->params = (RF_DagParam_t *) node->dag_params;
+		} else {
+			RF_CallocAndAdd(node->params, nParam, sizeof(RF_DagParam_t), (RF_DagParam_t *), alist);
+		}
+	} else {
+		node->params = NULL;
+	}
+}
+
+
+
+/******************************************************************************
+ *
+ * allocation and deallocation routines
+ *
+ *****************************************************************************/
+
+void 
+rf_FreeDAG(dag_h)
+	RF_DagHeader_t *dag_h;
+{
+	RF_AccessStripeMapHeader_t *asmap, *t_asmap;
+	RF_DagHeader_t *nextDag;
+	int     i;
+
+	while (dag_h) {
+		nextDag = dag_h->next;
+		for (i = 0; dag_h->memChunk[i] && i < RF_MAXCHUNKS; i++) {
+			/* release mem chunks */
+			rf_ReleaseMemChunk(dag_h->memChunk[i]);
+			dag_h->memChunk[i] = NULL;
+		}
+
+		RF_ASSERT(i == dag_h->chunkIndex);
+		if (dag_h->xtraChunkCnt > 0) {
+			/* free xtraMemChunks */
+			for (i = 0; dag_h->xtraMemChunk[i] && i < dag_h->xtraChunkIndex; i++) {
+				rf_ReleaseMemChunk(dag_h->xtraMemChunk[i]);
+				dag_h->xtraMemChunk[i] = NULL;
+			}
+			RF_ASSERT(i == dag_h->xtraChunkIndex);
+			/* free ptrs to xtraMemChunks */
+			RF_Free(dag_h->xtraMemChunk, dag_h->xtraChunkCnt * sizeof(RF_ChunkDesc_t *));
+		}
+		rf_FreeAllocList(dag_h->allocList);
+		for (asmap = dag_h->asmList; asmap;) {
+			t_asmap = asmap;
+			asmap = asmap->next;
+			rf_FreeAccessStripeMap(t_asmap);
+		}
+		rf_FreeDAGHeader(dag_h);
+		dag_h = nextDag;
+	}
+}
+
+RF_PropHeader_t *
+rf_MakePropListEntry(
+    RF_DagHeader_t * dag_h,
+    int resultNum,
+    int paramNum,
+    RF_PropHeader_t * next,
+    RF_AllocListElem_t * allocList)
+{
+	RF_PropHeader_t *p;
+
+	RF_CallocAndAdd(p, 1, sizeof(RF_PropHeader_t),
+	    (RF_PropHeader_t *), allocList);
+	p->resultNum = resultNum;
+	p->paramNum = paramNum;
+	p->next = next;
+	return (p);
+}
+
+static RF_FreeList_t *rf_dagh_freelist;
+
+#define RF_MAX_FREE_DAGH 128
+#define RF_DAGH_INC       16
+#define RF_DAGH_INITIAL   32
+
+static void rf_ShutdownDAGs(void *);
+static void 
+rf_ShutdownDAGs(ignored)
+	void   *ignored;
+{
+	RF_FREELIST_DESTROY(rf_dagh_freelist, next, (RF_DagHeader_t *));
+}
+
+int 
+rf_ConfigureDAGs(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     rc;
+
+	RF_FREELIST_CREATE(rf_dagh_freelist, RF_MAX_FREE_DAGH,
+	    RF_DAGH_INC, sizeof(RF_DagHeader_t));
+	if (rf_dagh_freelist == NULL)
+		return (ENOMEM);
+	rc = rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+		    __FILE__, __LINE__, rc);
+		rf_ShutdownDAGs(NULL);
+		return (rc);
+	}
+	RF_FREELIST_PRIME(rf_dagh_freelist, RF_DAGH_INITIAL, next,
+	    (RF_DagHeader_t *));
+	return (0);
+}
+
+RF_DagHeader_t *
+rf_AllocDAGHeader()
+{
+	RF_DagHeader_t *dh;
+
+	RF_FREELIST_GET(rf_dagh_freelist, dh, next, (RF_DagHeader_t *));
+	if (dh) {
+		bzero((char *) dh, sizeof(RF_DagHeader_t));
+	}
+	return (dh);
+}
+
+void 
+rf_FreeDAGHeader(RF_DagHeader_t * dh)
+{
+	RF_FREELIST_FREE(rf_dagh_freelist, dh, next);
+}
+/* allocates a buffer big enough to hold the data described by pda */
+void   *
+rf_AllocBuffer(
+    RF_Raid_t * raidPtr,
+    RF_DagHeader_t * dag_h,
+    RF_PhysDiskAddr_t * pda,
+    RF_AllocListElem_t * allocList)
+{
+	char   *p;
+
+	RF_MallocAndAdd(p, pda->numSector << raidPtr->logBytesPerSector,
+	    (char *), allocList);
+	return ((void *) p);
+}
+/******************************************************************************
+ *
+ * debug routines
+ *
+ *****************************************************************************/
+
+char   *
+rf_NodeStatusString(RF_DagNode_t * node)
+{
+	switch (node->status) {
+		case rf_wait:return ("wait");
+	case rf_fired:
+		return ("fired");
+	case rf_good:
+		return ("good");
+	case rf_bad:
+		return ("bad");
+	default:
+		return ("?");
+	}
+}
+
+void 
+rf_PrintNodeInfoString(RF_DagNode_t * node)
+{
+	RF_PhysDiskAddr_t *pda;
+	int     (*df) (RF_DagNode_t *) = node->doFunc;
+	int     i, lk, unlk;
+	void   *bufPtr;
+
+	if ((df == rf_DiskReadFunc) || (df == rf_DiskWriteFunc)
+	    || (df == rf_DiskReadMirrorIdleFunc)
+	    || (df == rf_DiskReadMirrorPartitionFunc)) {
+		pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+		bufPtr = (void *) node->params[1].p;
+		lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
+		unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
+		RF_ASSERT(!(lk && unlk));
+		printf("r %d c %d offs %ld nsect %d buf 0x%lx %s\n", pda->row, pda->col,
+		    (long) pda->startSector, (int) pda->numSector, (long) bufPtr,
+		    (lk) ? "LOCK" : ((unlk) ? "UNLK" : " "));
+		return;
+	}
+	if (df == rf_DiskUnlockFunc) {
+		pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+		lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
+		unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
+		RF_ASSERT(!(lk && unlk));
+		printf("r %d c %d %s\n", pda->row, pda->col,
+		    (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop"));
+		return;
+	}
+	if ((df == rf_SimpleXorFunc) || (df == rf_RegularXorFunc)
+	    || (df == rf_RecoveryXorFunc)) {
+		printf("result buf 0x%lx\n", (long) node->results[0]);
+		for (i = 0; i < node->numParams - 1; i += 2) {
+			pda = (RF_PhysDiskAddr_t *) node->params[i].p;
+			bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p;
+			printf("    buf 0x%lx r%d c%d offs %ld nsect %d\n",
+			    (long) bufPtr, pda->row, pda->col,
+			    (long) pda->startSector, (int) pda->numSector);
+		}
+		return;
+	}
+#if RF_INCLUDE_PARITYLOGGING > 0
+	if (df == rf_ParityLogOverwriteFunc || df == rf_ParityLogUpdateFunc) {
+		for (i = 0; i < node->numParams - 1; i += 2) {
+			pda = (RF_PhysDiskAddr_t *) node->params[i].p;
+			bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p;
+			printf(" r%d c%d offs %ld nsect %d buf 0x%lx\n",
+			    pda->row, pda->col, (long) pda->startSector,
+			    (int) pda->numSector, (long) bufPtr);
+		}
+		return;
+	}
+#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
+
+	if ((df == rf_TerminateFunc) || (df == rf_NullNodeFunc)) {
+		printf("\n");
+		return;
+	}
+	printf("?\n");
+}
+
+static void 
+rf_RecurPrintDAG(node, depth, unvisited)
+	RF_DagNode_t *node;
+	int     depth;
+	int     unvisited;
+{
+	char   *anttype;
+	int     i;
+
+	node->visited = (unvisited) ? 0 : 1;
+	printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth,
+	    node->nodeNum, node->commitNode, node->name, rf_NodeStatusString(node),
+	    node->numSuccedents, node->numSuccFired, node->numSuccDone,
+	    node->numAntecedents, node->numAntDone, node->numParams, node->numResults);
+	for (i = 0; i < node->numSuccedents; i++) {
+		printf("%d%s", node->succedents[i]->nodeNum,
+		    ((i == node->numSuccedents - 1) ? "\0" : " "));
+	}
+	printf("} A{");
+	for (i = 0; i < node->numAntecedents; i++) {
+		switch (node->antType[i]) {
+		case rf_trueData:
+			anttype = "T";
+			break;
+		case rf_antiData:
+			anttype = "A";
+			break;
+		case rf_outputData:
+			anttype = "O";
+			break;
+		case rf_control:
+			anttype = "C";
+			break;
+		default:
+			anttype = "?";
+			break;
+		}
+		printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype, (i == node->numAntecedents - 1) ? "\0" : " ");
+	}
+	printf("}; ");
+	rf_PrintNodeInfoString(node);
+	for (i = 0; i < node->numSuccedents; i++) {
+		if (node->succedents[i]->visited == unvisited)
+			rf_RecurPrintDAG(node->succedents[i], depth + 1, unvisited);
+	}
+}
+
+static void 
+rf_PrintDAG(dag_h)
+	RF_DagHeader_t *dag_h;
+{
+	int     unvisited, i;
+	char   *status;
+
+	/* set dag status */
+	switch (dag_h->status) {
+	case rf_enable:
+		status = "enable";
+		break;
+	case rf_rollForward:
+		status = "rollForward";
+		break;
+	case rf_rollBackward:
+		status = "rollBackward";
+		break;
+	default:
+		status = "illegal!";
+		break;
+	}
+	/* find out if visited bits are currently set or clear */
+	unvisited = dag_h->succedents[0]->visited;
+
+	printf("DAG type:  %s\n", dag_h->creator);
+	printf("format is (depth) num commit type: status,nSucc nSuccFired/nSuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)};  info\n");
+	printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum,
+	    status, dag_h->numSuccedents, dag_h->numCommitNodes, dag_h->numCommits);
+	for (i = 0; i < dag_h->numSuccedents; i++) {
+		printf("%d%s", dag_h->succedents[i]->nodeNum,
+		    ((i == dag_h->numSuccedents - 1) ? "\0" : " "));
+	}
+	printf("};\n");
+	for (i = 0; i < dag_h->numSuccedents; i++) {
+		if (dag_h->succedents[i]->visited == unvisited)
+			rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited);
+	}
+}
+/* assigns node numbers */
+int 
+rf_AssignNodeNums(RF_DagHeader_t * dag_h)
+{
+	int     unvisited, i, nnum;
+	RF_DagNode_t *node;
+
+	nnum = 0;
+	unvisited = dag_h->succedents[0]->visited;
+
+	dag_h->nodeNum = nnum++;
+	for (i = 0; i < dag_h->numSuccedents; i++) {
+		node = dag_h->succedents[i];
+		if (node->visited == unvisited) {
+			nnum = rf_RecurAssignNodeNums(dag_h->succedents[i], nnum, unvisited);
+		}
+	}
+	return (nnum);
+}
+
+int 
+rf_RecurAssignNodeNums(node, num, unvisited)
+	RF_DagNode_t *node;
+	int     num;
+	int     unvisited;
+{
+	int     i;
+
+	node->visited = (unvisited) ? 0 : 1;
+
+	node->nodeNum = num++;
+	for (i = 0; i < node->numSuccedents; i++) {
+		if (node->succedents[i]->visited == unvisited) {
+			num = rf_RecurAssignNodeNums(node->succedents[i], num, unvisited);
+		}
+	}
+	return (num);
+}
+/* set the header pointers in each node to "newptr" */
+void 
+rf_ResetDAGHeaderPointers(dag_h, newptr)
+	RF_DagHeader_t *dag_h;
+	RF_DagHeader_t *newptr;
+{
+	int     i;
+	for (i = 0; i < dag_h->numSuccedents; i++)
+		if (dag_h->succedents[i]->dagHdr != newptr)
+			rf_RecurResetDAGHeaderPointers(dag_h->succedents[i], newptr);
+}
+
+void 
+rf_RecurResetDAGHeaderPointers(node, newptr)
+	RF_DagNode_t *node;
+	RF_DagHeader_t *newptr;
+{
+	int     i;
+	node->dagHdr = newptr;
+	for (i = 0; i < node->numSuccedents; i++)
+		if (node->succedents[i]->dagHdr != newptr)
+			rf_RecurResetDAGHeaderPointers(node->succedents[i], newptr);
+}
+
+
+void 
+rf_PrintDAGList(RF_DagHeader_t * dag_h)
+{
+	int     i = 0;
+
+	for (; dag_h; dag_h = dag_h->next) {
+		rf_AssignNodeNums(dag_h);
+		printf("\n\nDAG %d IN LIST:\n", i++);
+		rf_PrintDAG(dag_h);
+	}
+}
+
+static int 
+rf_ValidateBranch(node, scount, acount, nodes, unvisited)
+	RF_DagNode_t *node;
+	int    *scount;
+	int    *acount;
+	RF_DagNode_t **nodes;
+	int     unvisited;
+{
+	int     i, retcode = 0;
+
+	/* construct an array of node pointers indexed by node num */
+	node->visited = (unvisited) ? 0 : 1;
+	nodes[node->nodeNum] = node;
+
+	if (node->next != NULL) {
+		printf("INVALID DAG: next pointer in node is not NULL\n");
+		retcode = 1;
+	}
+	if (node->status != rf_wait) {
+		printf("INVALID DAG: Node status is not wait\n");
+		retcode = 1;
+	}
+	if (node->numAntDone != 0) {
+		printf("INVALID DAG: numAntDone is not zero\n");
+		retcode = 1;
+	}
+	if (node->doFunc == rf_TerminateFunc) {
+		if (node->numSuccedents != 0) {
+			printf("INVALID DAG: Terminator node has succedents\n");
+			retcode = 1;
+		}
+	} else {
+		if (node->numSuccedents == 0) {
+			printf("INVALID DAG: Non-terminator node has no succedents\n");
+			retcode = 1;
+		}
+	}
+	for (i = 0; i < node->numSuccedents; i++) {
+		if (!node->succedents[i]) {
+			printf("INVALID DAG: succedent %d of node %s is NULL\n", i, node->name);
+			retcode = 1;
+		}
+		scount[node->succedents[i]->nodeNum]++;
+	}
+	for (i = 0; i < node->numAntecedents; i++) {
+		if (!node->antecedents[i]) {
+			printf("INVALID DAG: antecedent %d of node %s is NULL\n", i, node->name);
+			retcode = 1;
+		}
+		acount[node->antecedents[i]->nodeNum]++;
+	}
+	for (i = 0; i < node->numSuccedents; i++) {
+		if (node->succedents[i]->visited == unvisited) {
+			if (rf_ValidateBranch(node->succedents[i], scount,
+				acount, nodes, unvisited)) {
+				retcode = 1;
+			}
+		}
+	}
+	return (retcode);
+}
+
+static void 
+rf_ValidateBranchVisitedBits(node, unvisited, rl)
+	RF_DagNode_t *node;
+	int     unvisited;
+	int     rl;
+{
+	int     i;
+
+	RF_ASSERT(node->visited == unvisited);
+	for (i = 0; i < node->numSuccedents; i++) {
+		if (node->succedents[i] == NULL) {
+			printf("node=%lx node->succedents[%d] is NULL\n", (long) node, i);
+			RF_ASSERT(0);
+		}
+		rf_ValidateBranchVisitedBits(node->succedents[i], unvisited, rl + 1);
+	}
+}
+/* NOTE:  never call this on a big dag, because it is exponential
+ * in execution time
+ */
+static void 
+rf_ValidateVisitedBits(dag)
+	RF_DagHeader_t *dag;
+{
+	int     i, unvisited;
+
+	unvisited = dag->succedents[0]->visited;
+
+	for (i = 0; i < dag->numSuccedents; i++) {
+		if (dag->succedents[i] == NULL) {
+			printf("dag=%lx dag->succedents[%d] is NULL\n", (long) dag, i);
+			RF_ASSERT(0);
+		}
+		rf_ValidateBranchVisitedBits(dag->succedents[i], unvisited, 0);
+	}
+}
+/* validate a DAG.  _at entry_ verify that:
+ *   -- numNodesCompleted is zero
+ *   -- node queue is null
+ *   -- dag status is rf_enable
+ *   -- next pointer is null on every node
+ *   -- all nodes have status wait
+ *   -- numAntDone is zero in all nodes
+ *   -- terminator node has zero successors
+ *   -- no other node besides terminator has zero successors
+ *   -- no successor or antecedent pointer in a node is NULL
+ *   -- number of times that each node appears as a successor of another node
+ *      is equal to the antecedent count on that node
+ *   -- number of times that each node appears as an antecedent of another node
+ *      is equal to the succedent count on that node
+ *   -- what else?
+ */
+int 
+rf_ValidateDAG(dag_h)
+	RF_DagHeader_t *dag_h;
+{
+	int     i, nodecount;
+	int    *scount, *acount;/* per-node successor and antecedent counts */
+	RF_DagNode_t **nodes;	/* array of ptrs to nodes in dag */
+	int     retcode = 0;
+	int     unvisited;
+	int     commitNodeCount = 0;
+
+	if (rf_validateVisitedDebug)
+		rf_ValidateVisitedBits(dag_h);
+
+	if (dag_h->numNodesCompleted != 0) {
+		printf("INVALID DAG: num nodes completed is %d, should be 0\n", dag_h->numNodesCompleted);
+		retcode = 1;
+		goto validate_dag_bad;
+	}
+	if (dag_h->status != rf_enable) {
+		printf("INVALID DAG: not enabled\n");
+		retcode = 1;
+		goto validate_dag_bad;
+	}
+	if (dag_h->numCommits != 0) {
+		printf("INVALID DAG: numCommits != 0 (%d)\n", dag_h->numCommits);
+		retcode = 1;
+		goto validate_dag_bad;
+	}
+	if (dag_h->numSuccedents != 1) {
+		/* currently, all dags must have only one succedent */
+		printf("INVALID DAG: numSuccedents !1 (%d)\n", dag_h->numSuccedents);
+		retcode = 1;
+		goto validate_dag_bad;
+	}
+	nodecount = rf_AssignNodeNums(dag_h);
+
+	unvisited = dag_h->succedents[0]->visited;
+
+	RF_Calloc(scount, nodecount, sizeof(int), (int *));
+	RF_Calloc(acount, nodecount, sizeof(int), (int *));
+	RF_Calloc(nodes, nodecount, sizeof(RF_DagNode_t *), (RF_DagNode_t **));
+	for (i = 0; i < dag_h->numSuccedents; i++) {
+		if ((dag_h->succedents[i]->visited == unvisited)
+		    && rf_ValidateBranch(dag_h->succedents[i], scount,
+			acount, nodes, unvisited)) {
+			retcode = 1;
+		}
+	}
+	/* start at 1 to skip the header node */
+	for (i = 1; i < nodecount; i++) {
+		if (nodes[i]->commitNode)
+			commitNodeCount++;
+		if (nodes[i]->doFunc == NULL) {
+			printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
+			retcode = 1;
+			goto validate_dag_out;
+		}
+		if (nodes[i]->undoFunc == NULL) {
+			printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
+			retcode = 1;
+			goto validate_dag_out;
+		}
+		if (nodes[i]->numAntecedents != scount[nodes[i]->nodeNum]) {
+			printf("INVALID DAG: node %s has %d antecedents but appears as a succedent %d times\n",
+			    nodes[i]->name, nodes[i]->numAntecedents, scount[nodes[i]->nodeNum]);
+			retcode = 1;
+			goto validate_dag_out;
+		}
+		if (nodes[i]->numSuccedents != acount[nodes[i]->nodeNum]) {
+			printf("INVALID DAG: node %s has %d succedents but appears as an antecedent %d times\n",
+			    nodes[i]->name, nodes[i]->numSuccedents, acount[nodes[i]->nodeNum]);
+			retcode = 1;
+			goto validate_dag_out;
+		}
+	}
+
+	if (dag_h->numCommitNodes != commitNodeCount) {
+		printf("INVALID DAG: incorrect commit node count.  hdr->numCommitNodes (%d) found (%d) commit nodes in graph\n",
+		    dag_h->numCommitNodes, commitNodeCount);
+		retcode = 1;
+		goto validate_dag_out;
+	}
+validate_dag_out:
+	RF_Free(scount, nodecount * sizeof(int));
+	RF_Free(acount, nodecount * sizeof(int));
+	RF_Free(nodes, nodecount * sizeof(RF_DagNode_t *));
+	if (retcode)
+		rf_PrintDAGList(dag_h);
+
+	if (rf_validateVisitedDebug)
+		rf_ValidateVisitedBits(dag_h);
+
+	return (retcode);
+
+validate_dag_bad:
+	rf_PrintDAGList(dag_h);
+	return (retcode);
+}
+
+
+/******************************************************************************
+ *
+ * misc construction routines
+ *
+ *****************************************************************************/
+
+void 
+rf_redirect_asm(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap)
+{
+	int     ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0;
+	int     row = asmap->physInfo->row;
+	int     fcol = raidPtr->reconControl[row]->fcol;
+	int     srow = raidPtr->reconControl[row]->spareRow;
+	int     scol = raidPtr->reconControl[row]->spareCol;
+	RF_PhysDiskAddr_t *pda;
+
+	RF_ASSERT(raidPtr->status[row] == rf_rs_reconstructing);
+	for (pda = asmap->physInfo; pda; pda = pda->next) {
+		if (pda->col == fcol) {
+			if (rf_dagDebug) {
+				if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap,
+					pda->startSector)) {
+					RF_PANIC();
+				}
+			}
+			/* printf("Remapped data for large write\n"); */
+			if (ds) {
+				raidPtr->Layout.map->MapSector(raidPtr, pda->raidAddress,
+				    &pda->row, &pda->col, &pda->startSector, RF_REMAP);
+			} else {
+				pda->row = srow;
+				pda->col = scol;
+			}
+		}
+	}
+	for (pda = asmap->parityInfo; pda; pda = pda->next) {
+		if (pda->col == fcol) {
+			if (rf_dagDebug) {
+				if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, pda->startSector)) {
+					RF_PANIC();
+				}
+			}
+		}
+		if (ds) {
+			(raidPtr->Layout.map->MapParity) (raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
+		} else {
+			pda->row = srow;
+			pda->col = scol;
+		}
+	}
+}
+
+
+/* this routine allocates read buffers and generates stripe maps for the
+ * regions of the array from the start of the stripe to the start of the
+ * access, and from the end of the access to the end of the stripe.  It also
+ * computes and returns the number of DAG nodes needed to read all this data.
+ * Note that this routine does the wrong thing if the access is fully
+ * contained within one stripe unit, so we RF_ASSERT against this case at the
+ * start.
+ */
+void 
+rf_MapUnaccessedPortionOfStripe(
+    RF_Raid_t * raidPtr,
+    RF_RaidLayout_t * layoutPtr,/* in: layout information */
+    RF_AccessStripeMap_t * asmap,	/* in: access stripe map */
+    RF_DagHeader_t * dag_h,	/* in: header of the dag to create */
+    RF_AccessStripeMapHeader_t ** new_asm_h,	/* in: ptr to array of 2
+						 * headers, to be filled in */
+    int *nRodNodes,		/* out: num nodes to be generated to read
+				 * unaccessed data */
+    char **sosBuffer,		/* out: pointers to newly allocated buffer */
+    char **eosBuffer,
+    RF_AllocListElem_t * allocList)
+{
+	RF_RaidAddr_t sosRaidAddress, eosRaidAddress;
+	RF_SectorNum_t sosNumSector, eosNumSector;
+
+	RF_ASSERT(asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol / 2));
+	/* generate an access map for the region of the array from start of
+	 * stripe to start of access */
+	new_asm_h[0] = new_asm_h[1] = NULL;
+	*nRodNodes = 0;
+	if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) {
+		sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+		sosNumSector = asmap->raidAddress - sosRaidAddress;
+		RF_MallocAndAdd(*sosBuffer, rf_RaidAddressToByte(raidPtr, sosNumSector), (char *), allocList);
+		new_asm_h[0] = rf_MapAccess(raidPtr, sosRaidAddress, sosNumSector, *sosBuffer, RF_DONT_REMAP);
+		new_asm_h[0]->next = dag_h->asmList;
+		dag_h->asmList = new_asm_h[0];
+		*nRodNodes += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
+
+		RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL);
+		/* we're totally within one stripe here */
+		if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
+			rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap);
+	}
+	/* generate an access map for the region of the array from end of
+	 * access to end of stripe */
+	if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) {
+		eosRaidAddress = asmap->endRaidAddress;
+		eosNumSector = rf_RaidAddressOfNextStripeBoundary(layoutPtr, eosRaidAddress) - eosRaidAddress;
+		RF_MallocAndAdd(*eosBuffer, rf_RaidAddressToByte(raidPtr, eosNumSector), (char *), allocList);
+		new_asm_h[1] = rf_MapAccess(raidPtr, eosRaidAddress, eosNumSector, *eosBuffer, RF_DONT_REMAP);
+		new_asm_h[1]->next = dag_h->asmList;
+		dag_h->asmList = new_asm_h[1];
+		*nRodNodes += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
+
+		RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL);
+		/* we're totally within one stripe here */
+		if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
+			rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap);
+	}
+}
+
+
+
+/* returns non-zero if the indicated ranges of stripe unit offsets overlap */
+int 
+rf_PDAOverlap(
+    RF_RaidLayout_t * layoutPtr,
+    RF_PhysDiskAddr_t * src,
+    RF_PhysDiskAddr_t * dest)
+{
+	RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
+	RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
+	/* use -1 to be sure we stay within SU */
+	RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector - 1);
+	RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector - 1);
+	return ((RF_MAX(soffs, doffs) <= RF_MIN(send, dend)) ? 1 : 0);
+}
+
+
+/* GenerateFailedAccessASMs
+ *
+ * this routine figures out what portion of the stripe needs to be read
+ * to effect the degraded read or write operation.  It's primary function
+ * is to identify everything required to recover the data, and then
+ * eliminate anything that is already being accessed by the user.
+ *
+ * The main result is two new ASMs, one for the region from the start of the
+ * stripe to the start of the access, and one for the region from the end of
+ * the access to the end of the stripe.  These ASMs describe everything that
+ * needs to be read to effect the degraded access.  Other results are:
+ *    nXorBufs -- the total number of buffers that need to be XORed together to
+ *                recover the lost data,
+ *    rpBufPtr -- ptr to a newly-allocated buffer to hold the parity.  If NULL
+ *                at entry, not allocated.
+ *    overlappingPDAs --
+ *                describes which of the non-failed PDAs in the user access
+ *                overlap data that needs to be read to effect recovery.
+ *                overlappingPDAs[i]==1 if and only if, neglecting the failed
+ *                PDA, the ith pda in the input asm overlaps data that needs
+ *                to be read for recovery.
+ */
+ /* in: asm - ASM for the actual access, one stripe only */
+ /* in: faildPDA - which component of the access has failed */
+ /* in: dag_h - header of the DAG we're going to create */
+ /* out: new_asm_h - the two new ASMs */
+ /* out: nXorBufs - the total number of xor bufs required */
+ /* out: rpBufPtr - a buffer for the parity read */
+void 
+rf_GenerateFailedAccessASMs(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_PhysDiskAddr_t * failedPDA,
+    RF_DagHeader_t * dag_h,
+    RF_AccessStripeMapHeader_t ** new_asm_h,
+    int *nXorBufs,
+    char **rpBufPtr,
+    char *overlappingPDAs,
+    RF_AllocListElem_t * allocList)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+
+	/* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
+	RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr;
+
+	RF_SectorCount_t numSect[2], numParitySect;
+	RF_PhysDiskAddr_t *pda;
+	char   *rdBuf, *bufP;
+	int     foundit, i;
+
+	bufP = NULL;
+	foundit = 0;
+	/* first compute the following raid addresses: start of stripe,
+	 * (sosAddr) MIN(start of access, start of failed SU),   (sosEndAddr)
+	 * MAX(end of access, end of failed SU),       (eosStartAddr) end of
+	 * stripe (i.e. start of next stripe)   (eosAddr) */
+	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+	sosEndAddr = RF_MIN(asmap->raidAddress, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->raidAddress));
+	eosStartAddr = RF_MAX(asmap->endRaidAddress, rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, failedPDA->raidAddress));
+	eosAddr = rf_RaidAddressOfNextStripeBoundary(layoutPtr, asmap->raidAddress);
+
+	/* now generate access stripe maps for each of the above regions of
+	 * the stripe.  Use a dummy (NULL) buf ptr for now */
+
+	new_asm_h[0] = (sosAddr != sosEndAddr) ? rf_MapAccess(raidPtr, sosAddr, sosEndAddr - sosAddr, NULL, RF_DONT_REMAP) : NULL;
+	new_asm_h[1] = (eosStartAddr != eosAddr) ? rf_MapAccess(raidPtr, eosStartAddr, eosAddr - eosStartAddr, NULL, RF_DONT_REMAP) : NULL;
+
+	/* walk through the PDAs and range-restrict each SU to the region of
+	 * the SU touched on the failed PDA.  also compute total data buffer
+	 * space requirements in this step.  Ignore the parity for now. */
+
+	numSect[0] = numSect[1] = 0;
+	if (new_asm_h[0]) {
+		new_asm_h[0]->next = dag_h->asmList;
+		dag_h->asmList = new_asm_h[0];
+		for (pda = new_asm_h[0]->stripeMap->physInfo; pda; pda = pda->next) {
+			rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_NOBUFFER, 0);
+			numSect[0] += pda->numSector;
+		}
+	}
+	if (new_asm_h[1]) {
+		new_asm_h[1]->next = dag_h->asmList;
+		dag_h->asmList = new_asm_h[1];
+		for (pda = new_asm_h[1]->stripeMap->physInfo; pda; pda = pda->next) {
+			rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_NOBUFFER, 0);
+			numSect[1] += pda->numSector;
+		}
+	}
+	numParitySect = failedPDA->numSector;
+
+	/* allocate buffer space for the data & parity we have to read to
+	 * recover from the failure */
+
+	if (numSect[0] + numSect[1] + ((rpBufPtr) ? numParitySect : 0)) {	/* don't allocate parity
+										 * buf if not needed */
+		RF_MallocAndAdd(rdBuf, rf_RaidAddressToByte(raidPtr, numSect[0] + numSect[1] + numParitySect), (char *), allocList);
+		bufP = rdBuf;
+		if (rf_degDagDebug)
+			printf("Newly allocated buffer (%d bytes) is 0x%lx\n",
+			    (int) rf_RaidAddressToByte(raidPtr, numSect[0] + numSect[1] + numParitySect), (unsigned long) bufP);
+	}
+	/* now walk through the pdas one last time and assign buffer pointers
+	 * (ugh!).  Again, ignore the parity.  also, count nodes to find out
+	 * how many bufs need to be xored together */
+	(*nXorBufs) = 1;	/* in read case, 1 is for parity.  In write
+				 * case, 1 is for failed data */
+	if (new_asm_h[0]) {
+		for (pda = new_asm_h[0]->stripeMap->physInfo; pda; pda = pda->next) {
+			pda->bufPtr = bufP;
+			bufP += rf_RaidAddressToByte(raidPtr, pda->numSector);
+		}
+		*nXorBufs += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
+	}
+	if (new_asm_h[1]) {
+		for (pda = new_asm_h[1]->stripeMap->physInfo; pda; pda = pda->next) {
+			pda->bufPtr = bufP;
+			bufP += rf_RaidAddressToByte(raidPtr, pda->numSector);
+		}
+		(*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
+	}
+	if (rpBufPtr)
+		*rpBufPtr = bufP;	/* the rest of the buffer is for
+					 * parity */
+
+	/* the last step is to figure out how many more distinct buffers need
+	 * to get xor'd to produce the missing unit.  there's one for each
+	 * user-data read node that overlaps the portion of the failed unit
+	 * being accessed */
+
+	for (foundit = i = 0, pda = asmap->physInfo; pda; i++, pda = pda->next) {
+		if (pda == failedPDA) {
+			i--;
+			foundit = 1;
+			continue;
+		}
+		if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) {
+			overlappingPDAs[i] = 1;
+			(*nXorBufs)++;
+		}
+	}
+	if (!foundit) {
+		RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA in asm list\n");
+		RF_ASSERT(0);
+	}
+	if (rf_degDagDebug) {
+		if (new_asm_h[0]) {
+			printf("First asm:\n");
+			rf_PrintFullAccessStripeMap(new_asm_h[0], 1);
+		}
+		if (new_asm_h[1]) {
+			printf("Second asm:\n");
+			rf_PrintFullAccessStripeMap(new_asm_h[1], 1);
+		}
+	}
+}
+
+
+/* adjusts the offset and number of sectors in the destination pda so that
+ * it covers at most the region of the SU covered by the source PDA.  This
+ * is exclusively a restriction:  the number of sectors indicated by the
+ * target PDA can only shrink.
+ *
+ * For example:  s = sectors within SU indicated by source PDA
+ *               d = sectors within SU indicated by dest PDA
+ *               r = results, stored in dest PDA
+ *
+ * |--------------- one stripe unit ---------------------|
+ * |           sssssssssssssssssssssssssssssssss         |
+ * |    ddddddddddddddddddddddddddddddddddddddddddddd    |
+ * |           rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr         |
+ *
+ * Another example:
+ *
+ * |--------------- one stripe unit ---------------------|
+ * |           sssssssssssssssssssssssssssssssss         |
+ * |    ddddddddddddddddddddddd                          |
+ * |           rrrrrrrrrrrrrrrr                          |
+ *
+ */
+void 
+rf_RangeRestrictPDA(
+    RF_Raid_t * raidPtr,
+    RF_PhysDiskAddr_t * src,
+    RF_PhysDiskAddr_t * dest,
+    int dobuffer,
+    int doraidaddr)
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
+	RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
+	RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector - 1);	/* use -1 to be sure we
+													 * stay within SU */
+	RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector - 1);
+	RF_SectorNum_t subAddr = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->startSector);	/* stripe unit boundary */
+
+	dest->startSector = subAddr + RF_MAX(soffs, doffs);
+	dest->numSector = subAddr + RF_MIN(send, dend) + 1 - dest->startSector;
+
+	if (dobuffer)
+		dest->bufPtr += (soffs > doffs) ? rf_RaidAddressToByte(raidPtr, soffs - doffs) : 0;
+	if (doraidaddr) {
+		dest->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->raidAddress) +
+		    rf_StripeUnitOffset(layoutPtr, dest->startSector);
+	}
+}
+/*
+ * Want the highest of these primes to be the largest one
+ * less than the max expected number of columns (won't hurt
+ * to be too small or too large, but won't be optimal, either)
+ * --jimz
+ */
+#define NLOWPRIMES 8
+static int lowprimes[NLOWPRIMES] = {2, 3, 5, 7, 11, 13, 17, 19};
+/*****************************************************************************
+ * compute the workload shift factor.  (chained declustering)
+ *
+ * return nonzero if access should shift to secondary, otherwise,
+ * access is to primary
+ *****************************************************************************/
+int 
+rf_compute_workload_shift(
+    RF_Raid_t * raidPtr,
+    RF_PhysDiskAddr_t * pda)
+{
+	/*
+         * variables:
+         *  d   = column of disk containing primary
+         *  f   = column of failed disk
+         *  n   = number of disks in array
+         *  sd  = "shift distance" (number of columns that d is to the right of f)
+         *  row = row of array the access is in
+         *  v   = numerator of redirection ratio
+         *  k   = denominator of redirection ratio
+         */
+	RF_RowCol_t d, f, sd, row, n;
+	int     k, v, ret, i;
+
+	row = pda->row;
+	n = raidPtr->numCol;
+
+	/* assign column of primary copy to d */
+	d = pda->col;
+
+	/* assign column of dead disk to f */
+	for (f = 0; ((!RF_DEAD_DISK(raidPtr->Disks[row][f].status)) && (f < n)); f++);
+
+	RF_ASSERT(f < n);
+	RF_ASSERT(f != d);
+
+	sd = (f > d) ? (n + d - f) : (d - f);
+	RF_ASSERT(sd < n);
+
+	/*
+         * v of every k accesses should be redirected
+         *
+         * v/k := (n-1-sd)/(n-1)
+         */
+	v = (n - 1 - sd);
+	k = (n - 1);
+
+#if 1
+	/*
+         * XXX
+         * Is this worth it?
+         *
+         * Now reduce the fraction, by repeatedly factoring
+         * out primes (just like they teach in elementary school!)
+         */
+	for (i = 0; i < NLOWPRIMES; i++) {
+		if (lowprimes[i] > v)
+			break;
+		while (((v % lowprimes[i]) == 0) && ((k % lowprimes[i]) == 0)) {
+			v /= lowprimes[i];
+			k /= lowprimes[i];
+		}
+	}
+#endif
+
+	raidPtr->hist_diskreq[row][d]++;
+	if (raidPtr->hist_diskreq[row][d] > v) {
+		ret = 0;	/* do not redirect */
+	} else {
+		ret = 1;	/* redirect */
+	}
+
+#if 0
+	printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret,
+	    raidPtr->hist_diskreq[row][d]);
+#endif
+
+	if (raidPtr->hist_diskreq[row][d] >= k) {
+		/* reset counter */
+		raidPtr->hist_diskreq[row][d] = 0;
+	}
+	return (ret);
+}
+/*
+ * Disk selection routines
+ */
+
+/*
+ * Selects the disk with the shortest queue from a mirror pair.
+ * Both the disk I/Os queued in RAIDframe as well as those at the physical
+ * disk are counted as members of the "queue"
+ */
+void 
+rf_SelectMirrorDiskIdle(RF_DagNode_t * node)
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
+	RF_RowCol_t rowData, colData, rowMirror, colMirror;
+	int     dataQueueLength, mirrorQueueLength, usemirror;
+	RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+	RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p;
+	RF_PhysDiskAddr_t *tmp_pda;
+	RF_RaidDisk_t **disks = raidPtr->Disks;
+	RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
+
+	/* return the [row col] of the disk with the shortest queue */
+	rowData = data_pda->row;
+	colData = data_pda->col;
+	rowMirror = mirror_pda->row;
+	colMirror = mirror_pda->col;
+	dataQueue = &(dqs[rowData][colData]);
+	mirrorQueue = &(dqs[rowMirror][colMirror]);
+
+#ifdef RF_LOCK_QUEUES_TO_READ_LEN
+	RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
+#endif				/* RF_LOCK_QUEUES_TO_READ_LEN */
+	dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding;
+#ifdef RF_LOCK_QUEUES_TO_READ_LEN
+	RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
+	RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
+#endif				/* RF_LOCK_QUEUES_TO_READ_LEN */
+	mirrorQueueLength = mirrorQueue->queueLength + mirrorQueue->numOutstanding;
+#ifdef RF_LOCK_QUEUES_TO_READ_LEN
+	RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
+#endif				/* RF_LOCK_QUEUES_TO_READ_LEN */
+
+	usemirror = 0;
+	if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
+		usemirror = 0;
+	} else
+		if (RF_DEAD_DISK(disks[rowData][colData].status)) {
+			usemirror = 1;
+		} else
+			if (raidPtr->parity_good == RF_RAID_DIRTY) {
+				/* Trust only the main disk */
+				usemirror = 0;
+			} else
+				if (dataQueueLength < mirrorQueueLength) {
+					usemirror = 0;
+				} else
+					if (mirrorQueueLength < dataQueueLength) {
+						usemirror = 1;
+					} else {
+						/* queues are equal length. attempt
+						 * cleverness. */
+						if (SNUM_DIFF(dataQueue->last_deq_sector, data_pda->startSector)
+						    <= SNUM_DIFF(mirrorQueue->last_deq_sector, mirror_pda->startSector)) {
+							usemirror = 0;
+						} else {
+							usemirror = 1;
+						}
+					}
+
+	if (usemirror) {
+		/* use mirror (parity) disk, swap params 0 & 4 */
+		tmp_pda = data_pda;
+		node->params[0].p = mirror_pda;
+		node->params[4].p = tmp_pda;
+	} else {
+		/* use data disk, leave param 0 unchanged */
+	}
+	/* printf("dataQueueLength %d, mirrorQueueLength
+	 * %d\n",dataQueueLength, mirrorQueueLength); */
+}
+/*
+ * Do simple partitioning. This assumes that
+ * the data and parity disks are laid out identically.
+ */
+void 
+rf_SelectMirrorDiskPartition(RF_DagNode_t * node)
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
+	RF_RowCol_t rowData, colData, rowMirror, colMirror;
+	RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+	RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p;
+	RF_PhysDiskAddr_t *tmp_pda;
+	RF_RaidDisk_t **disks = raidPtr->Disks;
+	RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
+	int     usemirror;
+
+	/* return the [row col] of the disk with the shortest queue */
+	rowData = data_pda->row;
+	colData = data_pda->col;
+	rowMirror = mirror_pda->row;
+	colMirror = mirror_pda->col;
+	dataQueue = &(dqs[rowData][colData]);
+	mirrorQueue = &(dqs[rowMirror][colMirror]);
+
+	usemirror = 0;
+	if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
+		usemirror = 0;
+	} else
+		if (RF_DEAD_DISK(disks[rowData][colData].status)) {
+			usemirror = 1;
+		} else 
+			if (raidPtr->parity_good == RF_RAID_DIRTY) {
+				/* Trust only the main disk */
+				usemirror = 0;
+			} else
+				if (data_pda->startSector < 
+				    (disks[rowData][colData].numBlocks / 2)) {
+					usemirror = 0;
+				} else {
+					usemirror = 1;
+				}
+
+	if (usemirror) {
+		/* use mirror (parity) disk, swap params 0 & 4 */
+		tmp_pda = data_pda;
+		node->params[0].p = mirror_pda;
+		node->params[4].p = tmp_pda;
+	} else {
+		/* use data disk, leave param 0 unchanged */
+	}
+}
diff --git a/sys/dev/raidframe/rf_dagutils.h b/sys/dev/raidframe/rf_dagutils.h
new file mode 100644
index 0000000..bad2c76
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagutils.h
@@ -0,0 +1,121 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_dagutils.h,v 1.3 1999/02/05 00:06:08 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*************************************************************************
+ *
+ * rf_dagutils.h -- header file for utility routines for manipulating DAGs
+ *
+ *************************************************************************/
+
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_general.h>
+
+#ifndef _RF__RF_DAGUTILS_H_
+#define _RF__RF_DAGUTILS_H_
+
+struct RF_RedFuncs_s {
+	int     (*regular) (RF_DagNode_t *);
+	char   *RegularName;
+	int     (*simple) (RF_DagNode_t *);
+	char   *SimpleName;
+};
+
+extern RF_RedFuncs_t rf_xorFuncs;
+extern RF_RedFuncs_t rf_xorRecoveryFuncs;
+
+void 
+rf_InitNode(RF_DagNode_t * node, RF_NodeStatus_t initstatus,
+    int commit,
+    int (*doFunc) (RF_DagNode_t * node),
+    int (*undoFunc) (RF_DagNode_t * node),
+    int (*wakeFunc) (RF_DagNode_t * node, int status),
+    int nSucc, int nAnte, int nParam, int nResult,
+    RF_DagHeader_t * hdr, char *name, RF_AllocListElem_t * alist);
+
+	void    rf_FreeDAG(RF_DagHeader_t * dag_h);
+
+	RF_PropHeader_t *rf_MakePropListEntry(RF_DagHeader_t * dag_h, int resultNum,
+            int paramNum, RF_PropHeader_t * next, RF_AllocListElem_t * allocList);
+
+	int     rf_ConfigureDAGs(RF_ShutdownList_t ** listp);
+
+	RF_DagHeader_t *rf_AllocDAGHeader(void);
+
+	void    rf_FreeDAGHeader(RF_DagHeader_t * dh);
+
+	void   *rf_AllocBuffer(RF_Raid_t * raidPtr, RF_DagHeader_t * dag_h,
+            RF_PhysDiskAddr_t * pda, RF_AllocListElem_t * allocList);
+
+	char   *rf_NodeStatusString(RF_DagNode_t * node);
+
+	void    rf_PrintNodeInfoString(RF_DagNode_t * node);
+
+	int     rf_AssignNodeNums(RF_DagHeader_t * dag_h);
+
+	int     rf_RecurAssignNodeNums(RF_DagNode_t * node, int num, int unvisited);
+
+	void    rf_ResetDAGHeaderPointers(RF_DagHeader_t * dag_h, RF_DagHeader_t * newptr);
+
+	void    rf_RecurResetDAGHeaderPointers(RF_DagNode_t * node, RF_DagHeader_t * newptr);
+
+	void    rf_PrintDAGList(RF_DagHeader_t * dag_h);
+
+	int     rf_ValidateDAG(RF_DagHeader_t * dag_h);
+
+	void    rf_redirect_asm(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap);
+
+	void    rf_MapUnaccessedPortionOfStripe(RF_Raid_t * raidPtr,
+            RF_RaidLayout_t * layoutPtr,
+            RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h,
+            RF_AccessStripeMapHeader_t ** new_asm_h, int *nRodNodes, char **sosBuffer,
+            char **eosBuffer, RF_AllocListElem_t * allocList);
+
+	int     rf_PDAOverlap(RF_RaidLayout_t * layoutPtr, RF_PhysDiskAddr_t * src,
+            RF_PhysDiskAddr_t * dest);
+
+	void    rf_GenerateFailedAccessASMs(RF_Raid_t * raidPtr,
+            RF_AccessStripeMap_t * asmap, RF_PhysDiskAddr_t * failedPDA,
+            RF_DagHeader_t * dag_h, RF_AccessStripeMapHeader_t ** new_asm_h,
+            int *nXorBufs, char **rpBufPtr, char *overlappingPDAs,
+            RF_AllocListElem_t * allocList);
+
+/* flags used by RangeRestrictPDA */
+#define RF_RESTRICT_NOBUFFER 0
+#define RF_RESTRICT_DOBUFFER 1
+
+	void    rf_RangeRestrictPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * src,
+            RF_PhysDiskAddr_t * dest, int dobuffer, int doraidaddr);
+
+	int     rf_compute_workload_shift(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda);
+	void    rf_SelectMirrorDiskIdle(RF_DagNode_t * node);
+	void    rf_SelectMirrorDiskPartition(RF_DagNode_t * node);
+
+#endif				/* !_RF__RF_DAGUTILS_H_ */
diff --git a/sys/dev/raidframe/rf_debugMem.c b/sys/dev/raidframe/rf_debugMem.c
new file mode 100644
index 0000000..f754812
--- /dev/null
+++ b/sys/dev/raidframe/rf_debugMem.c
@@ -0,0 +1,206 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_debugMem.c,v 1.7 2000/01/07 03:40:59 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky, Mark Holland, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* debugMem.c:  memory usage debugging stuff.
+ * Malloc, Calloc, and Free are #defined everywhere
+ * to do_malloc, do_calloc, and do_free.
+ *
+ * if RF_UTILITY is nonzero, it means were compiling one of the
+ * raidframe utility programs, such as rfctrl or smd.  In this
+ * case, we eliminate all references to the threads package
+ * and to the allocation list stuff.
+ */
+
+#include <dev/raidframe/rf_types.h>
+
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_options.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_general.h>
+
+#if defined(__FreeBSD__)
+#include <sys/kernel.h>
+MALLOC_DEFINE(M_RAIDFRAME, "rfbuf", "Buffers for RAIDframe operation");
+#endif
+
+static long tot_mem_in_use = 0;
+
+/* Hash table of information about memory allocations */
+#define RF_MH_TABLESIZE 1000
+
+struct mh_struct {
+	void   *address;
+	int     size;
+	int     line;
+	char   *filen;
+	char    allocated;
+	struct mh_struct *next;
+};
+static struct mh_struct *mh_table[RF_MH_TABLESIZE];
+RF_DECLARE_MUTEX(rf_debug_mem_mutex)
+	static int mh_table_initialized = 0;
+
+	static void memory_hash_insert(void *addr, int size, int line, char *filen);
+	static int memory_hash_remove(void *addr, int sz);
+
+void 
+rf_record_malloc(p, size, line, filen)
+	void   *p;
+	int     size, line;
+	char   *filen;
+{
+	RF_ASSERT(size != 0);
+
+	/* RF_LOCK_MUTEX(rf_debug_mem_mutex); */
+	memory_hash_insert(p, size, line, filen);
+	tot_mem_in_use += size;
+	/* RF_UNLOCK_MUTEX(rf_debug_mem_mutex); */
+	if ((long) p == rf_memDebugAddress) {
+		printf("Allocate: debug address allocated from line %d file %s\n", line, filen);
+	}
+}
+
+void 
+rf_unrecord_malloc(p, sz)
+	void   *p;
+	int     sz;
+{
+	int     size;
+
+	/* RF_LOCK_MUTEX(rf_debug_mem_mutex); */
+	size = memory_hash_remove(p, sz);
+	tot_mem_in_use -= size;
+	/* RF_UNLOCK_MUTEX(rf_debug_mem_mutex); */
+	if ((long) p == rf_memDebugAddress) {
+		printf("Free: Found debug address\n");	/* this is really only a
+							 * flag line for gdb */
+	}
+}
+
+void 
+rf_print_unfreed()
+{
+	int     i, foundone = 0;
+	struct mh_struct *p;
+
+	for (i = 0; i < RF_MH_TABLESIZE; i++) {
+		for (p = mh_table[i]; p; p = p->next)
+			if (p->allocated) {
+				if (!foundone)
+					printf("\n\nThere are unfreed memory locations at program shutdown:\n");
+				foundone = 1;
+				printf("Addr 0x%lx Size %d line %d file %s\n",
+				    (long) p->address, p->size, p->line, p->filen);
+			}
+	}
+	if (tot_mem_in_use) {
+		printf("%ld total bytes in use\n", tot_mem_in_use);
+	}
+}
+
+int 
+rf_ConfigureDebugMem(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     i, rc;
+
+	rc = rf_create_managed_mutex(listp, &rf_debug_mem_mutex);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		return (rc);
+	}
+	if (rf_memDebug) {
+		for (i = 0; i < RF_MH_TABLESIZE; i++)
+			mh_table[i] = NULL;
+		mh_table_initialized = 1;
+	}
+	return (0);
+}
+#define HASHADDR(_a_)      ( (((unsigned long) _a_)>>3) % RF_MH_TABLESIZE )
+
+static void 
+memory_hash_insert(addr, size, line, filen)
+	void   *addr;
+	int     size, line;
+	char   *filen;
+{
+	unsigned long bucket = HASHADDR(addr);
+	struct mh_struct *p;
+
+	RF_ASSERT(mh_table_initialized);
+
+	/* search for this address in the hash table */
+	for (p = mh_table[bucket]; p && (p->address != addr); p = p->next);
+	if (!p) {
+		RF_Malloc(p, sizeof(struct mh_struct), (struct mh_struct *));
+		RF_ASSERT(p);
+		p->next = mh_table[bucket];
+		mh_table[bucket] = p;
+		p->address = addr;
+		p->allocated = 0;
+	}
+	if (p->allocated) {
+		printf("ERROR:  reallocated address 0x%lx from line %d, file %s without intervening free\n", (long) addr, line, filen);
+		printf("        last allocated from line %d file %s\n", p->line, p->filen);
+		RF_ASSERT(0);
+	}
+	p->size = size;
+	p->line = line;
+	p->filen = filen;
+	p->allocated = 1;
+}
+
+static int 
+memory_hash_remove(addr, sz)
+	void   *addr;
+	int     sz;
+{
+	unsigned long bucket = HASHADDR(addr);
+	struct mh_struct *p;
+
+	RF_ASSERT(mh_table_initialized);
+	for (p = mh_table[bucket]; p && (p->address != addr); p = p->next);
+	if (!p) {
+		printf("ERROR:  freeing never-allocated address 0x%lx\n", (long) addr);
+		RF_PANIC();
+	}
+	if (!p->allocated) {
+		printf("ERROR:  freeing unallocated address 0x%lx.  Last allocation line %d file %s\n", (long) addr, p->line, p->filen);
+		RF_PANIC();
+	}
+	if (sz > 0 && p->size != sz) {	/* you can suppress this error by
+					 * using a negative value as the size
+					 * to free */
+		printf("ERROR:  incorrect size at free for address 0x%lx: is %d should be %d.  Alloc at line %d of file %s\n", (unsigned long) addr, sz, p->size, p->line, p->filen);
+		RF_PANIC();
+	}
+	p->allocated = 0;
+	return (p->size);
+}
diff --git a/sys/dev/raidframe/rf_debugMem.h b/sys/dev/raidframe/rf_debugMem.h
new file mode 100644
index 0000000..e6d8c60
--- /dev/null
+++ b/sys/dev/raidframe/rf_debugMem.h
@@ -0,0 +1,88 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_debugMem.h,v 1.7 1999/09/05 01:58:11 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky, Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_debugMem.h -- memory leak debugging module
+ *
+ * IMPORTANT:  if you put the lock/unlock mutex stuff back in here, you
+ *             need to take it out of the routines in debugMem.c
+ *
+ */
+
+#ifndef _RF__RF_DEBUGMEM_H_
+#define _RF__RF_DEBUGMEM_H_
+
+#include <dev/raidframe/rf_alloclist.h>
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/malloc.h>
+
+#if defined(__FreeBSD__)
+MALLOC_DECLARE(M_RAIDFRAME);
+#endif
+
+#define RF_Malloc(_p_, _size_, _cast_)                                      \
+  {                                                                         \
+     _p_ = _cast_ malloc((u_long)_size_, M_RAIDFRAME, M_NOWAIT | M_ZERO);   \
+     if (_p_ == NULL) panic("out of memory\n");				    \
+     if (rf_memDebug) rf_record_malloc(_p_, _size_, __LINE__, __FILE__);    \
+  }
+
+#define RF_MallocAndAdd(__p_, __size_, __cast_, __alist_)                   \
+  {                                                                         \
+     RF_Malloc(__p_, __size_, __cast_);                                     \
+     if (__alist_) rf_AddToAllocList(__alist_, __p_, __size_);              \
+  }
+
+#define RF_Calloc(_p_, _nel_, _elsz_, _cast_)                               \
+  {                                                                         \
+     RF_Malloc( _p_, (_nel_) * (_elsz_), _cast_);                           \
+  }
+
+#define RF_CallocAndAdd(__p,__nel,__elsz,__cast,__alist)                    \
+  {                                                                         \
+     RF_Calloc(__p, __nel, __elsz, __cast);                                 \
+     if (__alist) rf_AddToAllocList(__alist, __p, (__nel)*(__elsz));        \
+  }
+
+#define RF_Free(_p_, _sz_)                                                  \
+  {                                                                         \
+     free((void *)(_p_), M_RAIDFRAME);                                      \
+     if (rf_memDebug) rf_unrecord_malloc(_p_, (u_int32_t) (_sz_));          \
+  }
+
+#endif				/* _KERNEL */
+
+void    rf_record_malloc(void *p, int size, int line, char *filen);
+void    rf_unrecord_malloc(void *p, int sz);
+void    rf_print_unfreed(void);
+int     rf_ConfigureDebugMem(RF_ShutdownList_t ** listp);
+
+#endif				/* !_RF__RF_DEBUGMEM_H_ */
diff --git a/sys/dev/raidframe/rf_debugprint.c b/sys/dev/raidframe/rf_debugprint.c
new file mode 100644
index 0000000..6c96023
--- /dev/null
+++ b/sys/dev/raidframe/rf_debugprint.c
@@ -0,0 +1,134 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_debugprint.c,v 1.3 1999/02/05 00:06:08 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * Code to do debug printfs. Calls to rf_debug_printf cause the corresponding
+ * information to be printed to a circular buffer rather than the screen.
+ * The point is to try and minimize the timing variations induced by the
+ * printfs, and to capture only the printf's immediately preceding a failure.
+ */
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_debugprint.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_options.h>
+
+#include <sys/param.h>
+
+struct RF_Entry_s {
+	char   *cstring;
+	void   *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
+};
+/* space for 1k lines */
+#define BUFSHIFT 10
+#define BUFSIZE  (1<<BUFSHIFT)
+#define BUFMASK  (BUFSIZE-1)
+
+static struct RF_Entry_s rf_debugprint_buf[BUFSIZE];
+static int rf_debugprint_index = 0;
+RF_DECLARE_STATIC_MUTEX(rf_debug_print_mutex)
+	int     rf_ConfigureDebugPrint(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     rc;
+
+	rc = rf_create_managed_mutex(listp, &rf_debug_print_mutex);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		return (rc);
+	}
+	rf_clear_debug_print_buffer();
+	return (0);
+}
+
+void 
+rf_clear_debug_print_buffer()
+{
+	int     i;
+
+	for (i = 0; i < BUFSIZE; i++)
+		rf_debugprint_buf[i].cstring = NULL;
+	rf_debugprint_index = 0;
+}
+
+void 
+rf_debug_printf(s, a1, a2, a3, a4, a5, a6, a7, a8)
+	char   *s;
+	void   *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
+{
+	int     idx;
+
+	if (rf_debugPrintUseBuffer) {
+
+		RF_LOCK_MUTEX(rf_debug_print_mutex);
+		idx = rf_debugprint_index;
+		rf_debugprint_index = (rf_debugprint_index + 1) & BUFMASK;
+		RF_UNLOCK_MUTEX(rf_debug_print_mutex);
+
+		rf_debugprint_buf[idx].cstring = s;
+		rf_debugprint_buf[idx].a1 = a1;
+		rf_debugprint_buf[idx].a2 = a2;
+		rf_debugprint_buf[idx].a3 = a3;
+		rf_debugprint_buf[idx].a4 = a4;
+		rf_debugprint_buf[idx].a5 = a5;
+		rf_debugprint_buf[idx].a6 = a6;
+		rf_debugprint_buf[idx].a7 = a7;
+		rf_debugprint_buf[idx].a8 = a8;
+	} else {
+		printf(s, a1, a2, a3, a4, a5, a6, a7, a8);
+	}
+}
+
+void 
+rf_print_debug_buffer()
+{
+	rf_spill_debug_buffer(NULL);
+}
+
+void 
+rf_spill_debug_buffer(fname)
+	char   *fname;
+{
+	int     i;
+
+	if (!rf_debugPrintUseBuffer)
+		return;
+
+	RF_LOCK_MUTEX(rf_debug_print_mutex);
+
+	for (i = rf_debugprint_index + 1; i != rf_debugprint_index; i = (i + 1) & BUFMASK)
+		if (rf_debugprint_buf[i].cstring)
+			printf(rf_debugprint_buf[i].cstring, rf_debugprint_buf[i].a1, rf_debugprint_buf[i].a2, rf_debugprint_buf[i].a3,
+			    rf_debugprint_buf[i].a4, rf_debugprint_buf[i].a5, rf_debugprint_buf[i].a6, rf_debugprint_buf[i].a7, rf_debugprint_buf[i].a8);
+	printf(rf_debugprint_buf[i].cstring, rf_debugprint_buf[i].a1, rf_debugprint_buf[i].a2, rf_debugprint_buf[i].a3,
+	    rf_debugprint_buf[i].a4, rf_debugprint_buf[i].a5, rf_debugprint_buf[i].a6, rf_debugprint_buf[i].a7, rf_debugprint_buf[i].a8);
+	RF_UNLOCK_MUTEX(rf_debug_print_mutex);
+}
diff --git a/sys/dev/raidframe/rf_debugprint.h b/sys/dev/raidframe/rf_debugprint.h
new file mode 100644
index 0000000..318f620
--- /dev/null
+++ b/sys/dev/raidframe/rf_debugprint.h
@@ -0,0 +1,44 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_debugprint.h,v 1.3 1999/02/05 00:06:08 oster Exp $	*/
+/*
+ * rf_debugprint.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_DEBUGPRINT_H_
+#define _RF__RF_DEBUGPRINT_H_
+
+int     rf_ConfigureDebugPrint(RF_ShutdownList_t ** listp);
+void    rf_clear_debug_print_buffer(void);
+void 
+rf_debug_printf(char *s, void *a1, void *a2, void *a3, void *a4,
+    void *a5, void *a6, void *a7, void *a8);
+void    rf_print_debug_buffer(void);
+void    rf_spill_debug_buffer(char *fname);
+
+#endif				/* !_RF__RF_DEBUGPRINT_H_ */
diff --git a/sys/dev/raidframe/rf_decluster.c b/sys/dev/raidframe/rf_decluster.c
new file mode 100644
index 0000000..3a02519
--- /dev/null
+++ b/sys/dev/raidframe/rf_decluster.c
@@ -0,0 +1,745 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_decluster.c,v 1.6 2001/01/26 04:40:03 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*----------------------------------------------------------------------
+ *
+ * rf_decluster.c -- code related to the declustered layout
+ *
+ * Created 10-21-92 (MCH)
+ *
+ * Nov 93:  adding support for distributed sparing.  This code is a little
+ *          complex:  the basic layout used is as follows:
+ *          let F = (v-1)/GCD(r,v-1).  The spare space for each set of
+ *          F consecutive fulltables is grouped together and placed after
+ *          that set of tables.
+ *                   +------------------------------+
+ *                   |        F fulltables          |
+ *                   |        Spare Space           |
+ *                   |        F fulltables          |
+ *                   |        Spare Space           |
+ *                   |            ...               |
+ *                   +------------------------------+
+ *
+ *--------------------------------------------------------------------*/
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_raidframe.h>
+#include <dev/raidframe/rf_configure.h>
+#include <dev/raidframe/rf_decluster.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_utils.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_shutdown.h>
+
+
+extern int rf_copyback_in_progress;	/* debug only */
+
+/* found in rf_kintf.c */
+int     rf_GetSpareTableFromDaemon(RF_SparetWait_t * req);
+
+#if (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0)
+
+/* configuration code */
+
+int 
+rf_ConfigureDeclustered(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	int     b, v, k, r, lambda;	/* block design params */
+	int     i, j;
+	RF_RowCol_t *first_avail_slot;
+	RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk;
+	RF_DeclusteredConfigInfo_t *info;
+	RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk,
+	        extraPUsPerDisk;
+	RF_StripeCount_t totSparePUsPerDisk;
+	RF_SectorNum_t diskOffsetOfLastFullTableInSUs;
+	RF_SectorCount_t SpareSpaceInSUs;
+	char   *cfgBuf = (char *) (cfgPtr->layoutSpecific);
+	RF_StripeNum_t l, SUID;
+
+	SUID = l = 0;
+	numCompleteSpareRegionsPerDisk = 0;
+
+	/* 1. create layout specific structure */
+	RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList);
+	if (info == NULL)
+		return (ENOMEM);
+	layoutPtr->layoutSpecificInfo = (void *) info;
+	info->SpareTable = NULL;
+
+	/* 2. extract parameters from the config structure */
+	if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
+		(void) bcopy(cfgBuf, info->sparemap_fname, RF_SPAREMAP_NAME_LEN);
+	}
+	cfgBuf += RF_SPAREMAP_NAME_LEN;
+
+	b = *((int *) cfgBuf);
+	cfgBuf += sizeof(int);
+	v = *((int *) cfgBuf);
+	cfgBuf += sizeof(int);
+	k = *((int *) cfgBuf);
+	cfgBuf += sizeof(int);
+	r = *((int *) cfgBuf);
+	cfgBuf += sizeof(int);
+	lambda = *((int *) cfgBuf);
+	cfgBuf += sizeof(int);
+	raidPtr->noRotate = *((int *) cfgBuf);
+	cfgBuf += sizeof(int);
+
+	/* the sparemaps are generated assuming that parity is rotated, so we
+	 * issue a warning if both distributed sparing and no-rotate are on at
+	 * the same time */
+	if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) {
+		RF_ERRORMSG("Warning:  distributed sparing specified without parity rotation.\n");
+	}
+	if (raidPtr->numCol != v) {
+		RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v, raidPtr->numCol);
+		return (EINVAL);
+	}
+	/* 3.  set up the values used in the mapping code */
+	info->BlocksPerTable = b;
+	info->Lambda = lambda;
+	info->NumParityReps = info->groupSize = k;
+	info->SUsPerTable = b * (k - 1) * layoutPtr->SUsPerPU;	/* b blks, k-1 SUs each */
+	info->SUsPerFullTable = k * info->SUsPerTable;	/* rot k times */
+	info->PUsPerBlock = k - 1;
+	info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU;
+	info->TableDepthInPUs = (b * k) / v;
+	info->FullTableDepthInPUs = info->TableDepthInPUs * k;	/* k repetitions */
+
+	/* used only in distributed sparing case */
+	info->FullTablesPerSpareRegion = (v - 1) / rf_gcd(r, v - 1);	/* (v-1)/gcd fulltables */
+	info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion;
+	info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v - 1)) * layoutPtr->SUsPerPU;
+
+	/* check to make sure the block design is sufficiently small */
+	if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+		if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) {
+			RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n",
+			    (int) info->FullTableDepthInPUs,
+			    (int) info->SpareSpaceDepthPerRegionInSUs,
+			    (int) layoutPtr->stripeUnitsPerDisk);
+			return (EINVAL);
+		}
+	} else {
+		if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) {
+			RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n",
+			    (int) (info->TableDepthInPUs * layoutPtr->SUsPerPU), \
+			    (int) layoutPtr->stripeUnitsPerDisk);
+			return (EINVAL);
+		}
+	}
+
+
+	/* compute the size of each disk, and the number of tables in the last
+	 * fulltable (which need not be complete) */
+	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+
+		PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU;
+		spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs +
+		    (info->TablesPerSpareRegion * info->TableDepthInPUs) / (v - 1));
+		info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU;
+
+		numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs;
+		info->NumCompleteSRs = numCompleteSpareRegionsPerDisk;
+		extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs;
+
+		/* assume conservatively that we need the full amount of spare
+		 * space in one region in order to provide spares for the
+		 * partial spare region at the end of the array.  We set "i"
+		 * to the number of tables in the partial spare region.  This
+		 * may actually include some fulltables. */
+		extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
+		if (extraPUsPerDisk <= 0)
+			i = 0;
+		else
+			i = extraPUsPerDisk / info->TableDepthInPUs;
+
+		complete_FT_count = raidPtr->numRow * (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion / k) + i / k);
+		info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
+		info->ExtraTablesPerDisk = i % k;
+
+		/* note that in the last spare region, the spare space is
+		 * complete even though data/parity space is not */
+		totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk + 1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
+		info->TotSparePUsPerDisk = totSparePUsPerDisk;
+
+		layoutPtr->stripeUnitsPerDisk =
+		    ((complete_FT_count / raidPtr->numRow) * info->FullTableDepthInPUs +	/* data & parity space */
+		    info->ExtraTablesPerDisk * info->TableDepthInPUs +
+		    totSparePUsPerDisk	/* spare space */
+		    ) * layoutPtr->SUsPerPU;
+		layoutPtr->dataStripeUnitsPerDisk =
+		    (complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs)
+		    * layoutPtr->SUsPerPU * (k - 1) / k;
+
+	} else {
+		/* non-dist spare case:  force each disk to contain an
+		 * integral number of tables */
+		layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
+		layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
+
+		/* compute the number of tables in the last fulltable, which
+		 * need not be complete */
+		complete_FT_count =
+		    ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->FullTableDepthInPUs) * raidPtr->numRow;
+
+		info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
+		info->ExtraTablesPerDisk =
+		    ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k;
+	}
+
+	raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+	/* find the disk offset of the stripe unit where the last fulltable
+	 * starts */
+	numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow;
+	diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+		SpareSpaceInSUs = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs;
+		diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs;
+		info->DiskOffsetOfLastSpareSpaceChunkInSUs =
+		    diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU;
+	}
+	info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs;
+	info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk;
+
+	/* 4.  create and initialize the lookup tables */
+	info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
+	if (info->LayoutTable == NULL)
+		return (ENOMEM);
+	info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
+	if (info->OffsetTable == NULL)
+		return (ENOMEM);
+	info->BlockTable = rf_make_2d_array(info->TableDepthInPUs * layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList);
+	if (info->BlockTable == NULL)
+		return (ENOMEM);
+
+	first_avail_slot = rf_make_1d_array(v, NULL);
+	if (first_avail_slot == NULL)
+		return (ENOMEM);
+
+	for (i = 0; i < b; i++)
+		for (j = 0; j < k; j++)
+			info->LayoutTable[i][j] = *cfgBuf++;
+
+	/* initialize offset table */
+	for (i = 0; i < b; i++)
+		for (j = 0; j < k; j++) {
+			info->OffsetTable[i][j] = first_avail_slot[info->LayoutTable[i][j]];
+			first_avail_slot[info->LayoutTable[i][j]]++;
+		}
+
+	/* initialize block table */
+	for (SUID = l = 0; l < layoutPtr->SUsPerPU; l++) {
+		for (i = 0; i < b; i++) {
+			for (j = 0; j < k; j++) {
+				info->BlockTable[(info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l]
+				    [info->LayoutTable[i][j]] = SUID;
+			}
+			SUID++;
+		}
+	}
+
+	rf_free_1d_array(first_avail_slot, v);
+
+	/* 5.  set up the remaining redundant-but-useful parameters */
+
+	raidPtr->totalSectors = (k * complete_FT_count + raidPtr->numRow * info->ExtraTablesPerDisk) *
+	    info->SUsPerTable * layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k - 1);
+
+	/* strange evaluation order below to try and minimize overflow
+	 * problems */
+
+	layoutPtr->dataSectorsPerStripe = (k - 1) * layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+	layoutPtr->numDataCol = k - 1;
+	layoutPtr->numParityCol = 1;
+
+	return (0);
+}
+/* declustering with distributed sparing */
+static void rf_ShutdownDeclusteredDS(RF_ThreadArg_t);
+static void 
+rf_ShutdownDeclusteredDS(arg)
+	RF_ThreadArg_t arg;
+{
+	RF_DeclusteredConfigInfo_t *info;
+	RF_Raid_t *raidPtr;
+
+	raidPtr = (RF_Raid_t *) arg;
+	info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+	if (info->SpareTable)
+		rf_FreeSpareTable(raidPtr);
+}
+
+int 
+rf_ConfigureDeclusteredDS(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	int     rc;
+
+	rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr);
+	if (rc)
+		return (rc);
+	rc = rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr);
+	if (rc) {
+		RF_ERRORMSG1("Got %d adding shutdown event for DeclusteredDS\n", rc);
+		rf_ShutdownDeclusteredDS(raidPtr);
+		return (rc);
+	}
+	return (0);
+}
+
+void 
+rf_MapSectorDeclustered(raidPtr, raidSector, row, col, diskSector, remap)
+	RF_Raid_t *raidPtr;
+	RF_RaidAddr_t raidSector;
+	RF_RowCol_t *row;
+	RF_RowCol_t *col;
+	RF_SectorNum_t *diskSector;
+	int     remap;
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+	RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
+	RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
+	RF_StripeNum_t BlockID, BlockOffset, RepIndex;
+	RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+	RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+	RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0;
+
+	rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+
+	FullTableID = SUID / sus_per_fulltable;	/* fulltable ID within array
+						 * (across rows) */
+	if (raidPtr->numRow == 1)
+		*row = 0;	/* avoid a mod and a div in the common case */
+	else {
+		*row = FullTableID % raidPtr->numRow;
+		FullTableID /= raidPtr->numRow;	/* convert to fulltable ID on
+						 * this disk */
+	}
+	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+		SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
+		SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
+	}
+	FullTableOffset = SUID % sus_per_fulltable;
+	TableID = FullTableOffset / info->SUsPerTable;
+	TableOffset = FullTableOffset - TableID * info->SUsPerTable;
+	BlockID = TableOffset / info->PUsPerBlock;
+	BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
+	BlockID %= info->BlocksPerTable;
+	RepIndex = info->PUsPerBlock - TableID;
+	if (!raidPtr->noRotate)
+		BlockOffset += ((BlockOffset >= RepIndex) ? 1 : 0);
+	*col = info->LayoutTable[BlockID][BlockOffset];
+
+	/* remap to distributed spare space if indicated */
+	if (remap) {
+		RF_ASSERT(raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
+		    (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal));
+		rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU);
+	} else {
+
+		outSU = base_suid;
+		outSU += FullTableID * fulltable_depth;	/* offs to strt of FT */
+		outSU += SpareSpace;	/* skip rsvd spare space */
+		outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;	/* offs to strt of tble */
+		outSU += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU;	/* offs to the PU */
+	}
+	outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);	/* offs to the SU within
+										 * a PU */
+
+	/* convert SUs to sectors, and, if not aligned to SU boundary, add in
+	 * offset to sector.  */
+	*diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
+
+	RF_ASSERT(*col != -1);
+}
+
+
+/* prototyping this inexplicably causes the compile of the layout table (rf_layout.c) to fail */
+void 
+rf_MapParityDeclustered(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+	RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
+	RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
+	RF_StripeNum_t BlockID, BlockOffset, RepIndex;
+	RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+	RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+	RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0;
+
+	rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+
+	/* compute row & (possibly) spare space exactly as before */
+	FullTableID = SUID / sus_per_fulltable;
+	if (raidPtr->numRow == 1)
+		*row = 0;	/* avoid a mod and a div in the common case */
+	else {
+		*row = FullTableID % raidPtr->numRow;
+		FullTableID /= raidPtr->numRow;	/* convert to fulltable ID on
+						 * this disk */
+	}
+	if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+		SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
+		SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
+	}
+	/* compute BlockID and RepIndex exactly as before */
+	FullTableOffset = SUID % sus_per_fulltable;
+	TableID = FullTableOffset / info->SUsPerTable;
+	TableOffset = FullTableOffset - TableID * info->SUsPerTable;
+	/* TableOffset     = FullTableOffset % info->SUsPerTable; */
+	/* BlockID         = (TableOffset / info->PUsPerBlock) %
+	 * info->BlocksPerTable; */
+	BlockID = TableOffset / info->PUsPerBlock;
+	/* BlockOffset     = TableOffset % info->PUsPerBlock; */
+	BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
+	BlockID %= info->BlocksPerTable;
+
+	/* the parity block is in the position indicated by RepIndex */
+	RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->PUsPerBlock - TableID;
+	*col = info->LayoutTable[BlockID][RepIndex];
+
+	if (remap) {
+		RF_ASSERT(raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
+		    (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal));
+		rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU);
+	} else {
+
+		/* compute sector as before, except use RepIndex instead of
+		 * BlockOffset */
+		outSU = base_suid;
+		outSU += FullTableID * fulltable_depth;
+		outSU += SpareSpace;	/* skip rsvd spare space */
+		outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
+		outSU += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU;
+	}
+
+	outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
+	*diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
+
+	RF_ASSERT(*col != -1);
+}
+/* returns an array of ints identifying the disks that comprise the stripe containing the indicated address.
+ * the caller must _never_ attempt to modify this array.
+ */
+void 
+rf_IdentifyStripeDeclustered(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids,
+    RF_RowCol_t * outRow)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+	RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+	RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+	RF_StripeNum_t base_suid = 0;
+	RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr);
+	RF_StripeNum_t stripeID, FullTableID;
+	int     tableOffset;
+
+	rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+	FullTableID = SUID / sus_per_fulltable;	/* fulltable ID within array
+						 * (across rows) */
+	*outRow = FullTableID % raidPtr->numRow;
+	stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID);	/* find stripe offset
+								 * into array */
+	tableOffset = (stripeID % info->BlocksPerTable);	/* find offset into
+								 * block design table */
+	*diskids = info->LayoutTable[tableOffset];
+}
+/* This returns the default head-separation limit, which is measured
+ * in "required units for reconstruction".  Each time a disk fetches
+ * a unit, it bumps a counter.  The head-sep code prohibits any disk
+ * from getting more than headSepLimit counter values ahead of any
+ * other.
+ *
+ * We assume here that the number of floating recon buffers is already
+ * set.  There are r stripes to be reconstructed in each table, and so
+ * if we have a total of B buffers, we can have at most B/r tables
+ * under recon at any one time.  In each table, lambda units are required
+ * from each disk, so given B buffers, the head sep limit has to be
+ * (lambda*B)/r units.  We subtract one to avoid weird boundary cases.
+ *
+ * for example, suppose were given 50 buffers, r=19, and lambda=4 as in
+ * the 20.5 design.  There are 19 stripes/table to be reconstructed, so
+ * we can have 50/19 tables concurrently under reconstruction, which means
+ * we can allow the fastest disk to get 50/19 tables ahead of the slower
+ * disk.  There are lambda "required units" for each disk, so the fastest
+ * disk can get 4*50/19 = 10 counter values ahead of the slowest.
+ *
+ * If numBufsToAccumulate is not 1, we need to limit the head sep further
+ * because multiple bufs will be required for each stripe under recon.
+ */
+RF_HeadSepLimit_t 
+rf_GetDefaultHeadSepLimitDeclustered(
+    RF_Raid_t * raidPtr)
+{
+	RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+
+	return (info->Lambda * raidPtr->numFloatingReconBufs / info->TableDepthInPUs / rf_numBufsToAccumulate);
+}
+/* returns the default number of recon buffers to use.  The value
+ * is somewhat arbitrary...it's intended to be large enough to allow
+ * for a reasonably large head-sep limit, but small enough that you
+ * don't use up all your system memory with buffers.
+ */
+int 
+rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t * raidPtr)
+{
+	return (100 * rf_numBufsToAccumulate);
+}
+/* sectors in the last fulltable of the array need to be handled
+ * specially since this fulltable can be incomplete.  this function
+ * changes the values of certain params to handle this.
+ *
+ * the idea here is that MapSector et. al. figure out which disk the
+ * addressed unit lives on by computing the modulos of the unit number
+ * with the number of units per fulltable, table, etc.  In the last
+ * fulltable, there are fewer units per fulltable, so we need to adjust
+ * the number of user data units per fulltable to reflect this.
+ *
+ * so, we (1) convert the fulltable size and depth parameters to
+ * the size of the partial fulltable at the end, (2) compute the
+ * disk sector offset where this fulltable starts, and (3) convert
+ * the users stripe unit number from an offset into the array to
+ * an offset into the last fulltable.
+ */
+void 
+rf_decluster_adjust_params(
+    RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t * SUID,
+    RF_StripeCount_t * sus_per_fulltable,
+    RF_StripeCount_t * fulltable_depth,
+    RF_StripeNum_t * base_suid)
+{
+	RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+
+	if (*SUID >= info->FullTableLimitSUID) {
+		/* new full table size is size of last full table on disk */
+		*sus_per_fulltable = info->ExtraTablesPerDisk * info->SUsPerTable;
+
+		/* new full table depth is corresponding depth */
+		*fulltable_depth = info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU;
+
+		/* set up the new base offset */
+		*base_suid = info->DiskOffsetOfLastFullTableInSUs;
+
+		/* convert users array address to an offset into the last
+		 * fulltable */
+		*SUID -= info->FullTableLimitSUID;
+	}
+}
+/*
+ * map a stripe ID to a parity stripe ID.
+ * See comment above RaidAddressToParityStripeID in layout.c.
+ */
+void 
+rf_MapSIDToPSIDDeclustered(
+    RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID,
+    RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru)
+{
+	RF_DeclusteredConfigInfo_t *info;
+
+	info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+
+	*psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable))
+	    * info->BlocksPerTable + (stripeID % info->BlocksPerTable);
+	*which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU))
+	    / info->BlocksPerTable;
+	RF_ASSERT((*which_ru) < layoutPtr->SUsPerPU / layoutPtr->SUsPerRU);
+}
+/*
+ * Called from MapSector and MapParity to retarget an access at the spare unit.
+ * Modifies the "col" and "outSU" parameters only.
+ */
+void 
+rf_remap_to_spare_space(
+    RF_RaidLayout_t * layoutPtr,
+    RF_DeclusteredConfigInfo_t * info,
+    RF_RowCol_t row,
+    RF_StripeNum_t FullTableID,
+    RF_StripeNum_t TableID,
+    RF_SectorNum_t BlockID,
+    RF_StripeNum_t base_suid,
+    RF_StripeNum_t SpareRegion,
+    RF_RowCol_t * outCol,
+    RF_StripeNum_t * outSU)
+{
+	RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion, lastSROffset,
+	        which_ft;
+
+	/*
+         * note that FullTableID and hence SpareRegion may have gotten
+         * tweaked by rf_decluster_adjust_params. We detect this by
+         * noticing that base_suid is not 0.
+         */
+	if (base_suid == 0) {
+		ftID = FullTableID;
+	} else {
+		/*
+	         * There may be > 1.0 full tables in the last (i.e. partial)
+	         * spare region.  find out which of these we're in.
+	         */
+		lastSROffset = info->NumCompleteSRs * info->SpareRegionDepthInSUs;
+		which_ft = (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU);
+
+		/* compute the actual full table ID */
+		ftID = info->DiskOffsetOfLastFullTableInSUs / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) + which_ft;
+		SpareRegion = info->NumCompleteSRs;
+	}
+	TableInSpareRegion = (ftID * info->NumParityReps + TableID) % info->TablesPerSpareRegion;
+
+	*outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk;
+	RF_ASSERT(*outCol != -1);
+
+	spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ?
+	    info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU :
+	    (SpareRegion + 1) * info->SpareRegionDepthInSUs - info->SpareSpaceDepthPerRegionInSUs;
+	*outSU = spareTableStartSU + info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs;
+	if (*outSU >= layoutPtr->stripeUnitsPerDisk) {
+		printf("rf_remap_to_spare_space: invalid remapped disk SU offset %ld\n", (long) *outSU);
+	}
+}
+
+#endif /* (RF_INCLUDE_PARITY_DECLUSTERING > 0)  || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) */
+
+
+int 
+rf_InstallSpareTable(
+    RF_Raid_t * raidPtr,
+    RF_RowCol_t frow,
+    RF_RowCol_t fcol)
+{
+	RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+	RF_SparetWait_t *req;
+	int     retcode;
+
+	RF_Malloc(req, sizeof(*req), (RF_SparetWait_t *));
+	req->C = raidPtr->numCol;
+	req->G = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol;
+	req->fcol = fcol;
+	req->SUsPerPU = raidPtr->Layout.SUsPerPU;
+	req->TablesPerSpareRegion = info->TablesPerSpareRegion;
+	req->BlocksPerTable = info->BlocksPerTable;
+	req->TableDepthInPUs = info->TableDepthInPUs;
+	req->SpareSpaceDepthPerRegionInSUs = info->SpareSpaceDepthPerRegionInSUs;
+
+	retcode = rf_GetSpareTableFromDaemon(req);
+	RF_ASSERT(!retcode);	/* XXX -- fix this to recover gracefully --
+				 * XXX */
+	return (retcode);
+}
+/*
+ * Invoked via ioctl to install a spare table in the kernel.
+ */
+int 
+rf_SetSpareTable(raidPtr, data)
+	RF_Raid_t *raidPtr;
+	void   *data;
+{
+	RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+	RF_SpareTableEntry_t **ptrs;
+	int     i, retcode;
+
+	/* what we need to copyin is a 2-d array, so first copyin the user
+	 * pointers to the rows in the table */
+	RF_Malloc(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
+	retcode = copyin((caddr_t) data, (caddr_t) ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
+
+	if (retcode)
+		return (retcode);
+
+	/* now allocate kernel space for the row pointers */
+	RF_Malloc(info->SpareTable, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
+
+	/* now allocate kernel space for each row in the table, and copy it in
+	 * from user space */
+	for (i = 0; i < info->TablesPerSpareRegion; i++) {
+		RF_Malloc(info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t), (RF_SpareTableEntry_t *));
+		retcode = copyin(ptrs[i], info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));
+		if (retcode) {
+			info->SpareTable = NULL;	/* blow off the memory
+							 * we've allocated */
+			return (retcode);
+		}
+	}
+
+	/* free up the temporary array we used */
+	RF_Free(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
+
+	return (0);
+}
+
+RF_ReconUnitCount_t 
+rf_GetNumSpareRUsDeclustered(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+
+	return (((RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk);
+}
+
+void 
+rf_FreeSpareTable(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	long    i;
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+	RF_SpareTableEntry_t **table = info->SpareTable;
+
+	for (i = 0; i < info->TablesPerSpareRegion; i++) {
+		RF_Free(table[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));
+	}
+	RF_Free(table, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
+	info->SpareTable = (RF_SpareTableEntry_t **) NULL;
+}
diff --git a/sys/dev/raidframe/rf_decluster.h b/sys/dev/raidframe/rf_decluster.h
new file mode 100644
index 0000000..a630298
--- /dev/null
+++ b/sys/dev/raidframe/rf_decluster.h
@@ -0,0 +1,141 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_decluster.h,v 1.3 1999/02/05 00:06:09 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*----------------------------------------------------------------------
+ *
+ * decluster.h -- header file for declustered layout code
+ *
+ * Adapted from raidSim version July 1994
+ * Created 10-21-92 (MCH)
+ *
+ *--------------------------------------------------------------------*/
+
+#ifndef _RF__RF_DECLUSTER_H_
+#define _RF__RF_DECLUSTER_H_
+
+#include <dev/raidframe/rf_types.h>
+
+/*
+ * These structures define the tables used to locate the spare unit
+ * associated with a particular data or parity unit, and to perform
+ * the associated inverse mapping.
+ */
+struct RF_SpareTableEntry_s {
+	u_int   spareDisk;	/* disk to which this block is spared */
+	u_int   spareBlockOffsetInSUs;	/* offset into spare table for that
+					 * disk */
+};
+#define RF_SPAREMAP_NAME_LEN 128
+
+/* this is the layout-specific info structure for the declustered layout.
+ */
+struct RF_DeclusteredConfigInfo_s {
+	RF_StripeCount_t groupSize;	/* no. of stripe units per parity
+					 * stripe */
+	RF_RowCol_t **LayoutTable;	/* the block design table */
+	RF_RowCol_t **OffsetTable;	/* the sector offset table */
+	RF_RowCol_t **BlockTable;	/* the block membership table */
+	RF_StripeCount_t SUsPerFullTable;	/* stripe units per full table */
+	RF_StripeCount_t SUsPerTable;	/* stripe units per table */
+	RF_StripeCount_t PUsPerBlock;	/* parity units per block */
+	RF_StripeCount_t SUsPerBlock;	/* stripe units per block */
+	RF_StripeCount_t BlocksPerTable;	/* block design tuples per
+						 * table */
+	RF_StripeCount_t NumParityReps;	/* tables per full table */
+	RF_StripeCount_t TableDepthInPUs;	/* PUs on one disk in 1 table */
+	RF_StripeCount_t FullTableDepthInPUs;	/* PUs on one disk in 1
+						 * fulltable */
+	RF_StripeCount_t FullTableLimitSUID;	/* SU where partial fulltables
+						 * start */
+	RF_StripeCount_t ExtraTablesPerDisk;	/* # of tables in last
+						 * fulltable */
+	RF_SectorNum_t DiskOffsetOfLastFullTableInSUs;	/* disk offs of partial
+							 * ft, if any */
+	RF_StripeCount_t numCompleteFullTablesPerDisk;	/* ft identifier of
+							 * partial ft, if any */
+	u_int   Lambda;		/* the pair count in the block design */
+
+	/* these are used only in the distributed-sparing case */
+	RF_StripeCount_t FullTablesPerSpareRegion;	/* # of ft's comprising
+							 * 1 spare region */
+	RF_StripeCount_t TablesPerSpareRegion;	/* # of tables */
+	RF_SectorCount_t SpareSpaceDepthPerRegionInSUs;	/* spare
+							 * space/disk/region */
+	RF_SectorCount_t SpareRegionDepthInSUs;	/* # of units/disk/region */
+	RF_SectorNum_t DiskOffsetOfLastSpareSpaceChunkInSUs;	/* locates sp space
+								 * after partial ft */
+	RF_StripeCount_t TotSparePUsPerDisk;	/* total number of spare PUs
+						 * per disk */
+	RF_StripeCount_t NumCompleteSRs;
+	RF_SpareTableEntry_t **SpareTable;	/* remap table for spare space */
+	char    sparemap_fname[RF_SPAREMAP_NAME_LEN];	/* where to find
+							 * sparemap. not used in
+							 * kernel */
+};
+
+int 
+rf_ConfigureDeclustered(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+int 
+rf_ConfigureDeclusteredDS(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+
+void 
+rf_MapSectorDeclustered(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_MapParityDeclustered(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_IdentifyStripeDeclustered(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
+void 
+rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID, RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru);
+int     rf_InstallSpareTable(RF_Raid_t * raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol);
+void    rf_FreeSpareTable(RF_Raid_t * raidPtr);
+
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t * raidPtr);
+int     rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t * raidPtr);
+
+void 
+rf_decluster_adjust_params(RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t * SUID, RF_StripeCount_t * sus_per_fulltable,
+    RF_StripeCount_t * fulltable_depth, RF_StripeNum_t * base_suid);
+void 
+rf_remap_to_spare_space(
+    RF_RaidLayout_t * layoutPtr,
+    RF_DeclusteredConfigInfo_t * info, RF_RowCol_t row, RF_StripeNum_t FullTableID,
+    RF_StripeNum_t TableID, RF_SectorNum_t BlockID, RF_StripeNum_t base_suid,
+    RF_StripeNum_t SpareRegion, RF_RowCol_t * outCol, RF_StripeNum_t * outSU);
+int     rf_SetSpareTable(RF_Raid_t * raidPtr, void *data);
+RF_ReconUnitCount_t rf_GetNumSpareRUsDeclustered(RF_Raid_t * raidPtr);
+
+#endif				/* !_RF__RF_DECLUSTER_H_ */
diff --git a/sys/dev/raidframe/rf_declusterPQ.c b/sys/dev/raidframe/rf_declusterPQ.c
new file mode 100644
index 0000000..77a03b8
--- /dev/null
+++ b/sys/dev/raidframe/rf_declusterPQ.c
@@ -0,0 +1,491 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_declusterPQ.c,v 1.5 2001/01/26 14:06:17 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Daniel Stodolsky, Mark Holland, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*--------------------------------------------------
+ * rf_declusterPQ.c
+ *
+ * mapping code for declustered P & Q or declustered EvenOdd
+ * much code borrowed from rf_decluster.c
+ *
+ *--------------------------------------------------*/
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_configure.h>
+#include <dev/raidframe/rf_decluster.h>
+#include <dev/raidframe/rf_declusterPQ.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_utils.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_general.h>
+
+#if (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) || (RF_INCLUDE_EVENODD > 0)
+/* configuration code */
+
+int 
+rf_ConfigureDeclusteredPQ(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	int     b, v, k, r, lambda;	/* block design params */
+	int     i, j, l;
+	int    *first_avail_slot;
+	int     complete_FT_count, SUID;
+	RF_DeclusteredConfigInfo_t *info;
+	int     numCompleteFullTablesPerDisk;
+	int     PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk = 0,
+	        extraPUsPerDisk;
+	int     totSparePUsPerDisk;
+	int     diskOffsetOfLastFullTableInSUs, SpareSpaceInSUs;
+	char   *cfgBuf = (char *) (cfgPtr->layoutSpecific);
+
+	cfgBuf += RF_SPAREMAP_NAME_LEN;
+
+	b = *((int *) cfgBuf);
+	cfgBuf += sizeof(int);
+	v = *((int *) cfgBuf);
+	cfgBuf += sizeof(int);
+	k = *((int *) cfgBuf);
+	cfgBuf += sizeof(int);
+	r = *((int *) cfgBuf);
+	cfgBuf += sizeof(int);
+	lambda = *((int *) cfgBuf);
+	cfgBuf += sizeof(int);
+	raidPtr->noRotate = *((int *) cfgBuf);
+	cfgBuf += sizeof(int);
+
+	if (k <= 2) {
+		printf("RAIDFRAME: k=%d, minimum value 2\n", k);
+		return (EINVAL);
+	}
+	/* 1. create layout specific structure */
+	RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList);
+	if (info == NULL)
+		return (ENOMEM);
+	layoutPtr->layoutSpecificInfo = (void *) info;
+
+	/* the sparemaps are generated assuming that parity is rotated, so we
+	 * issue a warning if both distributed sparing and no-rotate are on at
+	 * the same time */
+	if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) {
+		RF_ERRORMSG("Warning:  distributed sparing specified without parity rotation.\n");
+	}
+	if (raidPtr->numCol != v) {
+		RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v, raidPtr->numCol);
+		return (EINVAL);
+	}
+	/* 3.  set up the values used in devRaidMap */
+	info->BlocksPerTable = b;
+	info->NumParityReps = info->groupSize = k;
+	info->PUsPerBlock = k - 2;	/* PQ */
+	info->SUsPerTable = b * info->PUsPerBlock * layoutPtr->SUsPerPU;	/* b blks, k-1 SUs each */
+	info->SUsPerFullTable = k * info->SUsPerTable;	/* rot k times */
+	info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU;
+	info->TableDepthInPUs = (b * k) / v;
+	info->FullTableDepthInPUs = info->TableDepthInPUs * k;	/* k repetitions */
+
+	/* used only in distributed sparing case */
+	info->FullTablesPerSpareRegion = (v - 1) / rf_gcd(r, v - 1);	/* (v-1)/gcd fulltables */
+	info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion;
+	info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v - 1)) * layoutPtr->SUsPerPU;
+
+	/* check to make sure the block design is sufficiently small */
+	if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+		if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) {
+			RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n",
+			    (int) info->FullTableDepthInPUs,
+			    (int) info->SpareSpaceDepthPerRegionInSUs,
+			    (int) layoutPtr->stripeUnitsPerDisk);
+			return (EINVAL);
+		}
+	} else {
+		if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) {
+			RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n",
+			    (int) (info->TableDepthInPUs * layoutPtr->SUsPerPU),
+			    (int) layoutPtr->stripeUnitsPerDisk);
+			return (EINVAL);
+		}
+	}
+
+
+	/* compute the size of each disk, and the number of tables in the last
+	 * fulltable (which need not be complete) */
+	if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+
+		PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU;
+		spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs +
+		    (info->TablesPerSpareRegion * info->TableDepthInPUs) / (v - 1));
+		info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU;
+
+		numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs;
+		info->NumCompleteSRs = numCompleteSpareRegionsPerDisk;
+		extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs;
+
+		/* assume conservatively that we need the full amount of spare
+		 * space in one region in order to provide spares for the
+		 * partial spare region at the end of the array.  We set "i"
+		 * to the number of tables in the partial spare region.  This
+		 * may actually include some fulltables. */
+		extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
+		if (extraPUsPerDisk <= 0)
+			i = 0;
+		else
+			i = extraPUsPerDisk / info->TableDepthInPUs;
+
+		complete_FT_count = raidPtr->numRow * (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion / k) + i / k);
+		info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
+		info->ExtraTablesPerDisk = i % k;
+
+		/* note that in the last spare region, the spare space is
+		 * complete even though data/parity space is not */
+		totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk + 1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
+		info->TotSparePUsPerDisk = totSparePUsPerDisk;
+
+		layoutPtr->stripeUnitsPerDisk =
+		    ((complete_FT_count / raidPtr->numRow) * info->FullTableDepthInPUs +	/* data & parity space */
+		    info->ExtraTablesPerDisk * info->TableDepthInPUs +
+		    totSparePUsPerDisk	/* spare space */
+		    ) * layoutPtr->SUsPerPU;
+		layoutPtr->dataStripeUnitsPerDisk =
+		    (complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs)
+		    * layoutPtr->SUsPerPU * (k - 1) / k;
+
+	} else {
+		/* non-dist spare case:  force each disk to contain an
+		 * integral number of tables */
+		layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
+		layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
+
+		/* compute the number of tables in the last fulltable, which
+		 * need not be complete */
+		complete_FT_count =
+		    ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->FullTableDepthInPUs) * raidPtr->numRow;
+
+		info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
+		info->ExtraTablesPerDisk =
+		    ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k;
+	}
+
+	raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+	/* find the disk offset of the stripe unit where the last fulltable
+	 * starts */
+	numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow;
+	diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+	if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+		SpareSpaceInSUs = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs;
+		diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs;
+		info->DiskOffsetOfLastSpareSpaceChunkInSUs =
+		    diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU;
+	}
+	info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs;
+	info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk;
+
+	/* 4.  create and initialize the lookup tables */
+	info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
+	if (info->LayoutTable == NULL)
+		return (ENOMEM);
+	info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
+	if (info->OffsetTable == NULL)
+		return (ENOMEM);
+	info->BlockTable = rf_make_2d_array(info->TableDepthInPUs * layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList);
+	if (info->BlockTable == NULL)
+		return (ENOMEM);
+
+	first_avail_slot = (int *) rf_make_1d_array(v, NULL);
+	if (first_avail_slot == NULL)
+		return (ENOMEM);
+
+	for (i = 0; i < b; i++)
+		for (j = 0; j < k; j++)
+			info->LayoutTable[i][j] = *cfgBuf++;
+
+	/* initialize offset table */
+	for (i = 0; i < b; i++)
+		for (j = 0; j < k; j++) {
+			info->OffsetTable[i][j] = first_avail_slot[info->LayoutTable[i][j]];
+			first_avail_slot[info->LayoutTable[i][j]]++;
+		}
+
+	/* initialize block table */
+	for (SUID = l = 0; l < layoutPtr->SUsPerPU; l++) {
+		for (i = 0; i < b; i++) {
+			for (j = 0; j < k; j++) {
+				info->BlockTable[(info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l]
+				    [info->LayoutTable[i][j]] = SUID;
+			}
+			SUID++;
+		}
+	}
+
+	rf_free_1d_array(first_avail_slot, v);
+
+	/* 5.  set up the remaining redundant-but-useful parameters */
+
+	raidPtr->totalSectors = (k * complete_FT_count + raidPtr->numRow * info->ExtraTablesPerDisk) *
+	    info->SUsPerTable * layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k - 2);
+
+	/* strange evaluation order below to try and minimize overflow
+	 * problems */
+
+	layoutPtr->dataSectorsPerStripe = (k - 2) * layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+	layoutPtr->numDataCol = k - 2;
+	layoutPtr->numParityCol = 2;
+
+	return (0);
+}
+
+int 
+rf_GetDefaultNumFloatingReconBuffersPQ(RF_Raid_t * raidPtr)
+{
+	int     def_decl;
+
+	def_decl = rf_GetDefaultNumFloatingReconBuffersDeclustered(raidPtr);
+	return (RF_MAX(3 * raidPtr->numCol, def_decl));
+}
+
+void 
+rf_MapSectorDeclusteredPQ(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+	RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
+	RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
+	RF_StripeNum_t BlockID, BlockOffset, RepIndex;
+	RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+	RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+	RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0;
+
+	rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+
+	FullTableID = SUID / sus_per_fulltable;	/* fulltable ID within array
+						 * (across rows) */
+	*row = FullTableID % raidPtr->numRow;
+	FullTableID /= raidPtr->numRow;	/* convert to fulltable ID on this
+					 * disk */
+	if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+		SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
+		SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
+	}
+	FullTableOffset = SUID % sus_per_fulltable;
+	TableID = FullTableOffset / info->SUsPerTable;
+	TableOffset = FullTableOffset - TableID * info->SUsPerTable;
+	BlockID = TableOffset / info->PUsPerBlock;
+	BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
+	BlockID %= info->BlocksPerTable;
+	RF_ASSERT(BlockOffset < info->groupSize - 2);
+	/*
+           TableIDs go from 0 .. GroupSize-1 inclusive.
+           PUsPerBlock is k-2.
+           We want the tableIDs to rotate from the
+           right, so use GroupSize
+           */
+	RepIndex = info->groupSize - 1 - TableID;
+	RF_ASSERT(RepIndex >= 0);
+	if (!raidPtr->noRotate) {
+		if (TableID == 0)
+			BlockOffset++;	/* P on last drive, Q on first */
+		else
+			BlockOffset += ((BlockOffset >= RepIndex) ? 2 : 0);	/* skip over PQ */
+		RF_ASSERT(BlockOffset < info->groupSize);
+		*col = info->LayoutTable[BlockID][BlockOffset];
+	}
+	/* remap to distributed spare space if indicated */
+	if (remap) {
+		rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU);
+	} else {
+
+		outSU = base_suid;
+		outSU += FullTableID * fulltable_depth;	/* offs to strt of FT */
+		outSU += SpareSpace;	/* skip rsvd spare space */
+		outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;	/* offs to strt of tble */
+		outSU += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU;	/* offs to the PU */
+	}
+	outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);	/* offs to the SU within
+										 * a PU */
+
+	/* convert SUs to sectors, and, if not aligned to SU boundary, add in
+	 * offset to sector */
+	*diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
+}
+
+
+void 
+rf_MapParityDeclusteredPQ(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+	RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
+	RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
+	RF_StripeNum_t BlockID, BlockOffset, RepIndex;
+	RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+	RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+	RF_StripeNum_t base_suid = 0, outSU, SpareRegion, SpareSpace = 0;
+
+	rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+
+	/* compute row & (possibly) spare space exactly as before */
+	FullTableID = SUID / sus_per_fulltable;
+	*row = FullTableID % raidPtr->numRow;
+	FullTableID /= raidPtr->numRow;	/* convert to fulltable ID on this
+					 * disk */
+	if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+		SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
+		SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
+	}
+	/* compute BlockID and RepIndex exactly as before */
+	FullTableOffset = SUID % sus_per_fulltable;
+	TableID = FullTableOffset / info->SUsPerTable;
+	TableOffset = FullTableOffset - TableID * info->SUsPerTable;
+	BlockID = TableOffset / info->PUsPerBlock;
+	BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
+	BlockID %= info->BlocksPerTable;
+
+	/* the parity block is in the position indicated by RepIndex */
+	RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->groupSize - 1 - TableID;
+	*col = info->LayoutTable[BlockID][RepIndex];
+
+	if (remap)
+		RF_PANIC();
+
+	/* compute sector as before, except use RepIndex instead of
+	 * BlockOffset */
+	outSU = base_suid;
+	outSU += FullTableID * fulltable_depth;
+	outSU += SpareSpace;	/* skip rsvd spare space */
+	outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
+	outSU += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU;
+	outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
+
+	*diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
+}
+
+void 
+rf_MapQDeclusteredPQ(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+	RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
+	RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
+	RF_StripeNum_t BlockID, BlockOffset, RepIndex, RepIndexQ;
+	RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+	RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+	RF_StripeNum_t base_suid = 0, outSU, SpareRegion, SpareSpace = 0;
+
+	rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+
+	/* compute row & (possibly) spare space exactly as before */
+	FullTableID = SUID / sus_per_fulltable;
+	*row = FullTableID % raidPtr->numRow;
+	FullTableID /= raidPtr->numRow;	/* convert to fulltable ID on this
+					 * disk */
+	if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+		SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
+		SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
+	}
+	/* compute BlockID and RepIndex exactly as before */
+	FullTableOffset = SUID % sus_per_fulltable;
+	TableID = FullTableOffset / info->SUsPerTable;
+	TableOffset = FullTableOffset - TableID * info->SUsPerTable;
+	BlockID = TableOffset / info->PUsPerBlock;
+	BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
+	BlockID %= info->BlocksPerTable;
+
+	/* the q block is in the position indicated by RepIndex */
+	RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->groupSize - 1 - TableID;
+	RepIndexQ = ((RepIndex == (info->groupSize - 1)) ? 0 : RepIndex + 1);
+	*col = info->LayoutTable[BlockID][RepIndexQ];
+
+	if (remap)
+		RF_PANIC();
+
+	/* compute sector as before, except use RepIndex instead of
+	 * BlockOffset */
+	outSU = base_suid;
+	outSU += FullTableID * fulltable_depth;
+	outSU += SpareSpace;	/* skip rsvd spare space */
+	outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
+	outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
+
+	outSU += info->OffsetTable[BlockID][RepIndexQ] * layoutPtr->SUsPerPU;
+	*diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
+}
+/* returns an array of ints identifying the disks that comprise the stripe containing the indicated address.
+ * the caller must _never_ attempt to modify this array.
+ */
+void 
+rf_IdentifyStripeDeclusteredPQ(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids,
+    RF_RowCol_t * outRow)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+	RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+	RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+	RF_StripeNum_t base_suid = 0;
+	RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr);
+	RF_StripeNum_t stripeID, FullTableID;
+	int     tableOffset;
+
+	rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+	FullTableID = SUID / sus_per_fulltable;	/* fulltable ID within array
+						 * (across rows) */
+	*outRow = FullTableID % raidPtr->numRow;
+	stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID);	/* find stripe offset
+								 * into array */
+	tableOffset = (stripeID % info->BlocksPerTable);	/* find offset into
+								 * block design table */
+	*diskids = info->LayoutTable[tableOffset];
+}
+#endif /* (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) || (RF_INCLUDE_EVENODD > 0) */
diff --git a/sys/dev/raidframe/rf_declusterPQ.h b/sys/dev/raidframe/rf_declusterPQ.h
new file mode 100644
index 0000000..6edef0b
--- /dev/null
+++ b/sys/dev/raidframe/rf_declusterPQ.h
@@ -0,0 +1,52 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_declusterPQ.h,v 1.3 1999/02/05 00:06:09 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky, Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_DECLUSTERPQ_H_
+#define _RF__RF_DECLUSTERPQ_H_
+
+#include <dev/raidframe/rf_types.h>
+
+int 
+rf_ConfigureDeclusteredPQ(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+int     rf_GetDefaultNumFloatingReconBuffersPQ(RF_Raid_t * raidPtr);
+void 
+rf_MapSectorDeclusteredPQ(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_MapParityDeclusteredPQ(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_MapQDeclusteredPQ(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_IdentifyStripeDeclusteredPQ(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
+
+#endif				/* !_RF__RF_DECLUSTERPQ_H_ */
diff --git a/sys/dev/raidframe/rf_desc.h b/sys/dev/raidframe/rf_desc.h
new file mode 100644
index 0000000..8a6951b
--- /dev/null
+++ b/sys/dev/raidframe/rf_desc.h
@@ -0,0 +1,113 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_desc.h,v 1.5 2000/01/09 00:00:18 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_DESC_H_
+#define _RF__RF_DESC_H_
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_dag.h>
+
+struct RF_RaidReconDesc_s {
+	RF_Raid_t *raidPtr;	/* raid device descriptor */
+	RF_RowCol_t row;	/* row of failed disk */
+	RF_RowCol_t col;	/* col of failed disk */
+	int     state;		/* how far along the reconstruction operation
+				 * has gotten */
+	RF_RaidDisk_t *spareDiskPtr;	/* describes target disk for recon
+					 * (not used in dist sparing) */
+	int     numDisksDone;	/* the number of surviving disks that have
+				 * completed their work */
+	RF_RowCol_t srow;	/* row ID of the spare disk (not used in dist
+				 * sparing) */
+	RF_RowCol_t scol;	/* col ID of the spare disk (not used in dist
+				 * sparing) */
+	/*
+         * Prevent recon from hogging CPU
+         */
+	RF_Etimer_t recon_exec_timer;
+	RF_uint64 reconExecTimerRunning;
+	RF_uint64 reconExecTicks;
+	RF_uint64 maxReconExecTicks;
+
+#if RF_RECON_STATS > 0
+	RF_uint64 hsStallCount;	/* head sep stall count */
+	RF_uint64 numReconExecDelays;
+	RF_uint64 numReconEventWaits;
+#endif				/* RF_RECON_STATS > 0 */
+	RF_RaidReconDesc_t *next;
+};
+
+struct RF_RaidAccessDesc_s {
+	RF_Raid_t *raidPtr;	/* raid device descriptor */
+	RF_IoType_t type;	/* read or write */
+	RF_RaidAddr_t raidAddress;	/* starting address in raid address
+					 * space */
+	RF_SectorCount_t numBlocks;	/* number of blocks (sectors) to
+					 * transfer */
+	RF_StripeCount_t numStripes;	/* number of stripes involved in
+					 * access */
+	caddr_t bufPtr;		/* pointer to data buffer */
+	RF_RaidAccessFlags_t flags;	/* flags controlling operation */
+	int     state;		/* index into states telling how far along the
+				 * RAID operation has gotten */
+	RF_AccessState_t *states;	/* array of states to be run */
+	int     status;		/* pass/fail status of the last operation */
+	RF_DagList_t *dagArray;	/* array of dag lists, one list per stripe */
+	RF_AccessStripeMapHeader_t *asmap;	/* the asm for this I/O */
+	void   *bp;		/* buf pointer for this RAID acc.  ignored
+				 * outside the kernel */
+	RF_DagHeader_t **paramDAG;	/* allows the DAG to be returned to
+					 * the caller after I/O completion */
+	RF_AccessStripeMapHeader_t **paramASM;	/* allows the ASM to be
+						 * returned to the caller
+						 * after I/O completion */
+	RF_AccTraceEntry_t tracerec;	/* perf monitoring information for a
+					 * user access (not for dag stats) */
+	void    (*callbackFunc) (RF_CBParam_t);	/* callback function for this
+						 * I/O */
+	void   *callbackArg;	/* arg to give to callback func */
+
+	RF_AllocListElem_t *cleanupList;	/* memory to be freed at the
+						 * end of the access */
+
+	RF_RaidAccessDesc_t *next;
+	RF_RaidAccessDesc_t *head;
+
+	int     numPending;
+
+	        RF_DECLARE_MUTEX(mutex)	/* these are used to implement
+					 * blocking I/O */
+	        RF_DECLARE_COND(cond)
+	int     async_flag;
+
+	RF_Etimer_t timer;	/* used for timing this access */
+};
+#endif				/* !_RF__RF_DESC_H_ */
diff --git a/sys/dev/raidframe/rf_diskqueue.c b/sys/dev/raidframe/rf_diskqueue.c
new file mode 100644
index 0000000..3359ae5
--- /dev/null
+++ b/sys/dev/raidframe/rf_diskqueue.c
@@ -0,0 +1,591 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_diskqueue.c,v 1.13 2000/03/04 04:22:34 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/****************************************************************************
+ *
+ * rf_diskqueue.c -- higher-level disk queue code
+ *
+ * the routines here are a generic wrapper around the actual queueing
+ * routines.  The code here implements thread scheduling, synchronization,
+ * and locking ops (see below) on top of the lower-level queueing code.
+ *
+ * to support atomic RMW, we implement "locking operations".  When a
+ * locking op is dispatched to the lower levels of the driver, the
+ * queue is locked, and no further I/Os are dispatched until the queue
+ * receives & completes a corresponding "unlocking operation".  This
+ * code relies on the higher layers to guarantee that a locking op
+ * will always be eventually followed by an unlocking op.  The model
+ * is that the higher layers are structured so locking and unlocking
+ * ops occur in pairs, i.e.  an unlocking op cannot be generated until
+ * after a locking op reports completion.  There is no good way to
+ * check to see that an unlocking op "corresponds" to the op that
+ * currently has the queue locked, so we make no such attempt.  Since
+ * by definition there can be only one locking op outstanding on a
+ * disk, this should not be a problem.
+ *
+ * In the kernel, we allow multiple I/Os to be concurrently dispatched
+ * to the disk driver.  In order to support locking ops in this
+ * environment, when we decide to do a locking op, we stop dispatching
+ * new I/Os and wait until all dispatched I/Os have completed before
+ * dispatching the locking op.
+ *
+ * Unfortunately, the code is different in the 3 different operating
+ * states (user level, kernel, simulator).  In the kernel, I/O is
+ * non-blocking, and we have no disk threads to dispatch for us.
+ * Therefore, we have to dispatch new I/Os to the scsi driver at the
+ * time of enqueue, and also at the time of completion.  At user
+ * level, I/O is blocking, and so only the disk threads may dispatch
+ * I/Os.  Thus at user level, all we can do at enqueue time is enqueue
+ * and wake up the disk thread to do the dispatch.
+ *
+ ****************************************************************************/
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_diskqueue.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_acctrace.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_configure.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_freelist.h>
+#include <dev/raidframe/rf_debugprint.h>
+#include <dev/raidframe/rf_shutdown.h>
+#include <dev/raidframe/rf_cvscan.h>
+#include <dev/raidframe/rf_sstf.h>
+#include <dev/raidframe/rf_fifo.h>
+#include <dev/raidframe/rf_kintf.h>
+
+static int init_dqd(RF_DiskQueueData_t *);
+static void clean_dqd(RF_DiskQueueData_t *);
+static void rf_ShutdownDiskQueueSystem(void *);
+
+#define Dprintf1(s,a)         if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf2(s,a,b)       if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf3(s,a,b,c)     if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
+
+/*****************************************************************************
+ *
+ * the disk queue switch defines all the functions used in the
+ * different queueing disciplines queue ID, init routine, enqueue
+ * routine, dequeue routine
+ *
+ ****************************************************************************/
+
+static RF_DiskQueueSW_t diskqueuesw[] = {
+	{"fifo",		/* FIFO */
+		rf_FifoCreate,
+		rf_FifoEnqueue,
+		rf_FifoDequeue,
+		rf_FifoPeek,
+	rf_FifoPromote},
+
+	{"cvscan",		/* cvscan */
+		rf_CvscanCreate,
+		rf_CvscanEnqueue,
+		rf_CvscanDequeue,
+		rf_CvscanPeek,
+	rf_CvscanPromote},
+
+	{"sstf",		/* shortest seek time first */
+		rf_SstfCreate,
+		rf_SstfEnqueue,
+		rf_SstfDequeue,
+		rf_SstfPeek,
+	rf_SstfPromote},
+
+	{"scan",		/* SCAN (two-way elevator) */
+		rf_ScanCreate,
+		rf_SstfEnqueue,
+		rf_ScanDequeue,
+		rf_ScanPeek,
+	rf_SstfPromote},
+
+	{"cscan",		/* CSCAN (one-way elevator) */
+		rf_CscanCreate,
+		rf_SstfEnqueue,
+		rf_CscanDequeue,
+		rf_CscanPeek,
+	rf_SstfPromote},
+
+};
+#define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
+
+static RF_FreeList_t *rf_dqd_freelist;
+
+#define RF_MAX_FREE_DQD 256
+#define RF_DQD_INC       16
+#define RF_DQD_INITIAL   64
+
+#if defined(__FreeBSD__) && __FreeBSD_version > 500005
+#include <sys/bio.h>
+#endif
+
+#include <sys/buf.h>
+
+static int 
+init_dqd(dqd)
+	RF_DiskQueueData_t *dqd;
+{
+
+	dqd->bp = (RF_Buf_t) malloc(sizeof(RF_Buf_t), M_RAIDFRAME, M_NOWAIT);
+	if (dqd->bp == NULL) {
+		return (ENOMEM);
+	}
+	memset(dqd->bp, 0, sizeof(RF_Buf_t));	/* if you don't do it, nobody
+						 * else will.. */
+	return (0);
+}
+
+static void 
+clean_dqd(dqd)
+	RF_DiskQueueData_t *dqd;
+{
+	free(dqd->bp, M_RAIDFRAME);
+}
+/* configures a single disk queue */
+
+int 
+rf_ConfigureDiskQueue(
+      RF_Raid_t * raidPtr,
+      RF_DiskQueue_t * diskqueue,
+      RF_RowCol_t r,		/* row & col -- debug only.  BZZT not any
+				 * more... */
+      RF_RowCol_t c,
+      RF_DiskQueueSW_t * p,
+      RF_SectorCount_t sectPerDisk,
+      dev_t dev,
+      int maxOutstanding,
+      RF_ShutdownList_t ** listp,
+      RF_AllocListElem_t * clList)
+{
+	int     rc;
+
+	diskqueue->row = r;
+	diskqueue->col = c;
+	diskqueue->qPtr = p;
+	diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp);
+	diskqueue->dev = dev;
+	diskqueue->numOutstanding = 0;
+	diskqueue->queueLength = 0;
+	diskqueue->maxOutstanding = maxOutstanding;
+	diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
+	diskqueue->nextLockingOp = NULL;
+	diskqueue->unlockingOp = NULL;
+	diskqueue->numWaiting = 0;
+	diskqueue->flags = 0;
+	diskqueue->raidPtr = raidPtr;
+	diskqueue->rf_cinfo = &raidPtr->raid_cinfo[r][c];
+	rc = rf_create_managed_mutex(listp, &diskqueue->mutex);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		return (rc);
+	}
+	rc = rf_create_managed_cond(listp, &diskqueue->cond);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		return (rc);
+	}
+	return (0);
+}
+
+static void 
+rf_ShutdownDiskQueueSystem(ignored)
+	void   *ignored;
+{
+	RF_FREELIST_DESTROY_CLEAN(rf_dqd_freelist, next, (RF_DiskQueueData_t *), clean_dqd);
+}
+
+int 
+rf_ConfigureDiskQueueSystem(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     rc;
+
+	RF_FREELIST_CREATE(rf_dqd_freelist, RF_MAX_FREE_DQD,
+	    RF_DQD_INC, sizeof(RF_DiskQueueData_t));
+	if (rf_dqd_freelist == NULL)
+		return (ENOMEM);
+	rc = rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+		    __FILE__, __LINE__, rc);
+		rf_ShutdownDiskQueueSystem(NULL);
+		return (rc);
+	}
+	RF_FREELIST_PRIME_INIT(rf_dqd_freelist, RF_DQD_INITIAL, next,
+	    (RF_DiskQueueData_t *), init_dqd);
+	return (0);
+}
+
+int 
+rf_ConfigureDiskQueues(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	RF_DiskQueue_t **diskQueues, *spareQueues;
+	RF_DiskQueueSW_t *p;
+	RF_RowCol_t r, c;
+	int     rc, i;
+
+	raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
+
+	for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) {
+		if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
+			p = &diskqueuesw[i];
+			break;
+		}
+	}
+	if (p == NULL) {
+		RF_ERRORMSG2("Unknown queue type \"%s\".  Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType);
+		p = &diskqueuesw[0];
+	}
+	raidPtr->qType = p;
+	RF_CallocAndAdd(diskQueues, raidPtr->numRow, sizeof(RF_DiskQueue_t *), (RF_DiskQueue_t **), raidPtr->cleanupList);
+	if (diskQueues == NULL) {
+		return (ENOMEM);
+	}
+	raidPtr->Queues = diskQueues;
+	for (r = 0; r < raidPtr->numRow; r++) {
+		RF_CallocAndAdd(diskQueues[r], raidPtr->numCol + 
+				 ((r == 0) ? RF_MAXSPARE : 0), 
+				sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *), 
+				raidPtr->cleanupList);
+		if (diskQueues[r] == NULL)
+			return (ENOMEM);
+		for (c = 0; c < raidPtr->numCol; c++) {
+			rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[r][c],
+						   r, c, p,
+						   raidPtr->sectorsPerDisk, 
+						   raidPtr->Disks[r][c].dev,
+						   cfgPtr->maxOutstandingDiskReqs, 
+						   listp, raidPtr->cleanupList);
+			if (rc)
+				return (rc);
+		}
+	}
+
+	spareQueues = &raidPtr->Queues[0][raidPtr->numCol];
+	for (r = 0; r < raidPtr->numSpare; r++) {
+		rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r],
+		    0, raidPtr->numCol + r, p,
+		    raidPtr->sectorsPerDisk,
+		    raidPtr->Disks[0][raidPtr->numCol + r].dev,
+		    cfgPtr->maxOutstandingDiskReqs, listp,
+		    raidPtr->cleanupList);
+		if (rc)
+			return (rc);
+	}
+	return (0);
+}
+/* Enqueue a disk I/O
+ *
+ * Unfortunately, we have to do things differently in the different
+ * environments (simulator, user-level, kernel).
+ * At user level, all I/O is blocking, so we have 1 or more threads/disk
+ * and the thread that enqueues is different from the thread that dequeues.
+ * In the kernel, I/O is non-blocking and so we'd like to have multiple
+ * I/Os outstanding on the physical disks when possible.
+ *
+ * when any request arrives at a queue, we have two choices:
+ *    dispatch it to the lower levels
+ *    queue it up
+ *
+ * kernel rules for when to do what:
+ *    locking request:  queue empty => dispatch and lock queue,
+ *                      else queue it
+ *    unlocking req  :  always dispatch it
+ *    normal req     :  queue empty => dispatch it & set priority
+ *                      queue not full & priority is ok => dispatch it
+ *                      else queue it
+ *
+ * user-level rules:
+ *    always enqueue.  In the special case of an unlocking op, enqueue
+ *    in a special way that will cause the unlocking op to be the next
+ *    thing dequeued.
+ *
+ * simulator rules:
+ *    Do the same as at user level, with the sleeps and wakeups suppressed.
+ */
+void 
+rf_DiskIOEnqueue(queue, req, pri)
+	RF_DiskQueue_t *queue;
+	RF_DiskQueueData_t *req;
+	int     pri;
+{
+	RF_ETIMER_START(req->qtime);
+	RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
+	req->priority = pri;
+
+	if (rf_queueDebug && (req->numSector == 0)) {
+		printf("Warning: Enqueueing zero-sector access\n");
+	}
+	/*
+         * kernel
+         */
+	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
+	/* locking request */
+	if (RF_LOCKING_REQ(req)) {
+		if (RF_QUEUE_EMPTY(queue)) {
+			Dprintf3("Dispatching pri %d locking op to r %d c %d (queue empty)\n", pri, queue->row, queue->col);
+			RF_LOCK_QUEUE(queue);
+			rf_DispatchKernelIO(queue, req);
+		} else {
+			queue->queueLength++;	/* increment count of number
+						 * of requests waiting in this
+						 * queue */
+			Dprintf3("Enqueueing pri %d locking op to r %d c %d (queue not empty)\n", pri, queue->row, queue->col);
+			req->queue = (void *) queue;
+			(queue->qPtr->Enqueue) (queue->qHdr, req, pri);
+		}
+	}
+	/* unlocking request */
+	else
+		if (RF_UNLOCKING_REQ(req)) {	/* we'll do the actual unlock
+						 * when this I/O completes */
+			Dprintf3("Dispatching pri %d unlocking op to r %d c %d\n", pri, queue->row, queue->col);
+			RF_ASSERT(RF_QUEUE_LOCKED(queue));
+			rf_DispatchKernelIO(queue, req);
+		}
+	/* normal request */
+		else
+			if (RF_OK_TO_DISPATCH(queue, req)) {
+				Dprintf3("Dispatching pri %d regular op to r %d c %d (ok to dispatch)\n", pri, queue->row, queue->col);
+				rf_DispatchKernelIO(queue, req);
+			} else {
+				queue->queueLength++;	/* increment count of
+							 * number of requests
+							 * waiting in this queue */
+				Dprintf3("Enqueueing pri %d regular op to r %d c %d (not ok to dispatch)\n", pri, queue->row, queue->col);
+				req->queue = (void *) queue;
+				(queue->qPtr->Enqueue) (queue->qHdr, req, pri);
+			}
+	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
+}
+
+
+/* get the next set of I/Os started, kernel version only */
+void 
+rf_DiskIOComplete(queue, req, status)
+	RF_DiskQueue_t *queue;
+	RF_DiskQueueData_t *req;
+	int     status;
+{
+	int     done = 0;
+
+	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
+
+	/* unlock the queue: (1) after an unlocking req completes (2) after a
+	 * locking req fails */
+	if (RF_UNLOCKING_REQ(req) || (RF_LOCKING_REQ(req) && status)) {
+		Dprintf2("DiskIOComplete: unlocking queue at r %d c %d\n", queue->row, queue->col);
+		RF_ASSERT(RF_QUEUE_LOCKED(queue) && (queue->unlockingOp == NULL));
+		RF_UNLOCK_QUEUE(queue);
+	}
+	queue->numOutstanding--;
+	RF_ASSERT(queue->numOutstanding >= 0);
+
+	/* dispatch requests to the disk until we find one that we can't. */
+	/* no reason to continue once we've filled up the queue */
+	/* no reason to even start if the queue is locked */
+
+	while (!done && !RF_QUEUE_FULL(queue) && !RF_QUEUE_LOCKED(queue)) {
+		if (queue->nextLockingOp) {
+			req = queue->nextLockingOp;
+			queue->nextLockingOp = NULL;
+			Dprintf3("DiskIOComplete: a pri %d locking req was pending at r %d c %d\n", req->priority, queue->row, queue->col);
+		} else {
+			req = (queue->qPtr->Dequeue) (queue->qHdr);
+			if (req != NULL) {
+				Dprintf3("DiskIOComplete: extracting pri %d req from queue at r %d c %d\n", req->priority, queue->row, queue->col);
+			} else {
+				Dprintf1("DiskIOComplete: no more requests to extract.\n", "");
+			}
+		}
+		if (req) {
+			queue->queueLength--;	/* decrement count of number
+						 * of requests waiting in this
+						 * queue */
+			RF_ASSERT(queue->queueLength >= 0);
+		}
+		if (!req)
+			done = 1;
+		else
+			if (RF_LOCKING_REQ(req)) {
+				if (RF_QUEUE_EMPTY(queue)) {	/* dispatch it */
+					Dprintf3("DiskIOComplete: dispatching pri %d locking req to r %d c %d (queue empty)\n", req->priority, queue->row, queue->col);
+					RF_LOCK_QUEUE(queue);
+					rf_DispatchKernelIO(queue, req);
+					done = 1;
+				} else {	/* put it aside to wait for
+						 * the queue to drain */
+					Dprintf3("DiskIOComplete: postponing pri %d locking req to r %d c %d\n", req->priority, queue->row, queue->col);
+					RF_ASSERT(queue->nextLockingOp == NULL);
+					queue->nextLockingOp = req;
+					done = 1;
+				}
+			} else
+				if (RF_UNLOCKING_REQ(req)) {	/* should not happen:
+								 * unlocking ops should
+								 * not get queued */
+					RF_ASSERT(RF_QUEUE_LOCKED(queue));	/* support it anyway for
+										 * the future */
+					Dprintf3("DiskIOComplete: dispatching pri %d unl req to r %d c %d (SHOULD NOT SEE THIS)\n", req->priority, queue->row, queue->col);
+					rf_DispatchKernelIO(queue, req);
+					done = 1;
+				} else
+					if (RF_OK_TO_DISPATCH(queue, req)) {
+						Dprintf3("DiskIOComplete: dispatching pri %d regular req to r %d c %d (ok to dispatch)\n", req->priority, queue->row, queue->col);
+						rf_DispatchKernelIO(queue, req);
+					} else {	/* we can't dispatch it,
+							 * so just re-enqueue
+							 * it.  */
+						/* potential trouble here if
+						 * disk queues batch reqs */
+						Dprintf3("DiskIOComplete: re-enqueueing pri %d regular req to r %d c %d\n", req->priority, queue->row, queue->col);
+						queue->queueLength++;
+						(queue->qPtr->Enqueue) (queue->qHdr, req, req->priority);
+						done = 1;
+					}
+	}
+
+	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
+}
+/* promotes accesses tagged with the given parityStripeID from low priority
+ * to normal priority.  This promotion is optional, meaning that a queue
+ * need not implement it.  If there is no promotion routine associated with
+ * a queue, this routine does nothing and returns -1.
+ */
+int 
+rf_DiskIOPromote(queue, parityStripeID, which_ru)
+	RF_DiskQueue_t *queue;
+	RF_StripeNum_t parityStripeID;
+	RF_ReconUnitNum_t which_ru;
+{
+	int     retval;
+
+	if (!queue->qPtr->Promote)
+		return (-1);
+	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
+	retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru);
+	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
+	return (retval);
+}
+
+RF_DiskQueueData_t *
+rf_CreateDiskQueueData(
+    RF_IoType_t typ,
+    RF_SectorNum_t ssect,
+    RF_SectorCount_t nsect,
+    caddr_t buf,
+    RF_StripeNum_t parityStripeID,
+    RF_ReconUnitNum_t which_ru,
+    int (*wakeF) (void *, int),
+    void *arg,
+    RF_DiskQueueData_t * next,
+    RF_AccTraceEntry_t * tracerec,
+    void *raidPtr,
+    RF_DiskQueueDataFlags_t flags,
+    void *kb_proc)
+{
+	RF_DiskQueueData_t *p;
+
+	RF_FREELIST_GET_INIT(rf_dqd_freelist, p, next, (RF_DiskQueueData_t *), init_dqd);
+
+	p->sectorOffset = ssect + rf_protectedSectors;
+	p->numSector = nsect;
+	p->type = typ;
+	p->buf = buf;
+	p->parityStripeID = parityStripeID;
+	p->which_ru = which_ru;
+	p->CompleteFunc = wakeF;
+	p->argument = arg;
+	p->next = next;
+	p->tracerec = tracerec;
+	p->priority = RF_IO_NORMAL_PRIORITY;
+	p->AuxFunc = NULL;
+	p->buf2 = NULL;
+	p->raidPtr = raidPtr;
+	p->flags = flags;
+	p->b_proc = kb_proc;
+	return (p);
+}
+
+RF_DiskQueueData_t *
+rf_CreateDiskQueueDataFull(
+    RF_IoType_t typ,
+    RF_SectorNum_t ssect,
+    RF_SectorCount_t nsect,
+    caddr_t buf,
+    RF_StripeNum_t parityStripeID,
+    RF_ReconUnitNum_t which_ru,
+    int (*wakeF) (void *, int),
+    void *arg,
+    RF_DiskQueueData_t * next,
+    RF_AccTraceEntry_t * tracerec,
+    int priority,
+    int (*AuxFunc) (void *,...),
+    caddr_t buf2,
+    void *raidPtr,
+    RF_DiskQueueDataFlags_t flags,
+    void *kb_proc)
+{
+	RF_DiskQueueData_t *p;
+
+	RF_FREELIST_GET_INIT(rf_dqd_freelist, p, next, (RF_DiskQueueData_t *), init_dqd);
+
+	p->sectorOffset = ssect + rf_protectedSectors;
+	p->numSector = nsect;
+	p->type = typ;
+	p->buf = buf;
+	p->parityStripeID = parityStripeID;
+	p->which_ru = which_ru;
+	p->CompleteFunc = wakeF;
+	p->argument = arg;
+	p->next = next;
+	p->tracerec = tracerec;
+	p->priority = priority;
+	p->AuxFunc = AuxFunc;
+	p->buf2 = buf2;
+	p->raidPtr = raidPtr;
+	p->flags = flags;
+	p->b_proc = kb_proc;
+	return (p);
+}
+
+void 
+rf_FreeDiskQueueData(p)
+	RF_DiskQueueData_t *p;
+{
+	RF_FREELIST_FREE_CLEAN(rf_dqd_freelist, p, next, clean_dqd);
+}
diff --git a/sys/dev/raidframe/rf_diskqueue.h b/sys/dev/raidframe/rf_diskqueue.h
new file mode 100644
index 0000000..7b162b0
--- /dev/null
+++ b/sys/dev/raidframe/rf_diskqueue.h
@@ -0,0 +1,208 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_diskqueue.h,v 1.5 2000/02/13 04:53:57 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************************
+ *
+ * rf_diskqueue.h -- header file for disk queues
+ *
+ * see comments in rf_diskqueue.c
+ *
+ ****************************************************************************************/
+
+
+#ifndef _RF__RF_DISKQUEUE_H_
+#define _RF__RF_DISKQUEUE_H_
+
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_acctrace.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_etimer.h>
+
+#include <dev/raidframe/rf_bsd.h>
+
+#define RF_IO_NORMAL_PRIORITY 1
+#define RF_IO_LOW_PRIORITY    0
+
+/* the data held by a disk queue entry */
+struct RF_DiskQueueData_s {
+	RF_SectorNum_t sectorOffset;	/* sector offset into the disk */
+	RF_SectorCount_t numSector;	/* number of sectors to read/write */
+	RF_IoType_t type;	/* read/write/nop */
+	caddr_t buf;		/* buffer pointer */
+	RF_StripeNum_t parityStripeID;	/* the RAID parity stripe ID this
+					 * access is for */
+	RF_ReconUnitNum_t which_ru;	/* which RU within this parity stripe */
+	int     priority;	/* the priority of this request */
+	int     (*CompleteFunc) (void *, int);	/* function to be called upon
+						 * completion */
+	int     (*AuxFunc) (void *,...);	/* function called upon
+						 * completion of the first I/O
+						 * of a Read_Op_Write pair */
+	void   *argument;	/* argument to be passed to CompleteFunc */
+	RF_Raid_t *raidPtr;	/* needed for simulation */
+	RF_AccTraceEntry_t *tracerec;	/* perf mon only */
+	RF_Etimer_t qtime;	/* perf mon only - time request is in queue */
+	long    entryTime;
+	RF_DiskQueueData_t *next;
+	RF_DiskQueueData_t *prev;
+	caddr_t buf2;		/* for read-op-write */
+	dev_t   dev;		/* the device number for in-kernel version */
+	RF_DiskQueue_t *queue;	/* the disk queue to which this req is
+				 * targeted */
+	RF_DiskQueueDataFlags_t flags;	/* flags controlling operation */
+
+	struct proc *b_proc;	/* the b_proc from the original bp passed into
+				 * the driver for this I/O */
+				/* XXX Should this be changed to the opaque
+				 * RF_Thread_t ? */
+	RF_Buf_t bp;		/* a bp to use to get this I/O done */
+};
+#define RF_LOCK_DISK_QUEUE   0x01
+#define RF_UNLOCK_DISK_QUEUE 0x02
+
+/* note: "Create" returns type-specific queue header pointer cast to (void *) */
+struct RF_DiskQueueSW_s {
+	RF_DiskQueueType_t queueType;
+	void   *(*Create) (RF_SectorCount_t, RF_AllocListElem_t *, RF_ShutdownList_t **);	/* creation routine --
+												 * one call per queue in
+												 * system */
+	void    (*Enqueue) (void *, RF_DiskQueueData_t *, int);	/* enqueue routine */
+	RF_DiskQueueData_t *(*Dequeue) (void *);	/* dequeue routine */
+	RF_DiskQueueData_t *(*Peek) (void *);	/* peek at head of queue */
+
+	/* the rest are optional:  they improve performance, but the driver
+	 * will deal with it if they don't exist */
+	int     (*Promote) (void *, RF_StripeNum_t, RF_ReconUnitNum_t);	/* promotes priority of
+									 * tagged accesses */
+};
+
+struct RF_DiskQueue_s {
+	RF_DiskQueueSW_t *qPtr;	/* access point to queue functions */
+	void   *qHdr;		/* queue header, of whatever type */
+	        RF_DECLARE_MUTEX(mutex)	/* mutex locking data structures */
+	        RF_DECLARE_COND(cond)	/* condition variable for
+					 * synchronization */
+	long    numOutstanding;	/* number of I/Os currently outstanding on
+				 * disk */
+	long    maxOutstanding;	/* max # of I/Os that can be outstanding on a
+				 * disk (in-kernel only) */
+	int     curPriority;	/* the priority of accs all that are currently
+				 * outstanding */
+	long    queueLength;	/* number of requests in queue */
+	RF_DiskQueueData_t *nextLockingOp;	/* a locking op that has
+						 * arrived at the head of the
+						 * queue & is waiting for
+						 * drainage */
+	RF_DiskQueueData_t *unlockingOp;	/* used at user level to
+						 * communicate unlocking op
+						 * b/w user (or dag exec) &
+						 * disk threads */
+	int     numWaiting;	/* number of threads waiting on this variable.
+				 * user-level only */
+	RF_DiskQueueFlags_t flags;	/* terminate, locked */
+	RF_Raid_t *raidPtr;	/* associated array */
+	dev_t   dev;		/* device number for kernel version */
+	RF_SectorNum_t last_deq_sector;	/* last sector number dequeued or
+					 * dispatched */
+	int     row, col;	/* debug only */
+	struct raidcinfo *rf_cinfo;	/* disks component info.. */
+};
+#define RF_DQ_LOCKED  0x02	/* no new accs allowed until queue is
+				 * explicitly unlocked */
+
+/* macros setting & returning information about queues and requests */
+#define RF_QUEUE_LOCKED(_q)                 ((_q)->flags & RF_DQ_LOCKED)
+#define RF_QUEUE_EMPTY(_q)                  (((_q)->numOutstanding == 0) && ((_q)->nextLockingOp == NULL) && !RF_QUEUE_LOCKED(_q))
+#define RF_QUEUE_FULL(_q)                   ((_q)->numOutstanding == (_q)->maxOutstanding)
+
+#define RF_LOCK_QUEUE(_q)                   (_q)->flags |= RF_DQ_LOCKED
+#define RF_UNLOCK_QUEUE(_q)                 (_q)->flags &= ~RF_DQ_LOCKED
+
+#define RF_LOCK_QUEUE_MUTEX(_q_,_wh_)   RF_LOCK_MUTEX((_q_)->mutex)
+#define RF_UNLOCK_QUEUE_MUTEX(_q_,_wh_) RF_UNLOCK_MUTEX((_q_)->mutex)
+
+#define RF_LOCKING_REQ(_r)                  ((_r)->flags & RF_LOCK_DISK_QUEUE)
+#define RF_UNLOCKING_REQ(_r)                ((_r)->flags & RF_UNLOCK_DISK_QUEUE)
+
+/* whether it is ok to dispatch a regular request */
+#define RF_OK_TO_DISPATCH(_q_,_r_) \
+  (RF_QUEUE_EMPTY(_q_) || \
+    (!RF_QUEUE_FULL(_q_) && ((_r_)->priority >= (_q_)->curPriority)))
+
+int     rf_ConfigureDiskQueueSystem(RF_ShutdownList_t ** listp);
+
+void    rf_TerminateDiskQueues(RF_Raid_t * raidPtr);
+
+int 
+rf_ConfigureDiskQueues(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+
+void    rf_DiskIOEnqueue(RF_DiskQueue_t * queue, RF_DiskQueueData_t * req, int pri);
+
+
+void    rf_DiskIOComplete(RF_DiskQueue_t * queue, RF_DiskQueueData_t * req, int status);
+
+int 
+rf_DiskIOPromote(RF_DiskQueue_t * queue, RF_StripeNum_t parityStripeID,
+    RF_ReconUnitNum_t which_ru);
+
+RF_DiskQueueData_t *
+rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect, 
+		       RF_SectorCount_t nsect, caddr_t buf,
+		       RF_StripeNum_t parityStripeID, 
+		       RF_ReconUnitNum_t which_ru,
+		       int (*wakeF) (void *, int),
+		       void *arg, RF_DiskQueueData_t * next, 
+		       RF_AccTraceEntry_t * tracerec,
+		       void *raidPtr, RF_DiskQueueDataFlags_t flags, 
+		       void *kb_proc);
+
+RF_DiskQueueData_t *
+rf_CreateDiskQueueDataFull(RF_IoType_t typ, RF_SectorNum_t ssect, 
+			   RF_SectorCount_t nsect, caddr_t buf,
+			   RF_StripeNum_t parityStripeID, 
+			   RF_ReconUnitNum_t which_ru,
+			   int (*wakeF) (void *, int),
+			   void *arg, RF_DiskQueueData_t * next, 
+			   RF_AccTraceEntry_t * tracerec,
+			   int priority, int (*AuxFunc) (void *,...), 
+			   caddr_t buf2, void *raidPtr, 
+			   RF_DiskQueueDataFlags_t flags, void *kb_proc);
+
+void    
+rf_FreeDiskQueueData(RF_DiskQueueData_t * p);
+
+int 
+rf_ConfigureDiskQueue(RF_Raid_t *, RF_DiskQueue_t *, RF_RowCol_t, 
+		      RF_RowCol_t, RF_DiskQueueSW_t *,
+		      RF_SectorCount_t, dev_t, int, 
+		      RF_ShutdownList_t **,
+		      RF_AllocListElem_t *);
+#endif				/* !_RF__RF_DISKQUEUE_H_ */
diff --git a/sys/dev/raidframe/rf_disks.c b/sys/dev/raidframe/rf_disks.c
new file mode 100644
index 0000000..dd0ea15
--- /dev/null
+++ b/sys/dev/raidframe/rf_disks.c
@@ -0,0 +1,1138 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_disks.c,v 1.34 2000/12/05 01:35:56 oster Exp $	*/
+/*-
+ * Copyright (c) 1999 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Greg Oster
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***************************************************************
+ * rf_disks.c -- code to perform operations on the actual disks
+ ***************************************************************/
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_utils.h>
+#include <dev/raidframe/rf_configure.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_options.h>
+#include <dev/raidframe/rf_kintf.h>
+#include <dev/raidframe/rf_bsd.h>
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#if defined(__NetBSD__)
+#include <sys/ioctl.h>
+#elif defined(__FreeBSD__)
+#include <sys/ioccom.h>
+#include <sys/filio.h>
+#endif
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+
+static int rf_AllocDiskStructures(RF_Raid_t *, RF_Config_t *);
+static void rf_print_label_status( RF_Raid_t *, int, int, char *, 
+				  RF_ComponentLabel_t *);
+static int rf_check_label_vitals( RF_Raid_t *, int, int, char *, 
+				  RF_ComponentLabel_t *, int, int );
+
+#define DPRINTF6(a,b,c,d,e,f) if (rf_diskDebug) printf(a,b,c,d,e,f)
+#define DPRINTF7(a,b,c,d,e,f,g) if (rf_diskDebug) printf(a,b,c,d,e,f,g)
+
+/**************************************************************************
+ *
+ * initialize the disks comprising the array
+ *
+ * We want the spare disks to have regular row,col numbers so that we can 
+ * easily substitue a spare for a failed disk.  But, the driver code assumes 
+ * throughout that the array contains numRow by numCol _non-spare_ disks, so 
+ * it's not clear how to fit in the spares.  This is an unfortunate holdover
+ * from raidSim.  The quick and dirty fix is to make row zero bigger than the 
+ * rest, and put all the spares in it.  This probably needs to get changed 
+ * eventually.
+ *
+ **************************************************************************/
+
+int 
+rf_ConfigureDisks( listp, raidPtr, cfgPtr )
+	RF_ShutdownList_t **listp;
+	RF_Raid_t *raidPtr;
+	RF_Config_t *cfgPtr;
+{
+	RF_RaidDisk_t **disks;
+	RF_SectorCount_t min_numblks = (RF_SectorCount_t) 0x7FFFFFFFFFFFLL;
+	RF_RowCol_t r, c;
+	int bs, ret;
+	unsigned i, count, foundone = 0, numFailuresThisRow;
+	int force;
+
+	force = cfgPtr->force;
+
+	ret = rf_AllocDiskStructures(raidPtr, cfgPtr);
+	if (ret)
+		goto fail;
+
+	disks = raidPtr->Disks;
+
+	for (r = 0; r < raidPtr->numRow; r++) {
+		numFailuresThisRow = 0;
+		for (c = 0; c < raidPtr->numCol; c++) {
+			ret = rf_ConfigureDisk(raidPtr, 
+					       &cfgPtr->devnames[r][c][0],
+					       &disks[r][c], r, c);
+			
+			if (ret)
+				goto fail;
+
+			if (disks[r][c].status == rf_ds_optimal) {
+				raidread_component_label(
+					 raidPtr->raid_cinfo[r][c].ci_dev,
+					 raidPtr->raid_cinfo[r][c].ci_vp,
+					 &raidPtr->raid_cinfo[r][c].ci_label);
+			}
+
+			if (disks[r][c].status != rf_ds_optimal) {
+				numFailuresThisRow++;
+			} else {
+				if (disks[r][c].numBlocks < min_numblks)
+					min_numblks = disks[r][c].numBlocks;
+				DPRINTF7("Disk at row %d col %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n",
+				    r, c, disks[r][c].devname,
+				    (long int) disks[r][c].numBlocks,
+				    disks[r][c].blockSize,
+				    (long int) disks[r][c].numBlocks *
+					 disks[r][c].blockSize / 1024 / 1024);
+			}
+		}
+		/* XXX fix for n-fault tolerant */
+		/* XXX this should probably check to see how many failures
+		   we can handle for this configuration! */
+		if (numFailuresThisRow > 0)
+			raidPtr->status[r] = rf_rs_degraded;
+	}
+
+	/* all disks must be the same size & have the same block size, bs must
+	 * be a power of 2 */
+	bs = 0;
+	for (foundone = r = 0; !foundone && r < raidPtr->numRow; r++) {
+		for (c = 0; !foundone && c < raidPtr->numCol; c++) {
+			if (disks[r][c].status == rf_ds_optimal) {
+				bs = disks[r][c].blockSize;
+				foundone = 1;
+			}
+		}
+	}
+	if (!foundone) {
+		RF_ERRORMSG("RAIDFRAME: Did not find any live disks in the array.\n");
+		ret = EINVAL;
+		goto fail;
+	}
+	for (count = 0, i = 1; i; i <<= 1)
+		if (bs & i)
+			count++;
+	if (count != 1) {
+		RF_ERRORMSG1("Error: block size on disks (%d) must be a power of 2\n", bs);
+		ret = EINVAL;
+		goto fail;
+	}
+
+	if (rf_CheckLabels( raidPtr, cfgPtr )) {
+		printf("raid%d: There were fatal errors\n", raidPtr->raidid);
+		if (force != 0) {
+			printf("raid%d: Fatal errors being ignored.\n",
+			       raidPtr->raidid);
+		} else {
+			ret = EINVAL;
+			goto fail;
+		} 
+	}
+
+	for (r = 0; r < raidPtr->numRow; r++) {
+		for (c = 0; c < raidPtr->numCol; c++) {
+			if (disks[r][c].status == rf_ds_optimal) {
+				if (disks[r][c].blockSize != bs) {
+					RF_ERRORMSG2("Error: block size of disk at r %d c %d different from disk at r 0 c 0\n", r, c);
+					ret = EINVAL;
+					goto fail;
+				}
+				if (disks[r][c].numBlocks != min_numblks) {
+					RF_ERRORMSG3("WARNING: truncating disk at r %d c %d to %d blocks\n",
+					    r, c, (int) min_numblks);
+					disks[r][c].numBlocks = min_numblks;
+				}
+			}
+		}
+	}
+
+	raidPtr->sectorsPerDisk = min_numblks;
+	raidPtr->logBytesPerSector = ffs(bs) - 1;
+	raidPtr->bytesPerSector = bs;
+	raidPtr->sectorMask = bs - 1;
+	return (0);
+
+fail:
+	
+	rf_UnconfigureVnodes( raidPtr );
+
+	return (ret);
+}
+
+
+/****************************************************************************
+ * set up the data structures describing the spare disks in the array
+ * recall from the above comment that the spare disk descriptors are stored
+ * in row zero, which is specially expanded to hold them.
+ ****************************************************************************/
+int 
+rf_ConfigureSpareDisks( listp, raidPtr, cfgPtr )
+	RF_ShutdownList_t ** listp;
+	RF_Raid_t * raidPtr;
+	RF_Config_t * cfgPtr;
+{
+	int     i, ret;
+	unsigned int bs;
+	RF_RaidDisk_t *disks;
+	int     num_spares_done;
+
+	num_spares_done = 0;
+
+	/* The space for the spares should have already been allocated by
+	 * ConfigureDisks() */
+
+	disks = &raidPtr->Disks[0][raidPtr->numCol];
+	for (i = 0; i < raidPtr->numSpare; i++) {
+		ret = rf_ConfigureDisk(raidPtr, &cfgPtr->spare_names[i][0],
+				       &disks[i], 0, raidPtr->numCol + i);
+		if (ret)
+			goto fail;
+		if (disks[i].status != rf_ds_optimal) {
+			RF_ERRORMSG1("Warning: spare disk %s failed TUR\n", 
+				     &cfgPtr->spare_names[i][0]);
+		} else {
+			disks[i].status = rf_ds_spare;	/* change status to
+							 * spare */
+			DPRINTF6("Spare Disk %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n", i,
+			    disks[i].devname,
+			    (long int) disks[i].numBlocks, disks[i].blockSize,
+			    (long int) disks[i].numBlocks * 
+				 disks[i].blockSize / 1024 / 1024);
+		}
+		num_spares_done++;
+	}
+
+	/* check sizes and block sizes on spare disks */
+	bs = 1 << raidPtr->logBytesPerSector;
+	for (i = 0; i < raidPtr->numSpare; i++) {
+		if (disks[i].blockSize != bs) {
+			RF_ERRORMSG3("Block size of %d on spare disk %s is not the same as on other disks (%d)\n", disks[i].blockSize, disks[i].devname, bs);
+			ret = EINVAL;
+			goto fail;
+		}
+		if (disks[i].numBlocks < raidPtr->sectorsPerDisk) {
+			RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %ld blocks)\n",
+				     disks[i].devname, disks[i].blockSize, 
+				     (long int) raidPtr->sectorsPerDisk);
+			ret = EINVAL;
+			goto fail;
+		} else
+			if (disks[i].numBlocks > raidPtr->sectorsPerDisk) {
+				RF_ERRORMSG2("Warning: truncating spare disk %s to %ld blocks\n", disks[i].devname, (long int) raidPtr->sectorsPerDisk);
+
+				disks[i].numBlocks = raidPtr->sectorsPerDisk;
+			}
+	}
+
+	return (0);
+
+fail:
+
+	/* Release the hold on the main components.  We've failed to allocate
+	 * a spare, and since we're failing, we need to free things.. 
+		 
+	 XXX failing to allocate a spare is *not* that big of a deal... 
+	 We *can* survive without it, if need be, esp. if we get hot
+	 adding working.  
+
+	 If we don't fail out here, then we need a way to remove this spare... 
+	 that should be easier to do here than if we are "live"... 
+
+	 */
+
+	rf_UnconfigureVnodes( raidPtr );
+	
+	return (ret);
+}
+
+static int
+rf_AllocDiskStructures(raidPtr, cfgPtr)
+	RF_Raid_t *raidPtr;
+ 	RF_Config_t *cfgPtr;
+{
+	RF_RaidDisk_t **disks;
+	int ret;
+	int r;
+
+	RF_CallocAndAdd(disks, raidPtr->numRow, sizeof(RF_RaidDisk_t *), 
+			(RF_RaidDisk_t **), raidPtr->cleanupList);
+	if (disks == NULL) {
+		ret = ENOMEM;
+		goto fail;
+	}
+	raidPtr->Disks = disks;
+	/* get space for the device-specific stuff... */
+	RF_CallocAndAdd(raidPtr->raid_cinfo, raidPtr->numRow,
+	    sizeof(struct raidcinfo *), (struct raidcinfo **),
+	    raidPtr->cleanupList);
+	if (raidPtr->raid_cinfo == NULL) {
+		ret = ENOMEM;
+		goto fail;
+	}
+
+	for (r = 0; r < raidPtr->numRow; r++) {
+		/* We allocate RF_MAXSPARE on the first row so that we
+		   have room to do hot-swapping of spares */
+		RF_CallocAndAdd(disks[r], raidPtr->numCol 
+				+ ((r == 0) ? RF_MAXSPARE : 0), 
+				sizeof(RF_RaidDisk_t), (RF_RaidDisk_t *), 
+				raidPtr->cleanupList);
+		if (disks[r] == NULL) {
+			ret = ENOMEM;
+			goto fail;
+		}
+		/* get more space for device specific stuff.. */
+		RF_CallocAndAdd(raidPtr->raid_cinfo[r],
+		    raidPtr->numCol + ((r == 0) ? raidPtr->numSpare : 0),
+		    sizeof(struct raidcinfo), (struct raidcinfo *),
+		    raidPtr->cleanupList);
+		if (raidPtr->raid_cinfo[r] == NULL) {
+			ret = ENOMEM;
+			goto fail;
+		}
+	}
+	return(0);
+fail:	
+	rf_UnconfigureVnodes( raidPtr );
+
+	return(ret);
+}
+
+
+/* configure a single disk during auto-configuration at boot */
+int
+rf_AutoConfigureDisks(raidPtr, cfgPtr, auto_config)
+	RF_Raid_t *raidPtr;
+	RF_Config_t *cfgPtr;
+	RF_AutoConfig_t *auto_config;
+{
+	RF_RaidDisk_t **disks;
+	RF_RaidDisk_t *diskPtr;
+	RF_RowCol_t r, c;	
+	RF_SectorCount_t min_numblks = (RF_SectorCount_t) 0x7FFFFFFFFFFFLL;
+	int bs, ret;
+	int numFailuresThisRow;
+	int force;
+	RF_AutoConfig_t *ac;
+	int parity_good;
+	int mod_counter;
+	int mod_counter_found;
+
+	rf_printf(0, "Starting autoconfiguration of RAID set...\n");
+	force = cfgPtr->force;
+
+	ret = rf_AllocDiskStructures(raidPtr, cfgPtr);
+	if (ret)
+		goto fail;
+
+	disks = raidPtr->Disks;
+
+	/* assume the parity will be fine.. */
+	parity_good = RF_RAID_CLEAN;
+
+	/* Check for mod_counters that are too low */
+	mod_counter_found = 0;
+	mod_counter = 0;
+	ac = auto_config;
+	while(ac!=NULL) {
+		if (mod_counter_found==0) {
+			mod_counter = ac->clabel->mod_counter;
+			mod_counter_found = 1;
+		} else {
+			if (ac->clabel->mod_counter > mod_counter) {
+				mod_counter = ac->clabel->mod_counter;
+			}
+		}
+		ac->flag = 0; /* clear the general purpose flag */
+		ac = ac->next;
+	}
+
+	bs = 0;
+	for (r = 0; r < raidPtr->numRow; r++) {
+		numFailuresThisRow = 0;
+		for (c = 0; c < raidPtr->numCol; c++) {
+			diskPtr = &disks[r][c];
+
+			/* find this row/col in the autoconfig */
+			rf_printf(1, "Looking for %d,%d in autoconfig\n",r,c);
+			ac = auto_config;
+			while(ac!=NULL) {
+				if (ac->clabel==NULL) {
+					/* big-time bad news. */
+					goto fail;
+				}
+				if ((ac->clabel->row == r) &&
+				    (ac->clabel->column == c) &&
+				    (ac->clabel->mod_counter == mod_counter)) {
+					/* it's this one... */
+					/* flag it as 'used', so we don't
+					   free it later. */
+					ac->flag = 1;
+					rf_printf(1, "Found: %s at %d,%d\n",
+					    ac->devname, r, c);
+					break;
+				}
+				ac=ac->next;
+			}
+
+			if (ac==NULL) {
+				/* we didn't find an exact match with a 
+				   correct mod_counter above... can we
+				   find one with an incorrect mod_counter
+				   to use instead?  (this one, if we find
+				   it, will be marked as failed once the 
+				   set configures) 
+				*/
+
+				ac = auto_config;
+				while(ac!=NULL) {
+					if (ac->clabel==NULL) {
+						/* big-time bad news. */
+						goto fail;
+					}
+					if ((ac->clabel->row == r) &&
+					    (ac->clabel->column == c)) {
+						/* it's this one... 
+						   flag it as 'used', so we 
+						   don't free it later. */
+						ac->flag = 1;
+						rf_printf(1, "Found(low mod_counter): %s at %d,%d\n",
+						       ac->devname,r,c);
+						
+						break;
+					}
+					ac=ac->next;
+				}
+			}
+
+
+
+			if (ac!=NULL) {
+				/* Found it.  Configure it.. */
+				diskPtr->blockSize = ac->clabel->blockSize;
+				diskPtr->numBlocks = ac->clabel->numBlocks;
+				/* Note: rf_protectedSectors is already 
+				   factored into numBlocks here */
+				raidPtr->raid_cinfo[r][c].ci_vp = ac->vp;
+				raidPtr->raid_cinfo[r][c].ci_dev = ac->dev;
+
+				memcpy(&raidPtr->raid_cinfo[r][c].ci_label,
+				       ac->clabel, sizeof(*ac->clabel));
+				sprintf(diskPtr->devname, "/dev/%s", 
+					ac->devname);
+				
+				/* note the fact that this component was
+				   autoconfigured.  You'll need this info
+				   later.  Trust me :) */
+				diskPtr->auto_configured = 1;
+				diskPtr->dev = ac->dev;
+			
+				/* 
+				 * we allow the user to specify that
+				 * only a fraction of the disks should
+				 * be used this is just for debug: it
+				 * speeds up the parity scan 
+				 */
+
+				diskPtr->numBlocks = diskPtr->numBlocks * 
+					rf_sizePercentage / 100;
+
+				/* XXX these will get set multiple times, 
+				   but since we're autoconfiguring, they'd
+				   better be always the same each time!
+				   If not, this is the least of your worries */
+
+				bs = diskPtr->blockSize;
+				min_numblks = diskPtr->numBlocks;
+
+				/* this gets done multiple times, but that's
+				   fine -- the serial number will be the same
+				   for all components, guaranteed */
+				raidPtr->serial_number = 
+					ac->clabel->serial_number;
+				/* check the last time the label
+				   was modified */
+				if (ac->clabel->mod_counter !=
+				    mod_counter) {
+					/* Even though we've filled in all
+					   of the above, we don't trust
+					   this component since it's 
+					   modification counter is not
+					   in sync with the rest, and we really
+					   consider it to be failed.  */
+					disks[r][c].status = rf_ds_failed;
+					numFailuresThisRow++;
+				} else {
+					if (ac->clabel->clean != 
+					    RF_RAID_CLEAN) {
+						parity_good = RF_RAID_DIRTY;
+					}
+				}
+			} else {
+				/* Didn't find it at all!! 
+				   Component must really be dead */
+				disks[r][c].status = rf_ds_failed;
+				sprintf(disks[r][c].devname,"component%d",
+					r * raidPtr->numCol + c);
+				numFailuresThisRow++;
+			}
+		}
+		/* XXX fix for n-fault tolerant */
+		/* XXX this should probably check to see how many failures
+		   we can handle for this configuration! */
+		if (numFailuresThisRow > 0)
+			raidPtr->status[r] = rf_rs_degraded;
+	}
+
+	/* close the device for the ones that didn't get used */
+
+	ac = auto_config;
+	while(ac!=NULL) {
+		if (ac->flag == 0) {
+#if defined(__NetBSD__)
+			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
+#elif defined(__FreeBSD__)
+			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY,
+				raidPtr->engine_thread);
+#endif
+			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED, 0);
+			vput(ac->vp);
+			ac->vp = NULL;
+			rf_printf(1, "Released %s from auto-config set.\n",
+			       ac->devname);
+		}
+		ac = ac->next;
+	}
+
+	raidPtr->mod_counter = mod_counter;
+
+	/* note the state of the parity, if any */
+	raidPtr->parity_good = parity_good;
+	raidPtr->sectorsPerDisk = min_numblks;
+	raidPtr->logBytesPerSector = ffs(bs) - 1;
+	raidPtr->bytesPerSector = bs;
+	raidPtr->sectorMask = bs - 1;
+	return (0);
+
+fail:
+	
+	rf_UnconfigureVnodes( raidPtr );
+
+	return (ret);
+
+}
+
+/* configure a single disk in the array */
+int 
+rf_ConfigureDisk(raidPtr, buf, diskPtr, row, col)
+	RF_Raid_t *raidPtr;
+	char   *buf;
+	RF_RaidDisk_t *diskPtr;
+	RF_RowCol_t row;
+	RF_RowCol_t col;
+{
+	char   *p;
+	int     retcode;
+
+	int     error;
+
+	retcode = 0;
+	p = rf_find_non_white(buf);
+	if (p[strlen(p) - 1] == '\n') {
+		/* strip off the newline */
+		p[strlen(p) - 1] = '\0';
+	}
+	(void) strcpy(diskPtr->devname, p);
+
+	/* Let's start by claiming the component is fine and well... */
+	diskPtr->status = rf_ds_optimal;
+
+	raidPtr->raid_cinfo[row][col].ci_vp = NULL;
+	raidPtr->raid_cinfo[row][col].ci_dev = NULL;
+
+	error = raid_getcomponentsize(raidPtr, row, col);
+	if (error) {
+		printf("raidlookup on device: %s failed!\n", diskPtr->devname);
+		if (error == ENXIO) {
+			/* the component isn't there... must be dead :-( */
+			diskPtr->status = rf_ds_failed;
+			return (error);
+		}
+	}
+	return (0);
+}
+
+static void
+rf_print_label_status( raidPtr, row, column, dev_name, ci_label )
+	RF_Raid_t *raidPtr;
+	int row;
+	int column;
+	char *dev_name;
+	RF_ComponentLabel_t *ci_label;
+{
+
+	printf("raid%d: Component %s being configured at row: %d col: %d\n", 
+	       raidPtr->raidid, dev_name, row, column );
+	printf("         Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
+	       ci_label->row, ci_label->column, 
+	       ci_label->num_rows, ci_label->num_columns);
+	printf("         Version: %d Serial Number: %d Mod Counter: %d\n",
+	       ci_label->version, ci_label->serial_number,
+	       ci_label->mod_counter);
+	printf("         Clean: %s Status: %d\n",
+	       ci_label->clean ? "Yes" : "No", ci_label->status );
+}
+
+static int rf_check_label_vitals( raidPtr, row, column, dev_name, ci_label,
+				  serial_number, mod_counter )
+	RF_Raid_t *raidPtr;
+	int row;
+	int column;
+	char *dev_name;
+	RF_ComponentLabel_t *ci_label;
+	int serial_number;
+	int mod_counter;
+{
+	int fatal_error = 0;
+
+	if (serial_number != ci_label->serial_number) {
+		printf("%s has a different serial number: %d %d\n", 
+		       dev_name, serial_number, ci_label->serial_number);
+		fatal_error = 1;
+	}
+	if (mod_counter != ci_label->mod_counter) {
+		printf("%s has a different modfication count: %d %d\n",
+		       dev_name, mod_counter, ci_label->mod_counter);
+	}
+	
+	if (row != ci_label->row) {
+		printf("Row out of alignment for: %s\n", dev_name); 
+		fatal_error = 1;
+	}
+	if (column != ci_label->column) {
+		printf("Column out of alignment for: %s\n", dev_name);
+		fatal_error = 1;
+	}
+	if (raidPtr->numRow != ci_label->num_rows) {
+		printf("Number of rows do not match for: %s\n", dev_name);
+		fatal_error = 1;
+	}
+	if (raidPtr->numCol != ci_label->num_columns) {
+		printf("Number of columns do not match for: %s\n", dev_name);
+		fatal_error = 1;
+	}
+	if (ci_label->clean == 0) {
+		/* it's not clean, but that's not fatal */
+		printf("%s is not clean!\n", dev_name);
+	}
+	return(fatal_error);
+}
+
+
+/* 
+
+   rf_CheckLabels() - check all the component labels for consistency.
+   Return an error if there is anything major amiss.
+
+ */
+
+int 
+rf_CheckLabels( raidPtr, cfgPtr )
+	RF_Raid_t *raidPtr;
+	RF_Config_t *cfgPtr;
+{
+	int r,c;
+	char *dev_name;
+	RF_ComponentLabel_t *ci_label;
+	int serial_number = 0;
+	int mod_number = 0;
+	int fatal_error = 0;
+	int mod_values[4];
+	int mod_count[4];
+	int ser_values[4];
+	int ser_count[4];
+	int num_ser;
+	int num_mod;
+	int i;
+	int found;
+	int hosed_row;
+	int hosed_column;
+	int too_fatal;
+	int parity_good;
+	int force;
+
+	hosed_row = -1;
+	hosed_column = -1;
+	too_fatal = 0;
+	force = cfgPtr->force;
+
+	/* 
+	   We're going to try to be a little intelligent here.  If one 
+	   component's label is bogus, and we can identify that it's the
+	   *only* one that's gone, we'll mark it as "failed" and allow
+	   the configuration to proceed.  This will be the *only* case
+	   that we'll proceed if there would be (otherwise) fatal errors.
+	   
+	   Basically we simply keep a count of how many components had
+	   what serial number.  If all but one agree, we simply mark
+	   the disagreeing component as being failed, and allow 
+	   things to come up "normally".
+	   
+	   We do this first for serial numbers, and then for "mod_counter".
+
+	 */
+
+	num_ser = 0;
+	num_mod = 0;
+	for (r = 0; r < raidPtr->numRow && !fatal_error ; r++) {
+		for (c = 0; c < raidPtr->numCol; c++) {
+			ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
+			found=0;
+			for(i=0;i<num_ser;i++) {
+				if (ser_values[i] == ci_label->serial_number) {
+					ser_count[i]++;
+					found=1;
+					break;
+				}
+			}
+			if (!found) {
+				ser_values[num_ser] = ci_label->serial_number;
+				ser_count[num_ser] = 1;
+				num_ser++;
+				if (num_ser>2) {
+					fatal_error = 1;
+					break;
+				}
+			}
+			found=0;
+			for(i=0;i<num_mod;i++) {
+				if (mod_values[i] == ci_label->mod_counter) {
+					mod_count[i]++;
+					found=1;
+					break;
+				}
+			}
+			if (!found) {
+			        mod_values[num_mod] = ci_label->mod_counter;
+				mod_count[num_mod] = 1;
+				num_mod++;
+				if (num_mod>2) {
+					fatal_error = 1;
+					break;
+				}
+			}
+		}
+	}
+	rf_printf(1, "raid%d: Summary of serial numbers:\n", raidPtr->raidid);
+	for(i=0;i<num_ser;i++) {
+		rf_printf(1, "%d %d\n", ser_values[i], ser_count[i]);
+	}
+	rf_printf(1, "raid%d: Summary of mod counters:\n", raidPtr->raidid);
+	for(i=0;i<num_mod;i++) {
+		rf_printf(1, "%d %d\n", mod_values[i], mod_count[i]);
+	}
+	serial_number = ser_values[0];
+	if (num_ser == 2) {
+		if ((ser_count[0] == 1) || (ser_count[1] == 1)) {
+			/* Locate the maverick component */
+			if (ser_count[1] > ser_count[0]) {
+				serial_number = ser_values[1];
+			} 
+			for (r = 0; r < raidPtr->numRow; r++) {
+				for (c = 0; c < raidPtr->numCol; c++) {
+				ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
+					if (serial_number != 
+					    ci_label->serial_number) {
+						hosed_row = r;
+						hosed_column = c;
+						break;
+					}
+				}
+			}
+			printf("Hosed component: %s\n",
+			       &cfgPtr->devnames[hosed_row][hosed_column][0]);
+			if (!force) {
+				/* we'll fail this component, as if there are
+				   other major errors, we arn't forcing things
+				   and we'll abort the config anyways */
+				raidPtr->Disks[hosed_row][hosed_column].status
+					= rf_ds_failed;
+				raidPtr->numFailures++;
+				raidPtr->status[hosed_row] = rf_rs_degraded;
+			}
+		} else {
+			too_fatal = 1;
+		}
+		if (cfgPtr->parityConfig == '0') {
+			/* We've identified two different serial numbers. 
+			   RAID 0 can't cope with that, so we'll punt */
+			too_fatal = 1;
+		}
+
+	} 
+
+	/* record the serial number for later.  If we bail later, setting
+	   this doesn't matter, otherwise we've got the best guess at the 
+	   correct serial number */
+	raidPtr->serial_number = serial_number;
+
+	mod_number = mod_values[0];
+	if (num_mod == 2) {
+		if ((mod_count[0] == 1) || (mod_count[1] == 1)) {
+			/* Locate the maverick component */
+			if (mod_count[1] > mod_count[0]) {
+				mod_number = mod_values[1];
+			} else if (mod_count[1] < mod_count[0]) {
+				mod_number = mod_values[0];
+			} else {
+				/* counts of different modification values
+				   are the same.   Assume greater value is 
+				   the correct one, all other things 
+				   considered */
+				if (mod_values[0] > mod_values[1]) {
+					mod_number = mod_values[0];
+				} else {
+					mod_number = mod_values[1];
+				}
+				
+			}
+			for (r = 0; r < raidPtr->numRow && !too_fatal ; r++) {
+				for (c = 0; c < raidPtr->numCol; c++) {
+					ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
+					if (mod_number != 
+					    ci_label->mod_counter) {
+						if ( ( hosed_row == r ) &&
+						     ( hosed_column == c )) {
+							/* same one.  Can
+							   deal with it.  */
+						} else {
+							hosed_row = r;
+							hosed_column = c;
+							if (num_ser != 1) {
+								too_fatal = 1;
+								break;
+							}
+						}
+					}
+				}
+			}
+			printf("Hosed component: %s\n",
+			       &cfgPtr->devnames[hosed_row][hosed_column][0]);
+			if (!force) {
+				/* we'll fail this component, as if there are
+				   other major errors, we arn't forcing things
+				   and we'll abort the config anyways */
+				if (raidPtr->Disks[hosed_row][hosed_column].status != rf_ds_failed) {
+					raidPtr->Disks[hosed_row][hosed_column].status
+						= rf_ds_failed;
+					raidPtr->numFailures++;
+					raidPtr->status[hosed_row] = rf_rs_degraded;
+				}
+			}
+		} else {
+			too_fatal = 1;
+		}
+		if (cfgPtr->parityConfig == '0') {
+			/* We've identified two different mod counters.
+			   RAID 0 can't cope with that, so we'll punt */
+			too_fatal = 1;
+		}
+	} 
+
+	raidPtr->mod_counter = mod_number;
+
+	if (too_fatal) {
+		/* we've had both a serial number mismatch, and a mod_counter
+		   mismatch -- and they involved two different components!!
+		   Bail -- make things fail so that the user must force
+		   the issue... */
+		hosed_row = -1;
+		hosed_column = -1;
+	}
+
+	if (num_ser > 2) {
+		printf("raid%d: Too many different serial numbers!\n", 
+		       raidPtr->raidid);
+	}
+
+	if (num_mod > 2) {
+		printf("raid%d: Too many different mod counters!\n", 
+		       raidPtr->raidid);
+	}
+
+	/* we start by assuming the parity will be good, and flee from
+	   that notion at the slightest sign of trouble */
+
+	parity_good = RF_RAID_CLEAN;
+	for (r = 0; r < raidPtr->numRow; r++) {
+		for (c = 0; c < raidPtr->numCol; c++) {
+			dev_name = &cfgPtr->devnames[r][c][0];
+			ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
+
+			if ((r == hosed_row) && (c == hosed_column)) {
+				printf("raid%d: Ignoring %s\n",
+				       raidPtr->raidid, dev_name);
+			} else {			
+				rf_print_label_status( raidPtr, r, c, 
+						       dev_name, ci_label );
+				if (rf_check_label_vitals( raidPtr, r, c, 
+							   dev_name, ci_label,
+							   serial_number, 
+							   mod_number )) {
+					fatal_error = 1;
+				}
+				if (ci_label->clean != RF_RAID_CLEAN) {
+					parity_good = RF_RAID_DIRTY;
+				}
+			}
+		}
+	}
+	if (fatal_error) {
+		parity_good = RF_RAID_DIRTY;
+	}
+
+	/* we note the state of the parity */
+	raidPtr->parity_good = parity_good;
+
+	return(fatal_error);	
+}
+
+int
+rf_add_hot_spare(raidPtr, sparePtr)
+	RF_Raid_t *raidPtr;
+	RF_SingleComponent_t *sparePtr;
+{
+	RF_RaidDisk_t *disks;
+	RF_DiskQueue_t *spareQueues;
+	int ret;
+	unsigned int bs;
+	int spare_number;
+
+#if 0
+	printf("Just in rf_add_hot_spare: %d\n",raidPtr->numSpare);
+	printf("Num col: %d\n",raidPtr->numCol);
+#endif
+	if (raidPtr->numSpare >= RF_MAXSPARE) {
+		RF_ERRORMSG1("Too many spares: %d\n", raidPtr->numSpare);
+		return(EINVAL);
+	}
+
+	RF_LOCK_MUTEX(raidPtr->mutex);
+
+	/* the beginning of the spares... */
+	disks = &raidPtr->Disks[0][raidPtr->numCol];
+
+	spare_number = raidPtr->numSpare;
+
+	ret = rf_ConfigureDisk(raidPtr, sparePtr->component_name,
+			       &disks[spare_number], 0,
+			       raidPtr->numCol + spare_number);
+
+	if (ret)
+		goto fail;
+	if (disks[spare_number].status != rf_ds_optimal) {
+		RF_ERRORMSG1("Warning: spare disk %s failed TUR\n", 
+			     sparePtr->component_name);
+		ret=EINVAL;
+		goto fail;
+	} else {
+		disks[spare_number].status = rf_ds_spare;
+		DPRINTF6("Spare Disk %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n", spare_number,
+			 disks[spare_number].devname,
+			 (long int) disks[spare_number].numBlocks, 
+			 disks[spare_number].blockSize,
+			 (long int) disks[spare_number].numBlocks * 
+			 disks[spare_number].blockSize / 1024 / 1024);
+	}
+	
+
+	/* check sizes and block sizes on the spare disk */
+	bs = 1 << raidPtr->logBytesPerSector;
+	if (disks[spare_number].blockSize != bs) {
+		RF_ERRORMSG3("Block size of %d on spare disk %s is not the same as on other disks (%d)\n", disks[spare_number].blockSize, disks[spare_number].devname, bs);
+		ret = EINVAL;
+		goto fail;
+	}
+	if (disks[spare_number].numBlocks < raidPtr->sectorsPerDisk) {
+		RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %ld blocks)\n",
+			     disks[spare_number].devname, 
+			     disks[spare_number].blockSize, 
+			     (long int) raidPtr->sectorsPerDisk);
+		ret = EINVAL;
+		goto fail;
+	} else {
+		if (disks[spare_number].numBlocks > 
+		    raidPtr->sectorsPerDisk) {
+			RF_ERRORMSG2("Warning: truncating spare disk %s to %ld blocks\n", disks[spare_number].devname, 
+				     (long int) raidPtr->sectorsPerDisk);
+			
+			disks[spare_number].numBlocks = raidPtr->sectorsPerDisk;
+		}
+	}
+
+	spareQueues = &raidPtr->Queues[0][raidPtr->numCol];
+	ret = rf_ConfigureDiskQueue( raidPtr, &spareQueues[spare_number],
+				 0, raidPtr->numCol + spare_number, 
+				 raidPtr->qType,
+				 raidPtr->sectorsPerDisk,
+				 raidPtr->Disks[0][raidPtr->numCol + 
+						  spare_number].dev,
+				 raidPtr->maxOutstanding,
+				 &raidPtr->shutdownList,
+				 raidPtr->cleanupList);
+				 
+
+	raidPtr->numSpare++;
+	RF_UNLOCK_MUTEX(raidPtr->mutex);
+	return (0);
+
+fail:
+	RF_UNLOCK_MUTEX(raidPtr->mutex);
+	return(ret);
+}
+
+int
+rf_remove_hot_spare(raidPtr,sparePtr)
+	RF_Raid_t *raidPtr;
+	RF_SingleComponent_t *sparePtr;
+{
+	int spare_number;
+
+
+	if (raidPtr->numSpare==0) {
+		printf("No spares to remove!\n");
+		return(EINVAL);
+	}
+
+	spare_number = sparePtr->column;
+
+	return(EINVAL); /* XXX not implemented yet */
+#if 0
+	if (spare_number < 0 || spare_number > raidPtr->numSpare) {
+		return(EINVAL);
+	}
+
+	/* verify that this spare isn't in use... */
+
+
+
+
+	/* it's gone.. */
+
+	raidPtr->numSpare--;
+
+	return(0);
+#endif
+}
+
+
+int
+rf_delete_component(raidPtr,component)
+	RF_Raid_t *raidPtr;
+	RF_SingleComponent_t *component;
+{
+	RF_RaidDisk_t *disks;
+
+	if ((component->row < 0) || 
+	    (component->row >= raidPtr->numRow) ||
+	    (component->column < 0) || 
+	    (component->column >= raidPtr->numCol)) {
+		return(EINVAL);
+	}
+
+	disks = &raidPtr->Disks[component->row][component->column];
+
+	/* 1. This component must be marked as 'failed' */
+
+	return(EINVAL); /* Not implemented yet. */
+}
+
+int
+rf_incorporate_hot_spare(raidPtr,component)
+	RF_Raid_t *raidPtr;
+	RF_SingleComponent_t *component;
+{
+
+	/* Issues here include how to 'move' this in if there is IO 
+	   taking place (e.g. component queues and such) */
+
+	return(EINVAL); /* Not implemented yet. */
+}
diff --git a/sys/dev/raidframe/rf_disks.h b/sys/dev/raidframe/rf_disks.h
new file mode 100644
index 0000000..b57c4f8
--- /dev/null
+++ b/sys/dev/raidframe/rf_disks.h
@@ -0,0 +1,108 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_disks.h,v 1.8 2000/03/27 03:25:17 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_disks.h -- header file for code related to physical disks
+ */
+
+#ifndef _RF__RF_DISKS_H_
+#define _RF__RF_DISKS_H_
+
+#include <sys/types.h>
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_bsd.h>
+
+/*
+ * A physical disk can be in one of several states:
+ * IF YOU ADD A STATE, CHECK TO SEE IF YOU NEED TO MODIFY RF_DEAD_DISK() BELOW.
+ */
+enum RF_DiskStatus_e {
+	rf_ds_optimal,		/* no problems */
+	rf_ds_failed,		/* reconstruction ongoing */
+	rf_ds_reconstructing,	/* reconstruction complete to spare, dead disk
+				 * not yet replaced */
+	rf_ds_dist_spared,	/* reconstruction complete to distributed
+				 * spare space, dead disk not yet replaced */
+	rf_ds_spared,		/* reconstruction complete to distributed
+				 * spare space, dead disk not yet replaced */
+	rf_ds_spare,		/* an available spare disk */
+	rf_ds_used_spare	/* a spare which has been used, and hence is
+				 * not available */
+};
+typedef enum RF_DiskStatus_e RF_DiskStatus_t;
+
+struct RF_RaidDisk_s {
+	char    devname[56];	/* name of device file */
+	RF_DiskStatus_t status;	/* whether it is up or down */
+	RF_RowCol_t spareRow;	/* if in status "spared", this identifies the
+				 * spare disk */
+	RF_RowCol_t spareCol;	/* if in status "spared", this identifies the
+				 * spare disk */
+	RF_SectorCount_t numBlocks;	/* number of blocks, obtained via READ
+					 * CAPACITY */
+	int     blockSize;
+	RF_SectorCount_t partitionSize; /* The *actual* and *full* size of 
+					   the partition, from the disklabel */
+	int     auto_configured;/* 1 if this component was autoconfigured.
+				   0 otherwise. */
+	dev_t   dev;
+};
+/*
+ * An RF_DiskOp_t ptr is really a pointer to a UAGT_CCB, but I want
+ * to isolate the cam layer from all other layers, so I typecast to/from
+ * RF_DiskOp_t * (i.e. void *) at the interfaces.
+ */
+typedef void RF_DiskOp_t;
+
+/* if a disk is in any of these states, it is inaccessible */
+#define RF_DEAD_DISK(_dstat_) (((_dstat_) == rf_ds_spared) || \
+	((_dstat_) == rf_ds_reconstructing) || ((_dstat_) == rf_ds_failed) || \
+	((_dstat_) == rf_ds_dist_spared))
+
+#ifdef _KERNEL
+#include <dev/raidframe/rf_bsd.h>
+
+int rf_ConfigureDisks(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+		      RF_Config_t * cfgPtr);
+int rf_ConfigureSpareDisks(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+			   RF_Config_t * cfgPtr);
+int rf_ConfigureDisk(RF_Raid_t * raidPtr, char *buf, RF_RaidDisk_t * diskPtr,
+		     RF_RowCol_t row, RF_RowCol_t col);
+int rf_AutoConfigureDisks(RF_Raid_t *raidPtr, RF_Config_t *cfgPtr,
+			  RF_AutoConfig_t *auto_config);
+int rf_CheckLabels( RF_Raid_t *, RF_Config_t *);
+int rf_add_hot_spare(RF_Raid_t *raidPtr, RF_SingleComponent_t *sparePtr);
+int rf_remove_hot_spare(RF_Raid_t *raidPtr, RF_SingleComponent_t *sparePtr);
+int rf_delete_component(RF_Raid_t *raidPtr, RF_SingleComponent_t *component);
+int rf_incorporate_hot_spare(RF_Raid_t *raidPtr, 
+			     RF_SingleComponent_t *component);
+#endif /* _KERNEL */
+#endif				/* !_RF__RF_DISKS_H_ */
diff --git a/sys/dev/raidframe/rf_driver.c b/sys/dev/raidframe/rf_driver.c
new file mode 100644
index 0000000..3f3fe1a
--- /dev/null
+++ b/sys/dev/raidframe/rf_driver.c
@@ -0,0 +1,1048 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_driver.c,v 1.39 2000/12/15 02:12:58 oster Exp $	*/
+/*-
+ * Copyright (c) 1999 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Greg Oster
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Khalil Amiri, Claudson Bornstein, William V. Courtright II,
+ *         Robby Findler, Daniel Stodolsky, Rachad Youssef, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/******************************************************************************
+ *
+ * rf_driver.c -- main setup, teardown, and access routines for the RAID driver
+ *
+ * all routines are prefixed with rf_ (raidframe), to avoid conficts.
+ *
+ ******************************************************************************/
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#if defined(__NetBSD__)
+#include <sys/ioctl.h>
+#elif defined(__FreeBSD__)
+#include <sys/ioccom.h>
+#include <sys/filio.h>
+#endif
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_threadstuff.h>
+
+#include <sys/errno.h>
+
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_aselect.h>
+#include <dev/raidframe/rf_diskqueue.h>
+#include <dev/raidframe/rf_parityscan.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_utils.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_acctrace.h>
+#include <dev/raidframe/rf_configure.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_desc.h>
+#include <dev/raidframe/rf_states.h>
+#include <dev/raidframe/rf_freelist.h>
+#include <dev/raidframe/rf_decluster.h>
+#include <dev/raidframe/rf_map.h>
+#include <dev/raidframe/rf_revent.h>
+#include <dev/raidframe/rf_callback.h>
+#include <dev/raidframe/rf_engine.h>
+#include <dev/raidframe/rf_memchunk.h>
+#include <dev/raidframe/rf_mcpair.h>
+#include <dev/raidframe/rf_nwayxor.h>
+#include <dev/raidframe/rf_debugprint.h>
+#include <dev/raidframe/rf_copyback.h>
+#include <dev/raidframe/rf_driver.h>
+#include <dev/raidframe/rf_options.h>
+#include <dev/raidframe/rf_shutdown.h>
+#include <dev/raidframe/rf_kintf.h>
+
+#if defined(__FreeBSD__) && __FreeBSD_version > 500005
+#include <sys/bio.h>
+#endif
+
+#include <sys/buf.h>
+
+/* rad == RF_RaidAccessDesc_t */
+static RF_FreeList_t *rf_rad_freelist;
+#define RF_MAX_FREE_RAD 128
+#define RF_RAD_INC       16
+#define RF_RAD_INITIAL   32
+
+/* debug variables */
+char    rf_panicbuf[2048];	/* a buffer to hold an error msg when we panic */
+
+/* main configuration routines */
+static int raidframe_booted = 0;
+
+static void rf_ConfigureDebug(RF_Config_t * cfgPtr);
+static void set_debug_option(char *name, long val);
+static void rf_UnconfigureArray(void);
+static int init_rad(RF_RaidAccessDesc_t *);
+static void clean_rad(RF_RaidAccessDesc_t *);
+static void rf_ShutdownRDFreeList(void *);
+static int rf_ConfigureRDFreeList(RF_ShutdownList_t **);
+
+RF_DECLARE_MUTEX(rf_printf_mutex)	/* debug only:  avoids interleaved
+					 * printfs by different stripes */
+
+#define SIGNAL_QUIESCENT_COND(_raid_)  wakeup(&((_raid_)->accesses_suspended))
+#define WAIT_FOR_QUIESCENCE(_raid_) \
+	RF_LTSLEEP(&((_raid_)->accesses_suspended), PRIBIO, \
+		"raidframe quiesce", 0, &((_raid_)->access_suspend_mutex))
+
+#if defined(__FreeBSD__) && __FreeBSD_version > 500005
+#define IO_BUF_ERR(bp, err) { \
+	bp->bio_flags |= BIO_ERROR; \
+	bp->bio_resid = bp->bio_bcount; \
+	bp->bio_error = err; \
+	biodone(bp); \
+};
+#else
+#define IO_BUF_ERR(bp, err) { \
+	bp->b_flags |= B_ERROR; \
+	bp->b_resid = bp->b_bcount; \
+	bp->b_error = err; \
+	biodone(bp); \
+}
+#endif
+
+static int configureCount = 0;	/* number of active configurations */
+static int configInProgress = 0; /* configuration is in progress and code
+				  * needs to be serialized. */
+static int isconfigged = 0;	/* is basic raidframe (non per-array)
+				 * stuff configged */
+RF_DECLARE_STATIC_MUTEX(configureMutex)	/* used to lock the configuration
+					 * stuff */
+static RF_ShutdownList_t *globalShutdown;	/* non array-specific
+						 * stuff */
+
+/* called at system boot time */
+int     
+rf_BootRaidframe()
+{
+	int     rc;
+
+	if (raidframe_booted)
+		return (EBUSY);
+	raidframe_booted = 1;
+
+	rc = rf_mutex_init(&configureMutex, __FUNCTION__);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		RF_PANIC();
+	}
+	configureCount = 0;
+	isconfigged = 0;
+	globalShutdown = NULL;
+	return (0);
+}
+/*
+ * This function is really just for debugging user-level stuff: it
+ * frees up all memory, other RAIDframe resources which might otherwise
+ * be kept around. This is used with systems like "sentinel" to detect
+ * memory leaks.
+ */
+int 
+rf_UnbootRaidframe()
+{
+	int     rc;
+
+	RF_LOCK_MUTEX(configureMutex);
+	if (configureCount) {
+		RF_UNLOCK_MUTEX(configureMutex);
+		return (EBUSY);
+	}
+	raidframe_booted = 0;
+	RF_UNLOCK_MUTEX(configureMutex);
+	rc = rf_mutex_destroy(&configureMutex);
+	if (rc) {
+		RF_ERRORMSG3("Unable to destroy mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		RF_PANIC();
+	}
+	return (0);
+}
+/*
+ * Called whenever an array is shutdown
+ */
+static void 
+rf_UnconfigureArray()
+{
+	int     rc;
+
+	RF_LOCK_MUTEX(configureMutex);
+	if (--configureCount == 0) {	/* if no active configurations, shut
+					 * everything down */
+		isconfigged = 0;
+
+		rc = rf_ShutdownList(&globalShutdown);
+		if (rc) {
+			RF_ERRORMSG1("RAIDFRAME: unable to do global shutdown, rc=%d\n", rc);
+		}
+
+		/*
+	         * We must wait until now, because the AllocList module
+	         * uses the DebugMem module.
+	         */
+		if (rf_memDebug)
+			rf_print_unfreed();
+	}
+	RF_UNLOCK_MUTEX(configureMutex);
+}
+
+/*
+ * Called to shut down an array.
+ */
+int 
+rf_Shutdown(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+
+	if (!raidPtr->valid) {
+		RF_ERRORMSG("Attempt to shut down unconfigured RAIDframe driver.  Aborting shutdown\n");
+		return (EINVAL);
+	}
+	/*
+         * wait for outstanding IOs to land
+         * As described in rf_raid.h, we use the rad_freelist lock
+         * to protect the per-array info about outstanding descs
+         * since we need to do freelist locking anyway, and this
+         * cuts down on the amount of serialization we've got going
+         * on.
+         */
+	RF_FREELIST_DO_LOCK(rf_rad_freelist);
+	if (raidPtr->waitShutdown) {
+		RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
+		return (EBUSY);
+	}
+	raidPtr->waitShutdown = 1;
+	while (raidPtr->nAccOutstanding) {
+		RF_WAIT_COND(raidPtr->outstandingCond, RF_FREELIST_MUTEX_OF(rf_rad_freelist));
+	}
+	RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
+
+	/* Wait for any parity re-writes to stop... */
+	while (raidPtr->parity_rewrite_in_progress) {
+		printf("Waiting for parity re-write to exit...\n");
+		tsleep(&raidPtr->parity_rewrite_in_progress, PRIBIO,
+		       "rfprwshutdown", 0);
+	}
+
+	raidPtr->valid = 0;
+
+	rf_update_component_labels(raidPtr, RF_FINAL_COMPONENT_UPDATE);
+
+	rf_UnconfigureVnodes(raidPtr);
+
+	rf_ShutdownList(&raidPtr->shutdownList);
+
+	rf_UnconfigureArray();
+
+	return (0);
+}
+
+
+#define DO_INIT_CONFIGURE(f) { \
+	rc = f (&globalShutdown); \
+	if (rc) { \
+		RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \
+		rf_ShutdownList(&globalShutdown); \
+		RF_LOCK_MUTEX(configureMutex); \
+		configInProgress = 0; \
+		configureCount--; \
+		RF_UNLOCK_MUTEX(configureMutex); \
+		return(rc); \
+	} \
+}
+
+#define DO_RAID_FAIL() { \
+	rf_UnconfigureVnodes(raidPtr); \
+	rf_ShutdownList(&raidPtr->shutdownList); \
+	rf_UnconfigureArray(); \
+}
+
+#define DO_RAID_INIT_CONFIGURE(f) { \
+	rc = f (&raidPtr->shutdownList, raidPtr, cfgPtr); \
+	if (rc) { \
+		RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \
+		DO_RAID_FAIL(); \
+		return(rc); \
+	} \
+}
+
+#define DO_RAID_MUTEX(_m_) { \
+	rc = rf_create_managed_mutex(&raidPtr->shutdownList, (_m_)); \
+	if (rc) { \
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", \
+			__FILE__, __LINE__, rc); \
+		DO_RAID_FAIL(); \
+		return(rc); \
+	} \
+}
+
+#define DO_RAID_COND(_c_) { \
+	rc = rf_create_managed_cond(&raidPtr->shutdownList, (_c_)); \
+	if (rc) { \
+		RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", \
+			__FILE__, __LINE__, rc); \
+		DO_RAID_FAIL(); \
+		return(rc); \
+	} \
+}
+
+int 
+rf_Configure(raidPtr, cfgPtr, ac)
+	RF_Raid_t *raidPtr;
+	RF_Config_t *cfgPtr;
+	RF_AutoConfig_t *ac;
+{
+	RF_RowCol_t row, col;
+	int     i, rc;
+
+	/* XXX This check can probably be removed now, since 
+	   RAIDFRAME_CONFIGURE now checks to make sure that the
+	   RAID set is not already valid
+	*/
+	if (raidPtr->valid) {
+		RF_ERRORMSG("RAIDframe configuration not shut down.  Aborting configure.\n");
+		return (EINVAL);
+	}
+	RF_LOCK_MUTEX(configureMutex);
+	if (configInProgress == 1) {
+		RF_UNLOCK_MUTEX(configureMutex);
+		return (EBUSY);
+	}
+	configureCount++;
+	if (isconfigged == 0) {
+		configInProgress = 1;
+		RF_UNLOCK_MUTEX(configureMutex);
+		rc = rf_create_managed_mutex(&globalShutdown, &rf_printf_mutex);
+		if (rc) {
+			RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+			    __LINE__, rc);
+			rf_ShutdownList(&globalShutdown);
+			return (rc);
+		}
+		/* initialize globals */
+		printf("RAIDFRAME: protectedSectors is %ld\n", 
+		       rf_protectedSectors);
+
+		rf_clear_debug_print_buffer();
+
+		DO_INIT_CONFIGURE(rf_ConfigureAllocList);
+
+		/*
+	         * Yes, this does make debugging general to the whole
+	         * system instead of being array specific. Bummer, drag.  
+		 */
+		rf_ConfigureDebug(cfgPtr);
+		DO_INIT_CONFIGURE(rf_ConfigureDebugMem);
+		DO_INIT_CONFIGURE(rf_ConfigureAccessTrace);
+		DO_INIT_CONFIGURE(rf_ConfigureMapModule);
+		DO_INIT_CONFIGURE(rf_ConfigureReconEvent);
+		DO_INIT_CONFIGURE(rf_ConfigureCallback);
+		DO_INIT_CONFIGURE(rf_ConfigureMemChunk);
+		DO_INIT_CONFIGURE(rf_ConfigureRDFreeList);
+		DO_INIT_CONFIGURE(rf_ConfigureNWayXor);
+		DO_INIT_CONFIGURE(rf_ConfigureStripeLockFreeList);
+		DO_INIT_CONFIGURE(rf_ConfigureMCPair);
+		DO_INIT_CONFIGURE(rf_ConfigureDAGs);
+		DO_INIT_CONFIGURE(rf_ConfigureDAGFuncs);
+		DO_INIT_CONFIGURE(rf_ConfigureDebugPrint);
+		DO_INIT_CONFIGURE(rf_ConfigureReconstruction);
+		DO_INIT_CONFIGURE(rf_ConfigureCopyback);
+		DO_INIT_CONFIGURE(rf_ConfigureDiskQueueSystem);
+
+		RF_LOCK_MUTEX(configureMutex);
+		isconfigged = 1;
+		configInProgress = 0;
+	}
+	RF_UNLOCK_MUTEX(configureMutex);
+
+	DO_RAID_MUTEX(&raidPtr->mutex);
+	/* set up the cleanup list.  Do this after ConfigureDebug so that
+	 * value of memDebug will be set */
+
+	rf_MakeAllocList(raidPtr->cleanupList);
+	if (raidPtr->cleanupList == NULL) {
+		DO_RAID_FAIL();
+		return (ENOMEM);
+	}
+	rc = rf_ShutdownCreate(&raidPtr->shutdownList,
+	    (void (*) (void *)) rf_FreeAllocList,
+	    raidPtr->cleanupList);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+		    __FILE__, __LINE__, rc);
+		DO_RAID_FAIL();
+		return (rc);
+	}
+	raidPtr->numRow = cfgPtr->numRow;
+	raidPtr->numCol = cfgPtr->numCol;
+	raidPtr->numSpare = cfgPtr->numSpare;
+
+	/* XXX we don't even pretend to support more than one row in the
+	 * kernel... */
+	if (raidPtr->numRow != 1) {
+		RF_ERRORMSG("Only one row supported in kernel.\n");
+		DO_RAID_FAIL();
+		return (EINVAL);
+	}
+	RF_CallocAndAdd(raidPtr->status, raidPtr->numRow, sizeof(RF_RowStatus_t),
+	    (RF_RowStatus_t *), raidPtr->cleanupList);
+	if (raidPtr->status == NULL) {
+		DO_RAID_FAIL();
+		return (ENOMEM);
+	}
+	RF_CallocAndAdd(raidPtr->reconControl, raidPtr->numRow,
+	    sizeof(RF_ReconCtrl_t *), (RF_ReconCtrl_t **), raidPtr->cleanupList);
+	if (raidPtr->reconControl == NULL) {
+		DO_RAID_FAIL();
+		return (ENOMEM);
+	}
+	for (i = 0; i < raidPtr->numRow; i++) {
+		raidPtr->status[i] = rf_rs_optimal;
+		raidPtr->reconControl[i] = NULL;
+	}
+
+	DO_RAID_INIT_CONFIGURE(rf_ConfigureEngine);
+	DO_RAID_INIT_CONFIGURE(rf_ConfigureStripeLocks);
+
+	DO_RAID_COND(&raidPtr->outstandingCond);
+
+	raidPtr->nAccOutstanding = 0;
+	raidPtr->waitShutdown = 0;
+
+	DO_RAID_MUTEX(&raidPtr->access_suspend_mutex);
+	DO_RAID_COND(&raidPtr->quiescent_cond);
+
+	DO_RAID_COND(&raidPtr->waitForReconCond);
+
+	DO_RAID_MUTEX(&raidPtr->recon_done_proc_mutex);
+
+	if (ac!=NULL) {
+		/* We have an AutoConfig structure..  Don't do the
+		   normal disk configuration... call the auto config
+		   stuff */
+		rf_AutoConfigureDisks(raidPtr, cfgPtr, ac);
+	} else {
+		DO_RAID_INIT_CONFIGURE(rf_ConfigureDisks);
+		DO_RAID_INIT_CONFIGURE(rf_ConfigureSpareDisks);
+	}
+	/* do this after ConfigureDisks & ConfigureSpareDisks to be sure dev
+	 * no. is set */
+	DO_RAID_INIT_CONFIGURE(rf_ConfigureDiskQueues);
+
+	DO_RAID_INIT_CONFIGURE(rf_ConfigureLayout);
+
+	DO_RAID_INIT_CONFIGURE(rf_ConfigurePSStatus);
+
+	for (row = 0; row < raidPtr->numRow; row++) {
+		for (col = 0; col < raidPtr->numCol; col++) {
+			/*
+		         * XXX better distribution
+		         */
+			raidPtr->hist_diskreq[row][col] = 0;
+		}
+	}
+
+	raidPtr->numNewFailures = 0;
+	raidPtr->copyback_in_progress = 0;
+	raidPtr->parity_rewrite_in_progress = 0;
+	raidPtr->recon_in_progress = 0;
+	raidPtr->maxOutstanding = cfgPtr->maxOutstandingDiskReqs;
+
+	/* autoconfigure and root_partition will actually get filled in 
+	   after the config is done */
+	raidPtr->autoconfigure = 0;
+	raidPtr->root_partition = 0;
+	raidPtr->last_unit = raidPtr->raidid;
+	raidPtr->config_order = 0;
+
+	if (rf_keepAccTotals) {
+		raidPtr->keep_acc_totals = 1;
+	}
+	rf_StartUserStats(raidPtr);
+
+	raidPtr->valid = 1;
+	return (0);
+}
+
+static int 
+init_rad(desc)
+	RF_RaidAccessDesc_t *desc;
+{
+	int     rc;
+
+	rc = rf_mutex_init(&desc->mutex, __FUNCTION__);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		return (rc);
+	}
+	rc = rf_cond_init(&desc->cond);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		rf_mutex_destroy(&desc->mutex);
+		return (rc);
+	}
+	return (0);
+}
+
+static void 
+clean_rad(desc)
+	RF_RaidAccessDesc_t *desc;
+{
+	rf_mutex_destroy(&desc->mutex);
+	rf_cond_destroy(&desc->cond);
+}
+
+static void 
+rf_ShutdownRDFreeList(ignored)
+	void   *ignored;
+{
+	RF_FREELIST_DESTROY_CLEAN(rf_rad_freelist, next, (RF_RaidAccessDesc_t *), clean_rad);
+}
+
+static int 
+rf_ConfigureRDFreeList(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     rc;
+
+	RF_FREELIST_CREATE(rf_rad_freelist, RF_MAX_FREE_RAD,
+	    RF_RAD_INC, sizeof(RF_RaidAccessDesc_t));
+	if (rf_rad_freelist == NULL) {
+		return (ENOMEM);
+	}
+	rc = rf_ShutdownCreate(listp, rf_ShutdownRDFreeList, NULL);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		rf_ShutdownRDFreeList(NULL);
+		return (rc);
+	}
+	RF_FREELIST_PRIME_INIT(rf_rad_freelist, RF_RAD_INITIAL, next,
+	    (RF_RaidAccessDesc_t *), init_rad);
+	return (0);
+}
+
+RF_RaidAccessDesc_t *
+rf_AllocRaidAccDesc(
+    RF_Raid_t * raidPtr,
+    RF_IoType_t type,
+    RF_RaidAddr_t raidAddress,
+    RF_SectorCount_t numBlocks,
+    caddr_t bufPtr,
+    void *bp,
+    RF_DagHeader_t ** paramDAG,
+    RF_AccessStripeMapHeader_t ** paramASM,
+    RF_RaidAccessFlags_t flags,
+    void (*cbF) (RF_Buf_t),
+    void *cbA,
+    RF_AccessState_t * states)
+{
+	RF_RaidAccessDesc_t *desc;
+
+	RF_FREELIST_GET_INIT_NOUNLOCK(rf_rad_freelist, desc, next, (RF_RaidAccessDesc_t *), init_rad);
+	if (raidPtr->waitShutdown) {
+		/*
+	         * Actually, we're shutting the array down. Free the desc
+	         * and return NULL.
+	         */
+		RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
+		RF_FREELIST_FREE_CLEAN(rf_rad_freelist, desc, next, clean_rad);
+		return (NULL);
+	}
+	raidPtr->nAccOutstanding++;
+	RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
+
+	desc->raidPtr = (void *) raidPtr;
+	desc->type = type;
+	desc->raidAddress = raidAddress;
+	desc->numBlocks = numBlocks;
+	desc->bufPtr = bufPtr;
+	desc->bp = bp;
+	desc->paramDAG = paramDAG;
+	desc->paramASM = paramASM;
+	desc->flags = flags;
+	desc->states = states;
+	desc->state = 0;
+
+	desc->status = 0;
+	bzero((char *) &desc->tracerec, sizeof(RF_AccTraceEntry_t));
+	desc->callbackFunc = (void (*) (RF_CBParam_t)) cbF;	/* XXX */
+	desc->callbackArg = cbA;
+	desc->next = NULL;
+	desc->head = desc;
+	desc->numPending = 0;
+	desc->cleanupList = NULL;
+	rf_MakeAllocList(desc->cleanupList);
+	return (desc);
+}
+
+void 
+rf_FreeRaidAccDesc(RF_RaidAccessDesc_t * desc)
+{
+	RF_Raid_t *raidPtr = desc->raidPtr;
+
+	RF_ASSERT(desc);
+
+	rf_FreeAllocList(desc->cleanupList);
+	RF_FREELIST_FREE_CLEAN_NOUNLOCK(rf_rad_freelist, desc, next, clean_rad);
+	raidPtr->nAccOutstanding--;
+	if (raidPtr->waitShutdown) {
+		RF_SIGNAL_COND(raidPtr->outstandingCond);
+	}
+	RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
+}
+/*********************************************************************
+ * Main routine for performing an access.
+ * Accesses are retried until a DAG can not be selected.  This occurs
+ * when either the DAG library is incomplete or there are too many
+ * failures in a parity group.
+ ********************************************************************/
+int 
+rf_DoAccess(
+    RF_Raid_t * raidPtr,
+    RF_IoType_t type,
+    int async_flag,
+    RF_RaidAddr_t raidAddress,
+    RF_SectorCount_t numBlocks,
+    caddr_t bufPtr,
+    void *bp_in,
+    RF_DagHeader_t ** paramDAG,
+    RF_AccessStripeMapHeader_t ** paramASM,
+    RF_RaidAccessFlags_t flags,
+    RF_RaidAccessDesc_t ** paramDesc,
+    void (*cbF) (RF_Buf_t),
+    void *cbA)
+/*
+type should be read or write
+async_flag should be RF_TRUE or RF_FALSE
+bp_in is a buf pointer.  void * to facilitate ignoring it outside the kernel
+*/
+{
+	RF_RaidAccessDesc_t *desc;
+	caddr_t lbufPtr = bufPtr;
+	RF_Buf_t bp = (RF_Buf_t) bp_in;
+
+	raidAddress += rf_raidSectorOffset;
+
+	if (!raidPtr->valid) {
+		RF_ERRORMSG("RAIDframe driver not successfully configured.  Rejecting access.\n");
+		IO_BUF_ERR(bp, EINVAL);
+		return (EINVAL);
+	}
+
+	if (rf_accessDebug) {
+
+		printf("logBytes is: %d %d %d\n", raidPtr->raidid,
+		    raidPtr->logBytesPerSector,
+		    (int) rf_RaidAddressToByte(raidPtr, numBlocks));
+		printf("raid%d: %s raidAddr %d (stripeid %d-%d) numBlocks %d (%d bytes) buf 0x%lx\n", raidPtr->raidid,
+		    (type == RF_IO_TYPE_READ) ? "READ" : "WRITE", (int) raidAddress,
+		    (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress),
+		    (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress + numBlocks - 1),
+		    (int) numBlocks,
+		    (int) rf_RaidAddressToByte(raidPtr, numBlocks),
+		    (long) bufPtr);
+	}
+	if (raidAddress + numBlocks > raidPtr->totalSectors) {
+
+		printf("DoAccess: raid addr %lu too large to access %lu sectors.  Max legal addr is %lu\n",
+		    (u_long) raidAddress, (u_long) numBlocks, (u_long) raidPtr->totalSectors);
+
+		IO_BUF_ERR(bp, ENOSPC);
+		return (ENOSPC);
+	}
+	desc = rf_AllocRaidAccDesc(raidPtr, type, raidAddress,
+	    numBlocks, lbufPtr, bp, paramDAG, paramASM,
+	    flags, cbF, cbA, raidPtr->Layout.map->states);
+
+	if (desc == NULL) {
+		return (ENOMEM);
+	}
+	RF_ETIMER_START(desc->tracerec.tot_timer);
+
+	desc->async_flag = async_flag;
+
+	rf_ContinueRaidAccess(desc);
+
+	return (0);
+}
+/* force the array into reconfigured mode without doing reconstruction */
+int 
+rf_SetReconfiguredMode(raidPtr, row, col)
+	RF_Raid_t *raidPtr;
+	int     row;
+	int     col;
+{
+	if (!(raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+		printf("Can't set reconfigured mode in dedicated-spare array\n");
+		RF_PANIC();
+	}
+	RF_LOCK_MUTEX(raidPtr->mutex);
+	raidPtr->numFailures++;
+	raidPtr->Disks[row][col].status = rf_ds_dist_spared;
+	raidPtr->status[row] = rf_rs_reconfigured;
+	rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
+	/* install spare table only if declustering + distributed sparing
+	 * architecture. */
+	if (raidPtr->Layout.map->flags & RF_BD_DECLUSTERED)
+		rf_InstallSpareTable(raidPtr, row, col);
+	RF_UNLOCK_MUTEX(raidPtr->mutex);
+	return (0);
+}
+
+extern int fail_row, fail_col, fail_time;
+extern int delayed_recon;
+
+int 
+rf_FailDisk(
+    RF_Raid_t * raidPtr,
+    int frow,
+    int fcol,
+    int initRecon)
+{
+	printf("raid%d: Failing disk r%d c%d\n", raidPtr->raidid, frow, fcol);
+	RF_LOCK_MUTEX(raidPtr->mutex);
+	raidPtr->numFailures++;
+	raidPtr->Disks[frow][fcol].status = rf_ds_failed;
+	raidPtr->status[frow] = rf_rs_degraded;
+	rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
+	RF_UNLOCK_MUTEX(raidPtr->mutex);
+	if (initRecon)
+		rf_ReconstructFailedDisk(raidPtr, frow, fcol);
+	return (0);
+}
+/* releases a thread that is waiting for the array to become quiesced.
+ * access_suspend_mutex should be locked upon calling this
+ */
+void 
+rf_SignalQuiescenceLock(raidPtr, reconDesc)
+	RF_Raid_t *raidPtr;
+	RF_RaidReconDesc_t *reconDesc;
+{
+	if (rf_quiesceDebug) {
+		printf("raid%d: Signalling quiescence lock\n", 
+		       raidPtr->raidid);
+	}
+	raidPtr->access_suspend_release = 1;
+
+	if (raidPtr->waiting_for_quiescence) {
+		SIGNAL_QUIESCENT_COND(raidPtr);
+	}
+}
+/* suspends all new requests to the array.  No effect on accesses that are in flight.  */
+int 
+rf_SuspendNewRequestsAndWait(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	if (rf_quiesceDebug)
+		printf("Suspending new reqs\n");
+
+	RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
+	raidPtr->accesses_suspended++;
+	raidPtr->waiting_for_quiescence = (raidPtr->accs_in_flight == 0) ? 0 : 1;
+
+	if (raidPtr->waiting_for_quiescence) {
+		raidPtr->access_suspend_release = 0;
+		while (!raidPtr->access_suspend_release) {
+			printf("Suspending: Waiting for Quiescence\n");
+			WAIT_FOR_QUIESCENCE(raidPtr);
+			raidPtr->waiting_for_quiescence = 0;
+		}
+	}
+	printf("Quiescence reached..\n");
+
+	RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
+	return (raidPtr->waiting_for_quiescence);
+}
+/* wake up everyone waiting for quiescence to be released */
+void 
+rf_ResumeNewRequests(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	RF_CallbackDesc_t *t, *cb;
+
+	if (rf_quiesceDebug)
+		printf("Resuming new reqs\n");
+
+	RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
+	raidPtr->accesses_suspended--;
+	if (raidPtr->accesses_suspended == 0)
+		cb = raidPtr->quiesce_wait_list;
+	else
+		cb = NULL;
+	raidPtr->quiesce_wait_list = NULL;
+	RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
+
+	while (cb) {
+		t = cb;
+		cb = cb->next;
+		(t->callbackFunc) (t->callbackArg);
+		rf_FreeCallbackDesc(t);
+	}
+}
+/*****************************************************************************************
+ *
+ * debug routines
+ *
+ ****************************************************************************************/
+
+static void 
+set_debug_option(name, val)
+	char   *name;
+	long    val;
+{
+	RF_DebugName_t *p;
+
+	for (p = rf_debugNames; p->name; p++) {
+		if (!strcmp(p->name, name)) {
+			*(p->ptr) = val;
+			printf("[Set debug variable %s to %ld]\n", name, val);
+			return;
+		}
+	}
+	RF_ERRORMSG1("Unknown debug string \"%s\"\n", name);
+}
+
+
+/* would like to use sscanf here, but apparently not available in kernel */
+/*ARGSUSED*/
+static void 
+rf_ConfigureDebug(cfgPtr)
+	RF_Config_t *cfgPtr;
+{
+	char   *val_p, *name_p, *white_p;
+	long    val;
+	int     i;
+
+	rf_ResetDebugOptions();
+	for (i = 0; cfgPtr->debugVars[i][0] && i < RF_MAXDBGV; i++) {
+		name_p = rf_find_non_white(&cfgPtr->debugVars[i][0]);
+		white_p = rf_find_white(name_p);	/* skip to start of 2nd
+							 * word */
+		val_p = rf_find_non_white(white_p);
+		if (*val_p == '0' && *(val_p + 1) == 'x')
+			val = rf_htoi(val_p + 2);
+		else
+			val = rf_atoi(val_p);
+		*white_p = '\0';
+		set_debug_option(name_p, val);
+	}
+}
+/* performance monitoring stuff */
+
+#define TIMEVAL_TO_US(t) (((long) t.tv_sec) * 1000000L + (long) t.tv_usec)
+
+#if !defined(_KERNEL) && !defined(SIMULATE)
+
+/*
+ * Throughput stats currently only used in user-level RAIDframe
+ */
+
+static int 
+rf_InitThroughputStats(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	int     rc;
+
+	/* these used by user-level raidframe only */
+	rc = rf_create_managed_mutex(listp, &raidPtr->throughputstats.mutex);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		return (rc);
+	}
+	raidPtr->throughputstats.sum_io_us = 0;
+	raidPtr->throughputstats.num_ios = 0;
+	raidPtr->throughputstats.num_out_ios = 0;
+	return (0);
+}
+
+void 
+rf_StartThroughputStats(RF_Raid_t * raidPtr)
+{
+	RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
+	raidPtr->throughputstats.num_ios++;
+	raidPtr->throughputstats.num_out_ios++;
+	if (raidPtr->throughputstats.num_out_ios == 1)
+		RF_GETTIME(raidPtr->throughputstats.start);
+	RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
+}
+
+static void 
+rf_StopThroughputStats(RF_Raid_t * raidPtr)
+{
+	struct timeval diff;
+
+	RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
+	raidPtr->throughputstats.num_out_ios--;
+	if (raidPtr->throughputstats.num_out_ios == 0) {
+		RF_GETTIME(raidPtr->throughputstats.stop);
+		RF_TIMEVAL_DIFF(&raidPtr->throughputstats.start, &raidPtr->throughputstats.stop, &diff);
+		raidPtr->throughputstats.sum_io_us += TIMEVAL_TO_US(diff);
+	}
+	RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
+}
+
+static void 
+rf_PrintThroughputStats(RF_Raid_t * raidPtr)
+{
+	RF_ASSERT(raidPtr->throughputstats.num_out_ios == 0);
+	if (raidPtr->throughputstats.sum_io_us != 0) {
+		printf("[Througphut: %8.2f IOs/second]\n", raidPtr->throughputstats.num_ios
+		    / (raidPtr->throughputstats.sum_io_us / 1000000.0));
+	}
+}
+#endif				/* !KERNEL && !SIMULATE */
+
+void 
+rf_StartUserStats(RF_Raid_t * raidPtr)
+{
+	RF_GETTIME(raidPtr->userstats.start);
+	raidPtr->userstats.sum_io_us = 0;
+	raidPtr->userstats.num_ios = 0;
+	raidPtr->userstats.num_sect_moved = 0;
+}
+
+void 
+rf_StopUserStats(RF_Raid_t * raidPtr)
+{
+	RF_GETTIME(raidPtr->userstats.stop);
+}
+
+void 
+rf_UpdateUserStats(raidPtr, rt, numsect)
+	RF_Raid_t *raidPtr;
+	int     rt;		/* resp time in us */
+	int     numsect;	/* number of sectors for this access */
+{
+	raidPtr->userstats.sum_io_us += rt;
+	raidPtr->userstats.num_ios++;
+	raidPtr->userstats.num_sect_moved += numsect;
+}
+
+void 
+rf_PrintUserStats(RF_Raid_t * raidPtr)
+{
+	long    elapsed_us, mbs, mbs_frac;
+	struct timeval diff;
+
+	RF_TIMEVAL_DIFF(&raidPtr->userstats.start, &raidPtr->userstats.stop, &diff);
+	elapsed_us = TIMEVAL_TO_US(diff);
+
+	/* 2000 sectors per megabyte, 10000000 microseconds per second */
+	if (elapsed_us)
+		mbs = (raidPtr->userstats.num_sect_moved / 2000) / (elapsed_us / 1000000);
+	else
+		mbs = 0;
+
+	/* this computes only the first digit of the fractional mb/s moved */
+	if (elapsed_us) {
+		mbs_frac = ((raidPtr->userstats.num_sect_moved / 200) / (elapsed_us / 1000000))
+		    - (mbs * 10);
+	} else {
+		mbs_frac = 0;
+	}
+
+	printf("Number of I/Os:             %ld\n", raidPtr->userstats.num_ios);
+	printf("Elapsed time (us):          %ld\n", elapsed_us);
+	printf("User I/Os per second:       %ld\n", RF_DB0_CHECK(raidPtr->userstats.num_ios, (elapsed_us / 1000000)));
+	printf("Average user response time: %ld us\n", RF_DB0_CHECK(raidPtr->userstats.sum_io_us, raidPtr->userstats.num_ios));
+	printf("Total sectors moved:        %ld\n", raidPtr->userstats.num_sect_moved);
+	printf("Average access size (sect): %ld\n", RF_DB0_CHECK(raidPtr->userstats.num_sect_moved, raidPtr->userstats.num_ios));
+	printf("Achieved data rate:         %ld.%ld MB/sec\n", mbs, mbs_frac);
+}
+
+
+void
+rf_print_panic_message(line,file)
+	int line;
+	char *file;
+{
+	sprintf(rf_panicbuf,"raidframe error at line %d file %s",
+		line, file);
+}
+
+void
+rf_print_assert_panic_message(line,file,condition)
+	int line;
+	char *file;
+	char *condition;
+{
+	sprintf(rf_panicbuf,
+		"raidframe error at line %d file %s (failed asserting %s)\n",
+		line, file, condition);
+}
diff --git a/sys/dev/raidframe/rf_driver.h b/sys/dev/raidframe/rf_driver.h
new file mode 100644
index 0000000..8b156c5
--- /dev/null
+++ b/sys/dev/raidframe/rf_driver.h
@@ -0,0 +1,79 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_driver.h,v 1.4 2000/02/13 04:53:57 oster Exp $	*/
+/*
+ * rf_driver.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_DRIVER_H_
+#define _RF__RF_DRIVER_H_
+
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_bsd.h>
+
+#if _KERNEL
+RF_DECLARE_EXTERN_MUTEX(rf_printf_mutex)
+int     rf_BootRaidframe(void);
+int     rf_UnbootRaidframe(void);
+int     rf_Shutdown(RF_Raid_t * raidPtr);
+int     rf_Configure(RF_Raid_t * raidPtr, RF_Config_t * cfgPtr,
+		     RF_AutoConfig_t *ac);
+RF_RaidAccessDesc_t *rf_AllocRaidAccDesc(RF_Raid_t * raidPtr, RF_IoType_t type,
+					 RF_RaidAddr_t raidAddress, 
+					 RF_SectorCount_t numBlocks, 
+					 caddr_t bufPtr,
+					 void *bp, RF_DagHeader_t ** paramDAG,
+					 RF_AccessStripeMapHeader_t ** paramASM,
+					 RF_RaidAccessFlags_t flags, 
+					 void (*cbF) (RF_Buf_t), 
+					 void *cbA,
+					 RF_AccessState_t * states);
+void    rf_FreeRaidAccDesc(RF_RaidAccessDesc_t * desc);
+int     rf_DoAccess(RF_Raid_t * raidPtr, RF_IoType_t type, int async_flag,
+		    RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks, 
+		    caddr_t bufPtr, void *bp_in, RF_DagHeader_t ** paramDAG,
+		    RF_AccessStripeMapHeader_t ** paramASM, 
+		    RF_RaidAccessFlags_t flags, 
+		    RF_RaidAccessDesc_t ** paramDesc, 
+		    void (*cbF) (RF_Buf_t), void *cbA);
+int     rf_SetReconfiguredMode(RF_Raid_t * raidPtr, RF_RowCol_t row,
+			       RF_RowCol_t col);
+int     rf_FailDisk(RF_Raid_t * raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol,
+		    int initRecon);
+void    rf_SignalQuiescenceLock(RF_Raid_t * raidPtr, 
+				RF_RaidReconDesc_t * reconDesc);
+int     rf_SuspendNewRequestsAndWait(RF_Raid_t * raidPtr);
+void    rf_ResumeNewRequests(RF_Raid_t * raidPtr);
+void    rf_StartThroughputStats(RF_Raid_t * raidPtr);
+void    rf_StartUserStats(RF_Raid_t * raidPtr);
+void    rf_StopUserStats(RF_Raid_t * raidPtr);
+void    rf_UpdateUserStats(RF_Raid_t * raidPtr, int rt, int numsect);
+void    rf_PrintUserStats(RF_Raid_t * raidPtr);
+#endif /* _KERNEL */
+#endif				/* !_RF__RF_DRIVER_H_ */
diff --git a/sys/dev/raidframe/rf_engine.c b/sys/dev/raidframe/rf_engine.c
new file mode 100644
index 0000000..ddd5612
--- /dev/null
+++ b/sys/dev/raidframe/rf_engine.c
@@ -0,0 +1,810 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_engine.c,v 1.10 2000/08/20 16:51:03 thorpej Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II, Mark Holland, Rachad Youssef
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/****************************************************************************
+ *                                                                          *
+ * engine.c -- code for DAG execution engine                                *
+ *                                                                          *
+ * Modified to work as follows (holland):                                   *
+ *   A user-thread calls into DispatchDAG, which fires off the nodes that   *
+ *   are direct successors to the header node.  DispatchDAG then returns,   *
+ *   and the rest of the I/O continues asynchronously.  As each node        *
+ *   completes, the node execution function calls FinishNode().  FinishNode *
+ *   scans the list of successors to the node and increments the antecedent *
+ *   counts.  Each node that becomes enabled is placed on a central node    *
+ *   queue.  A dedicated dag-execution thread grabs nodes off of this       *
+ *   queue and fires them.                                                  *
+ *                                                                          *
+ *   NULL nodes are never fired.                                            *
+ *                                                                          *
+ *   Terminator nodes are never fired, but rather cause the callback        *
+ *   associated with the DAG to be invoked.                                 *
+ *                                                                          *
+ *   If a node fails, the dag either rolls forward to the completion or     *
+ *   rolls back, undoing previously-completed nodes and fails atomically.   *
+ *   The direction of recovery is determined by the location of the failed  *
+ *   node in the graph.  If the failure occured before the commit node in   *
+ *   the graph, backward recovery is used.  Otherwise, forward recovery is  *
+ *   used.                                                                  *
+ *                                                                          *
+ ****************************************************************************/
+
+#include <dev/raidframe/rf_threadstuff.h>
+
+#include <sys/errno.h>
+
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_engine.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_shutdown.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_kintf.h>
+
+static void DAGExecutionThread(RF_ThreadArg_t arg);
+
+#define DO_INIT(_l_,_r_) { \
+  int _rc; \
+  _rc = rf_create_managed_mutex(_l_,&(_r_)->node_queue_mutex); \
+  if (_rc) { \
+    return(_rc); \
+  } \
+  _rc = rf_create_managed_cond(_l_,&(_r_)->node_queue_cond); \
+  if (_rc) { \
+    return(_rc); \
+  } \
+}
+
+/* synchronization primitives for this file.  DO_WAIT should be enclosed in a while loop. */
+
+/*
+ * XXX Is this spl-ing really necessary?
+ */
+#define DO_LOCK(_r_) \
+do { \
+	ks = splbio(); \
+	RF_LOCK_MUTEX((_r_)->node_queue_mutex); \
+} while (0)
+
+#define DO_UNLOCK(_r_) \
+do { \
+	RF_UNLOCK_MUTEX((_r_)->node_queue_mutex); \
+	splx(ks); \
+} while (0)
+
+#define	DO_WAIT(_r_) \
+	RF_WAIT_COND((_r_)->node_queue, (_r_)->node_queue_mutex)
+
+#define	DO_SIGNAL(_r_) \
+	RF_BROADCAST_COND((_r_)->node_queue)	/* XXX RF_SIGNAL_COND? */
+
+static void rf_ShutdownEngine(void *);
+
+static void 
+rf_ShutdownEngine(arg)
+	void   *arg;
+{
+	RF_Raid_t *raidPtr;
+
+	raidPtr = (RF_Raid_t *) arg;
+	raidPtr->shutdown_engine = 1;
+	DO_SIGNAL(raidPtr);
+}
+
+int 
+rf_ConfigureEngine(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	int     rc;
+
+	DO_INIT(listp, raidPtr);
+
+	raidPtr->node_queue = NULL;
+	raidPtr->dags_in_flight = 0;
+
+	rc = rf_init_managed_threadgroup(listp, &raidPtr->engine_tg);
+	if (rc)
+		return (rc);
+
+	/* we create the execution thread only once per system boot. no need
+	 * to check return code b/c the kernel panics if it can't create the
+	 * thread. */
+	if (rf_engineDebug) {
+		printf("raid%d: Creating engine thread\n", raidPtr->raidid);
+	}
+	if (RF_CREATE_THREAD(raidPtr->engine_thread, DAGExecutionThread, raidPtr,"raid")) {
+		RF_ERRORMSG("RAIDFRAME: Unable to create engine thread\n");
+		return (ENOMEM);
+	}
+	if (rf_engineDebug) {
+		printf("raid%d: Created engine thread\n", raidPtr->raidid);
+	}
+	RF_THREADGROUP_STARTED(&raidPtr->engine_tg);
+	/* XXX something is missing here... */
+#ifdef debug
+	printf("Skipping the WAIT_START!!\n");
+#endif
+#if 1
+	printf("Waiting for DAG engine to start\n");
+	RF_THREADGROUP_WAIT_START(&raidPtr->engine_tg);
+#endif
+	/* engine thread is now running and waiting for work */
+	if (rf_engineDebug) {
+		printf("raid%d: Engine thread running and waiting for events\n", raidPtr->raidid);
+	}
+	rc = rf_ShutdownCreate(listp, rf_ShutdownEngine, raidPtr);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		rf_ShutdownEngine(NULL);
+	}
+	return (rc);
+}
+
+static int 
+BranchDone(RF_DagNode_t * node)
+{
+	int     i;
+
+	/* return true if forward execution is completed for a node and it's
+	 * succedents */
+	switch (node->status) {
+	case rf_wait:
+		/* should never be called in this state */
+		RF_PANIC();
+		break;
+	case rf_fired:
+		/* node is currently executing, so we're not done */
+		return (RF_FALSE);
+	case rf_good:
+		for (i = 0; i < node->numSuccedents; i++)	/* for each succedent */
+			if (!BranchDone(node->succedents[i]))	/* recursively check
+								 * branch */
+				return RF_FALSE;
+		return RF_TRUE;	/* node and all succedent branches aren't in
+				 * fired state */
+		break;
+	case rf_bad:
+		/* succedents can't fire */
+		return (RF_TRUE);
+	case rf_recover:
+		/* should never be called in this state */
+		RF_PANIC();
+		break;
+	case rf_undone:
+	case rf_panic:
+		/* XXX need to fix this case */
+		/* for now, assume that we're done */
+		return (RF_TRUE);
+		break;
+	default:
+		/* illegal node status */
+		RF_PANIC();
+		break;
+	}
+}
+
+static int 
+NodeReady(RF_DagNode_t * node)
+{
+	int     ready;
+
+	switch (node->dagHdr->status) {
+	case rf_enable:
+	case rf_rollForward:
+		if ((node->status == rf_wait) && (node->numAntecedents == node->numAntDone))
+			ready = RF_TRUE;
+		else
+			ready = RF_FALSE;
+		break;
+	case rf_rollBackward:
+		RF_ASSERT(node->numSuccDone <= node->numSuccedents);
+		RF_ASSERT(node->numSuccFired <= node->numSuccedents);
+		RF_ASSERT(node->numSuccFired <= node->numSuccDone);
+		if ((node->status == rf_good) && (node->numSuccDone == node->numSuccedents))
+			ready = RF_TRUE;
+		else
+			ready = RF_FALSE;
+		break;
+	default:
+		printf("Execution engine found illegal DAG status in NodeReady\n");
+		RF_PANIC();
+		break;
+	}
+
+	return (ready);
+}
+
+
+
+/* user context and dag-exec-thread context:
+ * Fire a node.  The node's status field determines which function, do or undo,
+ * to be fired.
+ * This routine assumes that the node's status field has alread been set to
+ * "fired" or "recover" to indicate the direction of execution.
+ */
+static void 
+FireNode(RF_DagNode_t * node)
+{
+	switch (node->status) {
+	case rf_fired:
+		/* fire the do function of a node */
+		if (rf_engineDebug) {
+			printf("raid%d: Firing node 0x%lx (%s)\n", 
+			       node->dagHdr->raidPtr->raidid, 
+			       (unsigned long) node, node->name);
+		}
+		if (node->flags & RF_DAGNODE_FLAG_YIELD) {
+#if defined(__NetBSD__) || defined(__FreeBSD__) && defined(_KERNEL)
+			/* thread_block(); */
+			/* printf("Need to block the thread here...\n");  */
+			/* XXX thread_block is actually mentioned in
+			 * /usr/include/vm/vm_extern.h */
+#else
+			thread_block();
+#endif
+		}
+		(*(node->doFunc)) (node);
+		break;
+	case rf_recover:
+		/* fire the undo function of a node */
+		if (rf_engineDebug) {
+			printf("raid%d: Firing (undo) node 0x%lx (%s)\n", 
+			       node->dagHdr->raidPtr->raidid,
+			       (unsigned long) node, node->name);
+		}
+		if (node->flags & RF_DAGNODE_FLAG_YIELD)
+#if defined(__NetBSD__) || defined(__FreeBSD__) && defined(_KERNEL)
+			/* thread_block(); */
+			/* printf("Need to block the thread here...\n"); */
+			/* XXX thread_block is actually mentioned in
+			 * /usr/include/vm/vm_extern.h */
+#else
+			thread_block();
+#endif
+		(*(node->undoFunc)) (node);
+		break;
+	default:
+		RF_PANIC();
+		break;
+	}
+}
+
+
+
+/* user context:
+ * Attempt to fire each node in a linear array.
+ * The entire list is fired atomically.
+ */
+static void 
+FireNodeArray(
+    int numNodes,
+    RF_DagNode_t ** nodeList)
+{
+	RF_DagStatus_t dstat;
+	RF_DagNode_t *node;
+	int     i, j;
+
+	/* first, mark all nodes which are ready to be fired */
+	for (i = 0; i < numNodes; i++) {
+		node = nodeList[i];
+		dstat = node->dagHdr->status;
+		RF_ASSERT((node->status == rf_wait) || (node->status == rf_good));
+		if (NodeReady(node)) {
+			if ((dstat == rf_enable) || (dstat == rf_rollForward)) {
+				RF_ASSERT(node->status == rf_wait);
+				if (node->commitNode)
+					node->dagHdr->numCommits++;
+				node->status = rf_fired;
+				for (j = 0; j < node->numAntecedents; j++)
+					node->antecedents[j]->numSuccFired++;
+			} else {
+				RF_ASSERT(dstat == rf_rollBackward);
+				RF_ASSERT(node->status == rf_good);
+				RF_ASSERT(node->commitNode == RF_FALSE);	/* only one commit node
+										 * per graph */
+				node->status = rf_recover;
+			}
+		}
+	}
+	/* now, fire the nodes */
+	for (i = 0; i < numNodes; i++) {
+		if ((nodeList[i]->status == rf_fired) || (nodeList[i]->status == rf_recover))
+			FireNode(nodeList[i]);
+	}
+}
+
+
+/* user context:
+ * Attempt to fire each node in a linked list.
+ * The entire list is fired atomically.
+ */
+static void 
+FireNodeList(RF_DagNode_t * nodeList)
+{
+	RF_DagNode_t *node, *next;
+	RF_DagStatus_t dstat;
+	int     j;
+
+	if (nodeList) {
+		/* first, mark all nodes which are ready to be fired */
+		for (node = nodeList; node; node = next) {
+			next = node->next;
+			dstat = node->dagHdr->status;
+			RF_ASSERT((node->status == rf_wait) || (node->status == rf_good));
+			if (NodeReady(node)) {
+				if ((dstat == rf_enable) || (dstat == rf_rollForward)) {
+					RF_ASSERT(node->status == rf_wait);
+					if (node->commitNode)
+						node->dagHdr->numCommits++;
+					node->status = rf_fired;
+					for (j = 0; j < node->numAntecedents; j++)
+						node->antecedents[j]->numSuccFired++;
+				} else {
+					RF_ASSERT(dstat == rf_rollBackward);
+					RF_ASSERT(node->status == rf_good);
+					RF_ASSERT(node->commitNode == RF_FALSE);	/* only one commit node
+											 * per graph */
+					node->status = rf_recover;
+				}
+			}
+		}
+		/* now, fire the nodes */
+		for (node = nodeList; node; node = next) {
+			next = node->next;
+			if ((node->status == rf_fired) || (node->status == rf_recover))
+				FireNode(node);
+		}
+	}
+}
+/* interrupt context:
+ * for each succedent
+ *    propagate required results from node to succedent
+ *    increment succedent's numAntDone
+ *    place newly-enable nodes on node queue for firing
+ *
+ * To save context switches, we don't place NIL nodes on the node queue,
+ * but rather just process them as if they had fired.  Note that NIL nodes
+ * that are the direct successors of the header will actually get fired by
+ * DispatchDAG, which is fine because no context switches are involved.
+ *
+ * Important:  when running at user level, this can be called by any
+ * disk thread, and so the increment and check of the antecedent count
+ * must be locked.  I used the node queue mutex and locked down the
+ * entire function, but this is certainly overkill.
+ */
+static void 
+PropagateResults(
+    RF_DagNode_t * node,
+    int context)
+{
+	RF_DagNode_t *s, *a;
+	RF_Raid_t *raidPtr;
+	int     i, ks;
+	RF_DagNode_t *finishlist = NULL;	/* a list of NIL nodes to be
+						 * finished */
+	RF_DagNode_t *skiplist = NULL;	/* list of nodes with failed truedata
+					 * antecedents */
+	RF_DagNode_t *firelist = NULL;	/* a list of nodes to be fired */
+	RF_DagNode_t *q = NULL, *qh = NULL, *next;
+	int     j, skipNode;
+
+	raidPtr = node->dagHdr->raidPtr;
+
+	DO_LOCK(raidPtr);
+
+	/* debug - validate fire counts */
+	for (i = 0; i < node->numAntecedents; i++) {
+		a = *(node->antecedents + i);
+		RF_ASSERT(a->numSuccFired >= a->numSuccDone);
+		RF_ASSERT(a->numSuccFired <= a->numSuccedents);
+		a->numSuccDone++;
+	}
+
+	switch (node->dagHdr->status) {
+	case rf_enable:
+	case rf_rollForward:
+		for (i = 0; i < node->numSuccedents; i++) {
+			s = *(node->succedents + i);
+			RF_ASSERT(s->status == rf_wait);
+			(s->numAntDone)++;
+			if (s->numAntDone == s->numAntecedents) {
+				/* look for NIL nodes */
+				if (s->doFunc == rf_NullNodeFunc) {
+					/* don't fire NIL nodes, just process
+					 * them */
+					s->next = finishlist;
+					finishlist = s;
+				} else {
+					/* look to see if the node is to be
+					 * skipped */
+					skipNode = RF_FALSE;
+					for (j = 0; j < s->numAntecedents; j++)
+						if ((s->antType[j] == rf_trueData) && (s->antecedents[j]->status == rf_bad))
+							skipNode = RF_TRUE;
+					if (skipNode) {
+						/* this node has one or more
+						 * failed true data
+						 * dependencies, so skip it */
+						s->next = skiplist;
+						skiplist = s;
+					} else
+						/* add s to list of nodes (q)
+						 * to execute */
+						if (context != RF_INTR_CONTEXT) {
+							/* we only have to
+							 * enqueue if we're at
+							 * intr context */
+							s->next = firelist;	/* put node on a list to
+										 * be fired after we
+										 * unlock */
+							firelist = s;
+						} else {	/* enqueue the node for
+								 * the dag exec thread
+								 * to fire */
+							RF_ASSERT(NodeReady(s));
+							if (q) {
+								q->next = s;
+								q = s;
+							} else {
+								qh = q = s;
+								qh->next = NULL;
+							}
+						}
+				}
+			}
+		}
+
+		if (q) {
+			/* xfer our local list of nodes to the node queue */
+			q->next = raidPtr->node_queue;
+			raidPtr->node_queue = qh;
+			DO_SIGNAL(raidPtr);
+		}
+		DO_UNLOCK(raidPtr);
+
+		for (; skiplist; skiplist = next) {
+			next = skiplist->next;
+			skiplist->status = rf_skipped;
+			for (i = 0; i < skiplist->numAntecedents; i++) {
+				skiplist->antecedents[i]->numSuccFired++;
+			}
+			if (skiplist->commitNode) {
+				skiplist->dagHdr->numCommits++;
+			}
+			rf_FinishNode(skiplist, context);
+		}
+		for (; finishlist; finishlist = next) {
+			/* NIL nodes: no need to fire them */
+			next = finishlist->next;
+			finishlist->status = rf_good;
+			for (i = 0; i < finishlist->numAntecedents; i++) {
+				finishlist->antecedents[i]->numSuccFired++;
+			}
+			if (finishlist->commitNode)
+				finishlist->dagHdr->numCommits++;
+			/*
+		         * Okay, here we're calling rf_FinishNode() on nodes that
+		         * have the null function as their work proc. Such a node
+		         * could be the terminal node in a DAG. If so, it will
+		         * cause the DAG to complete, which will in turn free
+		         * memory used by the DAG, which includes the node in
+		         * question. Thus, we must avoid referencing the node
+		         * at all after calling rf_FinishNode() on it.
+		         */
+			rf_FinishNode(finishlist, context);	/* recursive call */
+		}
+		/* fire all nodes in firelist */
+		FireNodeList(firelist);
+		break;
+
+	case rf_rollBackward:
+		for (i = 0; i < node->numAntecedents; i++) {
+			a = *(node->antecedents + i);
+			RF_ASSERT(a->status == rf_good);
+			RF_ASSERT(a->numSuccDone <= a->numSuccedents);
+			RF_ASSERT(a->numSuccDone <= a->numSuccFired);
+
+			if (a->numSuccDone == a->numSuccFired) {
+				if (a->undoFunc == rf_NullNodeFunc) {
+					/* don't fire NIL nodes, just process
+					 * them */
+					a->next = finishlist;
+					finishlist = a;
+				} else {
+					if (context != RF_INTR_CONTEXT) {
+						/* we only have to enqueue if
+						 * we're at intr context */
+						a->next = firelist;	/* put node on a list to
+									 * be fired after we
+									 * unlock */
+						firelist = a;
+					} else {	/* enqueue the node for
+							 * the dag exec thread
+							 * to fire */
+						RF_ASSERT(NodeReady(a));
+						if (q) {
+							q->next = a;
+							q = a;
+						} else {
+							qh = q = a;
+							qh->next = NULL;
+						}
+					}
+				}
+			}
+		}
+		if (q) {
+			/* xfer our local list of nodes to the node queue */
+			q->next = raidPtr->node_queue;
+			raidPtr->node_queue = qh;
+			DO_SIGNAL(raidPtr);
+		}
+		DO_UNLOCK(raidPtr);
+		for (; finishlist; finishlist = next) {	/* NIL nodes: no need to
+							 * fire them */
+			next = finishlist->next;
+			finishlist->status = rf_good;
+			/*
+		         * Okay, here we're calling rf_FinishNode() on nodes that
+		         * have the null function as their work proc. Such a node
+		         * could be the first node in a DAG. If so, it will
+		         * cause the DAG to complete, which will in turn free
+		         * memory used by the DAG, which includes the node in
+		         * question. Thus, we must avoid referencing the node
+		         * at all after calling rf_FinishNode() on it.
+		         */
+			rf_FinishNode(finishlist, context);	/* recursive call */
+		}
+		/* fire all nodes in firelist */
+		FireNodeList(firelist);
+
+		break;
+	default:
+		printf("Engine found illegal DAG status in PropagateResults()\n");
+		RF_PANIC();
+		break;
+	}
+}
+
+
+
+/*
+ * Process a fired node which has completed
+ */
+static void 
+ProcessNode(
+    RF_DagNode_t * node,
+    int context)
+{
+	RF_Raid_t *raidPtr;
+
+	raidPtr = node->dagHdr->raidPtr;
+
+	switch (node->status) {
+	case rf_good:
+		/* normal case, don't need to do anything */
+		break;
+	case rf_bad:
+		if ((node->dagHdr->numCommits > 0) || (node->dagHdr->numCommitNodes == 0)) {
+			node->dagHdr->status = rf_rollForward;	/* crossed commit
+								 * barrier */
+			if (rf_engineDebug || 1) {
+				printf("raid%d: node (%s) returned fail, rolling forward\n", raidPtr->raidid, node->name);
+			}
+		} else {
+			node->dagHdr->status = rf_rollBackward;	/* never reached commit
+								 * barrier */
+			if (rf_engineDebug || 1) {
+				printf("raid%d: node (%s) returned fail, rolling backward\n", raidPtr->raidid, node->name);
+			}
+		}
+		break;
+	case rf_undone:
+		/* normal rollBackward case, don't need to do anything */
+		break;
+	case rf_panic:
+		/* an undo node failed!!! */
+		printf("UNDO of a node failed!!!/n");
+		break;
+	default:
+		printf("node finished execution with an illegal status!!!\n");
+		RF_PANIC();
+		break;
+	}
+
+	/* enqueue node's succedents (antecedents if rollBackward) for
+	 * execution */
+	PropagateResults(node, context);
+}
+
+
+
+/* user context or dag-exec-thread context:
+ * This is the first step in post-processing a newly-completed node.
+ * This routine is called by each node execution function to mark the node
+ * as complete and fire off any successors that have been enabled.
+ */
+int 
+rf_FinishNode(
+    RF_DagNode_t * node,
+    int context)
+{
+	/* as far as I can tell, retcode is not used -wvcii */
+	int     retcode = RF_FALSE;
+	node->dagHdr->numNodesCompleted++;
+	ProcessNode(node, context);
+
+	return (retcode);
+}
+
+
+/* user context:
+ * submit dag for execution, return non-zero if we have to wait for completion.
+ * if and only if we return non-zero, we'll cause cbFunc to get invoked with
+ * cbArg when the DAG has completed.
+ *
+ * for now we always return 1.  If the DAG does not cause any I/O, then the callback
+ * may get invoked before DispatchDAG returns.  There's code in state 5 of ContinueRaidAccess
+ * to handle this.
+ *
+ * All we do here is fire the direct successors of the header node.  The
+ * DAG execution thread does the rest of the dag processing.
+ */
+int 
+rf_DispatchDAG(
+    RF_DagHeader_t * dag,
+    void (*cbFunc) (void *),
+    void *cbArg)
+{
+	RF_Raid_t *raidPtr;
+
+	raidPtr = dag->raidPtr;
+	if (dag->tracerec) {
+		RF_ETIMER_START(dag->tracerec->timer);
+	}
+	if (rf_engineDebug || rf_validateDAGDebug) {
+		if (rf_ValidateDAG(dag))
+			RF_PANIC();
+	}
+	if (rf_engineDebug) {
+		printf("raid%d: Entering DispatchDAG\n", raidPtr->raidid);
+	}
+	raidPtr->dags_in_flight++;	/* debug only:  blow off proper
+					 * locking */
+	dag->cbFunc = cbFunc;
+	dag->cbArg = cbArg;
+	dag->numNodesCompleted = 0;
+	dag->status = rf_enable;
+	FireNodeArray(dag->numSuccedents, dag->succedents);
+	return (1);
+}
+/* dedicated kernel thread:
+ * the thread that handles all DAG node firing.
+ * To minimize locking and unlocking, we grab a copy of the entire node queue and then set the
+ * node queue to NULL before doing any firing of nodes.  This way we only have to release the
+ * lock once.  Of course, it's probably rare that there's more than one node in the queue at
+ * any one time, but it sometimes happens.
+ *
+ * In the kernel, this thread runs at spl0 and is not swappable.  I copied these
+ * characteristics from the aio_completion_thread.
+ */
+
+static void 
+DAGExecutionThread(RF_ThreadArg_t arg)
+{
+	RF_DagNode_t *nd, *local_nq, *term_nq, *fire_nq;
+	RF_Raid_t *raidPtr;
+	int     ks;
+
+	raidPtr = (RF_Raid_t *) arg;
+
+	if (rf_engineDebug) {
+		printf("raid%d: Engine thread is running\n", raidPtr->raidid);
+	}
+
+	mtx_lock(&Giant);
+
+	RF_THREADGROUP_RUNNING(&raidPtr->engine_tg);
+
+	DO_LOCK(raidPtr);
+	while (!raidPtr->shutdown_engine) {
+
+		while (raidPtr->node_queue != NULL) {
+			local_nq = raidPtr->node_queue;
+			fire_nq = NULL;
+			term_nq = NULL;
+			raidPtr->node_queue = NULL;
+			DO_UNLOCK(raidPtr);
+
+			/* first, strip out the terminal nodes */
+			while (local_nq) {
+				nd = local_nq;
+				local_nq = local_nq->next;
+				switch (nd->dagHdr->status) {
+				case rf_enable:
+				case rf_rollForward:
+					if (nd->numSuccedents == 0) {
+						/* end of the dag, add to
+						 * callback list */
+						nd->next = term_nq;
+						term_nq = nd;
+					} else {
+						/* not the end, add to the
+						 * fire queue */
+						nd->next = fire_nq;
+						fire_nq = nd;
+					}
+					break;
+				case rf_rollBackward:
+					if (nd->numAntecedents == 0) {
+						/* end of the dag, add to the
+						 * callback list */
+						nd->next = term_nq;
+						term_nq = nd;
+					} else {
+						/* not the end, add to the
+						 * fire queue */
+						nd->next = fire_nq;
+						fire_nq = nd;
+					}
+					break;
+				default:
+					RF_PANIC();
+					break;
+				}
+			}
+
+			/* execute callback of dags which have reached the
+			 * terminal node */
+			while (term_nq) {
+				nd = term_nq;
+				term_nq = term_nq->next;
+				nd->next = NULL;
+				(nd->dagHdr->cbFunc) (nd->dagHdr->cbArg);
+				raidPtr->dags_in_flight--;	/* debug only */
+			}
+
+			/* fire remaining nodes */
+			FireNodeList(fire_nq);
+
+			DO_LOCK(raidPtr);
+		}
+		while (!raidPtr->shutdown_engine && raidPtr->node_queue == NULL)
+			DO_WAIT(raidPtr);
+	}
+	DO_UNLOCK(raidPtr);
+
+	RF_THREADGROUP_DONE(&raidPtr->engine_tg);
+
+	RF_THREAD_EXIT(0);
+}
diff --git a/sys/dev/raidframe/rf_engine.h b/sys/dev/raidframe/rf_engine.h
new file mode 100644
index 0000000..c758c05
--- /dev/null
+++ b/sys/dev/raidframe/rf_engine.h
@@ -0,0 +1,48 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_engine.h,v 1.3 1999/02/05 00:06:11 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II, Mark Holland, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/**********************************************************
+ *                                                        *
+ * engine.h -- header file for execution engine functions *
+ *                                                        *
+ **********************************************************/
+
+#ifndef _RF__RF_ENGINE_H_
+#define _RF__RF_ENGINE_H_
+
+int 
+rf_ConfigureEngine(RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr, RF_Config_t * cfgPtr);
+
+int     rf_FinishNode(RF_DagNode_t * node, int context);	/* return finished node
+								 * to engine */
+
+int     rf_DispatchDAG(RF_DagHeader_t * dag, void (*cbFunc) (void *), void *cbArg);	/* execute dag */
+
+#endif				/* !_RF__RF_ENGINE_H_ */
diff --git a/sys/dev/raidframe/rf_etimer.h b/sys/dev/raidframe/rf_etimer.h
new file mode 100644
index 0000000..e66e01b
--- /dev/null
+++ b/sys/dev/raidframe/rf_etimer.h
@@ -0,0 +1,95 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_etimer.h,v 1.4 1999/08/13 03:26:55 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_TIMER_H_
+#define _RF__RF_TIMER_H_
+
+#include <dev/raidframe/rf_options.h>
+#include <dev/raidframe/rf_utils.h>
+
+#include <sys/time.h>
+
+struct RF_Etimer_s {
+        struct timeval st;
+        struct timeval et;
+        struct timeval diff;
+};
+
+#if defined(_KERNEL)
+#include <sys/kernel.h>
+
+#if defined(__NetBSD__)
+#define RF_ETIMER_START(_t_)                                    \
+                {                                               \
+                        int s;                                  \
+                        bzero(&(_t_), sizeof (_t_));            \
+                        s = splclock();                         \
+                        (_t_).st = mono_time;                   \
+                        splx(s);                                \
+                }
+#elif defined(__FreeBSD__)
+#define RF_ETIMER_START(_t_)                                    \
+                {                                               \
+                        int s;                                  \
+                        bzero(&(_t_), sizeof (_t_));            \
+                        s = splclock();                         \
+                        getmicrouptime(&(_t_).st);              \
+                        splx(s);                                \
+                }
+#endif
+
+#if defined(__NetBSD__)
+#define RF_ETIMER_STOP(_t_)                                     \
+                {                                               \
+                        int s;                                  \
+                        s = splclock();                         \
+                        (_t_).et = mono_time;                   \
+                        splx(s);                                \
+                }
+#elif defined(__FreeBSD__)
+#define RF_ETIMER_STOP(_t_)                                     \
+                {                                               \
+                        int s;                                  \
+                        s = splclock();                         \
+                        getmicrouptime(&(_t_).et);              \
+                        splx(s);                                \
+                }
+#endif
+
+#define RF_ETIMER_EVAL(_t_)                                     \
+        {                                                       \
+                RF_TIMEVAL_DIFF(&(_t_).st, &(_t_).et, &(_t_).diff) \
+        }
+
+#define RF_ETIMER_VAL_US(_t_)      (RF_TIMEVAL_TO_US((_t_).diff))
+#define RF_ETIMER_VAL_MS(_t_)      (RF_TIMEVAL_TO_US((_t_).diff)/1000)
+
+#endif /* _KERNEL */
+
+#endif				/* !_RF__RF_TIMER_H_ */
diff --git a/sys/dev/raidframe/rf_evenodd.c b/sys/dev/raidframe/rf_evenodd.c
new file mode 100644
index 0000000..47ce2cf
--- /dev/null
+++ b/sys/dev/raidframe/rf_evenodd.c
@@ -0,0 +1,557 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_evenodd.c,v 1.4 2000/01/07 03:40:59 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Chang-Ming Wu
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************************
+ *
+ * rf_evenodd.c -- implements EVENODD array architecture
+ *
+ ****************************************************************************************/
+
+#include <dev/raidframe/rf_archs.h>
+
+#if RF_INCLUDE_EVENODD > 0
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagffrd.h>
+#include <dev/raidframe/rf_dagffwr.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+#include <dev/raidframe/rf_dagdegwr.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_evenodd.h>
+#include <dev/raidframe/rf_configure.h>
+#include <dev/raidframe/rf_parityscan.h>
+#include <dev/raidframe/rf_utils.h>
+#include <dev/raidframe/rf_map.h>
+#include <dev/raidframe/rf_pq.h>
+#include <dev/raidframe/rf_mcpair.h>
+#include <dev/raidframe/rf_evenodd.h>
+#include <dev/raidframe/rf_evenodd_dagfuncs.h>
+#include <dev/raidframe/rf_evenodd_dags.h>
+#include <dev/raidframe/rf_engine.h>
+#include <dev/raidframe/rf_kintf.h>
+
+typedef struct RF_EvenOddConfigInfo_s {
+	RF_RowCol_t **stripeIdentifier;	/* filled in at config time & used by
+					 * IdentifyStripe */
+}       RF_EvenOddConfigInfo_t;
+
+int 
+rf_ConfigureEvenOdd(listp, raidPtr, cfgPtr)
+	RF_ShutdownList_t **listp;
+	RF_Raid_t *raidPtr;
+	RF_Config_t *cfgPtr;
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_EvenOddConfigInfo_t *info;
+	RF_RowCol_t i, j, startdisk;
+
+	RF_MallocAndAdd(info, sizeof(RF_EvenOddConfigInfo_t), (RF_EvenOddConfigInfo_t *), raidPtr->cleanupList);
+	layoutPtr->layoutSpecificInfo = (void *) info;
+
+	RF_ASSERT(raidPtr->numRow == 1);
+
+	info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList);
+	startdisk = 0;
+	for (i = 0; i < raidPtr->numCol; i++) {
+		for (j = 0; j < raidPtr->numCol; j++) {
+			info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol;
+		}
+		if ((startdisk -= 2) < 0)
+			startdisk += raidPtr->numCol;
+	}
+
+	/* fill in the remaining layout parameters */
+	layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
+	layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+	layoutPtr->numDataCol = raidPtr->numCol - 2;	/* ORIG:
+							 * layoutPtr->numDataCol
+							 * = raidPtr->numCol-1;  */
+#if RF_EO_MATRIX_DIM > 17
+	if (raidPtr->numCol <= 17) {
+		printf("Number of stripe units in a parity stripe is smaller than 17. Please\n");
+		printf("define the macro RF_EO_MATRIX_DIM in file rf_evenodd_dagfuncs.h to \n");
+		printf("be 17 to increase performance. \n");
+		return (EINVAL);
+	}
+#elif RF_EO_MATRIX_DIM == 17
+	if (raidPtr->numCol > 17) {
+		printf("Number of stripe units in a parity stripe is bigger than 17. Please\n");
+		printf("define the macro RF_EO_MATRIX_DIM in file rf_evenodd_dagfuncs.h to \n");
+		printf("be 257 for encoding and decoding functions to work. \n");
+		return (EINVAL);
+	}
+#endif
+	layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->numParityCol = 2;
+	layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
+	raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+	raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+
+	return (0);
+}
+
+int 
+rf_GetDefaultNumFloatingReconBuffersEvenOdd(RF_Raid_t * raidPtr)
+{
+	return (20);
+}
+
+RF_HeadSepLimit_t 
+rf_GetDefaultHeadSepLimitEvenOdd(RF_Raid_t * raidPtr)
+{
+	return (10);
+}
+
+void 
+rf_IdentifyStripeEvenOdd(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids,
+    RF_RowCol_t * outRow)
+{
+	RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
+	RF_EvenOddConfigInfo_t *info = (RF_EvenOddConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+
+	*outRow = 0;
+	*diskids = info->stripeIdentifier[stripeID % raidPtr->numCol];
+}
+/* The layout of stripe unit on the disks are:      c0 c1 c2 c3 c4
+
+ 						     0  1  2  E  P
+						     5  E  P  3  4
+						     P  6  7  8  E
+	 					    10 11  E  P  9
+						     E  P 12 13 14
+						     ....
+
+  We use the MapSectorRAID5 to map data information because the routine can be shown to map exactly
+  the layout of data stripe unit as shown above although we have 2 redundant information now.
+  But for E and P, we use rf_MapEEvenOdd and rf_MapParityEvenOdd which are different method from raid-5.
+*/
+
+
+void 
+rf_MapParityEvenOdd(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+	RF_StripeNum_t endSUIDofthisStrip = (SUID / raidPtr->Layout.numDataCol + 1) * raidPtr->Layout.numDataCol - 1;
+
+	*row = 0;
+	*col = (endSUIDofthisStrip + 2) % raidPtr->numCol;
+	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+	    (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void 
+rf_MapEEvenOdd(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+	RF_StripeNum_t endSUIDofthisStrip = (SUID / raidPtr->Layout.numDataCol + 1) * raidPtr->Layout.numDataCol - 1;
+
+	*row = 0;
+	*col = (endSUIDofthisStrip + 1) % raidPtr->numCol;
+	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+	    (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void 
+rf_EODagSelect(
+    RF_Raid_t * raidPtr,
+    RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap,
+    RF_VoidFuncPtr * createFunc)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	unsigned ndfail = asmap->numDataFailed;
+	unsigned npfail = asmap->numParityFailed + asmap->numQFailed;
+	unsigned ntfail = npfail + ndfail;
+
+	RF_ASSERT(RF_IO_IS_R_OR_W(type));
+	if (ntfail > 2) {
+		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
+		 /* *infoFunc = */ *createFunc = NULL;
+		return;
+	}
+	/* ok, we can do this I/O */
+	if (type == RF_IO_TYPE_READ) {
+		switch (ndfail) {
+		case 0:
+			/* fault free read */
+			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
+			break;
+		case 1:
+			/* lost a single data unit */
+			/* two cases: (1) parity is not lost. do a normal raid
+			 * 5 reconstruct read. (2) parity is lost. do a
+			 * reconstruct read using "e". */
+			if (ntfail == 2) {	/* also lost redundancy */
+				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
+					*createFunc = (RF_VoidFuncPtr) rf_EO_110_CreateReadDAG;
+				else
+					*createFunc = (RF_VoidFuncPtr) rf_EO_101_CreateReadDAG;
+			} else {
+				/* P and E are ok. But is there a failure in
+				 * some unaccessed data unit? */
+				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
+					*createFunc = (RF_VoidFuncPtr) rf_EO_200_CreateReadDAG;
+				else
+					*createFunc = (RF_VoidFuncPtr) rf_EO_100_CreateReadDAG;
+			}
+			break;
+		case 2:
+			/* *createFunc = rf_EO_200_CreateReadDAG; */
+			*createFunc = NULL;
+			break;
+		}
+		return;
+	}
+	/* a write */
+	switch (ntfail) {
+	case 0:		/* fault free */
+		if (rf_suppressLocksAndLargeWrites ||
+		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
+			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
+
+			*createFunc = (RF_VoidFuncPtr) rf_EOCreateSmallWriteDAG;
+		} else {
+			*createFunc = (RF_VoidFuncPtr) rf_EOCreateLargeWriteDAG;
+		}
+		break;
+
+	case 1:		/* single disk fault */
+		if (npfail == 1) {
+			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
+			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
+										 * normal mode raid5
+										 * write. */
+				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
+				    || (asmap->parityInfo->next != NULL) || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
+					*createFunc = (RF_VoidFuncPtr) rf_EO_001_CreateSmallWriteDAG;
+				else
+					*createFunc = (RF_VoidFuncPtr) rf_EO_001_CreateLargeWriteDAG;
+			} else {/* parity died, small write only updating Q */
+				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
+				    || (asmap->qInfo->next != NULL) || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
+					*createFunc = (RF_VoidFuncPtr) rf_EO_010_CreateSmallWriteDAG;
+				else
+					*createFunc = (RF_VoidFuncPtr) rf_EO_010_CreateLargeWriteDAG;
+			}
+		} else {	/* data missing. Do a P reconstruct write if
+				 * only a single data unit is lost in the
+				 * stripe, otherwise a reconstruct write which
+				 * employnig both P and E units. */
+			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2) {
+				if (asmap->numStripeUnitsAccessed == 1)
+					*createFunc = (RF_VoidFuncPtr) rf_EO_200_CreateWriteDAG;
+				else
+					*createFunc = NULL;	/* No direct support for
+								 * this case now, like
+								 * that in Raid-5  */
+			} else {
+				if (asmap->numStripeUnitsAccessed != 1 && asmap->failedPDAs[0]->numSector != layoutPtr->sectorsPerStripeUnit)
+					*createFunc = NULL;	/* No direct support for
+								 * this case now, like
+								 * that in Raid-5  */
+				else
+					*createFunc = (RF_VoidFuncPtr) rf_EO_100_CreateWriteDAG;
+			}
+		}
+		break;
+
+	case 2:		/* two disk faults */
+		switch (npfail) {
+		case 2:	/* both p and q dead */
+			*createFunc = (RF_VoidFuncPtr) rf_EO_011_CreateWriteDAG;
+			break;
+		case 1:	/* either p or q and dead data */
+			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
+			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
+			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q) {
+				if (asmap->numStripeUnitsAccessed != 1 && asmap->failedPDAs[0]->numSector != layoutPtr->sectorsPerStripeUnit)
+					*createFunc = NULL;	/* In both PQ and
+								 * EvenOdd, no direct
+								 * support for this case
+								 * now, like that in
+								 * Raid-5  */
+				else
+					*createFunc = (RF_VoidFuncPtr) rf_EO_101_CreateWriteDAG;
+			} else {
+				if (asmap->numStripeUnitsAccessed != 1 && asmap->failedPDAs[0]->numSector != layoutPtr->sectorsPerStripeUnit)
+					*createFunc = NULL;	/* No direct support for
+								 * this case, like that
+								 * in Raid-5  */
+				else
+					*createFunc = (RF_VoidFuncPtr) rf_EO_110_CreateWriteDAG;
+			}
+			break;
+		case 0:	/* double data loss */
+			/* if(asmap->failedPDAs[0]->numSector +
+			 * asmap->failedPDAs[1]->numSector == 2 *
+			 * layoutPtr->sectorsPerStripeUnit ) createFunc =
+			 * rf_EOCreateLargeWriteDAG; else    							 */
+			*createFunc = NULL;	/* currently, in Evenodd, No
+						 * support for simultaneous
+						 * access of both failed SUs */
+			break;
+		}
+		break;
+
+	default:		/* more than 2 disk faults */
+		*createFunc = NULL;
+		RF_PANIC();
+	}
+	return;
+}
+
+
+int 
+rf_VerifyParityEvenOdd(raidPtr, raidAddr, parityPDA, correct_it, flags)
+	RF_Raid_t *raidPtr;
+	RF_RaidAddr_t raidAddr;
+	RF_PhysDiskAddr_t *parityPDA;
+	int     correct_it;
+	RF_RaidAccessFlags_t flags;
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_RaidAddr_t startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
+	RF_SectorCount_t numsector = parityPDA->numSector;
+	int     numbytes = rf_RaidAddressToByte(raidPtr, numsector);
+	int     bytesPerStripe = numbytes * layoutPtr->numDataCol;
+	RF_DagHeader_t *rd_dag_h, *wr_dag_h;	/* read, write dag */
+	RF_DagNode_t *blockNode, *unblockNode, *wrBlock, *wrUnblock;
+	RF_AccessStripeMapHeader_t *asm_h;
+	RF_AccessStripeMap_t *asmap;
+	RF_AllocListElem_t *alloclist;
+	RF_PhysDiskAddr_t *pda;
+	char   *pbuf, *buf, *end_p, *p;
+	char   *redundantbuf2;
+	int     redundantTwoErr = 0, redundantOneErr = 0;
+	int     parity_cant_correct = RF_FALSE, red2_cant_correct = RF_FALSE,
+	        parity_corrected = RF_FALSE, red2_corrected = RF_FALSE;
+	int     i, retcode;
+	RF_ReconUnitNum_t which_ru;
+	RF_StripeNum_t psID = rf_RaidAddressToParityStripeID(layoutPtr, raidAddr, &which_ru);
+	int     stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
+	RF_AccTraceEntry_t tracerec;
+	RF_MCPair_t *mcpair;
+
+	retcode = RF_PARITY_OKAY;
+
+	mcpair = rf_AllocMCPair();
+	rf_MakeAllocList(alloclist);
+	RF_MallocAndAdd(buf, numbytes * (layoutPtr->numDataCol + layoutPtr->numParityCol), (char *), alloclist);
+	RF_CallocAndAdd(pbuf, 1, numbytes, (char *), alloclist);	/* use calloc to make
+									 * sure buffer is zeroed */
+	end_p = buf + bytesPerStripe;
+	RF_CallocAndAdd(redundantbuf2, 1, numbytes, (char *), alloclist);	/* use calloc to make
+										 * sure buffer is zeroed */
+
+	rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, numbytes, buf, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+	    "Rod", alloclist, flags, RF_IO_NORMAL_PRIORITY);
+	blockNode = rd_dag_h->succedents[0];
+	unblockNode = blockNode->succedents[0]->succedents[0];
+
+	/* map the stripe and fill in the PDAs in the dag */
+	asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe, buf, RF_DONT_REMAP);
+	asmap = asm_h->stripeMap;
+
+	for (pda = asmap->physInfo, i = 0; i < layoutPtr->numDataCol; i++, pda = pda->next) {
+		RF_ASSERT(pda);
+		rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
+		RF_ASSERT(pda->numSector != 0);
+		if (rf_TryToRedirectPDA(raidPtr, pda, 0))
+			goto out;	/* no way to verify parity if disk is
+					 * dead.  return w/ good status */
+		blockNode->succedents[i]->params[0].p = pda;
+		blockNode->succedents[i]->params[2].v = psID;
+		blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+	}
+
+	RF_ASSERT(!asmap->parityInfo->next);
+	rf_RangeRestrictPDA(raidPtr, parityPDA, asmap->parityInfo, 0, 1);
+	RF_ASSERT(asmap->parityInfo->numSector != 0);
+	if (rf_TryToRedirectPDA(raidPtr, asmap->parityInfo, 1))
+		goto out;
+	blockNode->succedents[layoutPtr->numDataCol]->params[0].p = asmap->parityInfo;
+
+	RF_ASSERT(!asmap->qInfo->next);
+	rf_RangeRestrictPDA(raidPtr, parityPDA, asmap->qInfo, 0, 1);
+	RF_ASSERT(asmap->qInfo->numSector != 0);
+	if (rf_TryToRedirectPDA(raidPtr, asmap->qInfo, 1))
+		goto out;
+	/* if disk is dead, b/c no reconstruction is implemented right now,
+	 * the function "rf_TryToRedirectPDA" always return one, which cause
+	 * go to out and return w/ good status   */
+	blockNode->succedents[layoutPtr->numDataCol + 1]->params[0].p = asmap->qInfo;
+
+	/* fire off the DAG */
+	bzero((char *) &tracerec, sizeof(tracerec));
+	rd_dag_h->tracerec = &tracerec;
+
+	if (rf_verifyParityDebug) {
+		printf("Parity verify read dag:\n");
+		rf_PrintDAGList(rd_dag_h);
+	}
+	RF_LOCK_MUTEX(mcpair->mutex);
+	mcpair->flag = 0;
+	rf_DispatchDAG(rd_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
+	    (void *) mcpair);
+	while (!mcpair->flag)
+		RF_WAIT_COND(mcpair->cond, mcpair->mutex);
+	RF_UNLOCK_MUTEX(mcpair->mutex);
+	if (rd_dag_h->status != rf_enable) {
+		RF_ERRORMSG("Unable to verify parity:  can't read the stripe\n");
+		retcode = RF_PARITY_COULD_NOT_VERIFY;
+		goto out;
+	}
+	for (p = buf, i = 0; p < end_p; p += numbytes, i++) {
+		rf_e_encToBuf(raidPtr, i, p, RF_EO_MATRIX_DIM - 2, redundantbuf2, numsector);
+		/* the corresponding columes in EvenOdd encoding Matrix for
+		 * these p pointers which point to the databuffer in a full
+		 * stripe are sequentially from 0 to layoutPtr->numDataCol-1 */
+		rf_bxor(p, pbuf, numbytes, NULL);
+	}
+	RF_ASSERT(i == layoutPtr->numDataCol);
+
+	for (i = 0; i < numbytes; i++) {
+		if (pbuf[i] != buf[bytesPerStripe + i]) {
+			if (!correct_it) {
+				RF_ERRORMSG3("Parity verify error: byte %d of parity is 0x%x should be 0x%x\n",
+				    i, (u_char) buf[bytesPerStripe + i], (u_char) pbuf[i]);
+			}
+		}
+		redundantOneErr = 1;
+		break;
+	}
+
+	for (i = 0; i < numbytes; i++) {
+		if (redundantbuf2[i] != buf[bytesPerStripe + numbytes + i]) {
+			if (!correct_it) {
+				RF_ERRORMSG3("Parity verify error: byte %d of second redundant information is 0x%x should be 0x%x\n",
+				    i, (u_char) buf[bytesPerStripe + numbytes + i], (u_char) redundantbuf2[i]);
+			}
+			redundantTwoErr = 1;
+			break;
+		}
+	}
+	if (redundantOneErr || redundantTwoErr)
+		retcode = RF_PARITY_BAD;
+
+	/* correct the first redundant disk, ie parity if it is error    */
+	if (redundantOneErr && correct_it) {
+		wr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, numbytes, pbuf, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+		    "Wnp", alloclist, flags, RF_IO_NORMAL_PRIORITY);
+		wrBlock = wr_dag_h->succedents[0];
+		wrUnblock = wrBlock->succedents[0]->succedents[0];
+		wrBlock->succedents[0]->params[0].p = asmap->parityInfo;
+		wrBlock->succedents[0]->params[2].v = psID;
+		wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		bzero((char *) &tracerec, sizeof(tracerec));
+		wr_dag_h->tracerec = &tracerec;
+		if (rf_verifyParityDebug) {
+			printf("Parity verify write dag:\n");
+			rf_PrintDAGList(wr_dag_h);
+		}
+		RF_LOCK_MUTEX(mcpair->mutex);
+		mcpair->flag = 0;
+		rf_DispatchDAG(wr_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
+		    (void *) mcpair);
+		while (!mcpair->flag)
+			RF_WAIT_COND(mcpair->cond, mcpair->mutex);
+		RF_UNLOCK_MUTEX(mcpair->mutex);
+		if (wr_dag_h->status != rf_enable) {
+			RF_ERRORMSG("Unable to correct parity in VerifyParity:  can't write the stripe\n");
+			parity_cant_correct = RF_TRUE;
+		} else {
+			parity_corrected = RF_TRUE;
+		}
+		rf_FreeDAG(wr_dag_h);
+	}
+	if (redundantTwoErr && correct_it) {
+		wr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, numbytes, redundantbuf2, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+		    "Wnred2", alloclist, flags, RF_IO_NORMAL_PRIORITY);
+		wrBlock = wr_dag_h->succedents[0];
+		wrUnblock = wrBlock->succedents[0]->succedents[0];
+		wrBlock->succedents[0]->params[0].p = asmap->qInfo;
+		wrBlock->succedents[0]->params[2].v = psID;
+		wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		bzero((char *) &tracerec, sizeof(tracerec));
+		wr_dag_h->tracerec = &tracerec;
+		if (rf_verifyParityDebug) {
+			printf("Dag of write new second redundant information in parity verify :\n");
+			rf_PrintDAGList(wr_dag_h);
+		}
+		RF_LOCK_MUTEX(mcpair->mutex);
+		mcpair->flag = 0;
+		rf_DispatchDAG(wr_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
+		    (void *) mcpair);
+		while (!mcpair->flag)
+			RF_WAIT_COND(mcpair->cond, mcpair->mutex);
+		RF_UNLOCK_MUTEX(mcpair->mutex);
+		if (wr_dag_h->status != rf_enable) {
+			RF_ERRORMSG("Unable to correct second redundant information in VerifyParity:  can't write the stripe\n");
+			red2_cant_correct = RF_TRUE;
+		} else {
+			red2_corrected = RF_TRUE;
+		}
+		rf_FreeDAG(wr_dag_h);
+	}
+	if ((redundantOneErr && parity_cant_correct) ||
+	    (redundantTwoErr && red2_cant_correct))
+		retcode = RF_PARITY_COULD_NOT_CORRECT;
+	if ((retcode = RF_PARITY_BAD) && parity_corrected && red2_corrected)
+		retcode = RF_PARITY_CORRECTED;
+
+
+out:
+	rf_FreeAccessStripeMap(asm_h);
+	rf_FreeAllocList(alloclist);
+	rf_FreeDAG(rd_dag_h);
+	rf_FreeMCPair(mcpair);
+	return (retcode);
+}
+#endif				/* RF_INCLUDE_EVENODD > 0 */
diff --git a/sys/dev/raidframe/rf_evenodd.h b/sys/dev/raidframe/rf_evenodd.h
new file mode 100644
index 0000000..4babdec
--- /dev/null
+++ b/sys/dev/raidframe/rf_evenodd.h
@@ -0,0 +1,55 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_evenodd.h,v 1.2 1999/02/05 00:06:11 oster Exp $	*/
+/*
+ * Copyright (c) 1995, 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Chang-Ming Wu
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_EVENODD_H_
+#define _RF__RF_EVENODD_H_
+
+/* extern declerations of the failure mode  functions.  */
+int 
+rf_ConfigureEvenOdd(RF_ShutdownList_t ** shutdownListp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+int     rf_GetDefaultNumFloatingReconBuffersEvenOdd(RF_Raid_t * raidPtr);
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitEvenOdd(RF_Raid_t * raidPtr);
+void 
+rf_IdentifyStripeEvenOdd(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids, RF_RowCol_t * outrow);
+void 
+rf_MapParityEvenOdd(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_MapEEvenOdd(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_EODagSelect(RF_Raid_t * raidPtr, RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap, RF_VoidFuncPtr * createFunc);
+int 
+rf_VerifyParityEvenOdd(RF_Raid_t * raidPtr, RF_RaidAddr_t raidAddr,
+    RF_PhysDiskAddr_t * parityPDA, int correct_it, RF_RaidAccessFlags_t flags);
+
+#endif				/* !_RF__RF_EVENODD_H_ */
diff --git a/sys/dev/raidframe/rf_evenodd_dagfuncs.c b/sys/dev/raidframe/rf_evenodd_dagfuncs.c
new file mode 100644
index 0000000..2dbf81d
--- /dev/null
+++ b/sys/dev/raidframe/rf_evenodd_dagfuncs.c
@@ -0,0 +1,975 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_evenodd_dagfuncs.c,v 1.7 2001/01/26 03:50:53 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: ChangMing Wu
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * Code for RAID-EVENODD  architecture.
+ */
+
+#include <dev/raidframe/rf_archs.h>
+
+#if RF_INCLUDE_EVENODD > 0
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagffrd.h>
+#include <dev/raidframe/rf_dagffwr.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+#include <dev/raidframe/rf_dagdegwr.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_configure.h>
+#include <dev/raidframe/rf_parityscan.h>
+#include <dev/raidframe/rf_evenodd.h>
+#include <dev/raidframe/rf_evenodd_dagfuncs.h>
+
+/* These redundant functions are for small write */
+RF_RedFuncs_t rf_EOSmallWritePFuncs = {rf_RegularXorFunc, "Regular Old-New P", rf_SimpleXorFunc, "Simple Old-New P"};
+RF_RedFuncs_t rf_EOSmallWriteEFuncs = {rf_RegularONEFunc, "Regular Old-New E", rf_SimpleONEFunc, "Regular Old-New E"};
+/* These redundant functions are for degraded read */
+RF_RedFuncs_t rf_eoPRecoveryFuncs = {rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"};
+RF_RedFuncs_t rf_eoERecoveryFuncs = {rf_RecoveryEFunc, "Recovery E Func", rf_RecoveryEFunc, "Recovery E Func"};
+/**********************************************************************************************
+ *   the following encoding node functions is used in  EO_000_CreateLargeWriteDAG
+ **********************************************************************************************/
+int 
+rf_RegularPEFunc(node)
+	RF_DagNode_t *node;
+{
+	rf_RegularESubroutine(node, node->results[1]);
+	rf_RegularXorFunc(node);/* does the wakeup here! */
+#if 1
+	return (0);		/* XXX This was missing... GO */
+#endif
+}
+
+
+/************************************************************************************************
+ *  For EO_001_CreateSmallWriteDAG, there are (i)RegularONEFunc() and (ii)SimpleONEFunc() to
+ *  be used. The previous case is when write access at least sectors of full stripe unit.
+ *  The later function is used when the write access two stripe units but with total sectors
+ *  less than sectors per SU. In this case, the access of parity and 'E' are shown as disconnected
+ *  areas in their stripe unit and  parity write and 'E' write are both devided into two distinct
+ *  writes( totally four). This simple old-new write and regular old-new write happen as in RAID-5
+ ************************************************************************************************/
+
+/* Algorithm:
+     1. Store the difference of old data and new data in the Rod buffer.
+     2. then encode this buffer into the buffer which already have old 'E' information inside it,
+	the result can be shown to be the new 'E' information.
+     3. xor the Wnd buffer into the difference buffer to recover the  original old data.
+   Here we have another alternative: to allocate a temporary buffer for storing the difference of
+   old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach
+   take the same speed as the previous, and need more memory.
+*/
+int 
+rf_RegularONEFunc(node)
+	RF_DagNode_t *node;
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
+	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
+	int     EpdaIndex = (node->numParams - 1) / 2 - 1;	/* the parameter of node
+								 * where you can find
+								 * e-pda */
+	int     i, k, retcode = 0;
+	int     suoffset, length;
+	RF_RowCol_t scol;
+	char   *srcbuf, *destbuf;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+	RF_PhysDiskAddr_t *pda, *EPDA = (RF_PhysDiskAddr_t *) node->params[EpdaIndex].p;
+	int     ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector);	/* generally zero  */
+
+	RF_ASSERT(EPDA->type == RF_PDA_TYPE_Q);
+	RF_ASSERT(ESUOffset == 0);
+
+	RF_ETIMER_START(timer);
+
+	/* Xor the Wnd buffer into Rod buffer, the difference of old data and
+	 * new data is stored in Rod buffer */
+	for (k = 0; k < EpdaIndex; k += 2) {
+		length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
+		retcode = rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length, node->dagHdr->bp);
+	}
+	/* Start to encoding the buffer storing the difference of old data and
+	 * new data into 'E' buffer  */
+	for (i = 0; i < EpdaIndex; i += 2)
+		if (node->params[i + 1].p != node->results[0]) {	/* results[0] is buf ptr
+									 * of E */
+			pda = (RF_PhysDiskAddr_t *) node->params[i].p;
+			srcbuf = (char *) node->params[i + 1].p;
+			scol = rf_EUCol(layoutPtr, pda->raidAddress);
+			suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+			destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset);
+			rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
+		}
+	/* Recover the original old data to be used by parity encoding
+	 * function in XorNode */
+	for (k = 0; k < EpdaIndex; k += 2) {
+		length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
+		retcode = rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length, node->dagHdr->bp);
+	}
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->q_us += RF_ETIMER_VAL_US(timer);
+	rf_GenericWakeupFunc(node, 0);
+#if 1
+	return (0);		/* XXX this was missing.. GO */
+#endif
+}
+
+int 
+rf_SimpleONEFunc(node)
+	RF_DagNode_t *node;
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
+	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
+	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+	int     retcode = 0;
+	char   *srcbuf, *destbuf;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	int     length;
+	RF_RowCol_t scol;
+	RF_Etimer_t timer;
+
+	RF_ASSERT(((RF_PhysDiskAddr_t *) node->params[2].p)->type == RF_PDA_TYPE_Q);
+	if (node->dagHdr->status == rf_enable) {
+		RF_ETIMER_START(timer);
+		length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[4].p)->numSector);	/* this is a pda of
+														 * writeDataNodes */
+		/* bxor to buffer of readDataNodes */
+		retcode = rf_bxor(node->params[5].p, node->params[1].p, length, node->dagHdr->bp);
+		/* find out the corresponding colume in encoding matrix for
+		 * write colume to be encoded into redundant disk 'E' */
+		scol = rf_EUCol(layoutPtr, pda->raidAddress);
+		srcbuf = node->params[1].p;
+		destbuf = node->params[3].p;
+		/* Start encoding process */
+		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
+		rf_bxor(node->params[5].p, node->params[1].p, length, node->dagHdr->bp);
+		RF_ETIMER_STOP(timer);
+		RF_ETIMER_EVAL(timer);
+		tracerec->q_us += RF_ETIMER_VAL_US(timer);
+
+	}
+	return (rf_GenericWakeupFunc(node, retcode));	/* call wake func
+							 * explicitly since no
+							 * I/O in this node */
+}
+
+
+/****** called by rf_RegularPEFunc(node) and rf_RegularEFunc(node) in f.f. large write  ********/
+void 
+rf_RegularESubroutine(node, ebuf)
+	RF_DagNode_t *node;
+	char   *ebuf;
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
+	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
+	RF_PhysDiskAddr_t *pda;
+	int     i, suoffset;
+	RF_RowCol_t scol;
+	char   *srcbuf, *destbuf;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+
+	RF_ETIMER_START(timer);
+	for (i = 0; i < node->numParams - 2; i += 2) {
+		RF_ASSERT(node->params[i + 1].p != ebuf);
+		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
+		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+		scol = rf_EUCol(layoutPtr, pda->raidAddress);
+		srcbuf = (char *) node->params[i + 1].p;
+		destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset);
+		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
+	}
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->xor_us += RF_ETIMER_VAL_US(timer);
+}
+
+
+/*******************************************************************************************
+ *			 Used in  EO_001_CreateLargeWriteDAG
+ ******************************************************************************************/
+int 
+rf_RegularEFunc(node)
+	RF_DagNode_t *node;
+{
+	rf_RegularESubroutine(node, node->results[0]);
+	rf_GenericWakeupFunc(node, 0);
+#if 1
+	return (0);		/* XXX this was missing?.. GO */
+#endif
+}
+/*******************************************************************************************
+ * This degraded function allow only two case:
+ *  1. when write access the full failed stripe unit, then the access can be more than
+ *     one tripe units.
+ *  2. when write access only part of the failed SU, we assume accesses of more than
+ *     one stripe unit is not allowed so that the write can be dealt with like a
+ *     large write.
+ *  The following function is based on these assumptions. So except in the second case,
+ *  it looks the same as a large write encodeing function. But this is not exactly the
+ *  normal way for doing a degraded write, since raidframe have to break cases of access
+ *  other than the above two into smaller accesses. We may have to change
+ *  DegrESubroutin in the future.
+ *******************************************************************************************/
+void 
+rf_DegrESubroutine(node, ebuf)
+	RF_DagNode_t *node;
+	char   *ebuf;
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
+	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
+	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
+	RF_PhysDiskAddr_t *pda;
+	int     i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
+	RF_RowCol_t scol;
+	char   *srcbuf, *destbuf;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+
+	RF_ETIMER_START(timer);
+	for (i = 0; i < node->numParams - 2; i += 2) {
+		RF_ASSERT(node->params[i + 1].p != ebuf);
+		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
+		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+		scol = rf_EUCol(layoutPtr, pda->raidAddress);
+		srcbuf = (char *) node->params[i + 1].p;
+		destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
+		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
+	}
+
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->q_us += RF_ETIMER_VAL_US(timer);
+}
+
+
+/**************************************************************************************
+ * This function is used in case where one data disk failed and both redundant disks
+ * alive. It is used in the EO_100_CreateWriteDAG. Note: if there is another disk
+ * failed in the stripe but not accessed at this time, then we should, instead, use
+ * the rf_EOWriteDoubleRecoveryFunc().
+ **************************************************************************************/
+int 
+rf_Degraded_100_EOFunc(node)
+	RF_DagNode_t *node;
+{
+	rf_DegrESubroutine(node, node->results[1]);
+	rf_RecoveryXorFunc(node);	/* does the wakeup here! */
+#if 1
+	return (0);		/* XXX this was missing... SHould these be
+				 * void functions??? GO */
+#endif
+}
+/**************************************************************************************
+ * This function is to encode one sector in one of the data disks to the E disk.
+ * However, in evenodd this function can also be used as decoding function to recover
+ * data from dead disk in the case of parity failure and a single data failure.
+ **************************************************************************************/
+void 
+rf_e_EncOneSect(
+    RF_RowCol_t srcLogicCol,
+    char *srcSecbuf,
+    RF_RowCol_t destLogicCol,
+    char *destSecbuf,
+    int bytesPerSector)
+{
+	int     S_index;	/* index of the EU in the src col which need
+				 * be Xored into all EUs in a dest sector */
+	int     numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1;
+	RF_RowCol_t j, indexInDest,	/* row index of an encoding unit in
+					 * the destination colume of encoding
+					 * matrix */
+	        indexInSrc;	/* row index of an encoding unit in the source
+				 * colume used for recovery */
+	int     bytesPerEU = bytesPerSector / numRowInEncMatix;
+
+#if RF_EO_MATRIX_DIM > 17
+	int     shortsPerEU = bytesPerEU / sizeof(short);
+	short  *destShortBuf, *srcShortBuf1, *srcShortBuf2;
+	short temp1;
+#elif RF_EO_MATRIX_DIM == 17
+	int     longsPerEU = bytesPerEU / sizeof(long);
+	long   *destLongBuf, *srcLongBuf1, *srcLongBuf2;
+	long temp1;
+#endif
+
+#if RF_EO_MATRIX_DIM > 17
+	RF_ASSERT(sizeof(short) == 2 || sizeof(short) == 1);
+	RF_ASSERT(bytesPerEU % sizeof(short) == 0);
+#elif RF_EO_MATRIX_DIM == 17
+	RF_ASSERT(sizeof(long) == 8 || sizeof(long) == 4);
+	RF_ASSERT(bytesPerEU % sizeof(long) == 0);
+#endif
+
+	S_index = rf_EO_Mod((RF_EO_MATRIX_DIM - 1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
+#if RF_EO_MATRIX_DIM > 17
+	srcShortBuf1 = (short *) (srcSecbuf + S_index * bytesPerEU);
+#elif RF_EO_MATRIX_DIM == 17
+	srcLongBuf1 = (long *) (srcSecbuf + S_index * bytesPerEU);
+#endif
+
+	for (indexInDest = 0; indexInDest < numRowInEncMatix; indexInDest++) {
+		indexInSrc = rf_EO_Mod((indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
+
+#if RF_EO_MATRIX_DIM > 17
+		destShortBuf = (short *) (destSecbuf + indexInDest * bytesPerEU);
+		srcShortBuf2 = (short *) (srcSecbuf + indexInSrc * bytesPerEU);
+		for (j = 0; j < shortsPerEU; j++) {
+			temp1 = destShortBuf[j] ^ srcShortBuf1[j];
+			/* note: S_index won't be at the end row for any src
+			 * col! */
+			if (indexInSrc != RF_EO_MATRIX_DIM - 1)
+				destShortBuf[j] = (srcShortBuf2[j]) ^ temp1;
+			/* if indexInSrc is at the end row, ie.
+			 * RF_EO_MATRIX_DIM -1, then all elements are zero! */
+			else
+				destShortBuf[j] = temp1;
+		}
+
+#elif RF_EO_MATRIX_DIM == 17
+		destLongBuf = (long *) (destSecbuf + indexInDest * bytesPerEU);
+		srcLongBuf2 = (long *) (srcSecbuf + indexInSrc * bytesPerEU);
+		for (j = 0; j < longsPerEU; j++) {
+			temp1 = destLongBuf[j] ^ srcLongBuf1[j];
+			if (indexInSrc != RF_EO_MATRIX_DIM - 1)
+				destLongBuf[j] = (srcLongBuf2[j]) ^ temp1;
+			else
+				destLongBuf[j] = temp1;
+		}
+#endif
+	}
+}
+
+void 
+rf_e_encToBuf(
+    RF_Raid_t * raidPtr,
+    RF_RowCol_t srcLogicCol,
+    char *srcbuf,
+    RF_RowCol_t destLogicCol,
+    char *destbuf,
+    int numSector)
+{
+	int     i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
+
+	for (i = 0; i < numSector; i++) {
+		rf_e_EncOneSect(srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector);
+		srcbuf += bytesPerSector;
+		destbuf += bytesPerSector;
+	}
+}
+/**************************************************************************************
+ * when parity die and one data die, We use second redundant information, 'E',
+ * to recover the data in dead disk. This function is used in the recovery node of
+ * for EO_110_CreateReadDAG
+ **************************************************************************************/
+int 
+rf_RecoveryEFunc(node)
+	RF_DagNode_t *node;
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
+	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
+	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
+	RF_RowCol_t scol,	/* source logical column */
+	        fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress);	/* logical column of
+									 * failed SU */
+	int     i;
+	RF_PhysDiskAddr_t *pda;
+	int     suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
+	char   *srcbuf, *destbuf;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+
+	bzero((char *) node->results[0], rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
+	if (node->dagHdr->status == rf_enable) {
+		RF_ETIMER_START(timer);
+		for (i = 0; i < node->numParams - 2; i += 2)
+			if (node->params[i + 1].p != node->results[0]) {
+				pda = (RF_PhysDiskAddr_t *) node->params[i].p;
+				if (i == node->numParams - 4)
+					scol = RF_EO_MATRIX_DIM - 2;	/* the colume of
+									 * redundant E */
+				else
+					scol = rf_EUCol(layoutPtr, pda->raidAddress);
+				srcbuf = (char *) node->params[i + 1].p;
+				suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+				destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
+				rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector);
+			}
+		RF_ETIMER_STOP(timer);
+		RF_ETIMER_EVAL(timer);
+		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
+	}
+	return (rf_GenericWakeupFunc(node, 0));	/* node execute successfully */
+}
+/**************************************************************************************
+ * This function is used in the case where one data and the parity have filed.
+ * (in EO_110_CreateWriteDAG )
+ **************************************************************************************/
+int 
+rf_EO_DegradedWriteEFunc(RF_DagNode_t * node)
+{
+	rf_DegrESubroutine(node, node->results[0]);
+	rf_GenericWakeupFunc(node, 0);
+#if 1
+	return (0);		/* XXX Yet another one!! GO */
+#endif
+}
+
+
+
+/**************************************************************************************
+ *  		THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES
+ **************************************************************************************/
+
+void 
+rf_doubleEOdecode(
+    RF_Raid_t * raidPtr,
+    char **rrdbuf,
+    char **dest,
+    RF_RowCol_t * fcol,
+    char *pbuf,
+    char *ebuf)
+{
+	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
+	int     i, j, k, f1, f2, row;
+	int     rrdrow, erow, count = 0;
+	int     bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
+	int     numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1;
+#if 0
+	int     pcol = (RF_EO_MATRIX_DIM) - 1;
+#endif
+	int     ecol = (RF_EO_MATRIX_DIM) - 2;
+	int     bytesPerEU = bytesPerSector / numRowInEncMatix;
+	int     numDataCol = layoutPtr->numDataCol;
+#if RF_EO_MATRIX_DIM > 17
+	int     shortsPerEU = bytesPerEU / sizeof(short);
+	short  *rrdbuf_current, *pbuf_current, *ebuf_current;
+	short  *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
+	short *temp;
+	short  *P;
+
+	RF_ASSERT(bytesPerEU % sizeof(short) == 0);
+	RF_Malloc(P, bytesPerEU, (short *));
+	RF_Malloc(temp, bytesPerEU, (short *));
+#elif RF_EO_MATRIX_DIM == 17
+	int     longsPerEU = bytesPerEU / sizeof(long);
+	long   *rrdbuf_current, *pbuf_current, *ebuf_current;
+	long   *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
+	long *temp;
+	long   *P;
+
+	RF_ASSERT(bytesPerEU % sizeof(long) == 0);
+	RF_Malloc(P, bytesPerEU, (long *));
+	RF_Malloc(temp, bytesPerEU, (long *));
+#endif
+	RF_ASSERT(*((long *) dest[0]) == 0);
+	RF_ASSERT(*((long *) dest[1]) == 0);
+	bzero((char *) P, bytesPerEU);
+	bzero((char *) temp, bytesPerEU);
+	RF_ASSERT(*P == 0);
+	/* calculate the 'P' parameter, which, not parity, is the Xor of all
+	 * elements in the last two column, ie. 'E' and 'parity' colume, see
+	 * the Ref. paper by Blaum, et al 1993  */
+	for (i = 0; i < numRowInEncMatix; i++)
+		for (k = 0; k < longsPerEU; k++) {
+#if RF_EO_MATRIX_DIM > 17
+			ebuf_current = ((short *) ebuf) + i * shortsPerEU + k;
+			pbuf_current = ((short *) pbuf) + i * shortsPerEU + k;
+#elif RF_EO_MATRIX_DIM == 17
+			ebuf_current = ((long *) ebuf) + i * longsPerEU + k;
+			pbuf_current = ((long *) pbuf) + i * longsPerEU + k;
+#endif
+			P[k] ^= *ebuf_current;
+			P[k] ^= *pbuf_current;
+		}
+	RF_ASSERT(fcol[0] != fcol[1]);
+	if (fcol[0] < fcol[1]) {
+#if RF_EO_MATRIX_DIM > 17
+		dest_smaller = (short *) (dest[0]);
+		dest_larger = (short *) (dest[1]);
+#elif RF_EO_MATRIX_DIM == 17
+		dest_smaller = (long *) (dest[0]);
+		dest_larger = (long *) (dest[1]);
+#endif
+		f1 = fcol[0];
+		f2 = fcol[1];
+	} else {
+#if RF_EO_MATRIX_DIM > 17
+		dest_smaller = (short *) (dest[1]);
+		dest_larger = (short *) (dest[0]);
+#elif RF_EO_MATRIX_DIM == 17
+		dest_smaller = (long *) (dest[1]);
+		dest_larger = (long *) (dest[0]);
+#endif
+		f1 = fcol[1];
+		f2 = fcol[0];
+	}
+	row = (RF_EO_MATRIX_DIM) - 1;
+	while ((row = rf_EO_Mod((row + f1 - f2), RF_EO_MATRIX_DIM)) != ((RF_EO_MATRIX_DIM) - 1)) {
+#if RF_EO_MATRIX_DIM > 17
+		dest_larger_current = dest_larger + row * shortsPerEU;
+		dest_smaller_current = dest_smaller + row * shortsPerEU;
+#elif RF_EO_MATRIX_DIM == 17
+		dest_larger_current = dest_larger + row * longsPerEU;
+		dest_smaller_current = dest_smaller + row * longsPerEU;
+#endif
+		/**    Do the diagonal recovery. Initially, temp[k] = (failed 1),
+		       which is the failed data in the colume which has smaller col index. **/
+		/* step 1:  ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3))         */
+		for (j = 0; j < numDataCol; j++) {
+			if (j == f1 || j == f2)
+				continue;
+			rrdrow = rf_EO_Mod((row + f2 - j), RF_EO_MATRIX_DIM);
+			if (rrdrow != (RF_EO_MATRIX_DIM) - 1) {
+#if RF_EO_MATRIX_DIM > 17
+				rrdbuf_current = (short *) (rrdbuf[j]) + rrdrow * shortsPerEU;
+				for (k = 0; k < shortsPerEU; k++)
+					temp[k] ^= *(rrdbuf_current + k);
+#elif RF_EO_MATRIX_DIM == 17
+				rrdbuf_current = (long *) (rrdbuf[j]) + rrdrow * longsPerEU;
+				for (k = 0; k < longsPerEU; k++)
+					temp[k] ^= *(rrdbuf_current + k);
+#endif
+			}
+		}
+		/* step 2:  ^E(erow,m-2), If erow is at the buttom row, don't
+		 * Xor into it  E(erow,m-2) = (principle diagonal) ^ (failed
+		 * 1) ^ (failed 2) ^ ( SUM of nonfailed in-diagonal
+		 * A(rrdrow,0..m-3) ) After this step, temp[k] = (principle
+		 * diagonal) ^ (failed 2)       */
+
+		erow = rf_EO_Mod((row + f2 - ecol), (RF_EO_MATRIX_DIM));
+		if (erow != (RF_EO_MATRIX_DIM) - 1) {
+#if RF_EO_MATRIX_DIM > 17
+			ebuf_current = (short *) ebuf + shortsPerEU * erow;
+			for (k = 0; k < shortsPerEU; k++)
+				temp[k] ^= *(ebuf_current + k);
+#elif RF_EO_MATRIX_DIM == 17
+			ebuf_current = (long *) ebuf + longsPerEU * erow;
+			for (k = 0; k < longsPerEU; k++)
+				temp[k] ^= *(ebuf_current + k);
+#endif
+		}
+		/* step 3: ^P to obtain the failed data (failed 2).  P can be
+		 * proved to be actually  (principle diagonal)  After this
+		 * step, temp[k] = (failed 2), the failed data to be recovered */
+#if RF_EO_MATRIX_DIM > 17
+		for (k = 0; k < shortsPerEU; k++)
+			temp[k] ^= P[k];
+		/* Put the data to the destination buffer                              */
+		for (k = 0; k < shortsPerEU; k++)
+			dest_larger_current[k] = temp[k];
+#elif RF_EO_MATRIX_DIM == 17
+		for (k = 0; k < longsPerEU; k++)
+			temp[k] ^= P[k];
+		/* Put the data to the destination buffer                              */
+		for (k = 0; k < longsPerEU; k++)
+			dest_larger_current[k] = temp[k];
+#endif
+
+		/**          THE FOLLOWING DO THE HORIZONTAL XOR                **/
+		/* step 1:  ^(SUM of A(row,0..m-3)), ie. all nonfailed data
+		 * columes    */
+		for (j = 0; j < numDataCol; j++) {
+			if (j == f1 || j == f2)
+				continue;
+#if RF_EO_MATRIX_DIM > 17
+			rrdbuf_current = (short *) (rrdbuf[j]) + row * shortsPerEU;
+			for (k = 0; k < shortsPerEU; k++)
+				temp[k] ^= *(rrdbuf_current + k);
+#elif RF_EO_MATRIX_DIM == 17
+			rrdbuf_current = (long *) (rrdbuf[j]) + row * longsPerEU;
+			for (k = 0; k < longsPerEU; k++)
+				temp[k] ^= *(rrdbuf_current + k);
+#endif
+		}
+		/* step 2: ^A(row,m-1) */
+		/* step 3: Put the data to the destination buffer                             	 */
+#if RF_EO_MATRIX_DIM > 17
+		pbuf_current = (short *) pbuf + shortsPerEU * row;
+		for (k = 0; k < shortsPerEU; k++)
+			temp[k] ^= *(pbuf_current + k);
+		for (k = 0; k < shortsPerEU; k++)
+			dest_smaller_current[k] = temp[k];
+#elif RF_EO_MATRIX_DIM == 17
+		pbuf_current = (long *) pbuf + longsPerEU * row;
+		for (k = 0; k < longsPerEU; k++)
+			temp[k] ^= *(pbuf_current + k);
+		for (k = 0; k < longsPerEU; k++)
+			dest_smaller_current[k] = temp[k];
+#endif
+		count++;
+	}
+	/* Check if all Encoding Unit in the data buffer have been decoded,
+	 * according EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime number,
+	 * this algorithm will covered all buffer 				 */
+	RF_ASSERT(count == numRowInEncMatix);
+	RF_Free((char *) P, bytesPerEU);
+	RF_Free((char *) temp, bytesPerEU);
+}
+
+
+/***************************************************************************************
+* 	This function is called by double degragded read
+* 	EO_200_CreateReadDAG
+*
+***************************************************************************************/
+int 
+rf_EvenOddDoubleRecoveryFunc(node)
+	RF_DagNode_t *node;
+{
+	int     ndataParam = 0;
+	int     np = node->numParams;
+	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
+	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
+	int     i, prm, sector, nresults = node->numResults;
+	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
+	unsigned sosAddr;
+	int     two = 0, mallc_one = 0, mallc_two = 0;	/* flags to indicate if
+							 * memory is allocated */
+	int     bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
+	RF_PhysDiskAddr_t *ppda, *ppda2, *epda, *epda2, *pda, *pda0, *pda1,
+	        npda;
+	RF_RowCol_t fcol[2], fsuoff[2], fsuend[2], numDataCol = layoutPtr->numDataCol;
+	char  **buf, *ebuf, *pbuf, *dest[2];
+	long   *suoff = NULL, *suend = NULL, *prmToCol = NULL, psuoff, esuoff;
+	RF_SectorNum_t startSector, endSector;
+	RF_Etimer_t timer;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+
+	RF_ETIMER_START(timer);
+
+	/* Find out the number of parameters which are pdas for data
+	 * information */
+	for (i = 0; i <= np; i++)
+		if (((RF_PhysDiskAddr_t *) node->params[i].p)->type != RF_PDA_TYPE_DATA) {
+			ndataParam = i;
+			break;
+		}
+	RF_Malloc(buf, numDataCol * sizeof(char *), (char **));
+	if (ndataParam != 0) {
+		RF_Malloc(suoff, ndataParam * sizeof(long), (long *));
+		RF_Malloc(suend, ndataParam * sizeof(long), (long *));
+		RF_Malloc(prmToCol, ndataParam * sizeof(long), (long *));
+	}
+	if (asmap->failedPDAs[1] &&
+	    (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
+		RF_ASSERT(0);	/* currently, no support for this situation */
+		ppda = node->params[np - 6].p;
+		ppda2 = node->params[np - 5].p;
+		RF_ASSERT(ppda2->type == RF_PDA_TYPE_PARITY);
+		epda = node->params[np - 4].p;
+		epda2 = node->params[np - 3].p;
+		RF_ASSERT(epda2->type == RF_PDA_TYPE_Q);
+		two = 1;
+	} else {
+		ppda = node->params[np - 4].p;
+		epda = node->params[np - 3].p;
+		psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
+		esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector);
+		RF_ASSERT(psuoff == esuoff);
+	}
+	/*
+            the followings have three goals:
+            1. determine the startSector to begin decoding and endSector to end decoding.
+            2. determine the colume numbers of the two failed disks.
+            3. determine the offset and end offset of the access within each failed stripe unit.
+         */
+	if (nresults == 1) {
+		/* find the startSector to begin decoding */
+		pda = node->results[0];
+		bzero(pda->bufPtr, bytesPerSector * pda->numSector);
+		fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+		fsuend[0] = fsuoff[0] + pda->numSector;
+		startSector = fsuoff[0];
+		endSector = fsuend[0];
+
+		/* find out the column of failed disk being accessed */
+		fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress);
+
+		/* find out the other failed colume not accessed */
+		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+		for (i = 0; i < numDataCol; i++) {
+			npda.raidAddress = sosAddr + (i * secPerSU);
+			(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
+			/* skip over dead disks */
+			if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
+				if (i != fcol[0])
+					break;
+		}
+		RF_ASSERT(i < numDataCol);
+		fcol[1] = i;
+	} else {
+		RF_ASSERT(nresults == 2);
+		pda0 = node->results[0];
+		bzero(pda0->bufPtr, bytesPerSector * pda0->numSector);
+		pda1 = node->results[1];
+		bzero(pda1->bufPtr, bytesPerSector * pda1->numSector);
+		/* determine the failed colume numbers of the two failed
+		 * disks. */
+		fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress);
+		fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress);
+		/* determine the offset and end offset of the access within
+		 * each failed stripe unit. */
+		fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector);
+		fsuend[0] = fsuoff[0] + pda0->numSector;
+		fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector);
+		fsuend[1] = fsuoff[1] + pda1->numSector;
+		/* determine the startSector to begin decoding */
+		startSector = RF_MIN(pda0->startSector, pda1->startSector);
+		/* determine the endSector to end decoding */
+		endSector = RF_MAX(fsuend[0], fsuend[1]);
+	}
+	/*
+	      assign the beginning sector and the end sector for each parameter
+	      find out the corresponding colume # for each parameter
+        */
+	for (prm = 0; prm < ndataParam; prm++) {
+		pda = node->params[prm].p;
+		suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+		suend[prm] = suoff[prm] + pda->numSector;
+		prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress);
+	}
+	/* 'sector' is the sector for the current decoding algorithm. For each
+	 * sector in the failed SU, find out the corresponding parameters that
+	 * cover the current sector and that are needed for decoding of this
+	 * sector in failed SU. 2.  Find out if sector is in the shadow of any
+	 * accessed failed SU. If not, malloc a temporary space of a sector in
+	 * size. */
+	for (sector = startSector; sector < endSector; sector++) {
+		if (nresults == 2)
+			if (!(fsuoff[0] <= sector && sector < fsuend[0]) && !(fsuoff[1] <= sector && sector < fsuend[1]))
+				continue;
+		for (prm = 0; prm < ndataParam; prm++)
+			if (suoff[prm] <= sector && sector < suend[prm])
+				buf[(prmToCol[prm])] = ((RF_PhysDiskAddr_t *) node->params[prm].p)->bufPtr +
+				    rf_RaidAddressToByte(raidPtr, sector - suoff[prm]);
+		/* find out if sector is in the shadow of any accessed failed
+		 * SU. If yes, assign dest[0], dest[1] to point at suitable
+		 * position of the buffer corresponding to failed SUs. if no,
+		 * malloc a temporary space of a sector in size for
+		 * destination of decoding. */
+		RF_ASSERT(nresults == 1 || nresults == 2);
+		if (nresults == 1) {
+			dest[0] = ((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
+			/* Always malloc temp buffer to dest[1]  */
+			RF_Malloc(dest[1], bytesPerSector, (char *));
+			bzero(dest[1], bytesPerSector);
+			mallc_two = 1;
+		} else {
+			if (fsuoff[0] <= sector && sector < fsuend[0])
+				dest[0] = ((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
+			else {
+				RF_Malloc(dest[0], bytesPerSector, (char *));
+				bzero(dest[0], bytesPerSector);
+				mallc_one = 1;
+			}
+			if (fsuoff[1] <= sector && sector < fsuend[1])
+				dest[1] = ((RF_PhysDiskAddr_t *) node->results[1])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[1]);
+			else {
+				RF_Malloc(dest[1], bytesPerSector, (char *));
+				bzero(dest[1], bytesPerSector);
+				mallc_two = 1;
+			}
+			RF_ASSERT(mallc_one == 0 || mallc_two == 0);
+		}
+		pbuf = ppda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - psuoff);
+		ebuf = epda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - esuoff);
+		/*
+	         * After finish finding all needed sectors, call doubleEOdecode function for decoding
+	         * one sector to destination.
+	         */
+		rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
+		/* free all allocated memory, and mark flag to indicate no
+		 * memory is being allocated */
+		if (mallc_one == 1)
+			RF_Free(dest[0], bytesPerSector);
+		if (mallc_two == 1)
+			RF_Free(dest[1], bytesPerSector);
+		mallc_one = mallc_two = 0;
+	}
+	RF_Free(buf, numDataCol * sizeof(char *));
+	if (ndataParam != 0) {
+		RF_Free(suoff, ndataParam * sizeof(long));
+		RF_Free(suend, ndataParam * sizeof(long));
+		RF_Free(prmToCol, ndataParam * sizeof(long));
+	}
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	if (tracerec) {
+		tracerec->q_us += RF_ETIMER_VAL_US(timer);
+	}
+	rf_GenericWakeupFunc(node, 0);
+#if 1
+	return (0);		/* XXX is this even close!!?!?!!? GO */
+#endif
+}
+
+
+/* currently, only access of one of the two failed SU is allowed in this function.
+ * also, asmap->numStripeUnitsAccessed is limited to be one, the RaidFrame will break large access into
+ * many accesses of single stripe unit.
+ */
+
+int 
+rf_EOWriteDoubleRecoveryFunc(node)
+	RF_DagNode_t *node;
+{
+	int     np = node->numParams;
+	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
+	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
+	RF_SectorNum_t sector;
+	RF_RowCol_t col, scol;
+	int     prm, i, j;
+	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
+	unsigned sosAddr;
+	unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
+	RF_int64 numbytes;
+	RF_SectorNum_t startSector, endSector;
+	RF_PhysDiskAddr_t *ppda, *epda, *pda, *fpda, npda;
+	RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol;
+	char  **buf;		/* buf[0], buf[1], buf[2], ...etc. point to
+				 * buffer storing data read from col0, col1,
+				 * col2 */
+	char   *ebuf, *pbuf, *dest[2], *olddata[2];
+	RF_Etimer_t timer;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+
+	RF_ASSERT(asmap->numDataFailed == 1);	/* currently only support this
+						 * case, the other failed SU
+						 * is not being accessed */
+	RF_ETIMER_START(timer);
+	RF_Malloc(buf, numDataCol * sizeof(char *), (char **));
+
+	ppda = node->results[0];/* Instead of being buffers, node->results[0]
+				 * and [1] are Ppda and Epda  */
+	epda = node->results[1];
+	fpda = asmap->failedPDAs[0];
+
+	/* First, recovery the failed old SU using EvenOdd double decoding      */
+	/* determine the startSector and endSector for decoding */
+	startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector);
+	endSector = startSector + fpda->numSector;
+	/* Assign buf[col] pointers to point to each non-failed colume  and
+	 * initialize the pbuf and ebuf to point at the beginning of each
+	 * source buffers and destination buffers */
+	for (prm = 0; prm < numDataCol - 2; prm++) {
+		pda = (RF_PhysDiskAddr_t *) node->params[prm].p;
+		col = rf_EUCol(layoutPtr, pda->raidAddress);
+		buf[col] = pda->bufPtr;
+	}
+	/* pbuf and ebuf:  they will change values as double recovery decoding
+	 * goes on */
+	pbuf = ppda->bufPtr;
+	ebuf = epda->bufPtr;
+	/* find out the logical colume numbers in the encoding matrix of the
+	 * two failed columes */
+	fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress);
+
+	/* find out the other failed colume not accessed this time */
+	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+	for (i = 0; i < numDataCol; i++) {
+		npda.raidAddress = sosAddr + (i * secPerSU);
+		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
+		/* skip over dead disks */
+		if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
+			if (i != fcol[0])
+				break;
+	}
+	RF_ASSERT(i < numDataCol);
+	fcol[1] = i;
+	/* assign temporary space to put recovered failed SU */
+	numbytes = fpda->numSector * bytesPerSector;
+	RF_Malloc(olddata[0], numbytes, (char *));
+	RF_Malloc(olddata[1], numbytes, (char *));
+	dest[0] = olddata[0];
+	dest[1] = olddata[1];
+	bzero(olddata[0], numbytes);
+	bzero(olddata[1], numbytes);
+	/* Begin the recovery decoding, initially buf[j],  ebuf, pbuf, dest[j]
+	 * have already pointed at the beginning of each source buffers and
+	 * destination buffers */
+	for (sector = startSector, i = 0; sector < endSector; sector++, i++) {
+		rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
+		for (j = 0; j < numDataCol; j++)
+			if ((j != fcol[0]) && (j != fcol[1]))
+				buf[j] += bytesPerSector;
+		dest[0] += bytesPerSector;
+		dest[1] += bytesPerSector;
+		ebuf += bytesPerSector;
+		pbuf += bytesPerSector;
+	}
+	/* after recovery, the buffer pointed by olddata[0] is the old failed
+	 * data. With new writing data and this old data, use small write to
+	 * calculate the new redundant informations */
+	/* node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of
+	 * Rrd; params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol
+	 * -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ; params[
+	 * PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol
+	 * +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1] are Pdas of
+	 * wudNodes; For current implementation, we assume the simplest case:
+	 * asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1
+	 * ie. PDAPerDisk = 1 then node->params[numDataCol] must be the new
+	 * data to be writen to the failed disk. We first bxor the new data
+	 * into the old recovered data, then do the same things as small
+	 * write. */
+
+	rf_bxor(((RF_PhysDiskAddr_t *) node->params[numDataCol].p)->bufPtr, olddata[0], numbytes, node->dagHdr->bp);
+	/* do new 'E' calculation  */
+	/* find out the corresponding colume in encoding matrix for write
+	 * colume to be encoded into redundant disk 'E' */
+	scol = rf_EUCol(layoutPtr, fpda->raidAddress);
+	/* olddata[0] now is source buffer pointer; epda->bufPtr is the dest
+	 * buffer pointer               */
+	rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2, epda->bufPtr, fpda->numSector);
+
+	/* do new 'P' calculation  */
+	rf_bxor(olddata[0], ppda->bufPtr, numbytes, node->dagHdr->bp);
+	/* Free the allocated buffer  */
+	RF_Free(olddata[0], numbytes);
+	RF_Free(olddata[1], numbytes);
+	RF_Free(buf, numDataCol * sizeof(char *));
+
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	if (tracerec) {
+		tracerec->q_us += RF_ETIMER_VAL_US(timer);
+	}
+	rf_GenericWakeupFunc(node, 0);
+	return (0);
+}
+#endif				/* RF_INCLUDE_EVENODD > 0 */
diff --git a/sys/dev/raidframe/rf_evenodd_dagfuncs.h b/sys/dev/raidframe/rf_evenodd_dagfuncs.h
new file mode 100644
index 0000000..cf5028b
--- /dev/null
+++ b/sys/dev/raidframe/rf_evenodd_dagfuncs.h
@@ -0,0 +1,79 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_evenodd_dagfuncs.h,v 1.2 1999/02/05 00:06:11 oster Exp $	*/
+/*
+ * rf_evenodd_dagfuncs.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Chang-Ming Wu
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_EVENODD_DAGFUNCS_H_
+#define _RF__RF_EVENODD_DAGFUNCS_H_
+
+extern RF_RedFuncs_t rf_EOSmallWriteEFuncs;
+extern RF_RedFuncs_t rf_EOSmallWritePFuncs;
+extern RF_RedFuncs_t rf_eoERecoveryFuncs;
+extern RF_RedFuncs_t rf_eoPRecoveryFuncs;
+
+int     rf_RegularPEFunc(RF_DagNode_t * node);
+int     rf_RegularONEFunc(RF_DagNode_t * node);
+int     rf_SimpleONEFunc(RF_DagNode_t * node);
+void    rf_RegularESubroutine(RF_DagNode_t * node, char *ebuf);
+int     rf_RegularEFunc(RF_DagNode_t * node);
+void    rf_DegrESubroutine(RF_DagNode_t * node, char *ebuf);
+int     rf_Degraded_100_EOFunc(RF_DagNode_t * node);
+void 
+rf_e_EncOneSect(RF_RowCol_t srcLogicCol, char *srcSecbuf,
+    RF_RowCol_t destLogicCol, char *destSecbuf, int bytesPerSector);
+void 
+rf_e_encToBuf(RF_Raid_t * raidPtr, RF_RowCol_t srcLogicCol,
+    char *srcbuf, RF_RowCol_t destLogicCol, char *destbuf, int numSector);
+int     rf_RecoveryEFunc(RF_DagNode_t * node);
+int     rf_EO_DegradedWriteEFunc(RF_DagNode_t * node);
+void 
+rf_doubleEOdecode(RF_Raid_t * raidPtr, char **rrdbuf, char **dest,
+    RF_RowCol_t * fcol, char *pbuf, char *ebuf);
+int     rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t * node);
+int     rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t * node);
+
+#define rf_EUCol(_layoutPtr_, _addr_ ) \
+( (_addr_)%( (_layoutPtr_)->dataSectorsPerStripe ) )/((_layoutPtr_)->sectorsPerStripeUnit)
+
+#define rf_EO_Mod( _int1_, _int2_ ) \
+( ((_int1_) < 0)? (((_int1_)+(_int2_))%(_int2_)) : (_int1_)%(_int2_) )
+
+#define rf_OffsetOfNextEUBoundary(_offset_, sec_per_eu) ((_offset_)/(sec_per_eu) + 1)*(sec_per_eu)
+
+#define RF_EO_MATRIX_DIM 17
+
+/*
+ * RF_EO_MATRIX_DIM should be a prime number: and "bytesPerSector" should be
+ * dividable by ( RF_EO_MATRIX_DIM - 1) to fully encode and utilize the space
+ * in a sector, this number could also be 17. Tha later case doesn't apply
+ * for disk array larger than 17 columns totally.
+ */
+
+#endif				/* !_RF__RF_EVENODD_DAGFUNCS_H_ */
diff --git a/sys/dev/raidframe/rf_evenodd_dags.c b/sys/dev/raidframe/rf_evenodd_dags.c
new file mode 100644
index 0000000..e644504
--- /dev/null
+++ b/sys/dev/raidframe/rf_evenodd_dags.c
@@ -0,0 +1,189 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_evenodd_dags.c,v 1.2 1999/02/05 00:06:11 oster Exp $	*/
+/*
+ * rf_evenodd_dags.c
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Chang-Ming Wu
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <dev/raidframe/rf_archs.h>
+
+#if RF_INCLUDE_EVENODD > 0
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_acctrace.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_evenodd_dags.h>
+#include <dev/raidframe/rf_evenodd.h>
+#include <dev/raidframe/rf_evenodd_dagfuncs.h>
+#include <dev/raidframe/rf_pq.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+#include <dev/raidframe/rf_dagdegwr.h>
+#include <dev/raidframe/rf_dagffwr.h>
+
+
+/*
+ * Lost one data.
+ * Use P to reconstruct missing data.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateReadDAG)
+{
+	rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_eoPRecoveryFuncs);
+}
+/*
+ * Lost data + E.
+ * Use P to reconstruct missing data.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateReadDAG)
+{
+	rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_eoPRecoveryFuncs);
+}
+/*
+ * Lost data + P.
+ * Make E look like P, and use Eor for Xor, and we can
+ * use degraded read DAG.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateReadDAG)
+{
+	RF_PhysDiskAddr_t *temp;
+	/* swap P and E pointers to fake out the DegradedReadDAG code */
+	temp = asmap->parityInfo;
+	asmap->parityInfo = asmap->qInfo;
+	asmap->qInfo = temp;
+	rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_eoERecoveryFuncs);
+}
+/*
+ * Lost two data.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EOCreateDoubleDegradedReadDAG)
+{
+	rf_EO_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList);
+}
+/*
+ * Lost two data.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateReadDAG)
+{
+	rf_EOCreateDoubleDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList);
+}
+RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateWriteDAG)
+{
+	if (asmap->numStripeUnitsAccessed != 1 &&
+	    asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit)
+		RF_PANIC();
+	rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2, (int (*) (RF_DagNode_t *)) rf_Degraded_100_EOFunc, RF_TRUE);
+}
+/*
+ * E is dead. Small write.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateSmallWriteDAG)
+{
+	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_EOSmallWritePFuncs, NULL);
+}
+/*
+ * E is dead. Large write.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateLargeWriteDAG)
+{
+	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularPFunc, RF_TRUE);
+}
+/*
+ * P is dead. Small write.
+ * Swap E + P, use single-degraded stuff.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateSmallWriteDAG)
+{
+	RF_PhysDiskAddr_t *temp;
+	/* swap P and E pointers to fake out the DegradedReadDAG code */
+	temp = asmap->parityInfo;
+	asmap->parityInfo = asmap->qInfo;
+	asmap->qInfo = temp;
+	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_EOSmallWriteEFuncs, NULL);
+}
+/*
+ * P is dead. Large write.
+ * Swap E + P, use single-degraded stuff.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateLargeWriteDAG)
+{
+	RF_PhysDiskAddr_t *temp;
+	/* swap P and E pointers to fake out the code */
+	temp = asmap->parityInfo;
+	asmap->parityInfo = asmap->qInfo;
+	asmap->qInfo = temp;
+	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularEFunc, RF_FALSE);
+}
+RF_CREATE_DAG_FUNC_DECL(rf_EO_011_CreateWriteDAG)
+{
+	rf_CreateNonRedundantWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+	    RF_IO_TYPE_WRITE);
+}
+RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateWriteDAG)
+{
+	RF_PhysDiskAddr_t *temp;
+
+	if (asmap->numStripeUnitsAccessed != 1 &&
+	    asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit) {
+		RF_PANIC();
+	}
+	/* swap P and E to fake out parity code */
+	temp = asmap->parityInfo;
+	asmap->parityInfo = asmap->qInfo;
+	asmap->qInfo = temp;
+	rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, (int (*) (RF_DagNode_t *)) rf_EO_DegradedWriteEFunc, RF_FALSE);
+	/* is the regular E func the right one to call? */
+}
+RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateWriteDAG)
+{
+	if (asmap->numStripeUnitsAccessed != 1 &&
+	    asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit)
+		RF_PANIC();
+	rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RecoveryXorFunc, RF_TRUE);
+}
+RF_CREATE_DAG_FUNC_DECL(rf_EO_DoubleDegRead)
+{
+	rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
+	    "Re", "EvenOddRecovery", rf_EvenOddDoubleRecoveryFunc);
+}
+RF_CREATE_DAG_FUNC_DECL(rf_EOCreateSmallWriteDAG)
+{
+	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_EOSmallWriteEFuncs);
+}
+RF_CREATE_DAG_FUNC_DECL(rf_EOCreateLargeWriteDAG)
+{
+	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2, rf_RegularPEFunc, RF_FALSE);
+}
+RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateWriteDAG)
+{
+	rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Re", "We", "EOWrDDRecovery", rf_EOWriteDoubleRecoveryFunc);
+}
+#endif				/* RF_INCLUDE_EVENODD > 0 */
diff --git a/sys/dev/raidframe/rf_evenodd_dags.h b/sys/dev/raidframe/rf_evenodd_dags.h
new file mode 100644
index 0000000..c4218a4
--- /dev/null
+++ b/sys/dev/raidframe/rf_evenodd_dags.h
@@ -0,0 +1,64 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_evenodd_dags.h,v 1.2 1999/02/05 00:06:11 oster Exp $	*/
+/*
+ * rf_evenodd_dags.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Chang-Ming Wu
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_EVENODD_DAGS_H_
+#define _RF__RF_EVENODD_DAGS_H_
+
+#include <dev/raidframe/rf_types.h>
+
+#if RF_UTILITY == 0
+#include <dev/raidframe/rf_dag.h>
+
+/* extern decl's of the failure mode EO functions.
+ * swiped from rf_pqdeg.h
+ */
+
+RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EOCreateDoubleDegradedReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateSmallWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateSmallWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateLargeWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateLargeWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_011_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_DoubleDegRead);
+RF_CREATE_DAG_FUNC_DECL(rf_EOCreateSmallWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EOCreateLargeWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateWriteDAG);
+#endif				/* RF_UTILITY == 0 */
+
+#endif				/* !_RF__RF_EVENODD_DAGS_H_ */
diff --git a/sys/dev/raidframe/rf_fifo.c b/sys/dev/raidframe/rf_fifo.c
new file mode 100644
index 0000000..51ed714
--- /dev/null
+++ b/sys/dev/raidframe/rf_fifo.c
@@ -0,0 +1,236 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_fifo.c,v 1.5 2000/03/04 03:27:13 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***************************************************
+ *
+ * rf_fifo.c --  prioritized fifo queue code.
+ * There are only two priority levels: hi and lo.
+ *
+ * Aug 4, 1994, adapted from raidSim version (MCH)
+ *
+ ***************************************************/
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_stripelocks.h>
+#include <dev/raidframe/rf_layout.h>
+#include <dev/raidframe/rf_diskqueue.h>
+#include <dev/raidframe/rf_fifo.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_options.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_types.h>
+
+/* just malloc a header, zero it (via calloc), and return it */
+/*ARGSUSED*/
+void   *
+rf_FifoCreate(sectPerDisk, clList, listp)
+	RF_SectorCount_t sectPerDisk;
+	RF_AllocListElem_t *clList;
+	RF_ShutdownList_t **listp;
+{
+	RF_FifoHeader_t *q;
+
+	RF_CallocAndAdd(q, 1, sizeof(RF_FifoHeader_t), (RF_FifoHeader_t *), clList);
+	q->hq_count = q->lq_count = 0;
+	return ((void *) q);
+}
+
+void 
+rf_FifoEnqueue(q_in, elem, priority)
+	void   *q_in;
+	RF_DiskQueueData_t *elem;
+	int     priority;
+{
+	RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in;
+
+	RF_ASSERT(priority == RF_IO_NORMAL_PRIORITY || priority == RF_IO_LOW_PRIORITY);
+
+	elem->next = NULL;
+	if (priority == RF_IO_NORMAL_PRIORITY) {
+		if (!q->hq_tail) {
+			RF_ASSERT(q->hq_count == 0 && q->hq_head == NULL);
+			q->hq_head = q->hq_tail = elem;
+		} else {
+			RF_ASSERT(q->hq_count != 0 && q->hq_head != NULL);
+			q->hq_tail->next = elem;
+			q->hq_tail = elem;
+		}
+		q->hq_count++;
+	} else {
+		RF_ASSERT(elem->next == NULL);
+		if (rf_fifoDebug) {
+			printf("raid%d: fifo: ENQ lopri\n", 
+			       elem->raidPtr->raidid);
+		}
+		if (!q->lq_tail) {
+			RF_ASSERT(q->lq_count == 0 && q->lq_head == NULL);
+			q->lq_head = q->lq_tail = elem;
+		} else {
+			RF_ASSERT(q->lq_count != 0 && q->lq_head != NULL);
+			q->lq_tail->next = elem;
+			q->lq_tail = elem;
+		}
+		q->lq_count++;
+	}
+	if ((q->hq_count + q->lq_count) != elem->queue->queueLength) {
+		printf("Queue lengths differ!: %d %d %d\n",
+		    q->hq_count, q->lq_count, (int) elem->queue->queueLength);
+		printf("%d %d %d %d\n",
+		    (int) elem->queue->numOutstanding,
+		    (int) elem->queue->maxOutstanding,
+		    (int) elem->queue->row,
+		    (int) elem->queue->col);
+	}
+	RF_ASSERT((q->hq_count + q->lq_count) == elem->queue->queueLength);
+}
+
+RF_DiskQueueData_t *
+rf_FifoDequeue(q_in)
+	void   *q_in;
+{
+	RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in;
+	RF_DiskQueueData_t *nd;
+
+	RF_ASSERT(q);
+	if (q->hq_head) {
+		RF_ASSERT(q->hq_count != 0 && q->hq_tail != NULL);
+		nd = q->hq_head;
+		q->hq_head = q->hq_head->next;
+		if (!q->hq_head)
+			q->hq_tail = NULL;
+		nd->next = NULL;
+		q->hq_count--;
+	} else
+		if (q->lq_head) {
+			RF_ASSERT(q->lq_count != 0 && q->lq_tail != NULL);
+			nd = q->lq_head;
+			q->lq_head = q->lq_head->next;
+			if (!q->lq_head)
+				q->lq_tail = NULL;
+			nd->next = NULL;
+			q->lq_count--;
+			if (rf_fifoDebug) {
+				printf("raid%d: fifo: DEQ lopri %lx\n", 
+				       nd->raidPtr->raidid, (long) nd);
+			}
+		} else {
+			RF_ASSERT(q->hq_count == 0 && q->lq_count == 0 && q->hq_tail == NULL && q->lq_tail == NULL);
+			nd = NULL;
+		}
+	return (nd);
+}
+
+/* Return ptr to item at head of queue.  Used to examine request
+ * info without actually dequeueing the request.
+ */
+RF_DiskQueueData_t *
+rf_FifoPeek(void *q_in)
+{
+	RF_DiskQueueData_t *headElement = NULL;
+	RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in;
+
+	RF_ASSERT(q);
+	if (q->hq_head)
+		headElement = q->hq_head;
+	else
+		if (q->lq_head)
+			headElement = q->lq_head;
+	return (headElement);
+}
+/* We sometimes need to promote a low priority access to a regular priority access.
+ * Currently, this is only used when the user wants to write a stripe which is currently
+ * under reconstruction.
+ * This routine will promote all accesses tagged with the indicated parityStripeID from
+ * the low priority queue to the end of the normal priority queue.
+ * We assume the queue is locked upon entry.
+ */
+int 
+rf_FifoPromote(q_in, parityStripeID, which_ru)
+	void   *q_in;
+	RF_StripeNum_t parityStripeID;
+	RF_ReconUnitNum_t which_ru;
+{
+	RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in;
+	RF_DiskQueueData_t *lp = q->lq_head, *pt = NULL;	/* lp = lo-pri queue
+								 * pointer, pt = trailer */
+	int     retval = 0;
+
+	while (lp) {
+
+		/* search for the indicated parity stripe in the low-pri queue */
+		if (lp->parityStripeID == parityStripeID && lp->which_ru == which_ru) {
+			/* printf("FifoPromote:  promoting access for psid
+			 * %ld\n",parityStripeID); */
+			if (pt)
+				pt->next = lp->next;	/* delete an entry other
+							 * than the first */
+			else
+				q->lq_head = lp->next;	/* delete the head entry */
+
+			if (!q->lq_head)
+				q->lq_tail = NULL;	/* we deleted the only
+							 * entry */
+			else
+				if (lp == q->lq_tail)
+					q->lq_tail = pt;	/* we deleted the tail
+								 * entry */
+
+			lp->next = NULL;
+			q->lq_count--;
+
+			if (q->hq_tail) {
+				q->hq_tail->next = lp;
+				q->hq_tail = lp;
+			}
+			 /* append to hi-priority queue */ 
+			else {
+				q->hq_head = q->hq_tail = lp;
+			}
+			q->hq_count++;
+
+			/* UpdateShortestSeekFinishTimeForced(lp->requestPtr,
+			 * lp->diskState); *//* deal with this later, if ever */
+
+			lp = (pt) ? pt->next : q->lq_head;	/* reset low-pri pointer
+								 * and continue */
+			retval++;
+
+		} else {
+			pt = lp;
+			lp = lp->next;
+		}
+	}
+
+	/* sanity check.  delete this if you ever put more than one entry in
+	 * the low-pri queue */
+	RF_ASSERT(retval == 0 || retval == 1);
+	return (retval);
+}
diff --git a/sys/dev/raidframe/rf_fifo.h b/sys/dev/raidframe/rf_fifo.h
new file mode 100644
index 0000000..9392f08
--- /dev/null
+++ b/sys/dev/raidframe/rf_fifo.h
@@ -0,0 +1,62 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_fifo.h,v 1.3 1999/02/05 00:06:11 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_fifo.h --  prioritized FIFO queue code.
+ *
+ * 4-9-93 Created (MCH)
+ */
+
+
+#ifndef _RF__RF_FIFO_H_
+#define _RF__RF_FIFO_H_
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_diskqueue.h>
+
+typedef struct RF_FifoHeader_s {
+	RF_DiskQueueData_t *hq_head, *hq_tail;	/* high priority requests */
+	RF_DiskQueueData_t *lq_head, *lq_tail;	/* low priority requests */
+	int     hq_count, lq_count;	/* debug only */
+}       RF_FifoHeader_t;
+
+extern void *
+rf_FifoCreate(RF_SectorCount_t sectPerDisk,
+    RF_AllocListElem_t * clList, RF_ShutdownList_t ** listp);
+extern void 
+rf_FifoEnqueue(void *q_in, RF_DiskQueueData_t * elem,
+    int priority);
+extern RF_DiskQueueData_t *rf_FifoDequeue(void *q_in);
+extern RF_DiskQueueData_t *rf_FifoPeek(void *q_in);
+extern int 
+rf_FifoPromote(void *q_in, RF_StripeNum_t parityStripeID,
+    RF_ReconUnitNum_t which_ru);
+
+#endif				/* !_RF__RF_FIFO_H_ */
diff --git a/sys/dev/raidframe/rf_freebsdkintf.c b/sys/dev/raidframe/rf_freebsdkintf.c
new file mode 100644
index 0000000..b7003b5
--- /dev/null
+++ b/sys/dev/raidframe/rf_freebsdkintf.c
@@ -0,0 +1,3294 @@
+/*-
+ * Copyright (c) 2002 Scott Long <scottl@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*	$NetBSD: rf_netbsdkintf.c,v 1.105 2001/04/05 02:48:51 oster Exp $	*/
+/*-
+ * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Greg Oster; Jason R. Thorpe.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1990, 1993
+ *      The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by the University of
+ *      California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: Utah $Hdr: cd.c 1.6 90/11/28$
+ *
+ *      @(#)cd.c        8.2 (Berkeley) 11/16/93
+ */
+
+
+
+
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Mark Holland, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***********************************************************
+ *
+ * rf_kintf.c -- the kernel interface routines for RAIDframe
+ *
+ ***********************************************************/
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+#include <sys/ioccom.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/disk.h>
+#include <sys/diskslice.h>
+#include <sys/disklabel.h>
+#include <sys/conf.h>
+#include <sys/lock.h>
+#include <sys/reboot.h>
+#include <sys/module.h>
+#include <sys/devicestat.h>
+#include <vm/uma.h>
+
+#include "opt_raid.h"
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_raidframe.h>
+#include <dev/raidframe/rf_copyback.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagflags.h>
+#include <dev/raidframe/rf_desc.h>
+#include <dev/raidframe/rf_diskqueue.h>
+#include <dev/raidframe/rf_acctrace.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_kintf.h>
+#include <dev/raidframe/rf_options.h>
+#include <dev/raidframe/rf_driver.h>
+#include <dev/raidframe/rf_parityscan.h>
+#include <dev/raidframe/rf_debugprint.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_configure.h>
+
+RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
+
+static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
+						 * spare table */
+static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
+						 * installation process */
+
+/* prototypes */
+static void KernelWakeupFunc(struct bio *);
+static void InitBP(struct bio *, struct vnode *, unsigned rw_flag, 
+		   dev_t dev, RF_SectorNum_t startSect, 
+		   RF_SectorCount_t numSect, caddr_t buf,
+		   void (*cbFunc) (struct bio *), void *cbArg, 
+		   int logBytesPerSector, struct proc * b_proc);
+static dev_t raidinit(RF_Raid_t *);
+static void rf_search_label(dev_t, struct disklabel *,
+			    RF_AutoConfig_t **) __unused;
+
+static int	raid_modevent(module_t, int, void*);
+void		raidattach(void);
+d_psize_t	raidsize;
+d_open_t	raidopen;
+d_close_t	raidclose;
+d_ioctl_t	raidioctl;
+d_write_t	raidwrite;
+d_read_t	raidread;
+d_strategy_t	raidstrategy;
+#if 0
+d_dump_t	raiddump;
+#endif
+
+d_open_t	raidctlopen;
+d_close_t	raidctlclose;
+d_ioctl_t	raidctlioctl;
+
+static struct cdevsw raid_cdevsw = {
+	raidopen,
+	raidclose,
+	raidread,
+	raidwrite,
+	raidioctl,
+	nopoll,
+	nommap,
+	raidstrategy,
+	"raid",
+	200,
+	nodump,
+	nopsize,
+	D_DISK,
+};
+
+static struct cdevsw raidctl_cdevsw = {
+	raidctlopen,
+	raidctlclose,
+	noread,
+	nowrite,
+	raidctlioctl,
+	nopoll,
+	nommap,
+	nostrategy,
+	"raidctl",
+	201,
+	nodump,
+	nopsize,
+	0,
+};
+
+static struct cdevsw raiddisk_cdevsw;
+
+/*
+ * Pilfered from ccd.c
+ */
+
+struct raidbuf {
+	struct bio rf_buf;	/* new I/O buf.  MUST BE FIRST!!! */
+	struct bio *rf_obp;	/* ptr. to original I/O buf */
+	int     rf_flags;	/* misc. flags */
+	RF_DiskQueueData_t *req;/* the request that this was part of.. */
+};
+
+
+#define RAIDGETBUF(sc) uma_zalloc((sc)->sc_cbufpool, M_NOWAIT)
+#define	RAIDPUTBUF(sc, cbp) uma_zfree((sc)->sc_cbufpool, cbp)
+
+#define RF_MAX_ARRAYS	32
+
+/* Raid control device */
+struct raidctl_softc {
+	dev_t	sc_dev;		/* Device node */
+	int	sc_flags;	/* flags */
+	int	sc_numraid;	/* Number of configured raid devices */
+	dev_t	sc_raiddevs[RF_MAX_ARRAYS];
+};
+
+struct raid_softc {
+	dev_t	sc_dev;		/* Our device */
+	dev_t	sc_parent_dev;
+	int     sc_flags;	/* flags */
+	int	sc_busycount;	/* How many times are we opened? */
+	size_t  sc_size;	/* size of the raid device */
+	dev_t	sc_parent;	/* Parent device */
+	struct disk		sc_dkdev;	/* generic disk device info */
+ 	uma_zone_t		sc_cbufpool;	/* component buffer pool */
+	RF_Raid_t		*raidPtr;	/* Raid information struct */
+	struct bio_queue_head	bio_queue;	/* used for the device queue */
+	struct devstat		device_stats;	/* devstat gathering */
+};
+/* sc_flags */
+#define RAIDF_OPEN	0x01	/* unit has been initialized */
+#define RAIDF_WLABEL	0x02	/* label area is writable */
+#define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
+#define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
+#define RAIDF_LOCKED	0x80	/* unit is locked */
+
+/* 
+ * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device. 
+ * Be aware that large numbers can allow the driver to consume a lot of 
+ * kernel memory, especially on writes, and in degraded mode reads.
+ * 
+ * For example: with a stripe width of 64 blocks (32k) and 5 disks, 
+ * a single 64K write will typically require 64K for the old data, 
+ * 64K for the old parity, and 64K for the new parity, for a total 
+ * of 192K (if the parity buffer is not re-used immediately).
+ * Even it if is used immedately, that's still 128K, which when multiplied
+ * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
+ * 
+ * Now in degraded mode, for example, a 64K read on the above setup may
+ * require data reconstruction, which will require *all* of the 4 remaining 
+ * disks to participate -- 4 * 32K/disk == 128K again.
+ */
+
+#ifndef RAIDOUTSTANDING
+#define RAIDOUTSTANDING   10
+#endif
+
+#define RAIDLABELDEV(dev)	dkmodpart(dev, RAW_PART)
+#define DISKPART(dev)	dkpart(dev)
+
+static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *, struct disk*);
+static int raidlock(struct raid_softc *);
+static void raidunlock(struct raid_softc *);
+
+static void rf_markalldirty(RF_Raid_t *);
+
+static dev_t raidctl_dev;
+
+void rf_ReconThread(struct rf_recon_req *);
+/* XXX what I want is: */
+/*void rf_ReconThread(RF_Raid_t *raidPtr);  */
+void rf_RewriteParityThread(RF_Raid_t *raidPtr);
+void rf_CopybackThread(RF_Raid_t *raidPtr);
+void rf_ReconstructInPlaceThread(struct rf_recon_req *);
+void rf_buildroothack(void *, struct raidctl_softc *);
+
+RF_AutoConfig_t *rf_find_raid_components(void);
+RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
+static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
+static int rf_reasonable_label(RF_ComponentLabel_t *);
+void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
+int rf_set_autoconfig(RF_Raid_t *, int);
+int rf_set_rootpartition(RF_Raid_t *, int);
+void rf_release_all_vps(RF_ConfigSet_t *);
+void rf_cleanup_config_set(RF_ConfigSet_t *);
+int rf_have_enough_components(RF_ConfigSet_t *);
+int rf_auto_config_set(RF_ConfigSet_t *, int *, struct raidctl_softc *);
+static int raidgetunit(struct raidctl_softc *, int);
+static int raidshutdown(void);
+
+void
+raidattach(void)
+{
+	struct raidctl_softc *parent_sc = NULL;
+	RF_AutoConfig_t *ac_list; /* autoconfig list */
+	RF_ConfigSet_t *config_sets;
+	int autoconfig = 0;
+
+	/* This is where all the initialization stuff gets done. */
+
+	if(rf_mutex_init(&rf_sparet_wait_mutex, __FUNCTION__)) {
+		rf_printf(0, "RAIDframe: failed to initialize mutexes\n");
+		return;
+	}
+
+	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
+
+	if (rf_BootRaidframe() != 0) {
+		rf_printf(0, "Serious error booting RAIDframe!!\n");
+		return;
+	}
+
+	rf_printf(0, "Kernelized RAIDframe activated\n");
+	MALLOC(parent_sc, struct raidctl_softc *, sizeof(*parent_sc),
+	    M_RAIDFRAME, M_NOWAIT|M_ZERO);
+	if (parent_sc == NULL) {
+		RF_PANIC();
+		return;
+	}
+
+	parent_sc->sc_dev= make_dev(&raidctl_cdevsw, 0, 0, 0, 0x644, "raidctl");
+	parent_sc->sc_dev->si_drv1 = parent_sc;
+	raidctl_dev = parent_sc->sc_dev;
+
+#if RAID_AUTOCONFIG
+	autoconfig = 1;
+#endif
+
+	if (autoconfig) {
+		/* 1. locate all RAID components on the system */
+
+		rf_printf(0, "Searching for raid components...\n");
+		ac_list = rf_find_raid_components();
+		if (ac_list == NULL)
+			return;
+
+		/* 2. sort them into their respective sets */
+
+		config_sets = rf_create_auto_sets(ac_list);
+
+		/* 3. evaluate each set and configure the valid ones
+		   This gets done in rf_buildroothack() */
+
+		/* schedule the creation of the thread to do the 
+		   "/ on RAID" stuff */
+
+		rf_buildroothack(config_sets, parent_sc);
+#if 0
+		kthread_create(rf_buildroothack,config_sets);
+
+#endif /* RAID_AUTOCONFIG */
+	}
+}
+
+void
+rf_buildroothack(arg, parent_sc)
+	void *arg;
+	struct raidctl_softc *parent_sc;
+{
+	RF_ConfigSet_t *config_sets = arg;
+	RF_ConfigSet_t *cset;
+	RF_ConfigSet_t *next_cset;
+	int retcode;
+	int raidID;
+	int rootID;
+	int num_root;
+
+	rootID = 0;
+	num_root = 0;
+	cset = config_sets;
+	while(cset != NULL ) {
+		next_cset = cset->next;
+		if (rf_have_enough_components(cset) && 
+		    cset->ac->clabel->autoconfigure==1) {
+			retcode = rf_auto_config_set(cset, &raidID, parent_sc);
+			if (!retcode) {
+				if (cset->rootable) {
+					rootID = raidID;
+					num_root++;
+				}
+			} else {
+				/* The autoconfig didn't work :( */
+				rf_printf(1, "Autoconfig failed with code %d"
+				    "for raid%d\n", retcode, raidID);
+				rf_release_all_vps(cset);
+			}
+		} else {
+			/* we're not autoconfiguring this set...  
+			   release the associated resources */
+			rf_release_all_vps(cset);
+		}
+		/* cleanup */
+		rf_cleanup_config_set(cset);
+		cset = next_cset;
+	}
+	if (boothowto & RB_ASKNAME) {
+		/* We don't auto-config... */
+	} else {
+		/* They didn't ask, and we found something bootable... */
+
+#if 0
+		if (num_root == 1) {
+			booted_device = &raidrootdev[rootID]; 
+		} else if (num_root > 1) {
+			/* we can't guess.. require the user to answer... */
+			boothowto |= RB_ASKNAME;
+		}
+#endif
+	}
+}
+
+int
+raidctlopen(dev_t dev, int flags, int fmt, struct thread *td)
+{
+	struct raidctl_softc *parent_sc;
+
+	parent_sc = dev->si_drv1;
+
+	if ((parent_sc->sc_flags & RAIDF_OPEN) != 0)
+		return (EBUSY);
+
+	parent_sc->sc_flags |= RAIDF_OPEN;
+	return (0);
+}
+
+int
+raidctlclose(dev_t dev, int flags, int fmt, struct thread *td)
+{
+	struct raidctl_softc *parent_sc;
+
+	parent_sc = dev->si_drv1;
+
+	parent_sc->sc_flags &= ~RAIDF_OPEN;
+	return (0);
+}
+
+int
+raidctlioctl(dev_t dev, u_long cmd, caddr_t data, int flags, struct thread *td)
+{
+	struct raidctl_softc *parent_sc;
+	struct raid_softc *sc;
+	RF_Config_t *u_cfg, *k_cfg;
+	RF_Raid_t *raidPtr;
+	u_char *specific_buf;
+	u_int unit;
+	int retcode = 0;
+
+	parent_sc = dev->si_drv1;
+
+	switch (cmd) {
+		/* configure the system */
+	case RAIDFRAME_CONFIGURE:
+
+		/* copy-in the configuration information */
+		/* data points to a pointer to the configuration structure */
+
+		u_cfg = *((RF_Config_t **) data);
+		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
+		if (k_cfg == NULL) {
+			return (ENOMEM);
+		}
+		retcode = copyin((caddr_t) u_cfg, (caddr_t) k_cfg,
+		    sizeof(RF_Config_t));
+		if (retcode) {
+			RF_Free(k_cfg, sizeof(RF_Config_t));
+			rf_printf(2, "raidctlioctl: retcode=%d copyin.1\n",
+				retcode);
+			return (retcode);
+		}
+		/* allocate a buffer for the layout-specific data, and copy it
+		 * in */
+		if (k_cfg->layoutSpecificSize) {
+			if (k_cfg->layoutSpecificSize > 10000) {
+				/* sanity check */
+				RF_Free(k_cfg, sizeof(RF_Config_t));
+				return (EINVAL);
+			}
+			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
+			    (u_char *));
+			if (specific_buf == NULL) {
+				RF_Free(k_cfg, sizeof(RF_Config_t));
+				return (ENOMEM);
+			}
+			retcode = copyin(k_cfg->layoutSpecific,
+			    (caddr_t) specific_buf,
+			    k_cfg->layoutSpecificSize);
+			if (retcode) {
+				RF_Free(k_cfg, sizeof(RF_Config_t));
+				RF_Free(specific_buf, 
+					k_cfg->layoutSpecificSize);
+				rf_printf(2, "raidctlioctl: retcode=%d "
+					"copyin.2\n", retcode);
+				return (retcode);
+			}
+		} else
+			specific_buf = NULL;
+		k_cfg->layoutSpecific = specific_buf;
+
+		/* should do some kind of sanity check on the configuration.
+		 * Store the sum of all the bytes in the last byte? */
+
+		/* configure the system */
+
+		RF_Malloc(raidPtr, sizeof(*raidPtr), (RF_Raid_t *));
+		if (raidPtr == NULL) {
+			rf_printf(0, "No memory for raid device\n");
+			RF_Free(k_cfg, sizeof(RF_Config_t));
+			retcode = ENOMEM;
+		}
+		bzero((char *) raidPtr, sizeof(RF_Raid_t));
+
+		/* Request a unit number for this soon-to-be device. */
+		unit = raidgetunit(parent_sc, 0);
+		if (unit == -1) {
+			rf_printf(0, "Cannot allocate raid unit\n");
+			RF_Free(raidPtr, sizeof(*raidPtr));
+			goto out;
+		}
+		raidPtr->raidid = unit;
+
+		if ((retcode = rf_Configure(raidPtr, k_cfg, NULL)) == 0) {
+
+			/* allow this many simultaneous IO's to 
+			   this RAID device */
+			raidPtr->openings = RAIDOUTSTANDING;
+
+			parent_sc->sc_raiddevs[unit] = raidinit(raidPtr);
+			if (parent_sc->sc_raiddevs[unit] == NULL) {
+				rf_printf(0, "Could not create raid device\n");
+				RF_Free(raidPtr, sizeof(*raidPtr));
+				goto out;
+			}
+			parent_sc->sc_numraid++;
+			((struct raid_softc *)raidPtr->sc)->sc_parent_dev = dev;
+			rf_markalldirty(raidPtr);
+		} else {
+			parent_sc->sc_raiddevs[unit] = NULL;
+			RF_Free(raidPtr, sizeof(*raidPtr));
+		}
+
+out:
+		/* free the buffers.  No return code here. */
+		if (k_cfg->layoutSpecificSize) {
+			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
+		}
+		RF_Free(k_cfg, sizeof(RF_Config_t));
+		break;
+
+	case RAIDFRAME_SHUTDOWN:
+
+		unit = *(u_int *)data;
+		if ((unit >= RF_MAX_ARRAYS) ||
+		    (parent_sc->sc_raiddevs[unit] == NULL))
+			return (EINVAL);
+
+		sc = parent_sc->sc_raiddevs[unit]->si_drv1;
+		if ((retcode = raidlock(sc)) != 0)
+			return (retcode);
+
+		/*
+		 * If somebody has a partition mounted, we shouldn't
+		 * shutdown.
+		 */
+
+		if ((sc->sc_flags & RAIDF_OPEN) != 0) {
+			raidunlock(sc);
+			return (EBUSY);
+		}
+
+		rf_printf(0, "Shutting down RAIDframe engine\n");
+		retcode = rf_Shutdown(sc->raidPtr);
+		RF_THREADGROUP_WAIT_STOP(&sc->raidPtr->engine_tg);
+
+		devstat_remove_entry(&sc->device_stats);
+
+		disk_destroy(parent_sc->sc_raiddevs[unit]);
+		raidunlock(sc);
+
+		/* XXX Need to be able to destroy the zone */
+		uma_zdestroy(sc->sc_cbufpool);
+
+		parent_sc->sc_numraid--;
+		parent_sc->sc_raiddevs[unit] = NULL;
+
+		RF_Free(sc->raidPtr, sizeof(*raidPtr));
+		RF_Free(sc, sizeof(*sc));
+
+		break;
+
+	default:
+		retcode = ENOIOCTL;
+	}
+
+	return (retcode);
+}
+
+#if 0 /* XXX DUMP!!!! */
+int
+raiddump(dev)
+	dev_t   dev;
+{
+	/* Not implemented. */
+	return ENXIO;
+}
+#endif
+
+/* ARGSUSED */
+int
+raidopen(dev, flags, fmt, td)
+	dev_t   dev;
+	int     flags, fmt;
+	struct thread *td;
+{
+	struct raid_softc *sc;
+	struct disk	*dp;
+	int     error = 0;
+
+	sc = dev->si_drv1;
+
+	if ((error = raidlock(sc)) != 0)
+		return (error);
+	dp = &sc->sc_dkdev;
+
+	rf_printf(1, "Opening raid device %s\n", dev->si_name);
+
+	/* Generate overall disklabel */
+	raidgetdefaultlabel(sc->raidPtr, sc, dp);
+
+	if (sc->sc_busycount == 0) {
+		/* First one... mark things as dirty... Note that we *MUST*
+		 have done a configure before this.  I DO NOT WANT TO BE
+		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
+		 THAT THEY BELONG TOGETHER!!!!! */
+		/* XXX should check to see if we're only open for reading
+		   here... If so, we needn't do this, but then need some
+		   other way of keeping track of what's happened.. */
+
+		rf_markalldirty( sc->raidPtr );
+		sc->sc_flags |= RAIDF_OPEN;
+	}
+
+	/* Prevent this unit from being unconfigured while open. */
+	sc->sc_busycount++;
+
+	raidunlock(sc);
+
+	return (error);
+
+
+}
+/* ARGSUSED */
+int
+raidclose(dev, flags, fmt, td)
+	dev_t   dev;
+	int     flags, fmt;
+	struct thread *td;
+{
+	struct raid_softc *sc;
+	int     error = 0;
+
+	sc = dev->si_drv1;
+
+	if ((error = raidlock(sc)) != 0)
+		return (error);
+
+	sc->sc_busycount--;
+	if (sc->sc_busycount == 0) {
+		sc->sc_flags &= ~RAIDF_OPEN;
+		rf_update_component_labels(sc->raidPtr,
+		    RF_FINAL_COMPONENT_UPDATE);
+	}
+
+	raidunlock(sc);
+	return (0);
+
+}
+
+void
+raidstrategy(bp)
+	struct bio *bp;
+{
+	RF_Raid_t *raidPtr;
+	struct raid_softc *sc = bp->bio_dev->si_drv1;
+	int     s;
+
+	raidPtr = sc->raidPtr;
+	if (raidPtr == NULL) {
+		bp->bio_error = ENODEV;
+		bp->bio_flags |= BIO_ERROR;
+		bp->bio_resid = bp->bio_bcount;
+		biodone(bp);
+		return;
+	}
+	if (!raidPtr->valid) {
+		bp->bio_error = ENODEV;
+		bp->bio_flags |= BIO_ERROR;
+		bp->bio_resid = bp->bio_bcount;
+		biodone(bp);
+		return;
+	}
+	if (bp->bio_bcount == 0) {
+		rf_printf(2, "b_bcount is zero..\n");
+		biodone(bp);
+		return;
+	}
+
+	s = splbio();
+
+	bp->bio_resid = 0;
+
+	/* stuff it onto our queue. XXX locking? */
+	bioq_insert_tail(&sc->bio_queue, bp);
+
+	raidstart(raidPtr);
+
+	splx(s);
+}
+
+int
+raidread(dev, uio, flags)
+	dev_t   dev;
+	struct uio *uio;
+	int     flags;
+{
+	struct raid_softc *sc;
+
+	sc = dev->si_drv1;
+
+	return (physio(dev, uio, BIO_READ));
+
+}
+
+int
+raidwrite(dev, uio, flags)
+	dev_t   dev;
+	struct uio *uio;
+	int     flags;
+{
+	struct raid_softc *sc;
+	int ret;
+
+	sc = dev->si_drv1;
+
+	rf_printf(3, "raidwrite\n");
+	ret = physio(dev, uio, BIO_WRITE);
+
+	return (ret);
+
+}
+
+int
+raidioctl(dev, cmd, data, flag, td)
+	dev_t   dev;
+	u_long  cmd;
+	caddr_t data;
+	int     flag;
+	struct thread *td;
+{
+	struct raid_softc *sc;
+	RF_Raid_t *raidPtr;
+	RF_RaidDisk_t *diskPtr;
+	RF_AccTotals_t *totals;
+	RF_DeviceConfig_t *d_cfg, **ucfgp;
+	struct rf_recon_req *rrcopy, *rr;
+	RF_ComponentLabel_t *clabel;
+	RF_ComponentLabel_t *ci_label;
+	RF_SingleComponent_t *sparePtr,*componentPtr;
+	RF_SingleComponent_t *hot_spare, *component;
+	RF_ProgressInfo_t progressInfo;
+	int retcode = 0;
+	int row, column;
+	int unit;
+	int i, j, d;
+
+	sc = dev->si_drv1;
+	raidPtr = sc->raidPtr;
+
+	rf_printf(2, "raidioctl: %s %ld\n", dev->si_name, cmd);
+
+	switch (cmd) {
+
+	case RAIDFRAME_GET_COMPONENT_LABEL:
+		/* need to read the component label for the disk indicated
+		   by row,column in clabel */
+
+		/* For practice, let's get it directly fromdisk, rather 
+		   than from the in-core copy */
+		RF_Malloc( clabel, sizeof( RF_ComponentLabel_t ),
+			   (RF_ComponentLabel_t *));
+		if (clabel == NULL)
+			return (ENOMEM);
+
+		bzero((char *) clabel, sizeof(RF_ComponentLabel_t));
+		
+		bcopy(data, clabel, sizeof(RF_ComponentLabel_t));
+
+		row = clabel->row;
+		column = clabel->column;
+
+		if ((row < 0) || (row >= raidPtr->numRow) ||
+		    (column < 0) || (column >= raidPtr->numCol +
+				     raidPtr->numSpare)) {
+			RF_Free( clabel, sizeof(RF_ComponentLabel_t));
+			return(EINVAL);
+		}
+
+		raidread_component_label(raidPtr->Disks[row][column].dev, 
+				raidPtr->raid_cinfo[row][column].ci_vp, 
+				clabel );
+
+		bcopy(clabel, data, sizeof(RF_ComponentLabel_t));
+		RF_Free( clabel, sizeof(RF_ComponentLabel_t));
+		return (retcode);
+
+	case RAIDFRAME_SET_COMPONENT_LABEL:
+		clabel = (RF_ComponentLabel_t *) data;
+
+		/* XXX check the label for valid stuff... */
+		/* Note that some things *should not* get modified --
+		   the user should be re-initing the labels instead of 
+		   trying to patch things.
+		   */
+
+		rf_printf(1, "Got component label:\n");
+		rf_printf(1, "Version: %d\n",clabel->version);
+		rf_printf(1, "Serial Number: %d\n",clabel->serial_number);
+		rf_printf(1, "Mod counter: %d\n",clabel->mod_counter);
+		rf_printf(1, "Row: %d\n", clabel->row);
+		rf_printf(1, "Column: %d\n", clabel->column);
+		rf_printf(1, "Num Rows: %d\n", clabel->num_rows);
+		rf_printf(1, "Num Columns: %d\n", clabel->num_columns);
+		rf_printf(1, "Clean: %d\n", clabel->clean);
+		rf_printf(1, "Status: %d\n", clabel->status);
+
+		row = clabel->row;
+		column = clabel->column;
+
+		if ((row < 0) || (row >= raidPtr->numRow) ||
+		    (column < 0) || (column >= raidPtr->numCol)) {
+			return(EINVAL);
+		}
+
+		/* XXX this isn't allowed to do anything for now :-) */
+
+		/* XXX and before it is, we need to fill in the rest
+		   of the fields!?!?!?! */
+#if 0
+		raidwrite_component_label( 
+                            raidPtr->Disks[row][column].dev, 
+			    raidPtr->raid_cinfo[row][column].ci_vp, 
+			    clabel );
+#endif
+		return (0);
+
+	case RAIDFRAME_INIT_LABELS:
+		MALLOC(ci_label, RF_ComponentLabel_t *,
+		    sizeof(RF_ComponentLabel_t), M_RAIDFRAME,
+		    M_WAITOK | M_ZERO);
+		clabel = (RF_ComponentLabel_t *) data;
+		/* 
+		   we only want the serial number from
+		   the above.  We get all the rest of the information
+		   from the config that was used to create this RAID
+		   set. 
+		   */
+
+		raidPtr->serial_number = clabel->serial_number;
+		
+		raid_init_component_label(raidPtr, ci_label);
+		ci_label->serial_number = clabel->serial_number;
+
+		for(row=0;row<raidPtr->numRow;row++) {
+			ci_label->row = row;
+			for(column=0;column<raidPtr->numCol;column++) {
+				diskPtr = &raidPtr->Disks[row][column];
+				if (!RF_DEAD_DISK(diskPtr->status)) {
+					ci_label->partitionSize =
+					    diskPtr->partitionSize;
+					ci_label->column = column;
+					raidwrite_component_label( 
+					    raidPtr->Disks[row][column].dev, 
+					    raidPtr->raid_cinfo[row][column].ci_vp, 
+					  ci_label );
+				}
+			}
+		}
+
+		FREE(ci_label, M_RAIDFRAME);
+		return (retcode);
+	case RAIDFRAME_SET_AUTOCONFIG:
+		d = rf_set_autoconfig(raidPtr, *(int *) data);
+		rf_printf(1, "New autoconfig value is: %d\n", d);
+		*(int *) data = d;
+		return (retcode);
+
+	case RAIDFRAME_SET_ROOT:
+		d = rf_set_rootpartition(raidPtr, *(int *) data);
+		rf_printf(1, "New rootpartition value is: %d\n", d);
+		*(int *) data = d;
+		return (retcode);
+
+		/* initialize all parity */
+	case RAIDFRAME_REWRITEPARITY:
+
+		if (raidPtr->Layout.map->faultsTolerated == 0) {
+			/* Parity for RAID 0 is trivially correct */
+			raidPtr->parity_good = RF_RAID_CLEAN;
+			return(0);
+		}
+		
+		if (raidPtr->parity_rewrite_in_progress == 1) {
+			/* Re-write is already in progress! */
+			return(EINVAL);
+		}
+
+		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
+					   rf_RewriteParityThread,
+					   raidPtr,"raid_parity");
+		return (retcode);
+
+
+	case RAIDFRAME_ADD_HOT_SPARE:
+		MALLOC(hot_spare, RF_SingleComponent_t *,
+		    sizeof(RF_SingleComponent_t), M_RAIDFRAME,
+		     M_WAITOK | M_ZERO);
+		sparePtr = (RF_SingleComponent_t *) data;
+		memcpy( hot_spare, sparePtr, sizeof(RF_SingleComponent_t));
+		retcode = rf_add_hot_spare(raidPtr, hot_spare);
+		FREE(hot_spare, M_RAIDFRAME);
+		return(retcode);
+
+	case RAIDFRAME_REMOVE_HOT_SPARE:
+		return(retcode);
+
+	case RAIDFRAME_DELETE_COMPONENT:
+		MALLOC(component, RF_SingleComponent_t *,
+		    sizeof(RF_SingleComponent_t), M_RAIDFRAME,
+		     M_WAITOK | M_ZERO);
+		componentPtr = (RF_SingleComponent_t *)data;
+		memcpy( component, componentPtr, 
+			sizeof(RF_SingleComponent_t));
+		retcode = rf_delete_component(raidPtr, component);
+		FREE(component, M_RAIDFRAME);
+		return(retcode);
+
+	case RAIDFRAME_INCORPORATE_HOT_SPARE:
+		MALLOC(component, RF_SingleComponent_t *,
+		    sizeof(RF_SingleComponent_t), M_RAIDFRAME,
+		     M_WAITOK | M_ZERO);
+		componentPtr = (RF_SingleComponent_t *)data;
+		memcpy( component, componentPtr, 
+			sizeof(RF_SingleComponent_t));
+		retcode = rf_incorporate_hot_spare(raidPtr, component);
+		FREE(component, M_RAIDFRAME);
+		return(retcode);
+
+	case RAIDFRAME_REBUILD_IN_PLACE:
+
+		MALLOC(component, RF_SingleComponent_t *,
+		    sizeof(RF_SingleComponent_t), M_RAIDFRAME,
+		     M_WAITOK | M_ZERO);
+		if (raidPtr->Layout.map->faultsTolerated == 0) {
+			/* Can't do this on a RAID 0!! */
+			FREE(component, M_RAIDFRAME);
+			return(EINVAL);
+		}
+
+		if (raidPtr->recon_in_progress == 1) {
+			/* a reconstruct is already in progress! */
+			FREE(component, M_RAIDFRAME);
+			return(EINVAL);
+		}
+
+		componentPtr = (RF_SingleComponent_t *) data;
+		memcpy( component, componentPtr, 
+			sizeof(RF_SingleComponent_t));
+		row = component->row;
+		column = component->column;
+		unit = raidPtr->raidid;
+		rf_printf(0, "raid%d Rebuild: %d %d\n", unit, row, column);
+		if ((row < 0) || (row >= raidPtr->numRow) ||
+		    (column < 0) || (column >= raidPtr->numCol)) {
+			FREE(component, M_RAIDFRAME);
+			return(EINVAL);
+		}
+
+		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
+		if (rrcopy == NULL) {
+			FREE(component, M_RAIDFRAME);
+			return(ENOMEM);
+		}
+
+		rrcopy->raidPtr = (void *) raidPtr;
+		rrcopy->row = row;
+		rrcopy->col = column;
+
+		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
+					   rf_ReconstructInPlaceThread,
+					   rrcopy,"raid_reconip");
+		FREE(component, M_RAIDFRAME);
+		return(retcode);
+
+	case RAIDFRAME_GET_UNIT:
+
+		*(int *)data = raidPtr->raidid;
+		return (0);
+
+	case RAIDFRAME_GET_INFO:
+		if (!raidPtr->valid)
+			return (ENODEV);
+		ucfgp = (RF_DeviceConfig_t **) data;
+		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
+			  (RF_DeviceConfig_t *));
+		if (d_cfg == NULL)
+			return (ENOMEM);
+		bzero((char *) d_cfg, sizeof(RF_DeviceConfig_t));
+		d_cfg->rows = raidPtr->numRow;
+		d_cfg->cols = raidPtr->numCol;
+		d_cfg->ndevs = raidPtr->numRow * raidPtr->numCol;
+		if (d_cfg->ndevs >= RF_MAX_DISKS) {
+			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
+			return (ENOMEM);
+		}
+		d_cfg->nspares = raidPtr->numSpare;
+		if (d_cfg->nspares >= RF_MAX_DISKS) {
+			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
+			return (ENOMEM);
+		}
+		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
+		d = 0;
+		for (i = 0; i < d_cfg->rows; i++) {
+			for (j = 0; j < d_cfg->cols; j++) {
+				d_cfg->devs[d] = raidPtr->Disks[i][j];
+				d++;
+			}
+		}
+		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
+			d_cfg->spares[i] = raidPtr->Disks[0][j];
+		}
+
+		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
+
+		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
+
+		return (retcode);
+
+	case RAIDFRAME_CHECK_PARITY:
+		*(int *) data = raidPtr->parity_good;
+		return (0);
+
+	case RAIDFRAME_RESET_ACCTOTALS:
+		bzero(&raidPtr->acc_totals, sizeof(raidPtr->acc_totals));
+		return (0);
+
+	case RAIDFRAME_GET_ACCTOTALS:
+		totals = (RF_AccTotals_t *) data;
+		*totals = raidPtr->acc_totals;
+		return (0);
+
+	case RAIDFRAME_KEEP_ACCTOTALS:
+		raidPtr->keep_acc_totals = *(int *)data;
+		return (0);
+
+	case RAIDFRAME_GET_SIZE:
+		*(int *) data = raidPtr->totalSectors;
+		return (0);
+
+		/* fail a disk & optionally start reconstruction */
+	case RAIDFRAME_FAIL_DISK:
+
+		if (raidPtr->Layout.map->faultsTolerated == 0) {
+			/* Can't do this on a RAID 0!! */
+			return(EINVAL);
+		}
+
+		rr = (struct rf_recon_req *) data;
+
+		if (rr->row < 0 || rr->row >= raidPtr->numRow
+		    || rr->col < 0 || rr->col >= raidPtr->numCol)
+			return (EINVAL);
+
+		rf_printf(0, "%s: Failing the disk: row: %d col: %d\n",
+		       dev->si_name, rr->row, rr->col);
+
+		/* make a copy of the recon request so that we don't rely on
+		 * the user's buffer */
+		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
+		if (rrcopy == NULL)
+			return(ENOMEM);
+		bcopy(rr, rrcopy, sizeof(*rr));
+		rrcopy->raidPtr = (void *) raidPtr;
+
+		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
+					   rf_ReconThread,
+					   rrcopy,"raid_recon");
+		return (0);
+
+		/* invoke a copyback operation after recon on whatever disk
+		 * needs it, if any */
+	case RAIDFRAME_COPYBACK:
+
+		if (raidPtr->Layout.map->faultsTolerated == 0) {
+			/* This makes no sense on a RAID 0!! */
+			return(EINVAL);
+		}
+
+		if (raidPtr->copyback_in_progress == 1) {
+			/* Copyback is already in progress! */
+			return(EINVAL);
+		}
+
+		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
+					   rf_CopybackThread,
+					   raidPtr,"raid_copyback");
+		return (retcode);
+
+		/* return the percentage completion of reconstruction */
+	case RAIDFRAME_CHECK_RECON_STATUS:
+		if (raidPtr->Layout.map->faultsTolerated == 0) {
+			/* This makes no sense on a RAID 0, so tell the
+			   user it's done. */
+			*(int *) data = 100;
+			return(0);
+		}
+		row = 0; /* XXX we only consider a single row... */
+		if (raidPtr->status[row] != rf_rs_reconstructing)
+			*(int *) data = 100;
+		else
+			*(int *) data = raidPtr->reconControl[row]->percentComplete;
+		return (0);
+	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
+		row = 0; /* XXX we only consider a single row... */
+		if (raidPtr->status[row] != rf_rs_reconstructing) {
+			progressInfo.remaining = 0;
+			progressInfo.completed = 100;
+			progressInfo.total = 100;
+		} else {
+			progressInfo.total = 
+				raidPtr->reconControl[row]->numRUsTotal;
+			progressInfo.completed = 
+				raidPtr->reconControl[row]->numRUsComplete;
+			progressInfo.remaining = progressInfo.total -
+				progressInfo.completed;
+		}
+		bcopy((caddr_t) &progressInfo, data, sizeof(RF_ProgressInfo_t));
+		return (retcode);
+
+	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
+		if (raidPtr->Layout.map->faultsTolerated == 0) {
+			/* This makes no sense on a RAID 0, so tell the
+			   user it's done. */
+			*(int *) data = 100;
+			return(0);
+		}
+		if (raidPtr->parity_rewrite_in_progress == 1) {
+			*(int *) data = 100 * 
+				raidPtr->parity_rewrite_stripes_done / 
+				raidPtr->Layout.numStripe;
+		} else {
+			*(int *) data = 100;
+		}
+		return (0);
+
+	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
+		if (raidPtr->parity_rewrite_in_progress == 1) {
+			progressInfo.total = raidPtr->Layout.numStripe;
+			progressInfo.completed = 
+				raidPtr->parity_rewrite_stripes_done;
+			progressInfo.remaining = progressInfo.total -
+				progressInfo.completed;
+		} else {
+			progressInfo.remaining = 0;
+			progressInfo.completed = 100;
+			progressInfo.total = 100;
+		}
+		bcopy((caddr_t) &progressInfo, data, sizeof(RF_ProgressInfo_t));
+		return (retcode);
+
+	case RAIDFRAME_CHECK_COPYBACK_STATUS:
+		if (raidPtr->Layout.map->faultsTolerated == 0) {
+			/* This makes no sense on a RAID 0 */
+			*(int *) data = 100;
+			return(0);
+		}
+		if (raidPtr->copyback_in_progress == 1) {
+			*(int *) data = 100 * raidPtr->copyback_stripes_done /
+				raidPtr->Layout.numStripe;
+		} else {
+			*(int *) data = 100;
+		}
+		return (0);
+
+	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
+		if (raidPtr->copyback_in_progress == 1) {
+			progressInfo.total = raidPtr->Layout.numStripe;
+			progressInfo.completed = 
+				raidPtr->copyback_stripes_done;
+			progressInfo.remaining = progressInfo.total -
+				progressInfo.completed;
+		} else {
+			progressInfo.remaining = 0;
+			progressInfo.completed = 100;
+			progressInfo.total = 100;
+		}
+		bcopy((caddr_t) &progressInfo, data, sizeof(RF_ProgressInfo_t));
+		return (retcode);
+
+		/* the sparetable daemon calls this to wait for the kernel to
+		 * need a spare table. this ioctl does not return until a
+		 * spare table is needed. XXX -- calling mpsleep here in the
+		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
+		 * -- I should either compute the spare table in the kernel,
+		 * or have a different -- XXX XXX -- interface (a different
+		 * character device) for delivering the table     -- XXX */
+#if 0
+	case RAIDFRAME_SPARET_WAIT:
+		RF_LOCK_MUTEX(rf_sparet_wait_mutex);
+		while (!rf_sparet_wait_queue)
+			mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
+		waitreq = rf_sparet_wait_queue;
+		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
+		RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
+
+		/* structure assignment */
+		*((RF_SparetWait_t *) data) = *waitreq;	
+
+		RF_Free(waitreq, sizeof(*waitreq));
+		return (0);
+
+		/* wakes up a process waiting on SPARET_WAIT and puts an error
+		 * code in it that will cause the dameon to exit */
+	case RAIDFRAME_ABORT_SPARET_WAIT:
+		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
+		waitreq->fcol = -1;
+		RF_LOCK_MUTEX(rf_sparet_wait_mutex);
+		waitreq->next = rf_sparet_wait_queue;
+		rf_sparet_wait_queue = waitreq;
+		RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
+		wakeup(&rf_sparet_wait_queue);
+		return (0);
+
+		/* used by the spare table daemon to deliver a spare table
+		 * into the kernel */
+	case RAIDFRAME_SEND_SPARET:
+
+		/* install the spare table */
+		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
+
+		/* respond to the requestor.  the return status of the spare
+		 * table installation is passed in the "fcol" field */
+		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
+		waitreq->fcol = retcode;
+		RF_LOCK_MUTEX(rf_sparet_wait_mutex);
+		waitreq->next = rf_sparet_resp_queue;
+		rf_sparet_resp_queue = waitreq;
+		wakeup(&rf_sparet_resp_queue);
+		RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
+
+		return (retcode);
+#endif
+
+	default:
+		retcode = ENOIOCTL;
+		break; /* fall through to the os-specific code below */
+
+	}
+
+	return (retcode);
+
+}
+
+
+/* raidinit -- complete the rest of the initialization for the
+   RAIDframe device.  */
+
+
+static dev_t 
+raidinit(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	struct raid_softc *sc;
+	dev_t	diskdev;
+
+	RF_Malloc(sc, sizeof(struct raid_softc), (struct raid_softc *));
+	if (sc == NULL) {
+		rf_printf(1, "No memory for raid device\n");
+		return(NULL);
+	}
+
+	sc->raidPtr = raidPtr;
+
+	/* XXX Should check return code here */
+	bioq_init(&sc->bio_queue);
+	sc->sc_cbufpool = uma_zcreate("raidpl", sizeof(struct raidbuf), NULL,
+	    NULL, NULL, NULL, 0, 0); 
+
+	/* XXX There may be a weird interaction here between this, and
+	 * protectedSectors, as used in RAIDframe.  */
+
+	sc->sc_size = raidPtr->totalSectors;
+
+	/* Create the disk device */
+	diskdev = disk_create(raidPtr->raidid, &sc->sc_dkdev, 0, &raid_cdevsw,
+		    &raiddisk_cdevsw);
+	if (diskdev == NODEV) {
+		rf_printf(1, "disk_create failed\n");
+		return (NULL);
+	}
+	sc->sc_dkdev.d_dev->si_drv1 = sc;
+	sc->sc_dev = diskdev;
+	raidPtr->sc = sc;
+
+	/* Register with devstat */
+	devstat_add_entry(&sc->device_stats, "raid", raidPtr->raidid, 0,
+			  DEVSTAT_NO_BLOCKSIZE | DEVSTAT_NO_ORDERED_TAGS,
+			  DEVSTAT_TYPE_IF_OTHER, DEVSTAT_PRIORITY_ARRAY);
+
+	return (diskdev);
+}
+
+/* wake up the daemon & tell it to get us a spare table
+ * XXX
+ * the entries in the queues should be tagged with the raidPtr
+ * so that in the extremely rare case that two recons happen at once, 
+ * we know for which device were requesting a spare table
+ * XXX
+ * 
+ * XXX This code is not currently used. GO
+ */
+int 
+rf_GetSpareTableFromDaemon(req)
+	RF_SparetWait_t *req;
+{
+	int     retcode;
+
+	RF_LOCK_MUTEX(rf_sparet_wait_mutex);
+	req->next = rf_sparet_wait_queue;
+	rf_sparet_wait_queue = req;
+	wakeup(&rf_sparet_wait_queue);
+
+	/* mpsleep unlocks the mutex */
+	while (!rf_sparet_resp_queue) {
+		tsleep(&rf_sparet_resp_queue, PRIBIO,
+		    "raidframe getsparetable", 0);
+	}
+	req = rf_sparet_resp_queue;
+	rf_sparet_resp_queue = req->next;
+	RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
+
+	retcode = req->fcol;
+	RF_Free(req, sizeof(*req));	/* this is not the same req as we
+					 * alloc'd */
+	return (retcode);
+}
+
+/* a wrapper around rf_DoAccess that extracts appropriate info from the 
+ * bp & passes it down.
+ * any calls originating in the kernel must use non-blocking I/O
+ * do some extra sanity checking to return "appropriate" error values for
+ * certain conditions (to make some standard utilities work)
+ * 
+ * Formerly known as: rf_DoAccessKernel
+ */
+void
+raidstart(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	RF_SectorCount_t num_blocks, pb, sum;
+	RF_RaidAddr_t raid_addr;
+	struct raid_softc *sc;
+	struct bio *bp;
+	daddr_t blocknum;
+	int     unit, retcode, do_async;
+
+	unit = raidPtr->raidid;
+	sc = raidPtr->sc;
+	
+	/* quick check to see if anything has died recently */
+	RF_LOCK_MUTEX(raidPtr->mutex);
+	if (raidPtr->numNewFailures > 0) {
+		raidPtr->numNewFailures--;
+		RF_UNLOCK_MUTEX(raidPtr->mutex);
+		rf_update_component_labels(raidPtr, 
+					   RF_NORMAL_COMPONENT_UPDATE);
+	} else
+		RF_UNLOCK_MUTEX(raidPtr->mutex);
+
+	/* Check to see if we're at the limit... */
+	RF_LOCK_MUTEX(raidPtr->mutex);
+	while (raidPtr->openings > 0) {
+		RF_UNLOCK_MUTEX(raidPtr->mutex);
+
+		/* get the next item, if any, from the queue */
+		if ((bp = bioq_first(&sc->bio_queue)) == NULL) {
+			/* nothing more to do */
+			return;
+		}
+		bioq_remove(&sc->bio_queue, bp);
+
+		/* Ok, for the bp we have here, bp->b_blkno is relative to the
+		 * partition.. Need to make it absolute to the underlying 
+		 * device.. */
+
+		blocknum = bp->bio_blkno;
+#if 0 /* XXX Is this needed? */
+		if (DISKPART(bp->bio_dev) != RAW_PART) {
+			struct partition *pp;
+			pp = &sc->sc_dkdev.d_label.d_partitions[DISKPART(
+			    bp->bio_dev)];
+			blocknum += pp->p_offset;
+		}
+#endif
+
+		rf_printf(3, "Blocks: %ld, %ld\n", (long)bp->bio_blkno, (long)blocknum);
+		
+		rf_printf(3, "bp->bio_bcount = %d\n", (int) bp->bio_bcount);
+		rf_printf(3, "bp->bio_resid = %d\n", (int) bp->bio_resid);
+		
+		/* *THIS* is where we adjust what block we're going to... 
+		 * but DO NOT TOUCH bp->bio_blkno!!! */
+		raid_addr = blocknum;
+		
+		num_blocks = bp->bio_bcount >> raidPtr->logBytesPerSector;
+		pb = (bp->bio_bcount & raidPtr->sectorMask) ? 1 : 0;
+		sum = raid_addr + num_blocks + pb;
+		if (rf_debugKernelAccess) {
+			rf_printf(0, "raid_addr=0x%x sum=%d num_blocks=%d(+%d) "
+				    "(%d)\n", (int)raid_addr, (int)sum, 
+				    (int)num_blocks, (int)pb,
+				    (int)bp->bio_resid);
+		}
+		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
+		    || (sum < num_blocks) || (sum < pb)) {
+			bp->bio_error = ENOSPC;
+			bp->bio_flags |= BIO_ERROR;
+			bp->bio_resid = bp->bio_bcount;
+			biodone(bp);
+			RF_LOCK_MUTEX(raidPtr->mutex);
+			continue;
+		}
+		/*
+		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
+		 */
+		
+		if (bp->bio_bcount & raidPtr->sectorMask) {
+			bp->bio_error = EINVAL;
+			bp->bio_flags |= BIO_ERROR;
+			bp->bio_resid = bp->bio_bcount;
+			biodone(bp);
+			RF_LOCK_MUTEX(raidPtr->mutex);
+			continue;
+			
+		}
+		rf_printf(3, "Calling DoAccess..\n");
+		
+
+		RF_LOCK_MUTEX(raidPtr->mutex);
+		raidPtr->openings--;
+		RF_UNLOCK_MUTEX(raidPtr->mutex);
+
+		/*
+		 * Everything is async.
+		 */
+		do_async = 1;
+
+		devstat_start_transaction(&sc->device_stats);
+
+		/* XXX we're still at splbio() here... do we *really* 
+		   need to be? */
+
+		/* don't ever condition on bp->bio_cmd & BIO_WRITE.  
+		 * always condition on BIO_READ instead */
+		
+		retcode = rf_DoAccess(raidPtr, (bp->bio_cmd & BIO_READ) ?
+				      RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
+				      do_async, raid_addr, num_blocks,
+				      bp->bio_data, bp, NULL, NULL, 
+				      RF_DAG_NONBLOCKING_IO, NULL, NULL, NULL);
+
+
+		RF_LOCK_MUTEX(raidPtr->mutex);
+	}
+	RF_UNLOCK_MUTEX(raidPtr->mutex);
+}
+
+
+
+
+/* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
+
+int 
+rf_DispatchKernelIO(queue, req)
+	RF_DiskQueue_t *queue;
+	RF_DiskQueueData_t *req;
+{
+	int     op = (req->type == RF_IO_TYPE_READ) ? BIO_READ : BIO_WRITE;
+	struct bio *bp;
+	struct raidbuf *raidbp = NULL;
+	struct raid_softc *sc;
+
+	/* XXX along with the vnode, we also need the softc associated with
+	 * this device.. */
+
+	req->queue = queue;
+
+	sc = queue->raidPtr->sc;
+
+	rf_printf(3, "DispatchKernelIO %s\n", sc->sc_dev->si_name);
+
+	bp = req->bp;
+#if 1
+	/* XXX when there is a physical disk failure, someone is passing us a
+	 * buffer that contains old stuff!!  Attempt to deal with this problem
+	 * without taking a performance hit... (not sure where the real bug
+	 * is.  It's buried in RAIDframe somewhere) :-(  GO ) */
+
+	if (bp->bio_flags & BIO_ERROR) {
+		bp->bio_flags &= ~BIO_ERROR;
+	}
+	if (bp->bio_error != 0) {
+		bp->bio_error = 0;
+	}
+#endif
+	raidbp = RAIDGETBUF(sc);
+
+	raidbp->rf_flags = 0;	/* XXX not really used anywhere... */
+
+	/*
+	 * context for raidiodone
+	 */
+	raidbp->rf_obp = bp;
+	raidbp->req = req;
+
+#if 0	/* XXX */
+	LIST_INIT(&raidbp->rf_buf.b_dep);
+#endif
+
+	switch (req->type) {
+	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
+		/* XXX need to do something extra here.. */
+		/* I'm leaving this in, as I've never actually seen it used,
+		 * and I'd like folks to report it... GO */
+		rf_printf(2, "WAKEUP CALLED\n");
+		queue->numOutstanding++;
+
+		/* XXX need to glue the original buffer into this?  */
+
+		KernelWakeupFunc(&raidbp->rf_buf);
+		break;
+
+	case RF_IO_TYPE_READ:
+	case RF_IO_TYPE_WRITE:
+
+		if (req->tracerec) {
+			RF_ETIMER_START(req->tracerec->timer);
+		}
+		InitBP(&raidbp->rf_buf, queue->rf_cinfo->ci_vp,
+		    op | bp->bio_cmd, queue->rf_cinfo->ci_dev,
+		    req->sectorOffset, req->numSector,
+		    req->buf, KernelWakeupFunc, (void *) req,
+		    queue->raidPtr->logBytesPerSector, req->b_proc);
+
+		if (rf_debugKernelAccess) {
+			rf_printf(0, "dispatch: bp->bio_blkno = %ld\n",
+				(long) bp->bio_blkno);
+		}
+		queue->numOutstanding++;
+		queue->last_deq_sector = req->sectorOffset;
+		/* acc wouldn't have been let in if there were any pending
+		 * reqs at any other priority */
+		queue->curPriority = req->priority;
+
+		rf_printf(3, "Going for %c to %s row %d col %d\n",
+			req->type, sc->sc_dev->si_name, queue->row, queue->col);
+		rf_printf(3, "sector %d count %d (%d bytes) %d\n",
+			(int) req->sectorOffset, (int) req->numSector,
+			(int) (req->numSector <<
+			    queue->raidPtr->logBytesPerSector),
+			(int) queue->raidPtr->logBytesPerSector);
+#if 0	/* XXX */
+		if ((raidbp->rf_buf.bio_cmd & BIO_READ) == 0) {
+			raidbp->rf_buf.b_vp->v_numoutput++;
+		}
+#endif
+		BIO_STRATEGY(&raidbp->rf_buf, 0);
+
+		break;
+
+	default:
+		panic("bad req->type in rf_DispatchKernelIO");
+	}
+	rf_printf(3, "Exiting from DispatchKernelIO\n");
+	/* splx(s); */ /* want to test this */
+	return (0);
+}
+/* this is the callback function associated with a I/O invoked from
+   kernel code.
+ */
+static void 
+KernelWakeupFunc(vbp)
+	struct bio *vbp;
+{
+	RF_DiskQueueData_t *req = NULL;
+	RF_DiskQueue_t *queue;
+	struct raidbuf *raidbp = (struct raidbuf *) vbp;
+	struct bio *bp;
+	struct raid_softc *sc;
+	int s;
+
+	s = splbio();
+	rf_printf(2, "recovering the request queue:\n");
+	req = raidbp->req;
+
+	bp = raidbp->rf_obp;
+	queue = (RF_DiskQueue_t *) req->queue;
+	sc = queue->raidPtr->sc;
+
+	if (raidbp->rf_buf.bio_flags & BIO_ERROR) {
+		bp->bio_flags |= BIO_ERROR;
+		bp->bio_error = raidbp->rf_buf.bio_error ?
+		    raidbp->rf_buf.bio_error : EIO;
+	}
+
+	/* XXX methinks this could be wrong... */
+#if 1
+	bp->bio_resid = raidbp->rf_buf.bio_resid;
+#endif
+
+	if (req->tracerec) {
+		RF_ETIMER_STOP(req->tracerec->timer);
+		RF_ETIMER_EVAL(req->tracerec->timer);
+		RF_LOCK_MUTEX(rf_tracing_mutex);
+		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
+		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
+		req->tracerec->num_phys_ios++;
+		RF_UNLOCK_MUTEX(rf_tracing_mutex);
+	}
+	bp->bio_bcount = raidbp->rf_buf.bio_bcount;	/* XXXX ? */
+
+	/* XXX Ok, let's get aggressive... If BIO_ERROR is set, let's go
+	 * ballistic, and mark the component as hosed... */
+
+	if (bp->bio_flags & BIO_ERROR) {
+		/* Mark the disk as dead */
+		/* but only mark it once... */
+		if (queue->raidPtr->Disks[queue->row][queue->col].status ==
+		    rf_ds_optimal) {
+			rf_printf(0, "%s: IO Error.  Marking %s as "
+			    "failed.\n", sc->sc_dev->si_name, queue->raidPtr->
+			    Disks[queue->row][queue->col].devname);
+			queue->raidPtr->Disks[queue->row][queue->col].status =
+			    rf_ds_failed;
+			queue->raidPtr->status[queue->row] = rf_rs_degraded;
+			queue->raidPtr->numFailures++;
+			queue->raidPtr->numNewFailures++;
+		} else {	/* Disk is already dead... */
+			/* printf("Disk already marked as dead!\n"); */
+		}
+
+	}
+
+	RAIDPUTBUF(sc, raidbp);
+
+	rf_DiskIOComplete(queue, req, (bp->bio_flags & BIO_ERROR) ? 1 : 0);
+	(req->CompleteFunc)(req->argument, (bp->bio_flags & BIO_ERROR) ? 1 : 0);
+
+	splx(s);
+}
+
+
+
+/*
+ * initialize a buf structure for doing an I/O in the kernel.
+ */
+static void 
+InitBP(bp, b_vp, rw_flag, dev, startSect, numSect, buf, cbFunc, cbArg,
+       logBytesPerSector, b_proc)
+	struct bio *bp;
+	struct vnode *b_vp;
+	unsigned rw_flag;
+	dev_t dev;
+	RF_SectorNum_t startSect;
+	RF_SectorCount_t numSect;
+	caddr_t buf;
+	void (*cbFunc) (struct bio *);
+	void *cbArg;
+	int logBytesPerSector;
+	struct proc *b_proc;
+{
+	/* bp->b_flags       = B_PHYS | rw_flag; */
+	bp->bio_cmd = rw_flag;	/* XXX need B_PHYS here too? */
+	bp->bio_bcount = numSect << logBytesPerSector;
+#if 0	/* XXX */
+	bp->bio_bufsize = bp->bio_bcount;
+#endif
+	bp->bio_error = 0;
+	bp->bio_dev = dev;
+	bp->bio_data = buf;
+	bp->bio_blkno = startSect;
+	bp->bio_resid = bp->bio_bcount;	/* XXX is this right!?!?!! */
+	if (bp->bio_bcount == 0) {
+		panic("bp->bio_bcount is zero in InitBP!!\n");
+	}
+/*
+	bp->b_proc = b_proc;
+	bp->b_vp = b_vp;
+*/
+	bp->bio_done = cbFunc;
+
+}
+
+static void
+raidgetdefaultlabel(raidPtr, sc, dp)
+	RF_Raid_t *raidPtr;
+	struct raid_softc *sc;
+	struct disk *dp;
+{
+	rf_printf(1, "Building a default label...\n");
+	if (dp == NULL)
+		panic("raidgetdefaultlabel(): dp is NULL\n");
+
+	/* fabricate a label... */
+	dp->d_mediasize = raidPtr->totalSectors * raidPtr->bytesPerSector;
+	dp->d_sectorsize = raidPtr->bytesPerSector;
+	dp->d_fwsectors = raidPtr->Layout.dataSectorsPerStripe;
+	dp->d_fwheads = 4 * raidPtr->numCol;
+
+}
+/*
+ * Lookup the provided name in the filesystem.  If the file exists,
+ * is a valid block device, and isn't being used by anyone else,
+ * set *vpp to the file's vnode.
+ * You'll find the original of this in ccd.c
+ */
+int
+raidlookup(path, td, vpp)
+	char   *path;
+	struct thread *td;
+	struct vnode **vpp;	/* result */
+{
+	struct nameidata *nd;
+	struct vnode *vp;
+	struct vattr *va;
+	struct proc *p;
+	int     error = 0, flags;
+
+	MALLOC(nd, struct nameidata *, sizeof(struct nameidata), M_TEMP, M_NOWAIT | M_ZERO);
+	MALLOC(va, struct vattr *, sizeof(struct vattr), M_TEMP, M_NOWAIT | M_ZERO);
+	if ((nd == NULL) || (va == NULL)) {
+		printf("Out of memory?\n");
+		return (ENOMEM);
+	}
+
+	/* Sanity check the p_fd fields.  This is really just a hack */
+	p = td->td_proc;
+	if (!p->p_fd->fd_rdir || !p->p_fd->fd_cdir)
+		printf("Warning: p_fd fields not set\n");
+
+	if (!td->td_proc->p_fd->fd_rdir)
+		p->p_fd->fd_rdir = rootvnode;
+
+	if (!p->p_fd->fd_cdir)
+		p->p_fd->fd_cdir = rootvnode;
+
+	NDINIT(nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, curthread);
+	flags = FREAD | FWRITE;
+	if ((error = vn_open(nd, &flags, 0)) != 0) {
+		rf_printf(2, "RAIDframe: vn_open returned %d\n", error);
+		goto end1;
+	}
+	vp = nd->ni_vp;
+	if (vp->v_usecount > 1) {
+		rf_printf(1, "raidlookup() vp->v_usecount= %d\n", vp->v_usecount);
+		error = EBUSY;
+		goto end;
+	}
+	if ((error = VOP_GETATTR(vp, va, td->td_ucred, td)) != 0) {
+		rf_printf(1, "raidlookup() VOP_GETATTR returned %d", error);
+		goto end;
+	}
+	/* XXX: eventually we should handle VREG, too. */
+	if (va->va_type != VCHR) {
+		rf_printf(1, "Returning ENOTBLK\n");
+		error = ENOTBLK;
+	}
+	*vpp = vp;
+
+end:
+	VOP_UNLOCK(vp, 0, td);
+	NDFREE(nd, NDF_ONLY_PNBUF);
+end1:
+	FREE(nd, M_TEMP);
+	FREE(va, M_TEMP);
+	return (error);
+}
+/*
+ * Wait interruptibly for an exclusive lock.
+ *
+ * XXX
+ * Several drivers do this; it should be abstracted and made MP-safe.
+ * (Hmm... where have we seen this warning before :->  GO )
+ */
+static int
+raidlock(sc)
+	struct raid_softc *sc;
+{
+	int     error;
+
+	while ((sc->sc_flags & RAIDF_LOCKED) != 0) {
+		sc->sc_flags |= RAIDF_WANTED;
+		if ((error =
+			tsleep(sc, PRIBIO | PCATCH, "raidlck", 0)) != 0)
+			return (error);
+	}
+	sc->sc_flags |= RAIDF_LOCKED;
+	return (0);
+}
+/*
+ * Unlock and wake up any waiters.
+ */
+static void
+raidunlock(sc)
+	struct raid_softc *sc;
+{
+
+	sc->sc_flags &= ~RAIDF_LOCKED;
+	if ((sc->sc_flags & RAIDF_WANTED) != 0) {
+		sc->sc_flags &= ~RAIDF_WANTED;
+		wakeup(sc);
+	}
+}
+ 
+
+#define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
+#define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
+
+int 
+raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter)
+{
+	RF_ComponentLabel_t *clabel;
+
+	MALLOC(clabel, RF_ComponentLabel_t *, sizeof(RF_ComponentLabel_t),
+	    M_RAIDFRAME, M_NOWAIT | M_ZERO);
+	if (clabel == NULL) {
+		printf("raidmarkclean: Out of memory?\n");
+		return (ENOMEM);
+	}
+
+	raidread_component_label(dev, b_vp, clabel);
+	clabel->mod_counter = mod_counter;
+	clabel->clean = RF_RAID_CLEAN;
+	raidwrite_component_label(dev, b_vp, clabel);
+	FREE(clabel, M_RAIDFRAME);
+	return(0);
+}
+
+
+int 
+raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter)
+{
+	RF_ComponentLabel_t *clabel;
+
+	MALLOC(clabel, RF_ComponentLabel_t *, sizeof(RF_ComponentLabel_t),
+	    M_RAIDFRAME, M_NOWAIT | M_ZERO);
+	if (clabel == NULL) {
+		printf("raidmarkclean: Out of memory?\n");
+		return (ENOMEM);
+	}
+
+	raidread_component_label(dev, b_vp, clabel);
+	clabel->mod_counter = mod_counter;
+	clabel->clean = RF_RAID_DIRTY;
+	raidwrite_component_label(dev, b_vp, clabel);
+	FREE(clabel, M_RAIDFRAME);
+	return(0);
+}
+
+/* ARGSUSED */
+int
+raidread_component_label(dev, b_vp, clabel)
+	dev_t dev;
+	struct vnode *b_vp;
+	RF_ComponentLabel_t *clabel;
+{
+	struct buf *bp;
+	int error;
+	
+	/* XXX should probably ensure that we don't try to do this if
+	   someone has changed rf_protected_sectors. */ 
+
+	if (b_vp == NULL) {
+		/* For whatever reason, this component is not valid.
+		   Don't try to read a component label from it. */
+		return(EINVAL);
+	}
+
+	/* get a block of the appropriate size... */
+	bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
+	bp->b_dev = dev;
+
+	/* get our ducks in a row for the read */
+	bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
+	bp->b_bcount = RF_COMPONENT_INFO_SIZE;
+	bp->b_iocmd = BIO_READ;
+ 	bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
+
+	DEV_STRATEGY(bp, 0);
+	error = bufwait(bp); 
+
+	if (!error) {
+		memcpy(clabel, bp->b_data, sizeof(RF_ComponentLabel_t));
+#if 0
+		rf_print_component_label( clabel );
+#endif
+        } else {
+#if 0
+		rf_printf(0, "Failed to read RAID component label!\n");
+#endif
+	}
+
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp); 
+	return(error);
+}
+/* ARGSUSED */
+int 
+raidwrite_component_label(dev, b_vp, clabel)
+	dev_t dev; 
+	struct vnode *b_vp;
+	RF_ComponentLabel_t *clabel;
+{
+	struct buf *bp;
+	int error;
+
+	/* get a block of the appropriate size... */
+	bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
+	bp->b_dev = dev;
+
+	/* get our ducks in a row for the write */
+	bp->b_flags = 0;
+	bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
+	bp->b_bcount = RF_COMPONENT_INFO_SIZE;
+	bp->b_iocmd = BIO_WRITE;
+ 	bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
+
+	memset(bp->b_data, 0, RF_COMPONENT_INFO_SIZE );
+
+	memcpy(bp->b_data, clabel, sizeof(RF_ComponentLabel_t));
+
+	DEV_STRATEGY(bp, 0);
+	error = bufwait(bp); 
+
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+	if (error) {
+#if 1
+		rf_printf(0, "Failed to write RAID component info!\n");
+		rf_printf(0, "b_error= %d\n", bp->b_error);
+#endif
+	}
+
+	return(error);
+}
+
+void 
+rf_markalldirty(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	RF_ComponentLabel_t *clabel;
+	int r,c;
+
+	MALLOC(clabel, RF_ComponentLabel_t *, sizeof(RF_ComponentLabel_t),
+	    M_RAIDFRAME, M_NOWAIT | M_ZERO);
+
+	if (clabel == NULL) {
+		printf("rf_markalldirty: Out of memory?\n");
+		return;
+	}
+
+	raidPtr->mod_counter++;
+	for (r = 0; r < raidPtr->numRow; r++) {
+		for (c = 0; c < raidPtr->numCol; c++) {
+			/* we don't want to touch (at all) a disk that has
+			   failed */
+			if (!RF_DEAD_DISK(raidPtr->Disks[r][c].status)) {
+				raidread_component_label(
+					raidPtr->Disks[r][c].dev,
+					raidPtr->raid_cinfo[r][c].ci_vp,
+					clabel);
+				if (clabel->status == rf_ds_spared) {
+					/* XXX do something special... 
+					 but whatever you do, don't 
+					 try to access it!! */
+				} else {
+#if 0
+				clabel->status = 
+					raidPtr->Disks[r][c].status;
+				raidwrite_component_label( 
+					raidPtr->Disks[r][c].dev,
+					raidPtr->raid_cinfo[r][c].ci_vp,
+					clabel);
+#endif
+				raidmarkdirty( 
+				       raidPtr->Disks[r][c].dev, 
+				       raidPtr->raid_cinfo[r][c].ci_vp,
+				       raidPtr->mod_counter);
+				}
+			}
+		} 
+	}
+	/* printf("Component labels marked dirty.\n"); */
+#if 0
+	for( c = 0; c < raidPtr->numSpare ; c++) {
+		sparecol = raidPtr->numCol + c;
+		if (raidPtr->Disks[r][sparecol].status == rf_ds_used_spare) {
+			/* 
+
+			   XXX this is where we get fancy and map this spare
+			   into it's correct spot in the array.
+
+			 */
+			/* 
+			   
+			   we claim this disk is "optimal" if it's 
+			   rf_ds_used_spare, as that means it should be 
+			   directly substitutable for the disk it replaced. 
+			   We note that too...
+
+			 */
+
+			for(i=0;i<raidPtr->numRow;i++) {
+				for(j=0;j<raidPtr->numCol;j++) {
+					if ((raidPtr->Disks[i][j].spareRow == 
+					     r) &&
+					    (raidPtr->Disks[i][j].spareCol ==
+					     sparecol)) {
+						srow = r;
+						scol = sparecol;
+						break;
+					}
+				}
+			}
+			
+			raidread_component_label( 
+				      raidPtr->Disks[r][sparecol].dev,
+				      raidPtr->raid_cinfo[r][sparecol].ci_vp,
+				      &clabel);
+			/* make sure status is noted */
+			clabel.version = RF_COMPONENT_LABEL_VERSION; 
+			clabel.mod_counter = raidPtr->mod_counter;
+			clabel.serial_number = raidPtr->serial_number;
+			clabel.row = srow;
+			clabel.column = scol;
+			clabel.num_rows = raidPtr->numRow;
+			clabel.num_columns = raidPtr->numCol;
+			clabel.clean = RF_RAID_DIRTY; /* changed in a bit*/
+			clabel.status = rf_ds_optimal;
+			raidwrite_component_label(
+				      raidPtr->Disks[r][sparecol].dev,
+				      raidPtr->raid_cinfo[r][sparecol].ci_vp,
+				      &clabel);
+			raidmarkclean( raidPtr->Disks[r][sparecol].dev, 
+			              raidPtr->raid_cinfo[r][sparecol].ci_vp);
+		}
+	}
+
+#endif
+	FREE(clabel, M_RAIDFRAME);
+}
+
+
+void
+rf_update_component_labels(raidPtr, final)
+	RF_Raid_t *raidPtr;
+	int final;
+{
+	RF_ComponentLabel_t *clabel;
+	int sparecol;
+	int r,c;
+	int i,j;
+	int srow, scol;
+
+	srow = -1;
+	scol = -1;
+
+	MALLOC(clabel, RF_ComponentLabel_t *, sizeof(RF_ComponentLabel_t),
+	    M_RAIDFRAME, M_NOWAIT | M_ZERO);
+	if (clabel == NULL) {
+		printf("rf_update_component_labels: Out of memory?\n");
+		return;
+	}
+
+	/* XXX should do extra checks to make sure things really are clean, 
+	   rather than blindly setting the clean bit... */
+
+	raidPtr->mod_counter++;
+
+	for (r = 0; r < raidPtr->numRow; r++) {
+		for (c = 0; c < raidPtr->numCol; c++) {
+			if (raidPtr->Disks[r][c].status == rf_ds_optimal) {
+				raidread_component_label(
+					raidPtr->Disks[r][c].dev,
+					raidPtr->raid_cinfo[r][c].ci_vp,
+					clabel);
+				/* make sure status is noted */
+				clabel->status = rf_ds_optimal;
+				/* bump the counter */
+				clabel->mod_counter = raidPtr->mod_counter;
+
+				raidwrite_component_label( 
+					raidPtr->Disks[r][c].dev,
+					raidPtr->raid_cinfo[r][c].ci_vp,
+					clabel);
+				if (final == RF_FINAL_COMPONENT_UPDATE) {
+					if (raidPtr->parity_good == RF_RAID_CLEAN) {
+						raidmarkclean( 
+							      raidPtr->Disks[r][c].dev, 
+							      raidPtr->raid_cinfo[r][c].ci_vp,
+							      raidPtr->mod_counter);
+					}
+				}
+			} 
+			/* else we don't touch it.. */
+		} 
+	}
+
+	for( c = 0; c < raidPtr->numSpare ; c++) {
+		sparecol = raidPtr->numCol + c;
+		if (raidPtr->Disks[0][sparecol].status == rf_ds_used_spare) {
+			/* 
+			   
+			   we claim this disk is "optimal" if it's 
+			   rf_ds_used_spare, as that means it should be 
+			   directly substitutable for the disk it replaced. 
+			   We note that too...
+
+			 */
+
+			for(i=0;i<raidPtr->numRow;i++) {
+				for(j=0;j<raidPtr->numCol;j++) {
+					if ((raidPtr->Disks[i][j].spareRow == 
+					     0) &&
+					    (raidPtr->Disks[i][j].spareCol ==
+					     sparecol)) {
+						srow = i;
+						scol = j;
+						break;
+					}
+				}
+			}
+			
+			/* XXX shouldn't *really* need this... */
+			raidread_component_label( 
+				      raidPtr->Disks[0][sparecol].dev,
+				      raidPtr->raid_cinfo[0][sparecol].ci_vp,
+				      clabel);
+			/* make sure status is noted */
+
+			raid_init_component_label(raidPtr, clabel);
+
+			clabel->mod_counter = raidPtr->mod_counter;
+			clabel->row = srow;
+			clabel->column = scol;
+			clabel->status = rf_ds_optimal;
+
+			raidwrite_component_label(
+				      raidPtr->Disks[0][sparecol].dev,
+				      raidPtr->raid_cinfo[0][sparecol].ci_vp,
+				      clabel);
+			if (final == RF_FINAL_COMPONENT_UPDATE) {
+				if (raidPtr->parity_good == RF_RAID_CLEAN) {
+					raidmarkclean( raidPtr->Disks[0][sparecol].dev,
+						       raidPtr->raid_cinfo[0][sparecol].ci_vp,
+						       raidPtr->mod_counter);
+				}
+			}
+		}
+	}
+	FREE(clabel, M_RAIDFRAME);
+	rf_printf(1, "Component labels updated\n");
+}
+
+void
+rf_close_component(raidPtr, vp, auto_configured)
+	RF_Raid_t *raidPtr;
+	struct vnode *vp;
+	int auto_configured;
+{
+	struct thread *td;
+
+	td = raidPtr->engine_thread;
+
+	if (vp != NULL) {
+		if (auto_configured == 1) {
+			VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
+			
+			vrele(vp);
+		} else {				
+			vn_close(vp, FREAD | FWRITE, td->td_ucred, td);
+		}
+	} else {
+		rf_printf(1, "vnode was NULL\n");
+	}
+}
+
+
+void
+rf_UnconfigureVnodes(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	int r,c; 
+	struct thread *td;
+	struct vnode *vp;
+	int acd;
+
+
+	/* We take this opportunity to close the vnodes like we should.. */
+
+	td = raidPtr->engine_thread;
+
+	for (r = 0; r < raidPtr->numRow; r++) {
+		for (c = 0; c < raidPtr->numCol; c++) {
+			rf_printf(1, "Closing vnode for row: %d col: %d\n", r, c);
+			vp = raidPtr->raid_cinfo[r][c].ci_vp;
+			acd = raidPtr->Disks[r][c].auto_configured;
+			rf_close_component(raidPtr, vp, acd);
+			raidPtr->raid_cinfo[r][c].ci_vp = NULL;
+			raidPtr->Disks[r][c].auto_configured = 0;
+		}
+	}
+	for (r = 0; r < raidPtr->numSpare; r++) {
+		rf_printf(1, "Closing vnode for spare: %d\n", r);
+		vp = raidPtr->raid_cinfo[0][raidPtr->numCol + r].ci_vp;
+		acd = raidPtr->Disks[0][raidPtr->numCol + r].auto_configured;
+		rf_close_component(raidPtr, vp, acd);
+		raidPtr->raid_cinfo[0][raidPtr->numCol + r].ci_vp = NULL;
+		raidPtr->Disks[0][raidPtr->numCol + r].auto_configured = 0;
+	}
+}
+
+
+void 
+rf_ReconThread(req)
+	struct rf_recon_req *req;
+{
+	RF_Raid_t *raidPtr;
+
+	mtx_lock(&Giant);
+	raidPtr = (RF_Raid_t *) req->raidPtr;
+	raidPtr->recon_in_progress = 1;
+
+	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->row, req->col,
+		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
+
+	/* XXX get rid of this! we don't need it at all.. */
+	RF_Free(req, sizeof(*req));
+
+	raidPtr->recon_in_progress = 0;
+
+	/* That's all... */
+	RF_THREAD_EXIT(0);        /* does not return */
+}
+
+void
+rf_RewriteParityThread(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	int retcode;
+
+	mtx_lock(&Giant);
+	raidPtr->parity_rewrite_in_progress = 1;
+	retcode = rf_RewriteParity(raidPtr);
+	if (retcode) {
+		rf_printf(0, "raid%d: Error re-writing parity!\n",raidPtr->raidid);
+	} else {
+		/* set the clean bit!  If we shutdown correctly,
+		   the clean bit on each component label will get
+		   set */
+		raidPtr->parity_good = RF_RAID_CLEAN;
+	}
+	raidPtr->parity_rewrite_in_progress = 0;
+
+	/* Anyone waiting for us to stop?  If so, inform them... */
+	if (raidPtr->waitShutdown) {
+		wakeup(&raidPtr->parity_rewrite_in_progress);
+	}
+
+	/* That's all... */
+	RF_THREAD_EXIT(0);        /* does not return */
+}
+
+
+void
+rf_CopybackThread(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	mtx_lock(&Giant);
+	raidPtr->copyback_in_progress = 1;
+	rf_CopybackReconstructedData(raidPtr);
+	raidPtr->copyback_in_progress = 0;
+
+	/* That's all... */
+	RF_THREAD_EXIT(0);        /* does not return */
+}
+
+
+void
+rf_ReconstructInPlaceThread(req)
+	struct rf_recon_req *req;
+{
+	int retcode;
+	RF_Raid_t *raidPtr;
+	
+	mtx_lock(&Giant);
+	raidPtr = req->raidPtr;
+	raidPtr->recon_in_progress = 1;
+	retcode = rf_ReconstructInPlace(raidPtr, req->row, req->col);
+	RF_Free(req, sizeof(*req));
+	raidPtr->recon_in_progress = 0;
+
+	/* That's all... */
+	RF_THREAD_EXIT(0);        /* does not return */
+}
+
+RF_AutoConfig_t *
+rf_find_raid_components()
+{
+	RF_AutoConfig_t *ac_list = NULL;
+#if 0 /* XXX GEOM */
+	struct vnode *vp;
+	struct disklabel *label;
+	struct diskslice *slice;
+	struct diskslices *slices;
+	struct disk *disk;
+	struct thread *td;
+	dev_t dev;
+	char *devname;
+	int error, j;
+	int nslices;
+
+	td = curthread;
+
+	MALLOC(label, struct disklabel *, sizeof(struct disklabel),
+	    M_RAIDFRAME, M_NOWAIT|M_ZERO);
+	MALLOC(slices, struct diskslices *, sizeof(struct diskslices),
+	    M_RAIDFRAME, M_NOWAIT|M_ZERO);
+	if ((label == NULL) || (slices == NULL)) {
+		printf("rf_find_raid_components: Out of Memory?\n");
+		return (NULL);
+	}
+
+	/* initialize the AutoConfig list */
+	ac_list = NULL;
+
+	/* we begin by trolling through *all* the disk devices on the system */
+
+	disk = NULL;
+	while ((disk = disk_enumerate(disk))) {
+
+		/* we don't care about floppies... */
+		devname = disk->d_dev->si_name;
+		if (!strncmp(devname, "fd", 2) ||
+		    !strncmp(devname, "cd", 2) ||
+		    !strncmp(devname, "acd", 3))
+			continue;
+
+		rf_printf(1, "Examining %s\n", disk->d_dev->si_name);
+		if (bdevvp(disk->d_dev, &vp))
+			panic("RAIDframe can't alloc vnode");
+		vref(vp);
+
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		error = VOP_OPEN(vp, FREAD, td->td_ucred, td);
+		VOP_UNLOCK(vp, 0, td);
+		if (error) {
+			vput(vp);
+			continue;
+		}
+
+		error = VOP_IOCTL(vp, DIOCGSLICEINFO, (caddr_t)slices,
+		    FREAD, td->td_ucred, td);
+		VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
+		vrele(vp);
+		if (error) {
+			/* No slice table. */
+			continue;
+		}
+
+		nslices = slices->dss_nslices;
+		if ((nslices == 0) || (nslices > MAX_SLICES))
+			continue;
+
+		/* Iterate through the slices */
+		for (j = 1; j < nslices; j++) {
+
+			rf_printf(1, "Examining slice %d\n", j);
+			slice = &slices->dss_slices[j - 1];
+			dev = dkmodslice(disk->d_dev, j);
+			if (bdevvp(dev, &vp))
+				panic("RAIDframe can't alloc vnode");
+
+			vref(vp);
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+			error = VOP_OPEN(vp, FREAD, td->td_ucred, td);
+			VOP_UNLOCK(vp, 0, td);
+			if (error) {
+				continue;
+			}
+
+			error = VOP_IOCTL(vp, DIOCGDINFO, (caddr_t)label,
+			    FREAD, td->td_ucred, td);
+			VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
+			vrele(vp);
+			if (error)
+				continue;
+
+			rf_search_label(dev, label, &ac_list);
+		}
+	}
+
+	FREE(label, M_RAIDFRAME);
+	FREE(slices, M_RAIDFRAME);
+#endif
+	return (ac_list);
+}
+
+static void
+rf_search_label(dev_t dev, struct disklabel *label, RF_AutoConfig_t **ac_list)
+{
+	RF_AutoConfig_t *ac;
+	RF_ComponentLabel_t *clabel;
+	struct vnode *vp;
+	struct thread *td;
+	dev_t dev1;
+	int i, error, good_one;
+
+	td = curthread;
+
+	/* Iterate through the partitions */
+	for (i=0; i < label->d_npartitions; i++) {
+		/* We only support partitions marked as RAID */
+		if (label->d_partitions[i].p_fstype != FS_RAID)
+			continue;
+
+		dev1 = dkmodpart(dev, i);
+		if (dev1 == NULL) {
+			rf_printf(1, "dev1 == null\n");
+			continue;
+		}
+		if (bdevvp(dev1, &vp))
+			panic("RAIDframe can't alloc vnode");
+
+		vref(vp);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		error = VOP_OPEN(vp, FREAD, td->td_ucred, td);
+		VOP_UNLOCK(vp, 0, td);
+		if (error) {
+			/* Whatever... */
+			continue;
+		}
+
+		good_one = 0;
+
+		clabel = (RF_ComponentLabel_t *) 
+			malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME,
+			       M_NOWAIT);
+		if (clabel == NULL) {
+			/* XXX CLEANUP HERE */
+			panic("RAID autoconfig: no memory!\n");
+		}
+
+		if (!raidread_component_label(dev1, vp, clabel)) {
+			/* Got the label.  Is it reasonable? */
+			if (rf_reasonable_label(clabel) &&
+			    (clabel->partitionSize <= 
+			     label->d_partitions[i].p_size)) {
+				rf_printf(1, "Component on: %s: %d\n",
+				    dev1->si_name, label->d_partitions[i].p_size);
+				rf_print_component_label(clabel);
+				/* if it's reasonable, add it, else ignore it */
+				ac = (RF_AutoConfig_t *)
+					malloc(sizeof(RF_AutoConfig_t),
+					       M_RAIDFRAME, M_NOWAIT);
+				if (ac == NULL) {
+					/* XXX should panic? */
+					panic("RAID autoconfig: no memory!\n");
+				}
+			
+				sprintf(ac->devname, "%s", dev->si_name);
+				ac->dev = dev1;
+				ac->vp = vp;
+				ac->clabel = clabel;
+				ac->next = *ac_list;
+				*ac_list = ac;
+				good_one = 1;
+			} 
+		}
+		if (!good_one) {
+			/* cleanup */
+			free(clabel, M_RAIDFRAME);
+			VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
+			vrele(vp);
+		}
+	}
+}
+
+static int
+rf_reasonable_label(clabel)
+	RF_ComponentLabel_t *clabel;
+{
+	
+	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
+	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
+	    ((clabel->clean == RF_RAID_CLEAN) ||
+	     (clabel->clean == RF_RAID_DIRTY)) &&
+	    clabel->row >=0 && 
+	    clabel->column >= 0 && 
+	    clabel->num_rows > 0 &&
+	    clabel->num_columns > 0 &&
+	    clabel->row < clabel->num_rows && 
+	    clabel->column < clabel->num_columns &&
+	    clabel->blockSize > 0 &&
+	    clabel->numBlocks > 0) {
+		/* label looks reasonable enough... */
+		return(1);
+	}
+	return(0);
+}
+
+
+void
+rf_print_component_label(clabel)
+	RF_ComponentLabel_t *clabel;
+{
+	rf_printf(1, "   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
+	       clabel->row, clabel->column, 
+	       clabel->num_rows, clabel->num_columns);
+	rf_printf(1, "   Version: %d Serial Number: %d Mod Counter: %d\n",
+	       clabel->version, clabel->serial_number,
+	       clabel->mod_counter);
+	rf_printf(1, "   Clean: %s Status: %d\n",
+	       clabel->clean ? "Yes" : "No", clabel->status );
+	rf_printf(1, "   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
+	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
+	rf_printf(1, "   RAID Level: %c  blocksize: %d numBlocks: %d\n",
+	       (char) clabel->parityConfig, clabel->blockSize, 
+	       clabel->numBlocks);
+	rf_printf(1, "   Autoconfig: %s\n", clabel->autoconfigure ? "Yes":"No");
+	rf_printf(1, "   Contains root partition: %s\n",  
+	       clabel->root_partition ? "Yes" : "No" );
+	rf_printf(1, "   Last configured as: raid%d\n", clabel->last_unit );
+#if 0
+	rf_printf(1, "   Config order: %d\n", clabel->config_order);
+#endif
+	       
+}
+
+RF_ConfigSet_t *
+rf_create_auto_sets(ac_list)
+	RF_AutoConfig_t *ac_list;
+{
+	RF_AutoConfig_t *ac;
+	RF_ConfigSet_t *config_sets;
+	RF_ConfigSet_t *cset;
+	RF_AutoConfig_t *ac_next;
+
+
+	config_sets = NULL;
+
+	/* Go through the AutoConfig list, and figure out which components
+	   belong to what sets.  */
+	ac = ac_list;
+	while(ac!=NULL) {
+		/* we're going to putz with ac->next, so save it here
+		   for use at the end of the loop */
+		ac_next = ac->next;
+
+		if (config_sets == NULL) {
+			/* will need at least this one... */
+			config_sets = (RF_ConfigSet_t *)
+				malloc(sizeof(RF_ConfigSet_t), 
+				       M_RAIDFRAME, M_NOWAIT);
+			if (config_sets == NULL) {
+				panic("rf_create_auto_sets: No memory!\n");
+			}
+			/* this one is easy :) */
+			config_sets->ac = ac;
+			config_sets->next = NULL;
+			config_sets->rootable = 0;
+			ac->next = NULL;
+		} else {
+			/* which set does this component fit into? */
+			cset = config_sets;
+			while(cset!=NULL) {
+				if (rf_does_it_fit(cset, ac)) {
+					/* looks like it matches... */
+					ac->next = cset->ac;
+					cset->ac = ac;
+					break;
+				}
+				cset = cset->next;
+			}
+			if (cset==NULL) {
+				/* didn't find a match above... new set..*/
+				cset = (RF_ConfigSet_t *)
+					malloc(sizeof(RF_ConfigSet_t), 
+					       M_RAIDFRAME, M_NOWAIT);
+				if (cset == NULL) {
+					panic("rf_create_auto_sets: No memory!\n");
+				}
+				cset->ac = ac;
+				ac->next = NULL;
+				cset->next = config_sets;
+				cset->rootable = 0;
+				config_sets = cset;
+			}
+		}
+		ac = ac_next;
+	}
+
+
+	return(config_sets);
+}
+
+static int
+rf_does_it_fit(cset, ac)	
+	RF_ConfigSet_t *cset;
+	RF_AutoConfig_t *ac;
+{
+	RF_ComponentLabel_t *clabel1, *clabel2;
+
+	/* If this one matches the *first* one in the set, that's good
+	   enough, since the other members of the set would have been
+	   through here too... */
+	/* note that we are not checking partitionSize here..
+
+	   Note that we are also not checking the mod_counters here.
+	   If everything else matches execpt the mod_counter, that's 
+	   good enough for this test.  We will deal with the mod_counters
+	   a little later in the autoconfiguration process.  
+
+	    (clabel1->mod_counter == clabel2->mod_counter) &&
+
+	   The reason we don't check for this is that failed disks
+	   will have lower modification counts.  If those disks are
+	   not added to the set they used to belong to, then they will
+	   form their own set, which may result in 2 different sets,
+	   for example, competing to be configured at raid0, and
+	   perhaps competing to be the root filesystem set.  If the
+	   wrong ones get configured, or both attempt to become /,
+	   weird behaviour and or serious lossage will occur.  Thus we
+	   need to bring them into the fold here, and kick them out at
+	   a later point.
+
+	*/
+
+	clabel1 = cset->ac->clabel;
+	clabel2 = ac->clabel;
+	if ((clabel1->version == clabel2->version) &&
+	    (clabel1->serial_number == clabel2->serial_number) &&
+	    (clabel1->num_rows == clabel2->num_rows) &&
+	    (clabel1->num_columns == clabel2->num_columns) &&
+	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
+	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
+	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
+	    (clabel1->parityConfig == clabel2->parityConfig) &&
+	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
+	    (clabel1->blockSize == clabel2->blockSize) &&
+	    (clabel1->numBlocks == clabel2->numBlocks) &&
+	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
+	    (clabel1->root_partition == clabel2->root_partition) &&
+	    (clabel1->last_unit == clabel2->last_unit) &&
+	    (clabel1->config_order == clabel2->config_order)) {
+		/* if it get's here, it almost *has* to be a match */
+	} else {
+		/* it's not consistent with somebody in the set.. 
+		   punt */
+		return(0);
+	}
+	/* all was fine.. it must fit... */
+	return(1);
+}
+
+int
+rf_have_enough_components(cset)
+	RF_ConfigSet_t *cset;
+{
+	RF_AutoConfig_t *ac;
+	RF_AutoConfig_t *auto_config;
+	RF_ComponentLabel_t *clabel;
+	int r,c;
+	int num_rows;
+	int num_cols;
+	int num_missing;
+	int mod_counter;
+	int mod_counter_found;
+	int even_pair_failed;
+	char parity_type;
+	
+
+	/* check to see that we have enough 'live' components
+	   of this set.  If so, we can configure it if necessary */
+
+	num_rows = cset->ac->clabel->num_rows;
+	num_cols = cset->ac->clabel->num_columns;
+	parity_type = cset->ac->clabel->parityConfig;
+
+	/* XXX Check for duplicate components!?!?!? */
+
+	/* Determine what the mod_counter is supposed to be for this set. */
+
+	mod_counter_found = 0;
+	mod_counter = 0;
+	ac = cset->ac;
+	while(ac!=NULL) {
+		if (mod_counter_found==0) {
+			mod_counter = ac->clabel->mod_counter;
+			mod_counter_found = 1;
+		} else {
+			if (ac->clabel->mod_counter > mod_counter) {
+				mod_counter = ac->clabel->mod_counter;
+			}
+		}
+		ac = ac->next;
+	}
+
+	num_missing = 0;
+	auto_config = cset->ac;
+
+	for(r=0; r<num_rows; r++) {
+		even_pair_failed = 0;
+		for(c=0; c<num_cols; c++) {
+			ac = auto_config;
+			while(ac!=NULL) {
+				if ((ac->clabel->row == r) &&
+				    (ac->clabel->column == c) && 
+				    (ac->clabel->mod_counter == mod_counter)) {
+					/* it's this one... */
+					rf_printf(1, "Found: %s at %d,%d\n",
+					       ac->devname,r,c);
+					break;
+				}
+				ac=ac->next;
+			}
+			if (ac==NULL) {
+				/* Didn't find one here! */
+				/* special case for RAID 1, especially
+				   where there are more than 2
+				   components (where RAIDframe treats
+				   things a little differently :( ) */
+				if (parity_type == '1') {
+					if (c%2 == 0) { /* even component */
+						even_pair_failed = 1;
+					} else { /* odd component.  If
+                                                    we're failed, and
+                                                    so is the even
+                                                    component, it's
+                                                    "Good Night, Charlie" */
+						if (even_pair_failed == 1) {
+							return(0);
+						}
+					}
+				} else {
+					/* normal accounting */
+					num_missing++;
+				}
+			}
+			if ((parity_type == '1') && (c%2 == 1)) {
+				/* Just did an even component, and we didn't
+				   bail.. reset the even_pair_failed flag, 
+				   and go on to the next component.... */
+				even_pair_failed = 0;
+			}
+		}
+	}
+
+	clabel = cset->ac->clabel;
+
+	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
+	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
+	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
+		/* XXX this needs to be made *much* more general */
+		/* Too many failures */
+		return(0);
+	}
+	/* otherwise, all is well, and we've got enough to take a kick
+	   at autoconfiguring this set */
+	return(1);
+}
+
+void
+rf_create_configuration(ac,config,raidPtr)
+	RF_AutoConfig_t *ac;
+	RF_Config_t *config;
+	RF_Raid_t *raidPtr;
+{
+	RF_ComponentLabel_t *clabel;
+	int i;
+
+	clabel = ac->clabel;
+
+	/* 1. Fill in the common stuff */
+	config->numRow = clabel->num_rows;
+	config->numCol = clabel->num_columns;
+	config->numSpare = 0; /* XXX should this be set here? */
+	config->sectPerSU = clabel->sectPerSU;
+	config->SUsPerPU = clabel->SUsPerPU;
+	config->SUsPerRU = clabel->SUsPerRU;
+	config->parityConfig = clabel->parityConfig;
+	/* XXX... */
+	strcpy(config->diskQueueType,"fifo");
+	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
+	config->layoutSpecificSize = 0; /* XXX ? */
+
+	while(ac!=NULL) {
+		/* row/col values will be in range due to the checks
+		   in reasonable_label() */
+		strcpy(config->devnames[ac->clabel->row][ac->clabel->column],
+		       ac->devname);
+		ac = ac->next;
+	}
+
+	for(i=0;i<RF_MAXDBGV;i++) {
+		config->debugVars[i][0] = NULL;
+	}
+}
+
+int
+rf_set_autoconfig(raidPtr, new_value)
+	RF_Raid_t *raidPtr;
+	int new_value;
+{
+	RF_ComponentLabel_t *clabel;
+	struct vnode *vp;
+	dev_t dev;
+	int row, column;
+
+	MALLOC(clabel, RF_ComponentLabel_t *, sizeof(RF_ComponentLabel_t),
+	    M_RAIDFRAME, M_WAITOK | M_ZERO);
+
+	raidPtr->autoconfigure = new_value;
+	for(row=0; row<raidPtr->numRow; row++) {
+		for(column=0; column<raidPtr->numCol; column++) {
+			if (raidPtr->Disks[row][column].status == 
+			    rf_ds_optimal) {
+				dev = raidPtr->Disks[row][column].dev;
+				vp = raidPtr->raid_cinfo[row][column].ci_vp;
+				raidread_component_label(dev, vp, clabel);
+				clabel->autoconfigure = new_value;
+				raidwrite_component_label(dev, vp, clabel);
+			}
+		}
+	}
+	FREE(clabel, M_RAIDFRAME);
+	return(new_value);
+}
+
+int
+rf_set_rootpartition(raidPtr, new_value)
+	RF_Raid_t *raidPtr;
+	int new_value;
+{
+	RF_ComponentLabel_t *clabel;
+	struct vnode *vp;
+	dev_t dev;
+	int row, column;
+
+	MALLOC(clabel, RF_ComponentLabel_t *, sizeof(RF_ComponentLabel_t),
+	    M_RAIDFRAME, M_WAITOK | M_ZERO);
+
+	raidPtr->root_partition = new_value;
+	for(row=0; row<raidPtr->numRow; row++) {
+		for(column=0; column<raidPtr->numCol; column++) {
+			if (raidPtr->Disks[row][column].status == 
+			    rf_ds_optimal) {
+				dev = raidPtr->Disks[row][column].dev;
+				vp = raidPtr->raid_cinfo[row][column].ci_vp;
+				raidread_component_label(dev, vp, clabel);
+				clabel->root_partition = new_value;
+				raidwrite_component_label(dev, vp, clabel);
+			}
+		}
+	}
+	FREE(clabel, M_RAIDFRAME);
+	return(new_value);
+}
+
+void
+rf_release_all_vps(cset)
+	RF_ConfigSet_t *cset;
+{
+	RF_AutoConfig_t *ac;
+	struct thread *td;
+
+	td = curthread;
+	ac = cset->ac;
+	while(ac!=NULL) {
+		/* Close the vp, and give it back */
+		if (ac->vp) {
+			VOP_CLOSE(ac->vp, FREAD, td->td_ucred, td);
+			vrele(ac->vp);
+			ac->vp = NULL;
+		}
+		ac = ac->next;
+	}
+}
+
+
+void
+rf_cleanup_config_set(cset)
+	RF_ConfigSet_t *cset;
+{
+	RF_AutoConfig_t *ac;
+	RF_AutoConfig_t *next_ac;
+	
+	ac = cset->ac;
+	while(ac!=NULL) {
+		next_ac = ac->next;
+		/* nuke the label */
+		free(ac->clabel, M_RAIDFRAME);
+		/* cleanup the config structure */
+		free(ac, M_RAIDFRAME);
+		/* "next.." */
+		ac = next_ac;
+	}
+	/* and, finally, nuke the config set */
+	free(cset, M_RAIDFRAME);
+}
+
+
+void
+raid_init_component_label(raidPtr, clabel)
+	RF_Raid_t *raidPtr;
+	RF_ComponentLabel_t *clabel;
+{
+	/* current version number */
+	clabel->version = RF_COMPONENT_LABEL_VERSION; 
+	clabel->serial_number = raidPtr->serial_number;
+	clabel->mod_counter = raidPtr->mod_counter;
+	clabel->num_rows = raidPtr->numRow;
+	clabel->num_columns = raidPtr->numCol;
+	clabel->clean = RF_RAID_DIRTY; /* not clean */
+	clabel->status = rf_ds_optimal; /* "It's good!" */
+	
+	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
+	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
+	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
+
+	clabel->blockSize = raidPtr->bytesPerSector;
+	clabel->numBlocks = raidPtr->sectorsPerDisk;
+
+	/* XXX not portable */
+	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
+	clabel->maxOutstanding = raidPtr->maxOutstanding;
+	clabel->autoconfigure = raidPtr->autoconfigure;
+	clabel->root_partition = raidPtr->root_partition;
+	clabel->last_unit = raidPtr->raidid;
+	clabel->config_order = raidPtr->config_order;
+}
+
+int
+rf_auto_config_set(cset, unit, parent_sc)
+	RF_ConfigSet_t *cset;
+	int *unit;
+	struct raidctl_softc *parent_sc;
+{
+	int retcode = 0;
+	RF_Raid_t *raidPtr;
+	RF_Config_t *config;
+	int raidID;
+
+	rf_printf(0, "RAIDframe autoconfigure\n");
+
+	*unit = -1;
+
+	/* 1. Create a config structure */
+
+	config = (RF_Config_t *)malloc(sizeof(RF_Config_t), M_RAIDFRAME,
+				       M_NOWAIT|M_ZERO);
+	if (config==NULL) {
+		rf_printf(0, "Out of mem at rf_auto_config_set\n");
+				/* XXX do something more intelligent here. */
+		return(1);
+	}
+
+	/* XXX raidID needs to be set correctly.. */
+
+	/* 
+	   2. Figure out what RAID ID this one is supposed to live at 
+	   See if we can get the same RAID dev that it was configured
+	   on last time.. 
+	*/
+
+	raidID = cset->ac->clabel->last_unit;
+	if (raidID < 0) {
+		/* let's not wander off into lala land. */
+		raidID = raidgetunit(parent_sc, 0);
+	} else {
+		raidID = raidgetunit(parent_sc, raidID);
+	}
+
+	if (raidID < 0) {
+		/* punt... */
+		rf_printf(0, "Unable to auto configure this set!\n");
+		rf_printf(1, "Out of RAID devs!\n");
+		return(1);
+	}
+	rf_printf(0, "Configuring raid%d:\n",raidID);
+	RF_Malloc(raidPtr, sizeof(*raidPtr), (RF_Raid_t *));
+	if (raidPtr == NULL) {
+		rf_printf(0, "Out of mem at rf_auto_config_set\n");
+		return (1);
+	}
+	bzero((char *)raidPtr, sizeof(RF_Raid_t));
+
+	/* XXX all this stuff should be done SOMEWHERE ELSE! */
+	raidPtr->raidid = raidID;
+	raidPtr->openings = RAIDOUTSTANDING;
+
+	/* 3. Build the configuration structure */
+	rf_create_configuration(cset->ac, config, raidPtr);
+
+	/* 4. Do the configuration */
+	retcode = rf_Configure(raidPtr, config, cset->ac);
+	
+	if (retcode == 0) {
+
+		parent_sc->sc_raiddevs[raidID] = raidinit(raidPtr);
+		if (parent_sc->sc_raiddevs[raidID] == NULL) {
+			rf_printf(0, "Could not create RAID device\n");
+			RF_Free(raidPtr, sizeof(RF_Raid_t));
+			free(config, M_RAIDFRAME);
+			return (1);
+		}
+
+		parent_sc->sc_numraid++;
+		((struct raid_softc *)raidPtr->sc)->sc_parent_dev =
+		    parent_sc->sc_dev;
+		rf_markalldirty(raidPtr);
+		raidPtr->autoconfigure = 1; /* XXX do this here? */
+		if (cset->ac->clabel->root_partition==1) {
+			/* everything configured just fine.  Make a note
+			   that this set is eligible to be root. */
+			cset->rootable = 1;
+			/* XXX do this here? */
+			raidPtr->root_partition = 1; 
+		}
+	}
+
+	/* 5. Cleanup */
+	free(config, M_RAIDFRAME);
+	
+	*unit = raidID;
+	return(retcode);
+}
+
+void
+rf_disk_unbusy(desc)
+	RF_RaidAccessDesc_t *desc;
+{
+	struct raid_softc *sc;
+	struct bio *bp;
+
+	sc = desc->raidPtr->sc;
+	bp = (struct bio *)desc->bp;
+
+	devstat_end_transaction_bio(&sc->device_stats, bp);
+}
+
+/*
+ * Get the next available unit number from the bitmap.  You can also request
+ * a particular unit number by passing it in the second arg.  If it's not
+ * available, then grab the next free one.  Return -1 if none are available.
+ */
+static int
+raidgetunit(struct raidctl_softc *parent_sc, int id)
+{
+	int i;
+
+	if (id >= RF_MAX_ARRAYS)
+		return (-1);
+
+	for (i = id; i < RF_MAX_ARRAYS; i++) {
+		if (parent_sc->sc_raiddevs[i] == NULL)
+			return (i);
+	}
+
+	if (id != 0) {
+		for (i = 0; i < id; i++) {
+			if (parent_sc->sc_raiddevs[i] == NULL)
+				return (i);
+		}
+	}
+
+	return (-1);
+}
+
+static int
+raidshutdown(void)
+{
+	struct raidctl_softc *parent_sc;
+	int i, error = 0;
+
+	parent_sc = raidctl_dev->si_drv1;
+
+	if (parent_sc->sc_numraid != 0) {
+#if XXX_KTHREAD_EXIT_RACE
+		return (EBUSY);
+#else
+		for (i = 0; i < RF_MAX_ARRAYS; i++) {
+			if (parent_sc->sc_raiddevs[i] != NULL) {
+				rf_printf(0, "Shutting down raid%d\n", i);
+				error = raidctlioctl(raidctl_dev,
+				    RAIDFRAME_SHUTDOWN, (caddr_t)&i, 0, NULL);
+				if (error)
+					return (error);
+				if (parent_sc->sc_numraid == 0)
+					break;
+			}
+		}
+#endif
+	}
+
+	destroy_dev(raidctl_dev);
+
+	return (error);
+}
+
+int
+raid_getcomponentsize(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
+{
+	struct disklabel *dlabel;
+	struct vnode *vp;
+	struct vattr va;
+	RF_Thread_t td;
+	int retcode;
+
+	td = raidPtr->engine_thread;
+
+	MALLOC(dlabel, struct disklabel *, sizeof(struct disklabel),
+	    M_RAIDFRAME, M_NOWAIT | M_ZERO);
+	if (dlabel == NULL) {
+		printf("rf_getcomponentsize: Out of memory?\n");
+		return (ENOMEM);
+	}
+
+	retcode = raidlookup(raidPtr->Disks[row][col].devname, td, &vp);
+
+	if (retcode) {
+		printf("raid%d: rebuilding: raidlookup on device: %s failed: %d!\n",raidPtr->raidid,
+		       raidPtr->Disks[row][col].devname, retcode);
+
+		/* XXX the component isn't responding properly... 
+		   must be still dead :-( */
+		raidPtr->reconInProgress--;
+		FREE(dlabel, M_RAIDFRAME);
+		return(retcode);
+
+	} else {
+
+		/* Ok, so we can at least do a lookup... 
+		   How about actually getting a vp for it? */
+
+		if ((retcode = VOP_GETATTR(vp, &va, rf_getucred(td),
+					   td)) != 0) {
+			raidPtr->reconInProgress--;
+			FREE(dlabel, M_RAIDFRAME);
+			return(retcode);
+		}
+		
+		retcode = VOP_IOCTL(vp, DIOCGDINFO, (caddr_t)dlabel,
+		    FREAD, rf_getucred(td), td);
+		if (retcode) {
+			FREE(dlabel, M_RAIDFRAME);
+			return(retcode);
+		}
+		raidPtr->Disks[row][col].blockSize = dlabel->d_secsize;
+		raidPtr->Disks[row][col].numBlocks =
+		    dlabel->d_partitions[dkpart(vn_todev(vp))].p_size -
+		    rf_protectedSectors;
+
+		raidPtr->raid_cinfo[row][col].ci_vp = vp;
+		raidPtr->raid_cinfo[row][col].ci_dev = udev2dev(va.va_rdev, 0);
+		raidPtr->Disks[row][col].dev = udev2dev(va.va_rdev, 0);
+		
+		/* we allow the user to specify that only a 
+		   fraction of the disks should be used this is 
+		   just for debug:  it speeds up
+		 * the parity scan */
+		raidPtr->Disks[row][col].numBlocks =
+			raidPtr->Disks[row][col].numBlocks *
+			rf_sizePercentage / 100;
+	}
+
+	FREE(dlabel, M_RAIDFRAME);
+	return(retcode);
+}
+
+static int
+raid_modevent(mod, type, data)
+	module_t mod;
+	int type;
+	void *data;
+{
+	int error = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+		raidattach();
+		break;
+
+	case MOD_UNLOAD:
+	case MOD_SHUTDOWN:
+		error = raidshutdown();
+		break;
+
+	default:
+		break;
+	}
+
+	return (error);
+}
+
+moduledata_t raid_mod = {
+	"raidframe",
+	(modeventhand_t) raid_modevent,
+	0};
+
+DECLARE_MODULE(raidframe, raid_mod, SI_SUB_RAID, SI_ORDER_MIDDLE);
diff --git a/sys/dev/raidframe/rf_freelist.h b/sys/dev/raidframe/rf_freelist.h
new file mode 100644
index 0000000..13a5e83
--- /dev/null
+++ b/sys/dev/raidframe/rf_freelist.h
@@ -0,0 +1,702 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_freelist.h,v 1.6 2002/08/08 02:53:01 oster Exp $	*/
+/*
+ * rf_freelist.h
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_freelist.h -- code to manage counted freelists
+ *
+ * Keep an arena of fixed-size objects. When a new object is needed,
+ * allocate it as necessary. When an object is freed, either put it
+ * in the arena, or really free it, depending on the maximum arena
+ * size.
+ */
+
+#ifndef _RF__RF_FREELIST_H_
+#define _RF__RF_FREELIST_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_threadstuff.h>
+
+#define RF_FREELIST_STATS 0
+
+#if RF_FREELIST_STATS > 0
+typedef struct RF_FreeListStats_s {
+	char   *file;
+	int     line;
+	int     allocations;
+	int     frees;
+	int     max_free;
+	int     grows;
+	int     outstanding;
+	int     max_outstanding;
+}       RF_FreeListStats_t;
+#define RF_FREELIST_STAT_INIT(_fl_) { \
+	bzero((char *)&((_fl_)->stats), sizeof(RF_FreeListStats_t)); \
+	(_fl_)->stats.file = __FILE__; \
+	(_fl_)->stats.line = __LINE__; \
+}
+
+#define RF_FREELIST_STAT_ALLOC(_fl_) { \
+	(_fl_)->stats.allocations++; \
+	(_fl_)->stats.outstanding++; \
+	if ((_fl_)->stats.outstanding > (_fl_)->stats.max_outstanding) \
+		(_fl_)->stats.max_outstanding = (_fl_)->stats.outstanding; \
+}
+
+#define RF_FREELIST_STAT_FREE_UPDATE(_fl_) { \
+	if ((_fl_)->free_cnt > (_fl_)->stats.max_free) \
+		(_fl_)->stats.max_free = (_fl_)->free_cnt; \
+}
+
+#define RF_FREELIST_STAT_FREE(_fl_) { \
+	(_fl_)->stats.frees++; \
+	(_fl_)->stats.outstanding--; \
+	RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
+}
+
+#define RF_FREELIST_STAT_GROW(_fl_) { \
+	(_fl_)->stats.grows++; \
+	RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
+}
+
+#define RF_FREELIST_STAT_REPORT(_fl_) { \
+	printf("Freelist at %s %d (%s)\n", (_fl_)->stats.file, (_fl_)->stats.line, RF_STRING(_fl_)); \
+	printf("  %d allocations, %d frees\n", (_fl_)->stats.allocations, (_fl_)->stats.frees); \
+	printf("  %d grows\n", (_fl_)->stats.grows); \
+	printf("  %d outstanding\n", (_fl_)->stats.outstanding); \
+	printf("  %d free (max)\n", (_fl_)->stats.max_free); \
+	printf("  %d outstanding (max)\n", (_fl_)->stats.max_outstanding); \
+}
+
+#else				/* RF_FREELIST_STATS > 0 */
+
+#define RF_FREELIST_STAT_INIT(_fl_)
+#define RF_FREELIST_STAT_ALLOC(_fl_)
+#define RF_FREELIST_STAT_FREE_UPDATE(_fl_)
+#define RF_FREELIST_STAT_FREE(_fl_)
+#define RF_FREELIST_STAT_GROW(_fl_)
+#define RF_FREELIST_STAT_REPORT(_fl_)
+
+#endif				/* RF_FREELIST_STATS > 0 */
+
+struct RF_FreeList_s {
+	void   *objlist;	/* list of free obj */
+	int     free_cnt;	/* how many free obj */
+	int     max_free_cnt;	/* max free arena size */
+	int     obj_inc;	/* how many to allocate at a time */
+	int     obj_size;	/* size of objects */
+	        RF_DECLARE_MUTEX(lock)
+#if RF_FREELIST_STATS > 0
+	RF_FreeListStats_t stats;	/* statistics */
+#endif				/* RF_FREELIST_STATS > 0 */
+};
+/*
+ * fl     = freelist
+ * maxcnt = max number of items in arena
+ * inc    = how many to allocate at a time
+ * size   = size of object
+ */
+#define RF_FREELIST_CREATE(_fl_,_maxcnt_,_inc_,_size_) { \
+	int rc; \
+	RF_ASSERT((_inc_) > 0); \
+	RF_Malloc(_fl_, sizeof(RF_FreeList_t), (RF_FreeList_t *)); \
+	(_fl_)->objlist = NULL; \
+	(_fl_)->free_cnt = 0; \
+	(_fl_)->max_free_cnt = _maxcnt_; \
+	(_fl_)->obj_inc = _inc_; \
+	(_fl_)->obj_size = _size_; \
+	rc = rf_mutex_init(&(_fl_)->lock, "RF_FREELIST"); \
+	if (rc) { \
+		RF_Free(_fl_, sizeof(RF_FreeList_t)); \
+		_fl_ = NULL; \
+	} \
+	RF_FREELIST_STAT_INIT(_fl_); \
+}
+
+/*
+ * fl    = freelist
+ * cnt   = number to prime with
+ * nextp = name of "next" pointer in obj
+ * cast  = object cast
+ */
+#define RF_FREELIST_PRIME(_fl_,_cnt_,_nextp_,_cast_) { \
+	void *_p; \
+	int _i; \
+	for(_i=0;_i<(_cnt_);_i++) { \
+		RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+		if (_p) { \
+			RF_LOCK_MUTEX((_fl_)->lock); \
+			(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+			(_fl_)->objlist = _p; \
+			(_fl_)->free_cnt++; \
+			RF_UNLOCK_MUTEX((_fl_)->lock); \
+		} \
+		else { \
+			break; \
+		} \
+	} \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
+	RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+#define RF_FREELIST_MUTEX_OF(_fl_) ((_fl_)->lock)
+
+#define RF_FREELIST_DO_UNLOCK(_fl_) { \
+	RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+#define RF_FREELIST_DO_LOCK(_fl_) { \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl    = freelist
+ * cnt   = number to prime with
+ * nextp = name of "next" pointer in obj
+ * cast  = object cast
+ * init  = func to call to init obj
+ */
+#define RF_FREELIST_PRIME_INIT(_fl_,_cnt_,_nextp_,_cast_,_init_) { \
+	void *_p; \
+	int _i; \
+	for(_i=0;_i<(_cnt_);_i++) { \
+		RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+		if (_init_ (_cast_ _p)) { \
+			RF_Free(_p,(_fl_)->obj_size); \
+			_p = NULL; \
+		} \
+		if (_p) { \
+			RF_LOCK_MUTEX((_fl_)->lock); \
+			(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+			(_fl_)->objlist = _p; \
+			(_fl_)->free_cnt++; \
+			RF_UNLOCK_MUTEX((_fl_)->lock); \
+		} \
+		else { \
+			break; \
+		} \
+	} \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
+	RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl    = freelist
+ * cnt   = number to prime with
+ * nextp = name of "next" pointer in obj
+ * cast  = object cast
+ * init  = func to call to init obj
+ * arg   = arg to init obj func
+ */
+#define RF_FREELIST_PRIME_INIT_ARG(_fl_,_cnt_,_nextp_,_cast_,_init_,_arg_) { \
+	void *_p; \
+	int _i; \
+	for(_i=0;_i<(_cnt_);_i++) { \
+		RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+		if (_init_ (_cast_ _p,_arg_)) { \
+			RF_Free(_p,(_fl_)->obj_size); \
+			_p = NULL; \
+		} \
+		if (_p) { \
+			RF_LOCK_MUTEX((_fl_)->lock); \
+			(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+			(_fl_)->objlist = _p; \
+			(_fl_)->free_cnt++; \
+			RF_UNLOCK_MUTEX((_fl_)->lock); \
+		} \
+		else { \
+			break; \
+		} \
+	} \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
+	RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl    = freelist
+ * obj   = object to allocate
+ * nextp = name of "next" pointer in obj
+ * cast  = cast of obj assignment
+ * init  = init obj func
+ */
+#define RF_FREELIST_GET_INIT(_fl_,_obj_,_nextp_,_cast_,_init_) { \
+	void *_p; \
+	int _i; \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
+	if (_fl_->objlist) { \
+		_obj_ = _cast_((_fl_)->objlist); \
+		(_fl_)->objlist = (void *)((_obj_)->_nextp_); \
+		(_fl_)->free_cnt--; \
+		RF_UNLOCK_MUTEX((_fl_)->lock); \
+	} \
+	else { \
+		RF_UNLOCK_MUTEX((_fl_)->lock); \
+		/* \
+		 * Allocate one at a time so we can free \
+		 * one at a time without cleverness when arena \
+		 * is full. \
+		 */ \
+		RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
+		if (_obj_) { \
+			if (_init_ (_obj_)) { \
+				RF_Free(_obj_,(_fl_)->obj_size); \
+				_obj_ = NULL; \
+			} \
+			else { \
+				for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
+					RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+					if (_p) { \
+						if (_init_ (_p)) { \
+							RF_Free(_p,(_fl_)->obj_size); \
+							_p = NULL; \
+							break; \
+						} \
+						RF_LOCK_MUTEX((_fl_)->lock); \
+						(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+						(_fl_)->objlist = _p; \
+						RF_UNLOCK_MUTEX((_fl_)->lock); \
+					} \
+					else { \
+						break; \
+					} \
+				} \
+			} \
+		} \
+		RF_LOCK_MUTEX((_fl_)->lock); \
+		RF_FREELIST_STAT_GROW(_fl_); \
+		RF_UNLOCK_MUTEX((_fl_)->lock); \
+	} \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	RF_FREELIST_STAT_ALLOC(_fl_); \
+	RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl    = freelist
+ * obj   = object to allocate
+ * nextp = name of "next" pointer in obj
+ * cast  = cast of obj assignment
+ * init  = init obj func
+ * arg   = arg to init obj func
+ */
+#define RF_FREELIST_GET_INIT_ARG(_fl_,_obj_,_nextp_,_cast_,_init_,_arg_) { \
+	void *_p; \
+	int _i; \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
+	if (_fl_->objlist) { \
+		_obj_ = _cast_((_fl_)->objlist); \
+		(_fl_)->objlist = (void *)((_obj_)->_nextp_); \
+		(_fl_)->free_cnt--; \
+		RF_UNLOCK_MUTEX((_fl_)->lock); \
+	} \
+	else { \
+		RF_UNLOCK_MUTEX((_fl_)->lock); \
+		/* \
+		 * Allocate one at a time so we can free \
+		 * one at a time without cleverness when arena \
+		 * is full. \
+		 */ \
+		RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
+		if (_obj_) { \
+			if (_init_ (_obj_,_arg_)) { \
+				RF_Free(_obj_,(_fl_)->obj_size); \
+				_obj_ = NULL; \
+			} \
+			else { \
+				for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
+					RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+					if (_p) { \
+						if (_init_ (_p,_arg_)) { \
+							RF_Free(_p,(_fl_)->obj_size); \
+							_p = NULL; \
+							break; \
+						} \
+						RF_LOCK_MUTEX((_fl_)->lock); \
+						(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+						(_fl_)->objlist = _p; \
+						RF_UNLOCK_MUTEX((_fl_)->lock); \
+					} \
+					else { \
+						break; \
+					} \
+				} \
+			} \
+		} \
+		RF_LOCK_MUTEX((_fl_)->lock); \
+		RF_FREELIST_STAT_GROW(_fl_); \
+		RF_UNLOCK_MUTEX((_fl_)->lock); \
+	} \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	RF_FREELIST_STAT_ALLOC(_fl_); \
+	RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl    = freelist
+ * obj   = object to allocate
+ * nextp = name of "next" pointer in obj
+ * cast  = cast of obj assignment
+ * init  = init obj func
+ */
+#define RF_FREELIST_GET_INIT_NOUNLOCK(_fl_,_obj_,_nextp_,_cast_,_init_) { \
+	void *_p; \
+	int _i; \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
+	if (_fl_->objlist) { \
+		_obj_ = _cast_((_fl_)->objlist); \
+		(_fl_)->objlist = (void *)((_obj_)->_nextp_); \
+		(_fl_)->free_cnt--; \
+	} \
+	else { \
+		/* \
+		 * Allocate one at a time so we can free \
+		 * one at a time without cleverness when arena \
+		 * is full. \
+		 */ \
+		RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
+		if (_obj_) { \
+			if (_init_ (_obj_)) { \
+				RF_Free(_obj_,(_fl_)->obj_size); \
+				_obj_ = NULL; \
+			} \
+			else { \
+				for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
+					RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+					if (_p) { \
+						if (_init_ (_p)) { \
+							RF_Free(_p,(_fl_)->obj_size); \
+							_p = NULL; \
+							break; \
+						} \
+						(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+						(_fl_)->objlist = _p; \
+					} \
+					else { \
+						break; \
+					} \
+				} \
+			} \
+		} \
+		RF_FREELIST_STAT_GROW(_fl_); \
+	} \
+	RF_FREELIST_STAT_ALLOC(_fl_); \
+}
+
+/*
+ * fl    = freelist
+ * obj   = object to allocate
+ * nextp = name of "next" pointer in obj
+ * cast  = cast of obj assignment
+ */
+#define RF_FREELIST_GET(_fl_,_obj_,_nextp_,_cast_) { \
+	void *_p; \
+	int _i; \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
+	if (_fl_->objlist) { \
+		_obj_ = _cast_((_fl_)->objlist); \
+		(_fl_)->objlist = (void *)((_obj_)->_nextp_); \
+		(_fl_)->free_cnt--; \
+		RF_UNLOCK_MUTEX((_fl_)->lock); \
+	} \
+	else { \
+		RF_UNLOCK_MUTEX((_fl_)->lock); \
+		/* \
+		 * Allocate one at a time so we can free \
+		 * one at a time without cleverness when arena \
+		 * is full. \
+		 */ \
+		RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
+		if (_obj_) { \
+			for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
+				RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+				if (_p) { \
+					RF_LOCK_MUTEX((_fl_)->lock); \
+					(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+					(_fl_)->objlist = _p; \
+					RF_UNLOCK_MUTEX((_fl_)->lock); \
+				} \
+				else { \
+					break; \
+				} \
+			} \
+		} \
+		RF_LOCK_MUTEX((_fl_)->lock); \
+		RF_FREELIST_STAT_GROW(_fl_); \
+		RF_UNLOCK_MUTEX((_fl_)->lock); \
+	} \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	RF_FREELIST_STAT_ALLOC(_fl_); \
+	RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl    = freelist
+ * obj   = object to allocate
+ * nextp = name of "next" pointer in obj
+ * cast  = cast of obj assignment
+ * num   = num objs to return
+ */
+#define RF_FREELIST_GET_N(_fl_,_obj_,_nextp_,_cast_,_num_) { \
+	void *_p, *_l, *_f; \
+	int _i, _n; \
+	_l = _f = NULL; \
+	_n = 0; \
+	RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
+	for(_n=0;_n<_num_;_n++) { \
+		RF_LOCK_MUTEX((_fl_)->lock); \
+		if (_fl_->objlist) { \
+			_obj_ = _cast_((_fl_)->objlist); \
+			(_fl_)->objlist = (void *)((_obj_)->_nextp_); \
+			(_fl_)->free_cnt--; \
+			RF_UNLOCK_MUTEX((_fl_)->lock); \
+		} \
+		else { \
+			RF_UNLOCK_MUTEX((_fl_)->lock); \
+			/* \
+			 * Allocate one at a time so we can free \
+			 * one at a time without cleverness when arena \
+			 * is full. \
+			 */ \
+			RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
+			if (_obj_) { \
+				for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
+					RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+					if (_p) { \
+						RF_LOCK_MUTEX((_fl_)->lock); \
+						(_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+						(_fl_)->objlist = _p; \
+						RF_UNLOCK_MUTEX((_fl_)->lock); \
+					} \
+					else { \
+						break; \
+					} \
+				} \
+			} \
+			RF_LOCK_MUTEX((_fl_)->lock); \
+			RF_FREELIST_STAT_GROW(_fl_); \
+			RF_UNLOCK_MUTEX((_fl_)->lock); \
+		} \
+		RF_LOCK_MUTEX((_fl_)->lock); \
+		if (_f == NULL) \
+			_f = _obj_; \
+		if (_obj_) { \
+			(_cast_(_obj_))->_nextp_ = _l; \
+			_l = _obj_; \
+			RF_FREELIST_STAT_ALLOC(_fl_); \
+		} \
+		else { \
+			(_cast_(_f))->_nextp_ = (_fl_)->objlist; \
+			(_fl_)->objlist = _l; \
+			_n = _num_; \
+		} \
+		RF_UNLOCK_MUTEX((_fl_)->lock); \
+	} \
+}
+
+/*
+ * fl = freelist
+ * obj   = object to free
+ * nextp = name of "next" pointer in obj
+ */
+#define RF_FREELIST_FREE(_fl_,_obj_,_nextp_) { \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
+		RF_Free(_obj_,(_fl_)->obj_size); \
+	} \
+	else { \
+		RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
+		(_obj_)->_nextp_ = (_fl_)->objlist; \
+		(_fl_)->objlist = (void *)(_obj_); \
+		(_fl_)->free_cnt++; \
+	} \
+	RF_FREELIST_STAT_FREE(_fl_); \
+	RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl    = freelist
+ * obj   = object to free
+ * nextp = name of "next" pointer in obj
+ * num   = num to free (debugging)
+ */
+#define RF_FREELIST_FREE_N(_fl_,_obj_,_nextp_,_cast_,_num_) { \
+	void *_no; \
+	int _n; \
+	_n = 0; \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	while(_obj_) { \
+		_no = (_cast_(_obj_))->_nextp_; \
+		if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
+			RF_Free(_obj_,(_fl_)->obj_size); \
+		} \
+		else { \
+			RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
+			(_obj_)->_nextp_ = (_fl_)->objlist; \
+			(_fl_)->objlist = (void *)(_obj_); \
+			(_fl_)->free_cnt++; \
+		} \
+		_n++; \
+		_obj_ = _no; \
+		RF_FREELIST_STAT_FREE(_fl_); \
+	} \
+	RF_ASSERT(_n==(_num_)); \
+	RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl    = freelist
+ * obj   = object to free
+ * nextp = name of "next" pointer in obj
+ * clean = undo for init
+ */
+#define RF_FREELIST_FREE_CLEAN(_fl_,_obj_,_nextp_,_clean_) { \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
+		_clean_ (_obj_); \
+		RF_Free(_obj_,(_fl_)->obj_size); \
+	} \
+	else { \
+		RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
+		(_obj_)->_nextp_ = (_fl_)->objlist; \
+		(_fl_)->objlist = (void *)(_obj_); \
+		(_fl_)->free_cnt++; \
+	} \
+	RF_FREELIST_STAT_FREE(_fl_); \
+	RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl    = freelist
+ * obj   = object to free
+ * nextp = name of "next" pointer in obj
+ * clean = undo for init
+ * arg   = arg for undo func
+ */
+#define RF_FREELIST_FREE_CLEAN_ARG(_fl_,_obj_,_nextp_,_clean_,_arg_) { \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
+		_clean_ (_obj_,_arg_); \
+		RF_Free(_obj_,(_fl_)->obj_size); \
+	} \
+	else { \
+		RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
+		(_obj_)->_nextp_ = (_fl_)->objlist; \
+		(_fl_)->objlist = (void *)(_obj_); \
+		(_fl_)->free_cnt++; \
+	} \
+	RF_FREELIST_STAT_FREE(_fl_); \
+	RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl    = freelist
+ * obj   = object to free
+ * nextp = name of "next" pointer in obj
+ * clean = undo for init
+ */
+#define RF_FREELIST_FREE_CLEAN_NOUNLOCK(_fl_,_obj_,_nextp_,_clean_) { \
+	RF_LOCK_MUTEX((_fl_)->lock); \
+	if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
+		_clean_ (_obj_); \
+		RF_Free(_obj_,(_fl_)->obj_size); \
+	} \
+	else { \
+		RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
+		(_obj_)->_nextp_ = (_fl_)->objlist; \
+		(_fl_)->objlist = (void *)(_obj_); \
+		(_fl_)->free_cnt++; \
+	} \
+	RF_FREELIST_STAT_FREE(_fl_); \
+}
+
+/*
+ * fl     = freelist
+ * nextp = name of "next" pointer in obj
+ * cast  = cast to object type
+ */
+#define RF_FREELIST_DESTROY(_fl_,_nextp_,_cast_) { \
+	void *_cur, *_next; \
+	RF_FREELIST_STAT_REPORT(_fl_); \
+	rf_mutex_destroy(&((_fl_)->lock)); \
+	for(_cur=(_fl_)->objlist;_cur;_cur=_next) { \
+		_next = (_cast_ _cur)->_nextp_; \
+		RF_Free(_cur,(_fl_)->obj_size); \
+	} \
+	RF_Free(_fl_,sizeof(RF_FreeList_t)); \
+}
+
+/*
+ * fl    = freelist
+ * nextp = name of "next" pointer in obj
+ * cast  = cast to object type
+ * clean = func to undo obj init
+ */
+#define RF_FREELIST_DESTROY_CLEAN(_fl_,_nextp_,_cast_,_clean_) { \
+	void *_cur, *_next; \
+	RF_FREELIST_STAT_REPORT(_fl_); \
+	rf_mutex_destroy(&((_fl_)->lock)); \
+	for(_cur=(_fl_)->objlist;_cur;_cur=_next) { \
+		_next = (_cast_ _cur)->_nextp_; \
+		_clean_ (_cur); \
+		RF_Free(_cur,(_fl_)->obj_size); \
+	} \
+	RF_Free(_fl_,sizeof(RF_FreeList_t)); \
+}
+
+/*
+ * fl    = freelist
+ * nextp = name of "next" pointer in obj
+ * cast  = cast to object type
+ * clean = func to undo obj init
+ * arg   = arg for undo func
+ */
+#define RF_FREELIST_DESTROY_CLEAN_ARG(_fl_,_nextp_,_cast_,_clean_,_arg_) { \
+	void *_cur, *_next; \
+	RF_FREELIST_STAT_REPORT(_fl_); \
+	rf_mutex_destroy(&((_fl_)->lock)); \
+	for(_cur=(_fl_)->objlist;_cur;_cur=_next) { \
+		_next = (_cast_ _cur)->_nextp_; \
+		_clean_ (_cur,_arg_); \
+		RF_Free(_cur,(_fl_)->obj_size); \
+	} \
+	RF_Free(_fl_,sizeof(RF_FreeList_t)); \
+}
+
+#endif				/* !_RF__RF_FREELIST_H_ */
diff --git a/sys/dev/raidframe/rf_general.h b/sys/dev/raidframe/rf_general.h
new file mode 100644
index 0000000..e709899
--- /dev/null
+++ b/sys/dev/raidframe/rf_general.h
@@ -0,0 +1,107 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_general.h,v 1.6 2000/12/15 02:12:58 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_general.h -- some general-use definitions
+ */
+
+/*#define NOASSERT*/
+
+#ifndef _RF__RF_GENERAL_H_
+#define _RF__RF_GENERAL_H_
+
+/* error reporting and handling */
+
+#ifdef _KERNEL
+#include<sys/systm.h>		/* printf, sprintf, and friends */
+#endif
+
+#define RF_ERRORMSG(s)            printf((s))
+#define RF_ERRORMSG1(s,a)         printf((s),(a))
+#define RF_ERRORMSG2(s,a,b)       printf((s),(a),(b))
+#define RF_ERRORMSG3(s,a,b,c)     printf((s),(a),(b),(c))
+
+void rf_print_panic_message(int, char *);
+void rf_print_assert_panic_message(int, char *, char *);
+
+extern char rf_panicbuf[];
+#define RF_PANIC() {rf_print_panic_message(__LINE__,__FILE__); panic(rf_panicbuf);}
+
+#ifdef _KERNEL
+#ifdef RF_ASSERT
+#undef RF_ASSERT
+#endif				/* RF_ASSERT */
+#ifndef NOASSERT
+#define RF_ASSERT(_x_) { \
+  if (!(_x_)) { \
+    rf_print_assert_panic_message(__LINE__, __FILE__, #_x_); \
+    panic(rf_panicbuf); \
+  } \
+}
+#else				/* !NOASSERT */
+#define RF_ASSERT(x) {/*noop*/}
+#endif				/* !NOASSERT */
+#else				/* _KERNEL */
+#define RF_ASSERT(x) {/*noop*/}
+#endif				/* _KERNEL */
+
+/* random stuff */
+#define RF_MAX(a,b) (((a) > (b)) ? (a) : (b))
+#define RF_MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+/* divide-by-zero check */
+#define RF_DB0_CHECK(a,b) ( ((b)==0) ? 0 : (a)/(b) )
+
+/* get time of day */
+#define RF_GETTIME(_t) microtime(&(_t))
+
+/*
+ * zero memory- not all bzero calls go through here, only
+ * those which in the kernel may have a user address
+ */
+
+#define RF_BZERO(_bp,_b,_l)  bzero(_b,_l)	/* XXX This is likely
+						 * incorrect. GO */
+
+#if defined(__FreeBSD__)
+#define NBPG PAGE_SIZE
+#endif
+
+#define RF_UL(x)           ((unsigned long) (x))
+#define RF_PGMASK          RF_UL(NBPG-1)
+#define RF_BLIP(x)         (NBPG - (RF_UL(x) & RF_PGMASK))	/* bytes left in page */
+#define RF_PAGE_ALIGNED(x) ((RF_UL(x) & RF_PGMASK) == 0)
+
+#ifdef __STDC__
+#define RF_STRING(_str_) #_str_
+#else				/* __STDC__ */
+#define RF_STRING(_str_) "_str_"
+#endif				/* __STDC__ */
+
+#endif				/* !_RF__RF_GENERAL_H_ */
diff --git a/sys/dev/raidframe/rf_geniq.c b/sys/dev/raidframe/rf_geniq.c
new file mode 100644
index 0000000..c839059
--- /dev/null
+++ b/sys/dev/raidframe/rf_geniq.c
@@ -0,0 +1,163 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_geniq.c,v 1.3 1999/02/05 00:06:12 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_geniq.c
+ *  code which implements Reed-Solomon encoding for RAID level 6
+ */
+
+
+#define RF_UTILITY 1
+#include <dev/raidframe/rf_pqdeg.h>
+
+/*
+   five bit lfsr
+   poly - feedback connections
+
+   val  = value;
+*/
+int 
+lsfr_shift(val, poly)
+	unsigned val, poly;
+{
+	unsigned new;
+	unsigned int i;
+	unsigned high = (val >> 4) & 1;
+	unsigned bit;
+
+	new = (poly & 1) ? high : 0;
+
+	for (i = 1; i <= 4; i++) {
+		bit = (val >> (i - 1)) & 1;
+		if (poly & (1 << i))	/* there is a feedback connection */
+			new = new | ((bit ^ high) << i);
+		else
+			new = new | (bit << i);
+	}
+	return new;
+}
+/* generate Q matricies for the data */
+
+RF_ua32_t rf_qfor[32];
+
+void 
+main()
+{
+	unsigned int i, j, l, a, b;
+	unsigned int val;
+	unsigned int r;
+	unsigned int m, p, q;
+
+	RF_ua32_t k;
+
+	printf("/*\n");
+	printf(" * rf_invertq.h\n");
+	printf(" */\n");
+	printf("/*\n");
+	printf(" * GENERATED FILE -- DO NOT EDIT\n");
+	printf(" */\n");
+	printf("\n");
+	printf("#ifndef _RF__RF_INVERTQ_H_\n");
+	printf("#define _RF__RF_INVERTQ_H_\n");
+	printf("\n");
+	printf("/*\n");
+	printf(" * rf_geniq.c must include rf_archs.h before including\n");
+	printf(" * this file (to get VPATH magic right with the way we\n");
+	printf(" * generate this file in kernel trees)\n");
+	printf(" */\n");
+	printf("/* #include \"rf_archs.h\" */\n");
+	printf("\n");
+	printf("#if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0)\n");
+	printf("\n");
+	printf("#define RF_Q_COLS 32\n");
+	printf("RF_ua32_t rf_rn = {\n");
+	k[0] = 1;
+	for (j = 0; j < 31; j++)
+		k[j + 1] = lsfr_shift(k[j], 5);
+	for (j = 0; j < 32; j++)
+		printf("%d, ", k[j]);
+	printf("};\n");
+
+	printf("RF_ua32_t rf_qfor[32] = {\n");
+	for (i = 0; i < 32; i++) {
+		printf("/* i = %d */ { 0, ", i);
+		rf_qfor[i][0] = 0;
+		for (j = 1; j < 32; j++) {
+			val = j;
+			for (l = 0; l < i; l++)
+				val = lsfr_shift(val, 5);
+			rf_qfor[i][j] = val;
+			printf("%d, ", val);
+		}
+		printf("},\n");
+	}
+	printf("};\n");
+	printf("#define RF_Q_DATA_COL(col_num) rf_rn[col_num],rf_qfor[28-(col_num)]\n");
+
+	/* generate the inverse tables. (i,j,p,q) */
+	/* The table just stores a. Get b back from the parity */
+	printf("#ifdef KERNEL\n");
+	printf("RF_ua1024_t rf_qinv[1];        /* don't compile monster table into kernel */\n");
+	printf("#elif defined(NO_PQ)\n");
+	printf("RF_ua1024_t rf_qinv[29*29];\n");
+	printf("#else /* !KERNEL && NO_PQ */\n");
+	printf("RF_ua1024_t rf_qinv[29*29] = {\n");
+	for (i = 0; i < 29; i++) {
+		for (j = 0; j < 29; j++) {
+			printf("/* i %d, j %d */{ ", i, j);
+			if (i == j)
+				for (l = 0; l < 1023; l++)
+					printf("0, ");
+			else {
+				for (p = 0; p < 32; p++)
+					for (q = 0; q < 32; q++) {
+						/* What are a, b such that a ^
+						 * b =  p; and qfor[(28-i)][a
+						 * ^ rf_rn[i+1]] ^
+						 * qfor[(28-j)][b ^
+						 * rf_rn[j+1]] =  q. Solve by
+						 * guessing a. Then testing. */
+						for (a = 0; a < 32; a++) {
+							b = a ^ p;
+							if ((rf_qfor[28 - i][a ^ k[i + 1]] ^ rf_qfor[28 - j][b ^ k[j + 1]]) == q)
+								break;
+						}
+						if (a == 32)
+							printf("unable to solve %d %d %d %d\n", i, j, p, q);
+						printf("%d,", a);
+					}
+			}
+			printf("},\n");
+		}
+	}
+	printf("};\n");
+	printf("\n#endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */\n\n");
+	printf("#endif /* !KERNEL && NO_PQ */\n");
+	printf("#endif /* !_RF__RF_INVERTQ_H_ */\n");
+	exit(0);
+}
diff --git a/sys/dev/raidframe/rf_hist.h b/sys/dev/raidframe/rf_hist.h
new file mode 100644
index 0000000..b8b12c3
--- /dev/null
+++ b/sys/dev/raidframe/rf_hist.h
@@ -0,0 +1,57 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_hist.h,v 1.3 1999/02/05 00:06:12 oster Exp $	*/
+/*
+ * rf_hist.h
+ *
+ * Histgram operations for RAIDframe stats
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_HIST_H_
+#define _RF__RF_HIST_H_
+
+#include <dev/raidframe/rf_types.h>
+
+#define RF_HIST_RESOLUTION   5
+#define RF_HIST_MIN_VAL      0
+#define RF_HIST_MAX_VAL      1000
+#define RF_HIST_RANGE        (RF_HIST_MAX_VAL - RF_HIST_MIN_VAL)
+#define RF_HIST_NUM_BUCKETS  (RF_HIST_RANGE / RF_HIST_RESOLUTION + 1)
+
+typedef RF_uint32 RF_Hist_t;
+
+#define RF_HIST_ADD(_hist_,_val_) { \
+	RF_Hist_t val; \
+	val = ((RF_Hist_t)(_val_)) / 1000; \
+	if (val >= RF_HIST_MAX_VAL) \
+		_hist_[RF_HIST_NUM_BUCKETS-1]++; \
+	else \
+		_hist_[(val - RF_HIST_MIN_VAL) / RF_HIST_RESOLUTION]++; \
+}
+
+#endif				/* !_RF__RF_HIST_H_ */
diff --git a/sys/dev/raidframe/rf_interdecluster.c b/sys/dev/raidframe/rf_interdecluster.c
new file mode 100644
index 0000000..c8bbff5
--- /dev/null
+++ b/sys/dev/raidframe/rf_interdecluster.c
@@ -0,0 +1,283 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_interdecluster.c,v 1.5 2001/01/26 05:09:13 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/************************************************************
+ *
+ * rf_interdecluster.c -- implements interleaved declustering
+ *
+ ************************************************************/
+
+#include <dev/raidframe/rf_archs.h>
+
+#if RF_INCLUDE_INTERDECLUSTER > 0
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_interdecluster.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_utils.h>
+#include <dev/raidframe/rf_dagffrd.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+#include <dev/raidframe/rf_dagffwr.h>
+#include <dev/raidframe/rf_dagdegwr.h>
+
+typedef struct RF_InterdeclusterConfigInfo_s {
+	RF_RowCol_t **stripeIdentifier;	/* filled in at config time and used
+					 * by IdentifyStripe */
+	RF_StripeCount_t numSparingRegions;
+	RF_StripeCount_t stripeUnitsPerSparingRegion;
+	RF_SectorNum_t mirrorStripeOffset;
+}       RF_InterdeclusterConfigInfo_t;
+
+int 
+rf_ConfigureInterDecluster(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_StripeCount_t num_used_stripeUnitsPerDisk;
+	RF_InterdeclusterConfigInfo_t *info;
+	RF_RowCol_t i, tmp, SUs_per_region;
+
+	/* create an Interleaved Declustering configuration structure */
+	RF_MallocAndAdd(info, sizeof(RF_InterdeclusterConfigInfo_t), (RF_InterdeclusterConfigInfo_t *),
+	    raidPtr->cleanupList);
+	if (info == NULL)
+		return (ENOMEM);
+	layoutPtr->layoutSpecificInfo = (void *) info;
+
+	/* fill in the config structure.  */
+	SUs_per_region = raidPtr->numCol * (raidPtr->numCol - 1);
+	info->stripeIdentifier = rf_make_2d_array(SUs_per_region, 2, raidPtr->cleanupList);
+	if (info->stripeIdentifier == NULL)
+		return (ENOMEM);
+	for (i = 0; i < SUs_per_region; i++) {
+		info->stripeIdentifier[i][0] = i / (raidPtr->numCol - 1);
+		tmp = i / raidPtr->numCol;
+		info->stripeIdentifier[i][1] = (i + 1 + tmp) % raidPtr->numCol;
+	}
+
+	/* no spare tables */
+	RF_ASSERT(raidPtr->numRow == 1);
+
+	/* fill in the remaining layout parameters */
+
+	/* total number of stripes should a multiple of 2*numCol: Each sparing
+	 * region consists of 2*numCol stripes: n-1 primary copy, n-1
+	 * secondary copy and 2 for spare .. */
+	num_used_stripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk - (layoutPtr->stripeUnitsPerDisk %
+	    (2 * raidPtr->numCol));
+	info->numSparingRegions = num_used_stripeUnitsPerDisk / (2 * raidPtr->numCol);
+	/* this is in fact the number of stripe units (that are primary data
+	 * copies) in the sparing region */
+	info->stripeUnitsPerSparingRegion = raidPtr->numCol * (raidPtr->numCol - 1);
+	info->mirrorStripeOffset = info->numSparingRegions * (raidPtr->numCol + 1);
+	layoutPtr->numStripe = info->numSparingRegions * info->stripeUnitsPerSparingRegion;
+	layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+	layoutPtr->numDataCol = 1;
+	layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->numParityCol = 1;
+
+	layoutPtr->dataStripeUnitsPerDisk = num_used_stripeUnitsPerDisk;
+
+	raidPtr->sectorsPerDisk =
+	    num_used_stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+	raidPtr->totalSectors =
+	    (layoutPtr->numStripe) * layoutPtr->sectorsPerStripeUnit;
+
+	layoutPtr->stripeUnitsPerDisk = raidPtr->sectorsPerDisk / layoutPtr->sectorsPerStripeUnit;
+
+	return (0);
+}
+
+int 
+rf_GetDefaultNumFloatingReconBuffersInterDecluster(RF_Raid_t * raidPtr)
+{
+	return (30);
+}
+
+RF_HeadSepLimit_t 
+rf_GetDefaultHeadSepLimitInterDecluster(RF_Raid_t * raidPtr)
+{
+	return (raidPtr->sectorsPerDisk);
+}
+
+RF_ReconUnitCount_t 
+rf_GetNumSpareRUsInterDecluster(
+    RF_Raid_t * raidPtr)
+{
+	RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+
+	return (2 * ((RF_ReconUnitCount_t) info->numSparingRegions));
+	/* the layout uses two stripe units per disk as spare within each
+	 * sparing region */
+}
+/* Maps to the primary copy of the data, i.e. the first mirror pair */
+void 
+rf_MapSectorInterDecluster(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+	RF_StripeNum_t su_offset_into_disk, mirror_su_offset_into_disk;
+	RF_StripeNum_t sparing_region_id, index_within_region;
+	int     col_before_remap;
+
+	*row = 0;
+	sparing_region_id = SUID / info->stripeUnitsPerSparingRegion;
+	index_within_region = SUID % info->stripeUnitsPerSparingRegion;
+	su_offset_into_disk = index_within_region % (raidPtr->numCol - 1);
+	mirror_su_offset_into_disk = index_within_region / raidPtr->numCol;
+	col_before_remap = index_within_region / (raidPtr->numCol - 1);
+
+	if (!remap) {
+		*col = col_before_remap;;
+		*diskSector = (su_offset_into_disk + ((raidPtr->numCol - 1) * sparing_region_id)) *
+		    raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+	} else {
+		/* remap sector to spare space... */
+		*diskSector = sparing_region_id * (raidPtr->numCol + 1) * raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += (raidPtr->numCol - 1) * raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+		*col = (index_within_region + 1 + mirror_su_offset_into_disk) % raidPtr->numCol;
+		*col = (*col + 1) % raidPtr->numCol;
+		if (*col == col_before_remap)
+			*col = (*col + 1) % raidPtr->numCol;
+	}
+}
+/* Maps to the second copy of the mirror pair. */
+void 
+rf_MapParityInterDecluster(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+	RF_StripeNum_t sparing_region_id, index_within_region, mirror_su_offset_into_disk;
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+	int     col_before_remap;
+
+	sparing_region_id = SUID / info->stripeUnitsPerSparingRegion;
+	index_within_region = SUID % info->stripeUnitsPerSparingRegion;
+	mirror_su_offset_into_disk = index_within_region / raidPtr->numCol;
+	col_before_remap = (index_within_region + 1 + mirror_su_offset_into_disk) % raidPtr->numCol;
+
+	*row = 0;
+	if (!remap) {
+		*col = col_before_remap;
+		*diskSector = info->mirrorStripeOffset * raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += sparing_region_id * (raidPtr->numCol - 1) * raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += mirror_su_offset_into_disk * raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+	} else {
+		/* remap parity to spare space ... */
+		*diskSector = sparing_region_id * (raidPtr->numCol + 1) * raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += (raidPtr->numCol) * raidPtr->Layout.sectorsPerStripeUnit;
+		*diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+		*col = index_within_region / (raidPtr->numCol - 1);
+		*col = (*col + 1) % raidPtr->numCol;
+		if (*col == col_before_remap)
+			*col = (*col + 1) % raidPtr->numCol;
+	}
+}
+
+void 
+rf_IdentifyStripeInterDecluster(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids,
+    RF_RowCol_t * outRow)
+{
+	RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+	RF_StripeNum_t SUID;
+
+	SUID = addr / raidPtr->Layout.sectorsPerStripeUnit;
+	SUID = SUID % info->stripeUnitsPerSparingRegion;
+
+	*outRow = 0;
+	*diskids = info->stripeIdentifier[SUID];
+}
+
+void 
+rf_MapSIDToPSIDInterDecluster(
+    RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID,
+    RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru)
+{
+	*which_ru = 0;
+	*psID = stripeID;
+}
+/******************************************************************************
+ * select a graph to perform a single-stripe access
+ *
+ * Parameters:  raidPtr    - description of the physical array
+ *              type       - type of operation (read or write) requested
+ *              asmap      - logical & physical addresses for this access
+ *              createFunc - name of function to use to create the graph
+ *****************************************************************************/
+
+void 
+rf_RAIDIDagSelect(
+    RF_Raid_t * raidPtr,
+    RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap,
+    RF_VoidFuncPtr * createFunc)
+{
+	RF_ASSERT(RF_IO_IS_R_OR_W(type));
+
+	if (asmap->numDataFailed + asmap->numParityFailed > 1) {
+		RF_ERRORMSG("Multiple disks failed in a single group!  Aborting I/O operation.\n");
+		*createFunc = NULL;
+		return;
+	}
+	*createFunc = (type == RF_IO_TYPE_READ) ? (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG : (RF_VoidFuncPtr) rf_CreateRaidOneWriteDAG;
+	if (type == RF_IO_TYPE_READ) {
+		if (asmap->numDataFailed == 0)
+			*createFunc = (RF_VoidFuncPtr) rf_CreateMirrorPartitionReadDAG;
+		else
+			*createFunc = (RF_VoidFuncPtr) rf_CreateRaidOneDegradedReadDAG;
+	} else
+		*createFunc = (RF_VoidFuncPtr) rf_CreateRaidOneWriteDAG;
+}
+#endif /* RF_INCLUDE_INTERDECLUSTER > 0 */
diff --git a/sys/dev/raidframe/rf_interdecluster.h b/sys/dev/raidframe/rf_interdecluster.h
new file mode 100644
index 0000000..9bf3825
--- /dev/null
+++ b/sys/dev/raidframe/rf_interdecluster.h
@@ -0,0 +1,60 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_interdecluster.h,v 1.3 1999/02/05 00:06:12 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_interdecluster.h
+ * header file for Interleaved Declustering
+ */
+
+#ifndef _RF__RF_INTERDECLUSTER_H_
+#define _RF__RF_INTERDECLUSTER_H_
+
+int 
+rf_ConfigureInterDecluster(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+int     rf_GetDefaultNumFloatingReconBuffersInterDecluster(RF_Raid_t * raidPtr);
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitInterDecluster(RF_Raid_t * raidPtr);
+RF_ReconUnitCount_t rf_GetNumSpareRUsInterDecluster(RF_Raid_t * raidPtr);
+void 
+rf_MapSectorInterDecluster(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_MapParityInterDecluster(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_IdentifyStripeInterDecluster(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
+void 
+rf_MapSIDToPSIDInterDecluster(RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID, RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru);
+void 
+rf_RAIDIDagSelect(RF_Raid_t * raidPtr, RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap, RF_VoidFuncPtr * createFunc);
+
+#endif				/* !_RF__RF_INTERDECLUSTER_H_ */
diff --git a/sys/dev/raidframe/rf_invertq.c b/sys/dev/raidframe/rf_invertq.c
new file mode 100644
index 0000000..fa4f8d7
--- /dev/null
+++ b/sys/dev/raidframe/rf_invertq.c
@@ -0,0 +1,32 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_invertq.c,v 1.3 1999/02/05 00:06:12 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_pqdeg.h>
+#include <dev/raidframe/rf_invertq.h>
diff --git a/sys/dev/raidframe/rf_invertq.h b/sys/dev/raidframe/rf_invertq.h
new file mode 100644
index 0000000..fde2cae
--- /dev/null
+++ b/sys/dev/raidframe/rf_invertq.h
@@ -0,0 +1,64 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_invertq.h,v 1.3 1999/02/05 00:06:12 oster Exp $	*/
+/*
+ * rf_invertq.h
+ */
+/*
+ * This is normally a generated file.  Not so for NetBSD.
+ */
+
+#ifndef _RF__RF_INVERTQ_H_
+#define _RF__RF_INVERTQ_H_
+
+/*
+ * rf_geniq.c must include rf_archs.h before including
+ * this file (to get VPATH magic right with the way we
+ * generate this file in kernel trees)
+ */
+/* #include <dev/raidframe/rf_archs.h> */
+
+#if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
+
+#define RF_Q_COLS 32
+RF_ua32_t rf_rn = {
+1, 2, 4, 8, 16, 5, 10, 20, 13, 26, 17, 7, 14, 28, 29, 31, 27, 19, 3, 6, 12, 24, 21, 15, 30, 25, 23, 11, 22, 9, 18, 1,};
+RF_ua32_t rf_qfor[32] = {
+	 /* i = 0 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,},
+	 /* i = 1 */ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 5, 7, 1, 3, 13, 15, 9, 11, 21, 23, 17, 19, 29, 31, 25, 27,},
+	 /* i = 2 */ {0, 4, 8, 12, 16, 20, 24, 28, 5, 1, 13, 9, 21, 17, 29, 25, 10, 14, 2, 6, 26, 30, 18, 22, 15, 11, 7, 3, 31, 27, 23, 19,},
+	 /* i = 3 */ {0, 8, 16, 24, 5, 13, 21, 29, 10, 2, 26, 18, 15, 7, 31, 23, 20, 28, 4, 12, 17, 25, 1, 9, 30, 22, 14, 6, 27, 19, 11, 3,},
+	 /* i = 4 */ {0, 16, 5, 21, 10, 26, 15, 31, 20, 4, 17, 1, 30, 14, 27, 11, 13, 29, 8, 24, 7, 23, 2, 18, 25, 9, 28, 12, 19, 3, 22, 6,},
+	 /* i = 5 */ {0, 5, 10, 15, 20, 17, 30, 27, 13, 8, 7, 2, 25, 28, 19, 22, 26, 31, 16, 21, 14, 11, 4, 1, 23, 18, 29, 24, 3, 6, 9, 12,},
+	 /* i = 6 */ {0, 10, 20, 30, 13, 7, 25, 19, 26, 16, 14, 4, 23, 29, 3, 9, 17, 27, 5, 15, 28, 22, 8, 2, 11, 1, 31, 21, 6, 12, 18, 24,},
+	 /* i = 7 */ {0, 20, 13, 25, 26, 14, 23, 3, 17, 5, 28, 8, 11, 31, 6, 18, 7, 19, 10, 30, 29, 9, 16, 4, 22, 2, 27, 15, 12, 24, 1, 21,},
+	 /* i = 8 */ {0, 13, 26, 23, 17, 28, 11, 6, 7, 10, 29, 16, 22, 27, 12, 1, 14, 3, 20, 25, 31, 18, 5, 8, 9, 4, 19, 30, 24, 21, 2, 15,},
+	 /* i = 9 */ {0, 26, 17, 11, 7, 29, 22, 12, 14, 20, 31, 5, 9, 19, 24, 2, 28, 6, 13, 23, 27, 1, 10, 16, 18, 8, 3, 25, 21, 15, 4, 30,},
+	 /* i = 10 */ {0, 17, 7, 22, 14, 31, 9, 24, 28, 13, 27, 10, 18, 3, 21, 4, 29, 12, 26, 11, 19, 2, 20, 5, 1, 16, 6, 23, 15, 30, 8, 25,},
+	 /* i = 11 */ {0, 7, 14, 9, 28, 27, 18, 21, 29, 26, 19, 20, 1, 6, 15, 8, 31, 24, 17, 22, 3, 4, 13, 10, 2, 5, 12, 11, 30, 25, 16, 23,},
+	 /* i = 12 */ {0, 14, 28, 18, 29, 19, 1, 15, 31, 17, 3, 13, 2, 12, 30, 16, 27, 21, 7, 9, 6, 8, 26, 20, 4, 10, 24, 22, 25, 23, 5, 11,},
+	 /* i = 13 */ {0, 28, 29, 1, 31, 3, 2, 30, 27, 7, 6, 26, 4, 24, 25, 5, 19, 15, 14, 18, 12, 16, 17, 13, 8, 20, 21, 9, 23, 11, 10, 22,},
+	 /* i = 14 */ {0, 29, 31, 2, 27, 6, 4, 25, 19, 14, 12, 17, 8, 21, 23, 10, 3, 30, 28, 1, 24, 5, 7, 26, 16, 13, 15, 18, 11, 22, 20, 9,},
+	 /* i = 15 */ {0, 31, 27, 4, 19, 12, 8, 23, 3, 28, 24, 7, 16, 15, 11, 20, 6, 25, 29, 2, 21, 10, 14, 17, 5, 26, 30, 1, 22, 9, 13, 18,},
+	 /* i = 16 */ {0, 27, 19, 8, 3, 24, 16, 11, 6, 29, 21, 14, 5, 30, 22, 13, 12, 23, 31, 4, 15, 20, 28, 7, 10, 17, 25, 2, 9, 18, 26, 1,},
+	 /* i = 17 */ {0, 19, 3, 16, 6, 21, 5, 22, 12, 31, 15, 28, 10, 25, 9, 26, 24, 11, 27, 8, 30, 13, 29, 14, 20, 7, 23, 4, 18, 1, 17, 2,},
+	 /* i = 18 */ {0, 3, 6, 5, 12, 15, 10, 9, 24, 27, 30, 29, 20, 23, 18, 17, 21, 22, 19, 16, 25, 26, 31, 28, 13, 14, 11, 8, 1, 2, 7, 4,},
+	 /* i = 19 */ {0, 6, 12, 10, 24, 30, 20, 18, 21, 19, 25, 31, 13, 11, 1, 7, 15, 9, 3, 5, 23, 17, 27, 29, 26, 28, 22, 16, 2, 4, 14, 8,},
+	 /* i = 20 */ {0, 12, 24, 20, 21, 25, 13, 1, 15, 3, 23, 27, 26, 22, 2, 14, 30, 18, 6, 10, 11, 7, 19, 31, 17, 29, 9, 5, 4, 8, 28, 16,},
+	 /* i = 21 */ {0, 24, 21, 13, 15, 23, 26, 2, 30, 6, 11, 19, 17, 9, 4, 28, 25, 1, 12, 20, 22, 14, 3, 27, 7, 31, 18, 10, 8, 16, 29, 5,},
+	 /* i = 22 */ {0, 21, 15, 26, 30, 11, 17, 4, 25, 12, 22, 3, 7, 18, 8, 29, 23, 2, 24, 13, 9, 28, 6, 19, 14, 27, 1, 20, 16, 5, 31, 10,},
+	 /* i = 23 */ {0, 15, 30, 17, 25, 22, 7, 8, 23, 24, 9, 6, 14, 1, 16, 31, 11, 4, 21, 26, 18, 29, 12, 3, 28, 19, 2, 13, 5, 10, 27, 20,},
+	 /* i = 24 */ {0, 30, 25, 7, 23, 9, 14, 16, 11, 21, 18, 12, 28, 2, 5, 27, 22, 8, 15, 17, 1, 31, 24, 6, 29, 3, 4, 26, 10, 20, 19, 13,},
+	 /* i = 25 */ {0, 25, 23, 14, 11, 18, 28, 5, 22, 15, 1, 24, 29, 4, 10, 19, 9, 16, 30, 7, 2, 27, 21, 12, 31, 6, 8, 17, 20, 13, 3, 26,},
+	 /* i = 26 */ {0, 23, 11, 28, 22, 1, 29, 10, 9, 30, 2, 21, 31, 8, 20, 3, 18, 5, 25, 14, 4, 19, 15, 24, 27, 12, 16, 7, 13, 26, 6, 17,},
+	 /* i = 27 */ {0, 11, 22, 29, 9, 2, 31, 20, 18, 25, 4, 15, 27, 16, 13, 6, 1, 10, 23, 28, 8, 3, 30, 21, 19, 24, 5, 14, 26, 17, 12, 7,},
+	 /* i = 28 */ {0, 22, 9, 31, 18, 4, 27, 13, 1, 23, 8, 30, 19, 5, 26, 12, 2, 20, 11, 29, 16, 6, 25, 15, 3, 21, 10, 28, 17, 7, 24, 14,},
+	 /* i = 29 */ {0, 9, 18, 27, 1, 8, 19, 26, 2, 11, 16, 25, 3, 10, 17, 24, 4, 13, 22, 31, 5, 12, 23, 30, 6, 15, 20, 29, 7, 14, 21, 28,},
+	 /* i = 30 */ {0, 18, 1, 19, 2, 16, 3, 17, 4, 22, 5, 23, 6, 20, 7, 21, 8, 26, 9, 27, 10, 24, 11, 25, 12, 30, 13, 31, 14, 28, 15, 29,},
+	 /* i = 31 */ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,},
+};
+#define RF_Q_DATA_COL(col_num) rf_rn[col_num],rf_qfor[28-(col_num)]
+RF_ua1024_t rf_qinv[1];		/* don't compile monster table into kernel */
+
+#endif				/* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 >
+				 * 0) */
+#endif				/* !_RF__RF_INVERTQ_H_ */
diff --git a/sys/dev/raidframe/rf_kintf.h b/sys/dev/raidframe/rf_kintf.h
new file mode 100644
index 0000000..ae2697b
--- /dev/null
+++ b/sys/dev/raidframe/rf_kintf.h
@@ -0,0 +1,82 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_kintf.h,v 1.15 2000/10/20 02:24:45 oster Exp $	*/
+/*
+ * rf_kintf.h
+ *
+ * RAIDframe exported kernel interface
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_KINTF_H_
+#define _RF__RF_KINTF_H_
+
+#include <dev/raidframe/rf_types.h>
+
+#if defined(__NetBSD__)
+#define RF_LTSLEEP(cond, pri, text, time, mutex)	\
+		ltsleep(cond, pri, text, time, mutex)
+#elif defined(__FreeBSD__)
+#if __FreeBSD_version > 500005
+#define RF_LTSLEEP(cond, pri, text, time, mutex) \
+		msleep(cond, mutex, pri, text, time);
+#else
+static __inline int
+RF_LTSLEEP(void *cond, int pri, const char *text, int time, struct simplelock *mutex)
+{
+	int ret;
+	if (mutex != NULL)
+		simple_unlock(mutex);
+	ret = tsleep(cond, pri, text, time);
+	if (mutex != NULL)
+		simple_lock(mutex);
+	return (ret);
+}
+#endif
+#endif
+
+int     rf_GetSpareTableFromDaemon(RF_SparetWait_t * req);
+
+void    raidstart(RF_Raid_t * raidPtr);
+int     rf_DispatchKernelIO(RF_DiskQueue_t * queue, RF_DiskQueueData_t * req);
+
+int raidwrite_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *);
+int raidread_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *);
+
+#define RF_NORMAL_COMPONENT_UPDATE 0
+#define RF_FINAL_COMPONENT_UPDATE 1
+void rf_update_component_labels(RF_Raid_t *, int);
+int raidlookup(char *, RF_Thread_t, struct vnode **);
+int raidmarkclean(dev_t dev, struct vnode *b_vp, int);
+int raidmarkdirty(dev_t dev, struct vnode *b_vp, int);
+void raid_init_component_label(RF_Raid_t *, RF_ComponentLabel_t *);
+void rf_print_component_label(RF_ComponentLabel_t *);
+void rf_UnconfigureVnodes( RF_Raid_t * );
+void rf_close_component( RF_Raid_t *, struct vnode *, int);
+void rf_disk_unbusy(RF_RaidAccessDesc_t *);
+int raid_getcomponentsize(RF_Raid_t *, RF_RowCol_t, RF_RowCol_t);
+#endif				/* _RF__RF_KINTF_H_ */
diff --git a/sys/dev/raidframe/rf_layout.c b/sys/dev/raidframe/rf_layout.c
new file mode 100644
index 0000000..539db67
--- /dev/null
+++ b/sys/dev/raidframe/rf_layout.c
@@ -0,0 +1,490 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_layout.c,v 1.9 2001/01/27 19:34:43 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_layout.c -- driver code dealing with layout and mapping issues
+ */
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_configure.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_desc.h>
+#include <dev/raidframe/rf_decluster.h>
+#include <dev/raidframe/rf_pq.h>
+#include <dev/raidframe/rf_declusterPQ.h>
+#include <dev/raidframe/rf_raid0.h>
+#include <dev/raidframe/rf_raid1.h>
+#include <dev/raidframe/rf_raid4.h>
+#include <dev/raidframe/rf_raid5.h>
+#include <dev/raidframe/rf_states.h>
+#if RF_INCLUDE_RAID5_RS > 0
+#include <dev/raidframe/rf_raid5_rotatedspare.h>
+#endif				/* RF_INCLUDE_RAID5_RS > 0 */
+#if RF_INCLUDE_CHAINDECLUSTER > 0
+#include <dev/raidframe/rf_chaindecluster.h>
+#endif				/* RF_INCLUDE_CHAINDECLUSTER > 0 */
+#if RF_INCLUDE_INTERDECLUSTER > 0
+#include <dev/raidframe/rf_interdecluster.h>
+#endif				/* RF_INCLUDE_INTERDECLUSTER > 0 */
+#if RF_INCLUDE_PARITYLOGGING > 0
+#include <dev/raidframe/rf_paritylogging.h>
+#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
+#if RF_INCLUDE_EVENODD > 0
+#include <dev/raidframe/rf_evenodd.h>
+#endif				/* RF_INCLUDE_EVENODD > 0 */
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_driver.h>
+#include <dev/raidframe/rf_parityscan.h>
+#include <dev/raidframe/rf_reconbuffer.h>
+#include <dev/raidframe/rf_reconutil.h>
+
+/***********************************************************************
+ *
+ * the layout switch defines all the layouts that are supported.
+ *    fields are: layout ID, init routine, shutdown routine, map
+ *    sector, map parity, identify stripe, dag selection, map stripeid
+ *    to parity stripe id (optional), num faults tolerated, special
+ *    flags.
+ *
+ ***********************************************************************/
+
+static RF_AccessState_t DefaultStates[] = {rf_QuiesceState,
+					   rf_IncrAccessesCountState, 
+					   rf_MapState, 
+					   rf_LockState, 
+					   rf_CreateDAGState,
+					   rf_ExecuteDAGState, 
+					   rf_ProcessDAGState, 
+					   rf_DecrAccessesCountState,
+					   rf_CleanupState, 
+					   rf_LastState};
+
+#define RF_NU(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p
+
+/* Note that if you add any new RAID types to this list, that you must
+   also update the mapsw[] table in the raidctl sources */
+
+static RF_LayoutSW_t mapsw[] = {
+#if RF_INCLUDE_PARITY_DECLUSTERING > 0
+	/* parity declustering */
+	{'T', "Parity declustering",
+		RF_NU(
+		    rf_ConfigureDeclustered,
+		    rf_MapSectorDeclustered, rf_MapParityDeclustered, NULL,
+		    rf_IdentifyStripeDeclustered,
+		    rf_RaidFiveDagSelect,
+		    rf_MapSIDToPSIDDeclustered,
+		    rf_GetDefaultHeadSepLimitDeclustered,
+		    rf_GetDefaultNumFloatingReconBuffersDeclustered,
+		    NULL, NULL,
+		    rf_SubmitReconBufferBasic,
+		    rf_VerifyParityBasic,
+		    1,
+		    DefaultStates,
+		    0)
+	},
+#endif
+
+#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
+	/* parity declustering with distributed sparing */
+	{'D', "Distributed sparing parity declustering",
+		RF_NU(
+		    rf_ConfigureDeclusteredDS,
+		    rf_MapSectorDeclustered, rf_MapParityDeclustered, NULL,
+		    rf_IdentifyStripeDeclustered,
+		    rf_RaidFiveDagSelect,
+		    rf_MapSIDToPSIDDeclustered,
+		    rf_GetDefaultHeadSepLimitDeclustered,
+		    rf_GetDefaultNumFloatingReconBuffersDeclustered,
+		    rf_GetNumSpareRUsDeclustered, rf_InstallSpareTable,
+		    rf_SubmitReconBufferBasic,
+		    rf_VerifyParityBasic,
+		    1,
+		    DefaultStates,
+		    RF_DISTRIBUTE_SPARE | RF_BD_DECLUSTERED)
+	},
+#endif
+
+#if RF_INCLUDE_DECL_PQ > 0
+	/* declustered P+Q */
+	{'Q', "Declustered P+Q",
+		RF_NU(
+		    rf_ConfigureDeclusteredPQ,
+		    rf_MapSectorDeclusteredPQ, rf_MapParityDeclusteredPQ, rf_MapQDeclusteredPQ,
+		    rf_IdentifyStripeDeclusteredPQ,
+		    rf_PQDagSelect,
+		    rf_MapSIDToPSIDDeclustered,
+		    rf_GetDefaultHeadSepLimitDeclustered,
+		    rf_GetDefaultNumFloatingReconBuffersPQ,
+		    NULL, NULL,
+		    NULL,
+		    rf_VerifyParityBasic,
+		    2,
+		    DefaultStates,
+		    0)
+	},
+#endif				/* RF_INCLUDE_DECL_PQ > 0 */
+
+#if RF_INCLUDE_RAID5_RS > 0
+	/* RAID 5 with rotated sparing */
+	{'R', "RAID Level 5 rotated sparing",
+		RF_NU(
+		    rf_ConfigureRAID5_RS,
+		    rf_MapSectorRAID5_RS, rf_MapParityRAID5_RS, NULL,
+		    rf_IdentifyStripeRAID5_RS,
+		    rf_RaidFiveDagSelect,
+		    rf_MapSIDToPSIDRAID5_RS,
+		    rf_GetDefaultHeadSepLimitRAID5,
+		    rf_GetDefaultNumFloatingReconBuffersRAID5,
+		    rf_GetNumSpareRUsRAID5_RS, NULL,
+		    rf_SubmitReconBufferBasic,
+		    rf_VerifyParityBasic,
+		    1,
+		    DefaultStates,
+		    RF_DISTRIBUTE_SPARE)
+	},
+#endif				/* RF_INCLUDE_RAID5_RS > 0 */
+
+#if RF_INCLUDE_CHAINDECLUSTER > 0
+	/* Chained Declustering */
+	{'C', "Chained Declustering",
+		RF_NU(
+		    rf_ConfigureChainDecluster,
+		    rf_MapSectorChainDecluster, rf_MapParityChainDecluster, NULL,
+		    rf_IdentifyStripeChainDecluster,
+		    rf_RAIDCDagSelect,
+		    rf_MapSIDToPSIDChainDecluster,
+		    NULL,
+		    NULL,
+		    rf_GetNumSpareRUsChainDecluster, NULL,
+		    rf_SubmitReconBufferBasic,
+		    rf_VerifyParityBasic,
+		    1,
+		    DefaultStates,
+		    0)
+	},
+#endif				/* RF_INCLUDE_CHAINDECLUSTER > 0 */
+
+#if RF_INCLUDE_INTERDECLUSTER > 0
+	/* Interleaved Declustering */
+	{'I', "Interleaved Declustering",
+		RF_NU(
+		    rf_ConfigureInterDecluster,
+		    rf_MapSectorInterDecluster, rf_MapParityInterDecluster, NULL,
+		    rf_IdentifyStripeInterDecluster,
+		    rf_RAIDIDagSelect,
+		    rf_MapSIDToPSIDInterDecluster,
+		    rf_GetDefaultHeadSepLimitInterDecluster,
+		    rf_GetDefaultNumFloatingReconBuffersInterDecluster,
+		    rf_GetNumSpareRUsInterDecluster, NULL,
+		    rf_SubmitReconBufferBasic,
+		    rf_VerifyParityBasic,
+		    1,
+		    DefaultStates,
+		    RF_DISTRIBUTE_SPARE)
+	},
+#endif				/* RF_INCLUDE_INTERDECLUSTER > 0 */
+
+#if RF_INCLUDE_RAID0 > 0
+	/* RAID level 0 */
+	{'0', "RAID Level 0",
+		RF_NU(
+		    rf_ConfigureRAID0,
+		    rf_MapSectorRAID0, rf_MapParityRAID0, NULL,
+		    rf_IdentifyStripeRAID0,
+		    rf_RAID0DagSelect,
+		    rf_MapSIDToPSIDRAID0,
+		    NULL,
+		    NULL,
+		    NULL, NULL,
+		    NULL,
+		    rf_VerifyParityRAID0,
+		    0,
+		    DefaultStates,
+		    0)
+	},
+#endif				/* RF_INCLUDE_RAID0 > 0 */
+
+#if RF_INCLUDE_RAID1 > 0
+	/* RAID level 1 */
+	{'1', "RAID Level 1",
+		RF_NU(
+		    rf_ConfigureRAID1,
+		    rf_MapSectorRAID1, rf_MapParityRAID1, NULL,
+		    rf_IdentifyStripeRAID1,
+		    rf_RAID1DagSelect,
+		    rf_MapSIDToPSIDRAID1,
+		    NULL,
+		    NULL,
+		    NULL, NULL,
+		    rf_SubmitReconBufferRAID1,
+		    rf_VerifyParityRAID1,
+		    1,
+		    DefaultStates,
+		    0)
+	},
+#endif				/* RF_INCLUDE_RAID1 > 0 */
+
+#if RF_INCLUDE_RAID4 > 0
+	/* RAID level 4 */
+	{'4', "RAID Level 4",
+		RF_NU(
+		    rf_ConfigureRAID4,
+		    rf_MapSectorRAID4, rf_MapParityRAID4, NULL,
+		    rf_IdentifyStripeRAID4,
+		    rf_RaidFiveDagSelect,
+		    rf_MapSIDToPSIDRAID4,
+		    rf_GetDefaultHeadSepLimitRAID4,
+		    rf_GetDefaultNumFloatingReconBuffersRAID4,
+		    NULL, NULL,
+		    rf_SubmitReconBufferBasic,
+		    rf_VerifyParityBasic,
+		    1,
+		    DefaultStates,
+		    0)
+	},
+#endif				/* RF_INCLUDE_RAID4 > 0 */
+
+#if RF_INCLUDE_RAID5 > 0
+	/* RAID level 5 */
+	{'5', "RAID Level 5",
+		RF_NU(
+		    rf_ConfigureRAID5,
+		    rf_MapSectorRAID5, rf_MapParityRAID5, NULL,
+		    rf_IdentifyStripeRAID5,
+		    rf_RaidFiveDagSelect,
+		    rf_MapSIDToPSIDRAID5,
+		    rf_GetDefaultHeadSepLimitRAID5,
+		    rf_GetDefaultNumFloatingReconBuffersRAID5,
+		    NULL, NULL,
+		    rf_SubmitReconBufferBasic,
+		    rf_VerifyParityBasic,
+		    1,
+		    DefaultStates,
+		    0)
+	},
+#endif				/* RF_INCLUDE_RAID5 > 0 */
+
+#if RF_INCLUDE_EVENODD > 0
+	/* Evenodd */
+	{'E', "EvenOdd",
+		RF_NU(
+		    rf_ConfigureEvenOdd,
+		    rf_MapSectorRAID5, rf_MapParityEvenOdd, rf_MapEEvenOdd,
+		    rf_IdentifyStripeEvenOdd,
+		    rf_EODagSelect,
+		    rf_MapSIDToPSIDRAID5,
+		    NULL,
+		    NULL,
+		    NULL, NULL,
+		    NULL,	/* no reconstruction, yet */
+		    rf_VerifyParityEvenOdd,
+		    2,
+		    DefaultStates,
+		    0)
+	},
+#endif				/* RF_INCLUDE_EVENODD > 0 */
+
+#if RF_INCLUDE_EVENODD > 0
+	/* Declustered Evenodd */
+	{'e', "Declustered EvenOdd",
+		RF_NU(
+		    rf_ConfigureDeclusteredPQ,
+		    rf_MapSectorDeclusteredPQ, rf_MapParityDeclusteredPQ, rf_MapQDeclusteredPQ,
+		    rf_IdentifyStripeDeclusteredPQ,
+		    rf_EODagSelect,
+		    rf_MapSIDToPSIDRAID5,
+		    rf_GetDefaultHeadSepLimitDeclustered,
+		    rf_GetDefaultNumFloatingReconBuffersPQ,
+		    NULL, NULL,
+		    NULL,	/* no reconstruction, yet */
+		    rf_VerifyParityEvenOdd,
+		    2,
+		    DefaultStates,
+		    0)
+	},
+#endif				/* RF_INCLUDE_EVENODD > 0 */
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+	/* parity logging */
+	{'L', "Parity logging",
+		RF_NU(
+		    rf_ConfigureParityLogging,
+		    rf_MapSectorParityLogging, rf_MapParityParityLogging, NULL,
+		    rf_IdentifyStripeParityLogging,
+		    rf_ParityLoggingDagSelect,
+		    rf_MapSIDToPSIDParityLogging,
+		    rf_GetDefaultHeadSepLimitParityLogging,
+		    rf_GetDefaultNumFloatingReconBuffersParityLogging,
+		    NULL, NULL,
+		    rf_SubmitReconBufferBasic,
+		    NULL,
+		    1,
+		    DefaultStates,
+		    0)
+	},
+#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
+
+	/* end-of-list marker */
+	{'\0', NULL,
+		RF_NU(
+		    NULL,
+		    NULL, NULL, NULL,
+		    NULL,
+		    NULL,
+		    NULL,
+		    NULL,
+		    NULL,
+		    NULL, NULL,
+		    NULL,
+		    NULL,
+		    0,
+		    NULL,
+		    0)
+	}
+};
+
+RF_LayoutSW_t *
+rf_GetLayout(RF_ParityConfig_t parityConfig)
+{
+	RF_LayoutSW_t *p;
+
+	/* look up the specific layout */
+	for (p = &mapsw[0]; p->parityConfig; p++)
+		if (p->parityConfig == parityConfig)
+			break;
+	if (!p->parityConfig)
+		return (NULL);
+	RF_ASSERT(p->parityConfig == parityConfig);
+	return (p);
+}
+
+/*****************************************************************************
+ *
+ * ConfigureLayout --
+ *
+ * read the configuration file and set up the RAID layout parameters.
+ * After reading common params, invokes the layout-specific
+ * configuration routine to finish the configuration.
+ *
+ ****************************************************************************/
+int 
+rf_ConfigureLayout(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_ParityConfig_t parityConfig;
+	RF_LayoutSW_t *p;
+	int     retval;
+
+	layoutPtr->sectorsPerStripeUnit = cfgPtr->sectPerSU;
+	layoutPtr->SUsPerPU = cfgPtr->SUsPerPU;
+	layoutPtr->SUsPerRU = cfgPtr->SUsPerRU;
+	parityConfig = cfgPtr->parityConfig;
+
+	if (layoutPtr->sectorsPerStripeUnit <= 0) {
+		RF_ERRORMSG2("raid%d: Invalid sectorsPerStripeUnit: %d\n",
+			     raidPtr->raidid, 
+			     (int)layoutPtr->sectorsPerStripeUnit );
+		return (EINVAL); 
+	}
+
+	layoutPtr->stripeUnitsPerDisk = raidPtr->sectorsPerDisk / layoutPtr->sectorsPerStripeUnit;
+
+	p = rf_GetLayout(parityConfig);
+	if (p == NULL) {
+		RF_ERRORMSG1("Unknown parity configuration '%c'", parityConfig);
+		return (EINVAL);
+	}
+	RF_ASSERT(p->parityConfig == parityConfig);
+	layoutPtr->map = p;
+
+	/* initialize the specific layout */
+
+	retval = (p->Configure) (listp, raidPtr, cfgPtr);
+
+	if (retval)
+		return (retval);
+
+	layoutPtr->dataBytesPerStripe = layoutPtr->dataSectorsPerStripe << raidPtr->logBytesPerSector;
+	raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+	if (rf_forceNumFloatingReconBufs >= 0) {
+		raidPtr->numFloatingReconBufs = rf_forceNumFloatingReconBufs;
+	} else {
+		raidPtr->numFloatingReconBufs = rf_GetDefaultNumFloatingReconBuffers(raidPtr);
+	}
+
+	if (rf_forceHeadSepLimit >= 0) {
+		raidPtr->headSepLimit = rf_forceHeadSepLimit;
+	} else {
+		raidPtr->headSepLimit = rf_GetDefaultHeadSepLimit(raidPtr);
+	}
+
+	printf("RAIDFRAME: Configure (%s): total number of sectors is %lu (%lu MB)\n",
+	    layoutPtr->map->configName,
+	    (unsigned long) raidPtr->totalSectors,
+	    (unsigned long) (raidPtr->totalSectors / 1024 * (1 << raidPtr->logBytesPerSector) / 1024));
+	if (raidPtr->headSepLimit >= 0) {
+		printf("RAIDFRAME(%s): Using %ld floating recon bufs with head sep limit %ld\n",
+		    layoutPtr->map->configName, (long) raidPtr->numFloatingReconBufs, (long) raidPtr->headSepLimit);
+	} else {
+		printf("RAIDFRAME(%s): Using %ld floating recon bufs with no head sep limit\n",
+		    layoutPtr->map->configName, (long) raidPtr->numFloatingReconBufs);
+	}
+
+	return (0);
+}
+/* typically there is a 1-1 mapping between stripes and parity stripes.
+ * however, the declustering code supports packing multiple stripes into
+ * a single parity stripe, so as to increase the size of the reconstruction
+ * unit without affecting the size of the stripe unit.  This routine finds
+ * the parity stripe identifier associated with a stripe ID.  There is also
+ * a RaidAddressToParityStripeID macro in layout.h
+ */
+RF_StripeNum_t 
+rf_MapStripeIDToParityStripeID(layoutPtr, stripeID, which_ru)
+	RF_RaidLayout_t *layoutPtr;
+	RF_StripeNum_t stripeID;
+	RF_ReconUnitNum_t *which_ru;
+{
+	RF_StripeNum_t parityStripeID;
+
+	/* quick exit in the common case of SUsPerPU==1 */
+	if ((layoutPtr->SUsPerPU == 1) || !layoutPtr->map->MapSIDToPSID) {
+		*which_ru = 0;
+		return (stripeID);
+	} else {
+		(layoutPtr->map->MapSIDToPSID) (layoutPtr, stripeID, &parityStripeID, which_ru);
+	}
+	return (parityStripeID);
+}
diff --git a/sys/dev/raidframe/rf_layout.h b/sys/dev/raidframe/rf_layout.h
new file mode 100644
index 0000000..2482556
--- /dev/null
+++ b/sys/dev/raidframe/rf_layout.h
@@ -0,0 +1,349 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_layout.h,v 1.5 2001/01/26 04:14:14 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_layout.h -- header file defining layout data structures
+ */
+
+#ifndef _RF__RF_LAYOUT_H_
+#define _RF__RF_LAYOUT_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_alloclist.h>
+
+#ifndef _KERNEL
+#include <stdio.h>
+#endif
+
+/*****************************************************************************************
+ *
+ * This structure identifies all layout-specific operations and parameters.
+ *
+ ****************************************************************************************/
+
+typedef struct RF_LayoutSW_s {
+	RF_ParityConfig_t parityConfig;
+	const char *configName;
+
+#ifndef _KERNEL
+	/* layout-specific parsing */
+	int     (*MakeLayoutSpecific) (FILE * fp, RF_Config_t * cfgPtr, void *arg);
+	void   *makeLayoutSpecificArg;
+#endif				/* !KERNEL */
+
+#if RF_UTILITY == 0
+	/* initialization routine */
+	int     (*Configure) (RF_ShutdownList_t ** shutdownListp, RF_Raid_t * raidPtr, RF_Config_t * cfgPtr);
+
+	/* routine to map RAID sector address -> physical (row, col, offset) */
+	void    (*MapSector) (RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+	            RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+
+	/* routine to map RAID sector address -> physical (r,c,o) of parity
+	 * unit */
+	void    (*MapParity) (RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+	            RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+
+	/* routine to map RAID sector address -> physical (r,c,o) of Q unit */
+	void    (*MapQ) (RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector, RF_RowCol_t * row,
+	            RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+
+	/* routine to identify the disks comprising a stripe */
+	void    (*IdentifyStripe) (RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
+	            RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
+
+	/* routine to select a dag */
+	void    (*SelectionFunc) (RF_Raid_t * raidPtr, RF_IoType_t type,
+	            RF_AccessStripeMap_t * asmap,
+	            RF_VoidFuncPtr *);
+#if 0
+	void    (**createFunc) (RF_Raid_t *,
+	            RF_AccessStripeMap_t *,
+	            RF_DagHeader_t *, void *,
+	            RF_RaidAccessFlags_t,
+	            RF_AllocListElem_t *);
+
+#endif
+
+	/* map a stripe ID to a parity stripe ID.  This is typically the
+	 * identity mapping */
+	void    (*MapSIDToPSID) (RF_RaidLayout_t * layoutPtr, RF_StripeNum_t stripeID,
+	            RF_StripeNum_t * psID, RF_ReconUnitNum_t * which_ru);
+
+	/* get default head separation limit (may be NULL) */
+	        RF_HeadSepLimit_t(*GetDefaultHeadSepLimit) (RF_Raid_t * raidPtr);
+
+	/* get default num recon buffers (may be NULL) */
+	int     (*GetDefaultNumFloatingReconBuffers) (RF_Raid_t * raidPtr);
+
+	/* get number of spare recon units (may be NULL) */
+	        RF_ReconUnitCount_t(*GetNumSpareRUs) (RF_Raid_t * raidPtr);
+
+	/* spare table installation (may be NULL) */
+	int     (*InstallSpareTable) (RF_Raid_t * raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol);
+
+	/* recon buffer submission function */
+	int     (*SubmitReconBuffer) (RF_ReconBuffer_t * rbuf, int keep_it,
+	            int use_committed);
+
+	/*
+         * verify that parity information for a stripe is correct
+         * see rf_parityscan.h for return vals
+         */
+	int     (*VerifyParity) (RF_Raid_t * raidPtr, RF_RaidAddr_t raidAddr,
+	            RF_PhysDiskAddr_t * parityPDA, int correct_it, RF_RaidAccessFlags_t flags);
+
+	/* number of faults tolerated by this mapping */
+	int     faultsTolerated;
+
+	/* states to step through in an access. Must end with "LastState". The
+	 * default is DefaultStates in rf_layout.c */
+	RF_AccessState_t *states;
+
+	RF_AccessStripeMapFlags_t flags;
+#endif				/* RF_UTILITY == 0 */
+}       RF_LayoutSW_t;
+/* enables remapping to spare location under dist sparing */
+#define RF_REMAP       1
+#define RF_DONT_REMAP  0
+
+/*
+ * Flags values for RF_AccessStripeMapFlags_t
+ */
+#define RF_NO_STRIPE_LOCKS   0x0001	/* suppress stripe locks */
+#define RF_DISTRIBUTE_SPARE  0x0002	/* distribute spare space in archs
+					 * that support it */
+#define RF_BD_DECLUSTERED    0x0004	/* declustering uses block designs */
+
+/*************************************************************************
+ *
+ * this structure forms the layout component of the main Raid
+ * structure.  It describes everything needed to define and perform
+ * the mapping of logical RAID addresses <-> physical disk addresses.
+ *
+ *************************************************************************/
+struct RF_RaidLayout_s {
+	/* configuration parameters */
+	RF_SectorCount_t sectorsPerStripeUnit;	/* number of sectors in one
+						 * stripe unit */
+	RF_StripeCount_t SUsPerPU;	/* stripe units per parity unit */
+	RF_StripeCount_t SUsPerRU;	/* stripe units per reconstruction
+					 * unit */
+
+	/* redundant-but-useful info computed from the above, used in all
+	 * layouts */
+	RF_StripeCount_t numStripe;	/* total number of stripes in the
+					 * array */
+	RF_SectorCount_t dataSectorsPerStripe;
+	RF_StripeCount_t dataStripeUnitsPerDisk;
+	u_int   bytesPerStripeUnit;
+	u_int   dataBytesPerStripe;
+	RF_StripeCount_t numDataCol;	/* number of SUs of data per stripe
+					 * (name here is a la RAID4) */
+	RF_StripeCount_t numParityCol;	/* number of SUs of parity per stripe.
+					 * Always 1 for now */
+	RF_StripeCount_t numParityLogCol;	/* number of SUs of parity log
+						 * per stripe.  Always 1 for
+						 * now */
+	RF_StripeCount_t stripeUnitsPerDisk;
+
+	RF_LayoutSW_t *map;	/* ptr to struct holding mapping fns and
+				 * information */
+	void   *layoutSpecificInfo;	/* ptr to a structure holding
+					 * layout-specific params */
+};
+/*****************************************************************************************
+ *
+ * The mapping code returns a pointer to a list of AccessStripeMap structures, which
+ * describes all the mapping information about an access.  The list contains one
+ * AccessStripeMap structure per stripe touched by the access.  Each element in the list
+ * contains a stripe identifier and a pointer to a list of PhysDiskAddr structuress.  Each
+ * element in this latter list describes the physical location of a stripe unit accessed
+ * within the corresponding stripe.
+ *
+ ****************************************************************************************/
+
+#define RF_PDA_TYPE_DATA   0
+#define RF_PDA_TYPE_PARITY 1
+#define RF_PDA_TYPE_Q      2
+
+struct RF_PhysDiskAddr_s {
+	RF_RowCol_t row, col;	/* disk identifier */
+	RF_SectorNum_t startSector;	/* sector offset into the disk */
+	RF_SectorCount_t numSector;	/* number of sectors accessed */
+	int     type;		/* used by higher levels: currently, data,
+				 * parity, or q */
+	caddr_t bufPtr;		/* pointer to buffer supplying/receiving data */
+	RF_RaidAddr_t raidAddress;	/* raid address corresponding to this
+					 * physical disk address */
+	RF_PhysDiskAddr_t *next;
+};
+#define RF_MAX_FAILED_PDA RF_MAXCOL
+
+struct RF_AccessStripeMap_s {
+	RF_StripeNum_t stripeID;/* the stripe index */
+	RF_RaidAddr_t raidAddress;	/* the starting raid address within
+					 * this stripe */
+	RF_RaidAddr_t endRaidAddress;	/* raid address one sector past the
+					 * end of the access */
+	RF_SectorCount_t totalSectorsAccessed;	/* total num sectors
+						 * identified in physInfo list */
+	RF_StripeCount_t numStripeUnitsAccessed;	/* total num elements in
+							 * physInfo list */
+	int     numDataFailed;	/* number of failed data disks accessed */
+	int     numParityFailed;/* number of failed parity disks accessed (0
+				 * or 1) */
+	int     numQFailed;	/* number of failed Q units accessed (0 or 1) */
+	RF_AccessStripeMapFlags_t flags;	/* various flags */
+#if 0
+	RF_PhysDiskAddr_t *failedPDA;	/* points to the PDA that has failed */
+	RF_PhysDiskAddr_t *failedPDAtwo;	/* points to the second PDA
+						 * that has failed, if any */
+#else
+	int     numFailedPDAs;	/* number of failed phys addrs */
+	RF_PhysDiskAddr_t *failedPDAs[RF_MAX_FAILED_PDA];	/* array of failed phys
+								 * addrs */
+#endif
+	RF_PhysDiskAddr_t *physInfo;	/* a list of PhysDiskAddr structs */
+	RF_PhysDiskAddr_t *parityInfo;	/* list of physical addrs for the
+					 * parity (P of P + Q ) */
+	RF_PhysDiskAddr_t *qInfo;	/* list of physical addrs for the Q of
+					 * P + Q */
+	RF_LockReqDesc_t lockReqDesc;	/* used for stripe locking */
+	RF_RowCol_t origRow;	/* the original row:  we may redirect the acc
+				 * to a different row */
+	RF_AccessStripeMap_t *next;
+};
+/* flag values */
+#define RF_ASM_REDIR_LARGE_WRITE   0x00000001	/* allows large-write creation
+						 * code to redirect failed
+						 * accs */
+#define RF_ASM_BAILOUT_DAG_USED    0x00000002	/* allows us to detect
+						 * recursive calls to the
+						 * bailout write dag */
+#define RF_ASM_FLAGS_LOCK_TRIED    0x00000004	/* we've acquired the lock on
+						 * the first parity range in
+						 * this parity stripe */
+#define RF_ASM_FLAGS_LOCK_TRIED2   0x00000008	/* we've acquired the lock on
+						 * the 2nd   parity range in
+						 * this parity stripe */
+#define RF_ASM_FLAGS_FORCE_TRIED   0x00000010	/* we've done the force-recon
+						 * call on this parity stripe */
+#define RF_ASM_FLAGS_RECON_BLOCKED 0x00000020	/* we blocked recon => we must
+						 * unblock it later */
+
+struct RF_AccessStripeMapHeader_s {
+	RF_StripeCount_t numStripes;	/* total number of stripes touched by
+					 * this acc */
+	RF_AccessStripeMap_t *stripeMap;	/* pointer to the actual map.
+						 * Also used for making lists */
+	RF_AccessStripeMapHeader_t *next;
+};
+/*****************************************************************************************
+ *
+ * various routines mapping addresses in the RAID address space.  These work across
+ * all layouts.  DON'T PUT ANY LAYOUT-SPECIFIC CODE HERE.
+ *
+ ****************************************************************************************/
+
+/* return the identifier of the stripe containing the given address */
+#define rf_RaidAddressToStripeID(_layoutPtr_, _addr_) \
+  ( ((_addr_) / (_layoutPtr_)->sectorsPerStripeUnit) / (_layoutPtr_)->numDataCol )
+
+/* return the raid address of the start of the indicates stripe ID */
+#define rf_StripeIDToRaidAddress(_layoutPtr_, _sid_) \
+  ( ((_sid_) * (_layoutPtr_)->sectorsPerStripeUnit) * (_layoutPtr_)->numDataCol )
+
+/* return the identifier of the stripe containing the given stripe unit id */
+#define rf_StripeUnitIDToStripeID(_layoutPtr_, _addr_) \
+  ( (_addr_) / (_layoutPtr_)->numDataCol )
+
+/* return the identifier of the stripe unit containing the given address */
+#define rf_RaidAddressToStripeUnitID(_layoutPtr_, _addr_) \
+  ( ((_addr_) / (_layoutPtr_)->sectorsPerStripeUnit) )
+
+/* return the RAID address of next stripe boundary beyond the given address */
+#define rf_RaidAddressOfNextStripeBoundary(_layoutPtr_, _addr_) \
+  ( (((_addr_)/(_layoutPtr_)->dataSectorsPerStripe)+1) * (_layoutPtr_)->dataSectorsPerStripe )
+
+/* return the RAID address of the start of the stripe containing the given address */
+#define rf_RaidAddressOfPrevStripeBoundary(_layoutPtr_, _addr_) \
+  ( (((_addr_)/(_layoutPtr_)->dataSectorsPerStripe)+0) * (_layoutPtr_)->dataSectorsPerStripe )
+
+/* return the RAID address of next stripe unit boundary beyond the given address */
+#define rf_RaidAddressOfNextStripeUnitBoundary(_layoutPtr_, _addr_) \
+  ( (((_addr_)/(_layoutPtr_)->sectorsPerStripeUnit)+1L)*(_layoutPtr_)->sectorsPerStripeUnit )
+
+/* return the RAID address of the start of the stripe unit containing RAID address _addr_ */
+#define rf_RaidAddressOfPrevStripeUnitBoundary(_layoutPtr_, _addr_) \
+  ( (((_addr_)/(_layoutPtr_)->sectorsPerStripeUnit)+0)*(_layoutPtr_)->sectorsPerStripeUnit )
+
+/* returns the offset into the stripe.  used by RaidAddressStripeAligned */
+#define rf_RaidAddressStripeOffset(_layoutPtr_, _addr_) \
+  ( (_addr_) % ((_layoutPtr_)->dataSectorsPerStripe) )
+
+/* returns the offset into the stripe unit.  */
+#define rf_StripeUnitOffset(_layoutPtr_, _addr_) \
+  ( (_addr_) % ((_layoutPtr_)->sectorsPerStripeUnit) )
+
+/* returns nonzero if the given RAID address is stripe-aligned */
+#define rf_RaidAddressStripeAligned( __layoutPtr__, __addr__ ) \
+  ( rf_RaidAddressStripeOffset(__layoutPtr__, __addr__) == 0 )
+
+/* returns nonzero if the given address is stripe-unit aligned */
+#define rf_StripeUnitAligned( __layoutPtr__, __addr__ ) \
+  ( rf_StripeUnitOffset(__layoutPtr__, __addr__) == 0 )
+
+/* convert an address expressed in RAID blocks to/from an addr expressed in bytes */
+#define rf_RaidAddressToByte(_raidPtr_, _addr_) \
+  ( (_addr_) << ( (_raidPtr_)->logBytesPerSector ) )
+
+#define rf_ByteToRaidAddress(_raidPtr_, _addr_) \
+  ( (_addr_) >> ( (_raidPtr_)->logBytesPerSector ) )
+
+/* convert a raid address to/from a parity stripe ID.  Conversion to raid address is easy,
+ * since we're asking for the address of the first sector in the parity stripe.  Conversion to a
+ * parity stripe ID is more complex, since stripes are not contiguously allocated in
+ * parity stripes.
+ */
+#define rf_RaidAddressToParityStripeID(_layoutPtr_, _addr_, _ru_num_) \
+  rf_MapStripeIDToParityStripeID( (_layoutPtr_), rf_RaidAddressToStripeID( (_layoutPtr_), (_addr_) ), (_ru_num_) )
+
+#define rf_ParityStripeIDToRaidAddress(_layoutPtr_, _psid_) \
+  ( (_psid_) * (_layoutPtr_)->SUsPerPU * (_layoutPtr_)->numDataCol * (_layoutPtr_)->sectorsPerStripeUnit )
+
+RF_LayoutSW_t *rf_GetLayout(RF_ParityConfig_t parityConfig);
+int 
+rf_ConfigureLayout(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+RF_StripeNum_t 
+rf_MapStripeIDToParityStripeID(RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID, RF_ReconUnitNum_t * which_ru);
+
+#endif				/* !_RF__RF_LAYOUT_H_ */
diff --git a/sys/dev/raidframe/rf_map.c b/sys/dev/raidframe/rf_map.c
new file mode 100644
index 0000000..98f455f
--- /dev/null
+++ b/sys/dev/raidframe/rf_map.c
@@ -0,0 +1,907 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_map.c,v 1.5 2000/06/29 00:22:27 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/**************************************************************************
+ *
+ * map.c -- main code for mapping RAID addresses to physical disk addresses
+ *
+ **************************************************************************/
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_map.h>
+#include <dev/raidframe/rf_freelist.h>
+#include <dev/raidframe/rf_shutdown.h>
+
+static void rf_FreePDAList(RF_PhysDiskAddr_t * start, RF_PhysDiskAddr_t * end, int count);
+static void 
+rf_FreeASMList(RF_AccessStripeMap_t * start, RF_AccessStripeMap_t * end,
+    int count);
+
+/*****************************************************************************************
+ *
+ * MapAccess -- main 1st order mapping routine.
+ *
+ * Maps an access in the RAID address space to the corresponding set of physical disk
+ * addresses.  The result is returned as a list of AccessStripeMap structures, one per
+ * stripe accessed.  Each ASM structure contains a pointer to a list of PhysDiskAddr
+ * structures, which describe the physical locations touched by the user access.  Note
+ * that this routine returns only static mapping information, i.e. the list of physical
+ * addresses returned does not necessarily identify the set of physical locations that
+ * will actually be read or written.
+ *
+ * The routine also maps the parity.  The physical disk location returned always
+ * indicates the entire parity unit, even when only a subset of it is being accessed.
+ * This is because an access that is not stripe unit aligned but that spans a stripe
+ * unit boundary may require access two distinct portions of the parity unit, and we
+ * can't yet tell which portion(s) we'll actually need.  We leave it up to the algorithm
+ * selection code to decide what subset of the parity unit to access.
+ *
+ * Note that addresses in the RAID address space must always be maintained as
+ * longs, instead of ints.
+ *
+ * This routine returns NULL if numBlocks is 0
+ *
+ ****************************************************************************************/
+
+RF_AccessStripeMapHeader_t *
+rf_MapAccess(raidPtr, raidAddress, numBlocks, buffer, remap)
+	RF_Raid_t *raidPtr;
+	RF_RaidAddr_t raidAddress;	/* starting address in RAID address
+					 * space */
+	RF_SectorCount_t numBlocks;	/* number of blocks in RAID address
+					 * space to access */
+	caddr_t buffer;		/* buffer to supply/receive data */
+	int     remap;		/* 1 => remap addresses to spare space */
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_AccessStripeMapHeader_t *asm_hdr = NULL;
+	RF_AccessStripeMap_t *asm_list = NULL, *asm_p = NULL;
+	int     faultsTolerated = layoutPtr->map->faultsTolerated;
+	RF_RaidAddr_t startAddress = raidAddress;	/* we'll change
+							 * raidAddress along the
+							 * way */
+	RF_RaidAddr_t endAddress = raidAddress + numBlocks;
+	RF_RaidDisk_t **disks = raidPtr->Disks;
+
+	RF_PhysDiskAddr_t *pda_p, *pda_q;
+	RF_StripeCount_t numStripes = 0;
+	RF_RaidAddr_t stripeRealEndAddress, stripeEndAddress, nextStripeUnitAddress;
+	RF_RaidAddr_t startAddrWithinStripe, lastRaidAddr;
+	RF_StripeCount_t totStripes;
+	RF_StripeNum_t stripeID, lastSID, SUID, lastSUID;
+	RF_AccessStripeMap_t *asmList, *t_asm;
+	RF_PhysDiskAddr_t *pdaList, *t_pda;
+
+	/* allocate all the ASMs and PDAs up front */
+	lastRaidAddr = raidAddress + numBlocks - 1;
+	stripeID = rf_RaidAddressToStripeID(layoutPtr, raidAddress);
+	lastSID = rf_RaidAddressToStripeID(layoutPtr, lastRaidAddr);
+	totStripes = lastSID - stripeID + 1;
+	SUID = rf_RaidAddressToStripeUnitID(layoutPtr, raidAddress);
+	lastSUID = rf_RaidAddressToStripeUnitID(layoutPtr, lastRaidAddr);
+
+	asmList = rf_AllocASMList(totStripes);
+	pdaList = rf_AllocPDAList(lastSUID - SUID + 1 + faultsTolerated * totStripes);	/* may also need pda(s)
+											 * per stripe for parity */
+
+	if (raidAddress + numBlocks > raidPtr->totalSectors) {
+		RF_ERRORMSG1("Unable to map access because offset (%d) was invalid\n",
+		    (int) raidAddress);
+		return (NULL);
+	}
+	if (rf_mapDebug)
+		rf_PrintRaidAddressInfo(raidPtr, raidAddress, numBlocks);
+	for (; raidAddress < endAddress;) {
+		/* make the next stripe structure */
+		RF_ASSERT(asmList);
+		t_asm = asmList;
+		asmList = asmList->next;
+		bzero((char *) t_asm, sizeof(RF_AccessStripeMap_t));
+		if (!asm_p)
+			asm_list = asm_p = t_asm;
+		else {
+			asm_p->next = t_asm;
+			asm_p = asm_p->next;
+		}
+		numStripes++;
+
+		/* map SUs from current location to the end of the stripe */
+		asm_p->stripeID =	/* rf_RaidAddressToStripeID(layoutPtr,
+		        raidAddress) */ stripeID++;
+		stripeRealEndAddress = rf_RaidAddressOfNextStripeBoundary(layoutPtr, raidAddress);
+		stripeEndAddress = RF_MIN(endAddress, stripeRealEndAddress);
+		asm_p->raidAddress = raidAddress;
+		asm_p->endRaidAddress = stripeEndAddress;
+
+		/* map each stripe unit in the stripe */
+		pda_p = NULL;
+		startAddrWithinStripe = raidAddress;	/* Raid addr of start of
+							 * portion of access
+							 * that is within this
+							 * stripe */
+		for (; raidAddress < stripeEndAddress;) {
+			RF_ASSERT(pdaList);
+			t_pda = pdaList;
+			pdaList = pdaList->next;
+			bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
+			if (!pda_p)
+				asm_p->physInfo = pda_p = t_pda;
+			else {
+				pda_p->next = t_pda;
+				pda_p = pda_p->next;
+			}
+
+			pda_p->type = RF_PDA_TYPE_DATA;
+			(layoutPtr->map->MapSector) (raidPtr, raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
+
+			/* mark any failures we find.  failedPDA is don't-care
+			 * if there is more than one failure */
+			pda_p->raidAddress = raidAddress;	/* the RAID address
+								 * corresponding to this
+								 * physical disk address */
+			nextStripeUnitAddress = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, raidAddress);
+			pda_p->numSector = RF_MIN(endAddress, nextStripeUnitAddress) - raidAddress;
+			RF_ASSERT(pda_p->numSector != 0);
+			rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 0);
+			pda_p->bufPtr = buffer + rf_RaidAddressToByte(raidPtr, (raidAddress - startAddress));
+			asm_p->totalSectorsAccessed += pda_p->numSector;
+			asm_p->numStripeUnitsAccessed++;
+			asm_p->origRow = pda_p->row;	/* redundant but
+							 * harmless to do this
+							 * in every loop
+							 * iteration */
+
+			raidAddress = RF_MIN(endAddress, nextStripeUnitAddress);
+		}
+
+		/* Map the parity. At this stage, the startSector and
+		 * numSector fields for the parity unit are always set to
+		 * indicate the entire parity unit. We may modify this after
+		 * mapping the data portion. */
+		switch (faultsTolerated) {
+		case 0:
+			break;
+		case 1:	/* single fault tolerant */
+			RF_ASSERT(pdaList);
+			t_pda = pdaList;
+			pdaList = pdaList->next;
+			bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
+			pda_p = asm_p->parityInfo = t_pda;
+			pda_p->type = RF_PDA_TYPE_PARITY;
+			(layoutPtr->map->MapParity) (raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
+			    &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
+			pda_p->numSector = layoutPtr->sectorsPerStripeUnit;
+			/* raidAddr may be needed to find unit to redirect to */
+			pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
+			rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 1);
+			rf_ASMParityAdjust(asm_p->parityInfo, startAddrWithinStripe, endAddress, layoutPtr, asm_p);
+
+			break;
+		case 2:	/* two fault tolerant */
+			RF_ASSERT(pdaList && pdaList->next);
+			t_pda = pdaList;
+			pdaList = pdaList->next;
+			bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
+			pda_p = asm_p->parityInfo = t_pda;
+			pda_p->type = RF_PDA_TYPE_PARITY;
+			t_pda = pdaList;
+			pdaList = pdaList->next;
+			bzero((char *) t_pda, sizeof(RF_PhysDiskAddr_t));
+			pda_q = asm_p->qInfo = t_pda;
+			pda_q->type = RF_PDA_TYPE_Q;
+			(layoutPtr->map->MapParity) (raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
+			    &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
+			(layoutPtr->map->MapQ) (raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
+			    &(pda_q->row), &(pda_q->col), &(pda_q->startSector), remap);
+			pda_q->numSector = pda_p->numSector = layoutPtr->sectorsPerStripeUnit;
+			/* raidAddr may be needed to find unit to redirect to */
+			pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
+			pda_q->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
+			/* failure mode stuff */
+			rf_ASMCheckStatus(raidPtr, pda_p, asm_p, disks, 1);
+			rf_ASMCheckStatus(raidPtr, pda_q, asm_p, disks, 1);
+			rf_ASMParityAdjust(asm_p->parityInfo, startAddrWithinStripe, endAddress, layoutPtr, asm_p);
+			rf_ASMParityAdjust(asm_p->qInfo, startAddrWithinStripe, endAddress, layoutPtr, asm_p);
+			break;
+		}
+	}
+	RF_ASSERT(asmList == NULL && pdaList == NULL);
+	/* make the header structure */
+	asm_hdr = rf_AllocAccessStripeMapHeader();
+	RF_ASSERT(numStripes == totStripes);
+	asm_hdr->numStripes = numStripes;
+	asm_hdr->stripeMap = asm_list;
+
+	if (rf_mapDebug)
+		rf_PrintAccessStripeMap(asm_hdr);
+	return (asm_hdr);
+}
+/*****************************************************************************************
+ * This routine walks through an ASM list and marks the PDAs that have failed.
+ * It's called only when a disk failure causes an in-flight DAG to fail.
+ * The parity may consist of two components, but we want to use only one failedPDA
+ * pointer.  Thus we set failedPDA to point to the first parity component, and rely
+ * on the rest of the code to do the right thing with this.
+ ****************************************************************************************/
+
+void 
+rf_MarkFailuresInASMList(raidPtr, asm_h)
+	RF_Raid_t *raidPtr;
+	RF_AccessStripeMapHeader_t *asm_h;
+{
+	RF_RaidDisk_t **disks = raidPtr->Disks;
+	RF_AccessStripeMap_t *asmap;
+	RF_PhysDiskAddr_t *pda;
+
+	for (asmap = asm_h->stripeMap; asmap; asmap = asmap->next) {
+		asmap->numDataFailed = asmap->numParityFailed = asmap->numQFailed = 0;
+		asmap->numFailedPDAs = 0;
+		bzero((char *) asmap->failedPDAs,
+		    RF_MAX_FAILED_PDA * sizeof(RF_PhysDiskAddr_t *));
+		for (pda = asmap->physInfo; pda; pda = pda->next) {
+			if (RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
+				asmap->numDataFailed++;
+				asmap->failedPDAs[asmap->numFailedPDAs] = pda;
+				asmap->numFailedPDAs++;
+			}
+		}
+		pda = asmap->parityInfo;
+		if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
+			asmap->numParityFailed++;
+			asmap->failedPDAs[asmap->numFailedPDAs] = pda;
+			asmap->numFailedPDAs++;
+		}
+		pda = asmap->qInfo;
+		if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
+			asmap->numQFailed++;
+			asmap->failedPDAs[asmap->numFailedPDAs] = pda;
+			asmap->numFailedPDAs++;
+		}
+	}
+}
+/*****************************************************************************************
+ *
+ * DuplicateASM -- duplicates an ASM and returns the new one
+ *
+ ****************************************************************************************/
+RF_AccessStripeMap_t *
+rf_DuplicateASM(asmap)
+	RF_AccessStripeMap_t *asmap;
+{
+	RF_AccessStripeMap_t *new_asm;
+	RF_PhysDiskAddr_t *pda, *new_pda, *t_pda;
+
+	new_pda = NULL;
+	new_asm = rf_AllocAccessStripeMapComponent();
+	bcopy((char *) asmap, (char *) new_asm, sizeof(RF_AccessStripeMap_t));
+	new_asm->numFailedPDAs = 0;	/* ??? */
+	new_asm->failedPDAs[0] = NULL;
+	new_asm->physInfo = NULL;
+	new_asm->parityInfo = NULL;
+	new_asm->next = NULL;
+
+	for (pda = asmap->physInfo; pda; pda = pda->next) {	/* copy the physInfo
+								 * list */
+		t_pda = rf_AllocPhysDiskAddr();
+		bcopy((char *) pda, (char *) t_pda, sizeof(RF_PhysDiskAddr_t));
+		t_pda->next = NULL;
+		if (!new_asm->physInfo) {
+			new_asm->physInfo = t_pda;
+			new_pda = t_pda;
+		} else {
+			new_pda->next = t_pda;
+			new_pda = new_pda->next;
+		}
+		if (pda == asmap->failedPDAs[0])
+			new_asm->failedPDAs[0] = t_pda;
+	}
+	for (pda = asmap->parityInfo; pda; pda = pda->next) {	/* copy the parityInfo
+								 * list */
+		t_pda = rf_AllocPhysDiskAddr();
+		bcopy((char *) pda, (char *) t_pda, sizeof(RF_PhysDiskAddr_t));
+		t_pda->next = NULL;
+		if (!new_asm->parityInfo) {
+			new_asm->parityInfo = t_pda;
+			new_pda = t_pda;
+		} else {
+			new_pda->next = t_pda;
+			new_pda = new_pda->next;
+		}
+		if (pda == asmap->failedPDAs[0])
+			new_asm->failedPDAs[0] = t_pda;
+	}
+	return (new_asm);
+}
+/*****************************************************************************************
+ *
+ * DuplicatePDA -- duplicates a PDA and returns the new one
+ *
+ ****************************************************************************************/
+RF_PhysDiskAddr_t *
+rf_DuplicatePDA(pda)
+	RF_PhysDiskAddr_t *pda;
+{
+	RF_PhysDiskAddr_t *new;
+
+	new = rf_AllocPhysDiskAddr();
+	bcopy((char *) pda, (char *) new, sizeof(RF_PhysDiskAddr_t));
+	return (new);
+}
+/*****************************************************************************************
+ *
+ * routines to allocate and free list elements.  All allocation routines zero the
+ * structure before returning it.
+ *
+ * FreePhysDiskAddr is static.  It should never be called directly, because
+ * FreeAccessStripeMap takes care of freeing the PhysDiskAddr list.
+ *
+ ****************************************************************************************/
+
+static RF_FreeList_t *rf_asmhdr_freelist;
+#define RF_MAX_FREE_ASMHDR 128
+#define RF_ASMHDR_INC       16
+#define RF_ASMHDR_INITIAL   32
+
+static RF_FreeList_t *rf_asm_freelist;
+#define RF_MAX_FREE_ASM 192
+#define RF_ASM_INC       24
+#define RF_ASM_INITIAL   64
+
+static RF_FreeList_t *rf_pda_freelist;
+#define RF_MAX_FREE_PDA 192
+#define RF_PDA_INC       24
+#define RF_PDA_INITIAL   64
+
+/* called at shutdown time.  So far, all that is necessary is to release all the free lists */
+static void rf_ShutdownMapModule(void *);
+static void 
+rf_ShutdownMapModule(ignored)
+	void   *ignored;
+{
+	RF_FREELIST_DESTROY(rf_asmhdr_freelist, next, (RF_AccessStripeMapHeader_t *));
+	RF_FREELIST_DESTROY(rf_pda_freelist, next, (RF_PhysDiskAddr_t *));
+	RF_FREELIST_DESTROY(rf_asm_freelist, next, (RF_AccessStripeMap_t *));
+}
+
+int 
+rf_ConfigureMapModule(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     rc;
+
+	RF_FREELIST_CREATE(rf_asmhdr_freelist, RF_MAX_FREE_ASMHDR,
+	    RF_ASMHDR_INC, sizeof(RF_AccessStripeMapHeader_t));
+	if (rf_asmhdr_freelist == NULL) {
+		return (ENOMEM);
+	}
+	RF_FREELIST_CREATE(rf_asm_freelist, RF_MAX_FREE_ASM,
+	    RF_ASM_INC, sizeof(RF_AccessStripeMap_t));
+	if (rf_asm_freelist == NULL) {
+		RF_FREELIST_DESTROY(rf_asmhdr_freelist, next, (RF_AccessStripeMapHeader_t *));
+		return (ENOMEM);
+	}
+	RF_FREELIST_CREATE(rf_pda_freelist, RF_MAX_FREE_PDA,
+	    RF_PDA_INC, sizeof(RF_PhysDiskAddr_t));
+	if (rf_pda_freelist == NULL) {
+		RF_FREELIST_DESTROY(rf_asmhdr_freelist, next, (RF_AccessStripeMapHeader_t *));
+		RF_FREELIST_DESTROY(rf_pda_freelist, next, (RF_PhysDiskAddr_t *));
+		return (ENOMEM);
+	}
+	rc = rf_ShutdownCreate(listp, rf_ShutdownMapModule, NULL);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		rf_ShutdownMapModule(NULL);
+		return (rc);
+	}
+	RF_FREELIST_PRIME(rf_asmhdr_freelist, RF_ASMHDR_INITIAL, next,
+	    (RF_AccessStripeMapHeader_t *));
+	RF_FREELIST_PRIME(rf_asm_freelist, RF_ASM_INITIAL, next,
+	    (RF_AccessStripeMap_t *));
+	RF_FREELIST_PRIME(rf_pda_freelist, RF_PDA_INITIAL, next,
+	    (RF_PhysDiskAddr_t *));
+
+	return (0);
+}
+
+RF_AccessStripeMapHeader_t *
+rf_AllocAccessStripeMapHeader()
+{
+	RF_AccessStripeMapHeader_t *p;
+
+	RF_FREELIST_GET(rf_asmhdr_freelist, p, next, (RF_AccessStripeMapHeader_t *));
+	bzero((char *) p, sizeof(RF_AccessStripeMapHeader_t));
+
+	return (p);
+}
+
+
+void 
+rf_FreeAccessStripeMapHeader(p)
+	RF_AccessStripeMapHeader_t *p;
+{
+	RF_FREELIST_FREE(rf_asmhdr_freelist, p, next);
+}
+
+RF_PhysDiskAddr_t *
+rf_AllocPhysDiskAddr()
+{
+	RF_PhysDiskAddr_t *p;
+
+	RF_FREELIST_GET(rf_pda_freelist, p, next, (RF_PhysDiskAddr_t *));
+	bzero((char *) p, sizeof(RF_PhysDiskAddr_t));
+
+	return (p);
+}
+/* allocates a list of PDAs, locking the free list only once
+ * when we have to call calloc, we do it one component at a time to simplify
+ * the process of freeing the list at program shutdown.  This should not be
+ * much of a performance hit, because it should be very infrequently executed.
+ */
+RF_PhysDiskAddr_t *
+rf_AllocPDAList(count)
+	int     count;
+{
+	RF_PhysDiskAddr_t *p = NULL;
+
+	RF_FREELIST_GET_N(rf_pda_freelist, p, next, (RF_PhysDiskAddr_t *), count);
+	return (p);
+}
+
+void 
+rf_FreePhysDiskAddr(p)
+	RF_PhysDiskAddr_t *p;
+{
+	RF_FREELIST_FREE(rf_pda_freelist, p, next);
+}
+
+static void 
+rf_FreePDAList(l_start, l_end, count)
+	RF_PhysDiskAddr_t *l_start, *l_end;	/* pointers to start and end
+						 * of list */
+	int     count;		/* number of elements in list */
+{
+	RF_FREELIST_FREE_N(rf_pda_freelist, l_start, next, (RF_PhysDiskAddr_t *), count);
+}
+
+RF_AccessStripeMap_t *
+rf_AllocAccessStripeMapComponent()
+{
+	RF_AccessStripeMap_t *p;
+
+	RF_FREELIST_GET(rf_asm_freelist, p, next, (RF_AccessStripeMap_t *));
+	bzero((char *) p, sizeof(RF_AccessStripeMap_t));
+
+	return (p);
+}
+/* this is essentially identical to AllocPDAList.  I should combine the two.
+ * when we have to call calloc, we do it one component at a time to simplify
+ * the process of freeing the list at program shutdown.  This should not be
+ * much of a performance hit, because it should be very infrequently executed.
+ */
+RF_AccessStripeMap_t *
+rf_AllocASMList(count)
+	int     count;
+{
+	RF_AccessStripeMap_t *p = NULL;
+
+	RF_FREELIST_GET_N(rf_asm_freelist, p, next, (RF_AccessStripeMap_t *), count);
+	return (p);
+}
+
+void 
+rf_FreeAccessStripeMapComponent(p)
+	RF_AccessStripeMap_t *p;
+{
+	RF_FREELIST_FREE(rf_asm_freelist, p, next);
+}
+
+static void 
+rf_FreeASMList(l_start, l_end, count)
+	RF_AccessStripeMap_t *l_start, *l_end;
+	int     count;
+{
+	RF_FREELIST_FREE_N(rf_asm_freelist, l_start, next, (RF_AccessStripeMap_t *), count);
+}
+
+void 
+rf_FreeAccessStripeMap(hdr)
+	RF_AccessStripeMapHeader_t *hdr;
+{
+	RF_AccessStripeMap_t *p, *pt = NULL;
+	RF_PhysDiskAddr_t *pdp, *trailer, *pdaList = NULL, *pdaEnd = NULL;
+	int     count = 0, t, asm_count = 0;
+
+	for (p = hdr->stripeMap; p; p = p->next) {
+
+		/* link the 3 pda lists into the accumulating pda list */
+
+		if (!pdaList)
+			pdaList = p->qInfo;
+		else
+			pdaEnd->next = p->qInfo;
+		for (trailer = NULL, pdp = p->qInfo; pdp;) {
+			trailer = pdp;
+			pdp = pdp->next;
+			count++;
+		}
+		if (trailer)
+			pdaEnd = trailer;
+
+		if (!pdaList)
+			pdaList = p->parityInfo;
+		else
+			pdaEnd->next = p->parityInfo;
+		for (trailer = NULL, pdp = p->parityInfo; pdp;) {
+			trailer = pdp;
+			pdp = pdp->next;
+			count++;
+		}
+		if (trailer)
+			pdaEnd = trailer;
+
+		if (!pdaList)
+			pdaList = p->physInfo;
+		else
+			pdaEnd->next = p->physInfo;
+		for (trailer = NULL, pdp = p->physInfo; pdp;) {
+			trailer = pdp;
+			pdp = pdp->next;
+			count++;
+		}
+		if (trailer)
+			pdaEnd = trailer;
+
+		pt = p;
+		asm_count++;
+	}
+
+	/* debug only */
+	for (t = 0, pdp = pdaList; pdp; pdp = pdp->next)
+		t++;
+	RF_ASSERT(t == count);
+
+	if (pdaList)
+		rf_FreePDAList(pdaList, pdaEnd, count);
+	rf_FreeASMList(hdr->stripeMap, pt, asm_count);
+	rf_FreeAccessStripeMapHeader(hdr);
+}
+/* We can't use the large write optimization if there are any failures in the stripe.
+ * In the declustered layout, there is no way to immediately determine what disks
+ * constitute a stripe, so we actually have to hunt through the stripe looking for failures.
+ * The reason we map the parity instead of just using asm->parityInfo->col is because
+ * the latter may have been already redirected to a spare drive, which would
+ * mess up the computation of the stripe offset.
+ *
+ * ASSUMES AT MOST ONE FAILURE IN THE STRIPE.
+ */
+int 
+rf_CheckStripeForFailures(raidPtr, asmap)
+	RF_Raid_t *raidPtr;
+	RF_AccessStripeMap_t *asmap;
+{
+	RF_RowCol_t trow, tcol, prow, pcol, *diskids, row, i;
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_StripeCount_t stripeOffset;
+	int     numFailures;
+	RF_RaidAddr_t sosAddr;
+	RF_SectorNum_t diskOffset, poffset;
+	RF_RowCol_t testrow;
+
+	/* quick out in the fault-free case.  */
+	RF_LOCK_MUTEX(raidPtr->mutex);
+	numFailures = raidPtr->numFailures;
+	RF_UNLOCK_MUTEX(raidPtr->mutex);
+	if (numFailures == 0)
+		return (0);
+
+	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+	row = asmap->physInfo->row;
+	(layoutPtr->map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids, &testrow);
+	(layoutPtr->map->MapParity) (raidPtr, asmap->raidAddress, &prow, &pcol, &poffset, 0);	/* get pcol */
+
+	/* this need not be true if we've redirected the access to a spare in
+	 * another row RF_ASSERT(row == testrow); */
+	stripeOffset = 0;
+	for (i = 0; i < layoutPtr->numDataCol + layoutPtr->numParityCol; i++) {
+		if (diskids[i] != pcol) {
+			if (RF_DEAD_DISK(raidPtr->Disks[testrow][diskids[i]].status)) {
+				if (raidPtr->status[testrow] != rf_rs_reconstructing)
+					return (1);
+				RF_ASSERT(raidPtr->reconControl[testrow]->fcol == diskids[i]);
+				layoutPtr->map->MapSector(raidPtr,
+				    sosAddr + stripeOffset * layoutPtr->sectorsPerStripeUnit,
+				    &trow, &tcol, &diskOffset, 0);
+				RF_ASSERT((trow == testrow) && (tcol == diskids[i]));
+				if (!rf_CheckRUReconstructed(raidPtr->reconControl[testrow]->reconMap, diskOffset))
+					return (1);
+				asmap->flags |= RF_ASM_REDIR_LARGE_WRITE;
+				return (0);
+			}
+			stripeOffset++;
+		}
+	}
+	return (0);
+}
+/*
+   return the number of failed data units in the stripe.
+*/
+
+int 
+rf_NumFailedDataUnitsInStripe(raidPtr, asmap)
+	RF_Raid_t *raidPtr;
+	RF_AccessStripeMap_t *asmap;
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_RowCol_t trow, tcol, row, i;
+	RF_SectorNum_t diskOffset;
+	RF_RaidAddr_t sosAddr;
+	int     numFailures;
+
+	/* quick out in the fault-free case.  */
+	RF_LOCK_MUTEX(raidPtr->mutex);
+	numFailures = raidPtr->numFailures;
+	RF_UNLOCK_MUTEX(raidPtr->mutex);
+	if (numFailures == 0)
+		return (0);
+	numFailures = 0;
+
+	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+	row = asmap->physInfo->row;
+	for (i = 0; i < layoutPtr->numDataCol; i++) {
+		(layoutPtr->map->MapSector) (raidPtr, sosAddr + i * layoutPtr->sectorsPerStripeUnit,
+		    &trow, &tcol, &diskOffset, 0);
+		if (RF_DEAD_DISK(raidPtr->Disks[trow][tcol].status))
+			numFailures++;
+	}
+
+	return numFailures;
+}
+
+
+/*****************************************************************************************
+ *
+ * debug routines
+ *
+ ****************************************************************************************/
+
+void 
+rf_PrintAccessStripeMap(asm_h)
+	RF_AccessStripeMapHeader_t *asm_h;
+{
+	rf_PrintFullAccessStripeMap(asm_h, 0);
+}
+
+void 
+rf_PrintFullAccessStripeMap(asm_h, prbuf)
+	RF_AccessStripeMapHeader_t *asm_h;
+	int     prbuf;		/* flag to print buffer pointers */
+{
+	int     i;
+	RF_AccessStripeMap_t *asmap = asm_h->stripeMap;
+	RF_PhysDiskAddr_t *p;
+	printf("%d stripes total\n", (int) asm_h->numStripes);
+	for (; asmap; asmap = asmap->next) {
+		/* printf("Num failures: %d\n",asmap->numDataFailed); */
+		/* printf("Num sectors:
+		 * %d\n",(int)asmap->totalSectorsAccessed); */
+		printf("Stripe %d (%d sectors), failures: %d data, %d parity: ",
+		    (int) asmap->stripeID,
+		    (int) asmap->totalSectorsAccessed,
+		    (int) asmap->numDataFailed,
+		    (int) asmap->numParityFailed);
+		if (asmap->parityInfo) {
+			printf("Parity [r%d c%d s%d-%d", asmap->parityInfo->row, asmap->parityInfo->col,
+			    (int) asmap->parityInfo->startSector,
+			    (int) (asmap->parityInfo->startSector +
+				asmap->parityInfo->numSector - 1));
+			if (prbuf)
+				printf(" b0x%lx", (unsigned long) asmap->parityInfo->bufPtr);
+			if (asmap->parityInfo->next) {
+				printf(", r%d c%d s%d-%d", asmap->parityInfo->next->row,
+				    asmap->parityInfo->next->col,
+				    (int) asmap->parityInfo->next->startSector,
+				    (int) (asmap->parityInfo->next->startSector +
+					asmap->parityInfo->next->numSector - 1));
+				if (prbuf)
+					printf(" b0x%lx", (unsigned long) asmap->parityInfo->next->bufPtr);
+				RF_ASSERT(asmap->parityInfo->next->next == NULL);
+			}
+			printf("]\n\t");
+		}
+		for (i = 0, p = asmap->physInfo; p; p = p->next, i++) {
+			printf("SU r%d c%d s%d-%d ", p->row, p->col, (int) p->startSector,
+			    (int) (p->startSector + p->numSector - 1));
+			if (prbuf)
+				printf("b0x%lx ", (unsigned long) p->bufPtr);
+			if (i && !(i & 1))
+				printf("\n\t");
+		}
+		printf("\n");
+		p = asm_h->stripeMap->failedPDAs[0];
+		if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 1)
+			printf("[multiple failures]\n");
+		else
+			if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 0)
+				printf("\t[Failed PDA: r%d c%d s%d-%d]\n", p->row, p->col,
+				    (int) p->startSector, (int) (p->startSector + p->numSector - 1));
+	}
+}
+
+void 
+rf_PrintRaidAddressInfo(raidPtr, raidAddr, numBlocks)
+	RF_Raid_t *raidPtr;
+	RF_RaidAddr_t raidAddr;
+	RF_SectorCount_t numBlocks;
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_RaidAddr_t ra, sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
+
+	printf("Raid addrs of SU boundaries from start of stripe to end of access:\n\t");
+	for (ra = sosAddr; ra <= raidAddr + numBlocks; ra += layoutPtr->sectorsPerStripeUnit) {
+		printf("%d (0x%x), ", (int) ra, (int) ra);
+	}
+	printf("\n");
+	printf("Offset into stripe unit: %d (0x%x)\n",
+	    (int) (raidAddr % layoutPtr->sectorsPerStripeUnit),
+	    (int) (raidAddr % layoutPtr->sectorsPerStripeUnit));
+}
+/*
+   given a parity descriptor and the starting address within a stripe,
+   range restrict the parity descriptor to touch only the correct stuff.
+*/
+void 
+rf_ASMParityAdjust(
+    RF_PhysDiskAddr_t * toAdjust,
+    RF_StripeNum_t startAddrWithinStripe,
+    RF_SectorNum_t endAddress,
+    RF_RaidLayout_t * layoutPtr,
+    RF_AccessStripeMap_t * asm_p)
+{
+	RF_PhysDiskAddr_t *new_pda;
+
+	/* when we're accessing only a portion of one stripe unit, we want the
+	 * parity descriptor to identify only the chunk of parity associated
+	 * with the data.  When the access spans exactly one stripe unit
+	 * boundary and is less than a stripe unit in size, it uses two
+	 * disjoint regions of the parity unit.  When an access spans more
+	 * than one stripe unit boundary, it uses all of the parity unit.
+	 * 
+	 * To better handle the case where stripe units are small, we may
+	 * eventually want to change the 2nd case so that if the SU size is
+	 * below some threshold, we just read/write the whole thing instead of
+	 * breaking it up into two accesses. */
+	if (asm_p->numStripeUnitsAccessed == 1) {
+		int     x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit);
+		toAdjust->startSector += x;
+		toAdjust->raidAddress += x;
+		toAdjust->numSector = asm_p->physInfo->numSector;
+		RF_ASSERT(toAdjust->numSector != 0);
+	} else
+		if (asm_p->numStripeUnitsAccessed == 2 && asm_p->totalSectorsAccessed < layoutPtr->sectorsPerStripeUnit) {
+			int     x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit);
+
+			/* create a second pda and copy the parity map info
+			 * into it */
+			RF_ASSERT(toAdjust->next == NULL);
+			new_pda = toAdjust->next = rf_AllocPhysDiskAddr();
+			*new_pda = *toAdjust;	/* structure assignment */
+			new_pda->next = NULL;
+
+			/* adjust the start sector & number of blocks for the
+			 * first parity pda */
+			toAdjust->startSector += x;
+			toAdjust->raidAddress += x;
+			toAdjust->numSector = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, startAddrWithinStripe) - startAddrWithinStripe;
+			RF_ASSERT(toAdjust->numSector != 0);
+
+			/* adjust the second pda */
+			new_pda->numSector = endAddress - rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, endAddress);
+			/* new_pda->raidAddress =
+			 * rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr,
+			 * toAdjust->raidAddress); */
+			RF_ASSERT(new_pda->numSector != 0);
+		}
+}
+/*
+   Check if a disk has been spared or failed. If spared,
+   redirect the I/O.
+   If it has been failed, record it in the asm pointer.
+   Fourth arg is whether data or parity.
+*/
+void 
+rf_ASMCheckStatus(
+    RF_Raid_t * raidPtr,
+    RF_PhysDiskAddr_t * pda_p,
+    RF_AccessStripeMap_t * asm_p,
+    RF_RaidDisk_t ** disks,
+    int parity)
+{
+	RF_DiskStatus_t dstatus;
+	RF_RowCol_t frow, fcol;
+
+	dstatus = disks[pda_p->row][pda_p->col].status;
+
+	if (dstatus == rf_ds_spared) {
+		/* if the disk has been spared, redirect access to the spare */
+		frow = pda_p->row;
+		fcol = pda_p->col;
+		pda_p->row = disks[frow][fcol].spareRow;
+		pda_p->col = disks[frow][fcol].spareCol;
+	} else
+		if (dstatus == rf_ds_dist_spared) {
+			/* ditto if disk has been spared to dist spare space */
+			RF_RowCol_t or = pda_p->row, oc = pda_p->col;
+			RF_SectorNum_t oo = pda_p->startSector;
+
+			if (pda_p->type == RF_PDA_TYPE_DATA)
+				raidPtr->Layout.map->MapSector(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP);
+			else
+				raidPtr->Layout.map->MapParity(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP);
+
+			if (rf_mapDebug) {
+				printf("Redirected r %d c %d o %d -> r%d c %d o %d\n", or, oc, (int) oo,
+				    pda_p->row, pda_p->col, (int) pda_p->startSector);
+			}
+		} else
+			if (RF_DEAD_DISK(dstatus)) {
+				/* if the disk is inaccessible, mark the
+				 * failure */
+				if (parity)
+					asm_p->numParityFailed++;
+				else {
+					asm_p->numDataFailed++;
+#if 0
+					/* XXX Do we really want this spewing
+					 * out on the console? GO */
+					printf("DATA_FAILED!\n");
+#endif
+				}
+				asm_p->failedPDAs[asm_p->numFailedPDAs] = pda_p;
+				asm_p->numFailedPDAs++;
+#if 0
+				switch (asm_p->numParityFailed + asm_p->numDataFailed) {
+				case 1:
+					asm_p->failedPDAs[0] = pda_p;
+					break;
+				case 2:
+					asm_p->failedPDAs[1] = pda_p;
+				default:
+					break;
+				}
+#endif
+			}
+	/* the redirected access should never span a stripe unit boundary */
+	RF_ASSERT(rf_RaidAddressToStripeUnitID(&raidPtr->Layout, pda_p->raidAddress) ==
+	    rf_RaidAddressToStripeUnitID(&raidPtr->Layout, pda_p->raidAddress + pda_p->numSector - 1));
+	RF_ASSERT(pda_p->col != -1);
+}
diff --git a/sys/dev/raidframe/rf_map.h b/sys/dev/raidframe/rf_map.h
new file mode 100644
index 0000000..d7c6d19
--- /dev/null
+++ b/sys/dev/raidframe/rf_map.h
@@ -0,0 +1,94 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_map.h,v 1.3 1999/02/05 00:06:12 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_map.h */
+
+#ifndef _RF__RF_MAP_H_
+#define _RF__RF_MAP_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_raid.h>
+
+/* mapping structure allocation and free routines */
+RF_AccessStripeMapHeader_t *
+rf_MapAccess(RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks,
+    caddr_t buffer, int remap);
+
+void 
+rf_MarkFailuresInASMList(RF_Raid_t * raidPtr,
+    RF_AccessStripeMapHeader_t * asm_h);
+
+RF_AccessStripeMap_t *rf_DuplicateASM(RF_AccessStripeMap_t * asmap);
+
+RF_PhysDiskAddr_t *rf_DuplicatePDA(RF_PhysDiskAddr_t * pda);
+
+int     rf_ConfigureMapModule(RF_ShutdownList_t ** listp);
+
+RF_AccessStripeMapHeader_t *rf_AllocAccessStripeMapHeader(void);
+
+void    rf_FreeAccessStripeMapHeader(RF_AccessStripeMapHeader_t * p);
+
+RF_PhysDiskAddr_t *rf_AllocPhysDiskAddr(void);
+
+RF_PhysDiskAddr_t *rf_AllocPDAList(int count);
+
+void    rf_FreePhysDiskAddr(RF_PhysDiskAddr_t * p);
+
+RF_AccessStripeMap_t *rf_AllocAccessStripeMapComponent(void);
+
+RF_AccessStripeMap_t *rf_AllocASMList(int count);
+
+void    rf_FreeAccessStripeMapComponent(RF_AccessStripeMap_t * p);
+
+void    rf_FreeAccessStripeMap(RF_AccessStripeMapHeader_t * hdr);
+
+int     rf_CheckStripeForFailures(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap);
+
+int     rf_NumFailedDataUnitsInStripe(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap);
+
+void    rf_PrintAccessStripeMap(RF_AccessStripeMapHeader_t * asm_h);
+
+void    rf_PrintFullAccessStripeMap(RF_AccessStripeMapHeader_t * asm_h, int prbuf);
+
+void 
+rf_PrintRaidAddressInfo(RF_Raid_t * raidPtr, RF_RaidAddr_t raidAddr,
+    RF_SectorCount_t numBlocks);
+
+void 
+rf_ASMParityAdjust(RF_PhysDiskAddr_t * toAdjust,
+    RF_StripeNum_t startAddrWithinStripe, RF_SectorNum_t endAddress,
+    RF_RaidLayout_t * layoutPtr, RF_AccessStripeMap_t * asm_p);
+
+void 
+rf_ASMCheckStatus(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda_p,
+    RF_AccessStripeMap_t * asm_p, RF_RaidDisk_t ** disks, int parity);
+
+#endif				/* !_RF__RF_MAP_H_ */
diff --git a/sys/dev/raidframe/rf_mcpair.c b/sys/dev/raidframe/rf_mcpair.c
new file mode 100644
index 0000000..ff233fe
--- /dev/null
+++ b/sys/dev/raidframe/rf_mcpair.c
@@ -0,0 +1,141 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_mcpair.c,v 1.4 2000/09/11 02:23:14 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_mcpair.c
+ * an mcpair is a structure containing a mutex and a condition variable.
+ * it's used to block the current thread until some event occurs.
+ */
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_mcpair.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_freelist.h>
+#include <dev/raidframe/rf_shutdown.h>
+
+#include <sys/proc.h>
+
+static RF_FreeList_t *rf_mcpair_freelist;
+
+#define RF_MAX_FREE_MCPAIR 128
+#define RF_MCPAIR_INC       16
+#define RF_MCPAIR_INITIAL   24
+
+static int init_mcpair(RF_MCPair_t *);
+static void clean_mcpair(RF_MCPair_t *);
+static void rf_ShutdownMCPair(void *);
+
+
+
+static int 
+init_mcpair(t)
+	RF_MCPair_t *t;
+{
+	int     rc;
+
+	rc = rf_mutex_init(&t->mutex, __FUNCTION__);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		return (rc);
+	}
+	rc = rf_cond_init(&t->cond);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		rf_mutex_destroy(&t->mutex);
+		return (rc);
+	}
+	return (0);
+}
+
+static void 
+clean_mcpair(t)
+	RF_MCPair_t *t;
+{
+	rf_mutex_destroy(&t->mutex);
+	rf_cond_destroy(&t->cond);
+}
+
+static void 
+rf_ShutdownMCPair(ignored)
+	void   *ignored;
+{
+	RF_FREELIST_DESTROY_CLEAN(rf_mcpair_freelist, next, (RF_MCPair_t *), clean_mcpair);
+}
+
+int 
+rf_ConfigureMCPair(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     rc;
+
+	RF_FREELIST_CREATE(rf_mcpair_freelist, RF_MAX_FREE_MCPAIR,
+	    RF_MCPAIR_INC, sizeof(RF_MCPair_t));
+	rc = rf_ShutdownCreate(listp, rf_ShutdownMCPair, NULL);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+		    __FILE__, __LINE__, rc);
+		rf_ShutdownMCPair(NULL);
+		return (rc);
+	}
+	RF_FREELIST_PRIME_INIT(rf_mcpair_freelist, RF_MCPAIR_INITIAL, next,
+	    (RF_MCPair_t *), init_mcpair);
+	return (0);
+}
+
+RF_MCPair_t *
+rf_AllocMCPair()
+{
+	RF_MCPair_t *t;
+
+	RF_FREELIST_GET_INIT(rf_mcpair_freelist, t, next, (RF_MCPair_t *), init_mcpair);
+	if (t) {
+		t->flag = 0;
+		t->next = NULL;
+	}
+	return (t);
+}
+
+void 
+rf_FreeMCPair(t)
+	RF_MCPair_t *t;
+{
+	RF_FREELIST_FREE_CLEAN(rf_mcpair_freelist, t, next, clean_mcpair);
+}
+/* the callback function used to wake you up when you use an mcpair to wait for something */
+void 
+rf_MCPairWakeupFunc(mcpair)
+	RF_MCPair_t *mcpair;
+{
+	RF_LOCK_MUTEX(mcpair->mutex);
+	mcpair->flag = 1;
+	wakeup(&(mcpair->cond));
+	RF_UNLOCK_MUTEX(mcpair->mutex);
+}
diff --git a/sys/dev/raidframe/rf_mcpair.h b/sys/dev/raidframe/rf_mcpair.h
new file mode 100644
index 0000000..d43c728
--- /dev/null
+++ b/sys/dev/raidframe/rf_mcpair.h
@@ -0,0 +1,54 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_mcpair.h,v 1.6 2000/09/21 01:45:46 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_mcpair.h
+ * see comments in rf_mcpair.c
+ */
+
+#ifndef _RF__RF_MCPAIR_H_
+#define _RF__RF_MCPAIR_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+
+struct RF_MCPair_s {
+	RF_DECLARE_MUTEX(mutex)
+	RF_DECLARE_COND(cond)
+	int     flag;
+	RF_MCPair_t *next;
+};
+#define RF_WAIT_MCPAIR(_mcp) \
+	RF_LTSLEEP(&((_mcp)->cond), PRIBIO, "mcpair", 0, &((_mcp)->mutex))
+
+int     rf_ConfigureMCPair(RF_ShutdownList_t ** listp);
+RF_MCPair_t *rf_AllocMCPair(void);
+void    rf_FreeMCPair(RF_MCPair_t * t);
+void    rf_MCPairWakeupFunc(RF_MCPair_t * t);
+
+#endif				/* !_RF__RF_MCPAIR_H_ */
diff --git a/sys/dev/raidframe/rf_memchunk.c b/sys/dev/raidframe/rf_memchunk.c
new file mode 100644
index 0000000..b4aae57
--- /dev/null
+++ b/sys/dev/raidframe/rf_memchunk.c
@@ -0,0 +1,211 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_memchunk.c,v 1.4 1999/08/13 03:41:56 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*********************************************************************************
+ * rf_memchunk.c
+ *
+ * experimental code.  I've found that the malloc and free calls in the DAG
+ * creation code are very expensive.  Since for any given workload the DAGs
+ * created for different accesses are likely to be similar to each other, the
+ * amount of memory used for any given DAG data structure is likely to be one
+ * of a small number of values.  For example, in UNIX, all reads and writes will
+ * be less than 8k and will not span stripe unit boundaries.  Thus in the absence
+ * of failure, the only DAGs that will ever get created are single-node reads
+ * and single-stripe-unit atomic read-modify-writes.  So, I'm very likely to
+ * be continually asking for chunks of memory equal to the sizes of these two
+ * DAGs.
+ *
+ * This leads to the idea of holding on to these chunks of memory when the DAG is
+ * freed and then, when a new DAG is created, trying to find such a chunk before
+ * calling malloc.
+ *
+ * the "chunk list" is a list of lists.  Each header node contains a size value
+ * and a pointer to a list of chunk descriptors, each of which holds a pointer
+ * to a chunk of memory of the indicated size.
+ *
+ * There is currently no way to purge memory out of the chunk list.  My
+ * initial thought on this is to have a low-priority thread that wakes up every
+ * 1 or 2 seconds, purges all the chunks with low reuse counts, and sets all
+ * the reuse counts to zero.
+ *
+ * This whole idea may be bad, since malloc may be able to do this more efficiently.
+ * It's worth a try, though, and it can be turned off by setting useMemChunks to 0.
+ *
+ ********************************************************************************/
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_memchunk.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_options.h>
+#include <dev/raidframe/rf_shutdown.h>
+
+typedef struct RF_ChunkHdr_s RF_ChunkHdr_t;
+struct RF_ChunkHdr_s {
+	int     size;
+	RF_ChunkDesc_t *list;
+	RF_ChunkHdr_t *next;
+};
+
+static RF_ChunkHdr_t *chunklist, *chunk_hdr_free_list;
+static RF_ChunkDesc_t *chunk_desc_free_list;
+RF_DECLARE_STATIC_MUTEX(chunkmutex)
+	static void rf_ShutdownMemChunk(void *);
+	static RF_ChunkDesc_t *NewMemChunk(int, char *);
+
+
+	static void rf_ShutdownMemChunk(ignored)
+	void   *ignored;
+{
+	RF_ChunkDesc_t *pt, *p;
+	RF_ChunkHdr_t *hdr, *ht;
+
+	if (rf_memChunkDebug)
+		printf("Chunklist:\n");
+	for (hdr = chunklist; hdr;) {
+		for (p = hdr->list; p;) {
+			if (rf_memChunkDebug)
+				printf("Size %d reuse count %d\n", p->size, p->reuse_count);
+			pt = p;
+			p = p->next;
+			RF_Free(pt->buf, pt->size);
+			RF_Free(pt, sizeof(*pt));
+		}
+		ht = hdr;
+		hdr = hdr->next;
+		RF_Free(ht, sizeof(*ht));
+	}
+
+	rf_mutex_destroy(&chunkmutex);
+}
+
+int 
+rf_ConfigureMemChunk(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     rc;
+
+	chunklist = NULL;
+	chunk_hdr_free_list = NULL;
+	chunk_desc_free_list = NULL;
+	rc = rf_mutex_init(&chunkmutex, __FUNCTION__);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+	}
+	rc = rf_ShutdownCreate(listp, rf_ShutdownMemChunk, NULL);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		rf_mutex_destroy(&chunkmutex);
+	}
+	return (rc);
+}
+/* called to get a chunk descriptor for a newly-allocated chunk of memory
+ * MUTEX MUST BE LOCKED
+ *
+ * free list is not currently used
+ */
+static RF_ChunkDesc_t *
+NewMemChunk(size, buf)
+	int     size;
+	char   *buf;
+{
+	RF_ChunkDesc_t *p;
+
+	if (chunk_desc_free_list) {
+		p = chunk_desc_free_list;
+		chunk_desc_free_list = p->next;
+	} else
+		RF_Malloc(p, sizeof(RF_ChunkDesc_t), (RF_ChunkDesc_t *));
+	p->size = size;
+	p->buf = buf;
+	p->next = NULL;
+	p->reuse_count = 0;
+	return (p);
+}
+/* looks for a chunk of memory of acceptable size.  If none, allocates one and returns
+ * a chunk descriptor for it, but does not install anything in the list.  This is done
+ * when the chunk is released.
+ */
+RF_ChunkDesc_t *
+rf_GetMemChunk(size)
+	int     size;
+{
+	RF_ChunkHdr_t *hdr = chunklist;
+	RF_ChunkDesc_t *p = NULL;
+	char   *buf;
+
+	RF_LOCK_MUTEX(chunkmutex);
+	for (hdr = chunklist; hdr; hdr = hdr->next)
+		if (hdr->size >= size) {
+			p = hdr->list;
+			if (p) {
+				hdr->list = p->next;
+				p->next = NULL;
+				p->reuse_count++;
+			}
+			break;
+		}
+	if (!p) {
+		RF_Malloc(buf, size, (char *));
+		p = NewMemChunk(size, buf);
+	}
+	RF_UNLOCK_MUTEX(chunkmutex);
+	(void) bzero(p->buf, size);
+	return (p);
+}
+
+void 
+rf_ReleaseMemChunk(chunk)
+	RF_ChunkDesc_t *chunk;
+{
+	RF_ChunkHdr_t *hdr, *ht = NULL, *new;
+
+	RF_LOCK_MUTEX(chunkmutex);
+	for (hdr = chunklist; hdr && hdr->size < chunk->size; ht = hdr, hdr = hdr->next);
+	if (hdr && hdr->size == chunk->size) {
+		chunk->next = hdr->list;
+		hdr->list = chunk;
+	} else {
+		RF_Malloc(new, sizeof(RF_ChunkHdr_t), (RF_ChunkHdr_t *));
+		new->size = chunk->size;
+		new->list = chunk;
+		chunk->next = NULL;
+		if (ht) {
+			new->next = ht->next;
+			ht->next = new;
+		} else {
+			new->next = hdr;
+			chunklist = new;
+		}
+	}
+	RF_UNLOCK_MUTEX(chunkmutex);
+}
diff --git a/sys/dev/raidframe/rf_memchunk.h b/sys/dev/raidframe/rf_memchunk.h
new file mode 100644
index 0000000..5806d20
--- /dev/null
+++ b/sys/dev/raidframe/rf_memchunk.h
@@ -0,0 +1,48 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_memchunk.h,v 1.3 1999/02/05 00:06:13 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* header file for rf_memchunk.c.  See comments there */
+
+#ifndef _RF__RF_MEMCHUNK_H_
+#define _RF__RF_MEMCHUNK_H_
+
+#include <dev/raidframe/rf_types.h>
+
+struct RF_ChunkDesc_s {
+	int     size;
+	int     reuse_count;
+	char   *buf;
+	RF_ChunkDesc_t *next;
+};
+
+int     rf_ConfigureMemChunk(RF_ShutdownList_t ** listp);
+RF_ChunkDesc_t *rf_GetMemChunk(int size);
+void    rf_ReleaseMemChunk(RF_ChunkDesc_t * chunk);
+
+#endif				/* !_RF__RF_MEMCHUNK_H_ */
diff --git a/sys/dev/raidframe/rf_nwayxor.c b/sys/dev/raidframe/rf_nwayxor.c
new file mode 100644
index 0000000..c5d142b
--- /dev/null
+++ b/sys/dev/raidframe/rf_nwayxor.c
@@ -0,0 +1,449 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_nwayxor.c,v 1.4 2000/03/30 12:45:41 augustss Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/************************************************************
+ *
+ * nwayxor.c -- code to do N-way xors for reconstruction
+ *
+ * nWayXorN xors N input buffers into the destination buffer.
+ * adapted from danner's longword_bxor code.
+ *
+ ************************************************************/
+
+#include <dev/raidframe/rf_nwayxor.h>
+#include <dev/raidframe/rf_shutdown.h>
+
+static int callcount[10];
+static void rf_ShutdownNWayXor(void *);
+
+static void 
+rf_ShutdownNWayXor(ignored)
+	void   *ignored;
+{
+	int     i;
+
+	if (rf_showXorCallCounts == 0)
+		return;
+	printf("Call counts for n-way xor routines:  ");
+	for (i = 0; i < 10; i++)
+		printf("%d ", callcount[i]);
+	printf("\n");
+}
+
+int 
+rf_ConfigureNWayXor(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     i, rc;
+
+	for (i = 0; i < 10; i++)
+		callcount[i] = 0;
+	rc = rf_ShutdownCreate(listp, rf_ShutdownNWayXor, NULL);
+	return (rc);
+}
+
+void 
+rf_nWayXor1(src_rbs, dest_rb, len)
+	RF_ReconBuffer_t **src_rbs;
+	RF_ReconBuffer_t *dest_rb;
+	int     len;
+{
+	unsigned long *src = (unsigned long *) src_rbs[0]->buffer;
+	unsigned long *dest = (unsigned long *) dest_rb->buffer;
+	unsigned long *end = src + len;
+	unsigned long d0, d1, d2, d3, s0, s1, s2, s3;
+
+	callcount[1]++;
+	while (len >= 4) {
+		d0 = dest[0];
+		d1 = dest[1];
+		d2 = dest[2];
+		d3 = dest[3];
+		s0 = src[0];
+		s1 = src[1];
+		s2 = src[2];
+		s3 = src[3];
+		dest[0] = d0 ^ s0;
+		dest[1] = d1 ^ s1;
+		dest[2] = d2 ^ s2;
+		dest[3] = d3 ^ s3;
+		src += 4;
+		dest += 4;
+		len -= 4;
+	}
+	while (src < end) {
+		*dest++ ^= *src++;
+	}
+}
+
+void 
+rf_nWayXor2(src_rbs, dest_rb, len)
+	RF_ReconBuffer_t **src_rbs;
+	RF_ReconBuffer_t *dest_rb;
+	int     len;
+{
+	unsigned long *dst = (unsigned long *) dest_rb->buffer;
+	unsigned long *a = dst;
+	unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+	unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+	unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
+
+	callcount[2]++;
+	/* align dest to cache line */
+	while ((((unsigned long) dst) & 0x1f)) {
+		*dst++ = *a++ ^ *b++ ^ *c++;
+		len--;
+	}
+	while (len > 4) {
+		a0 = a[0];
+		len -= 4;
+
+		a1 = a[1];
+		a2 = a[2];
+
+		a3 = a[3];
+		a += 4;
+
+		b0 = b[0];
+		b1 = b[1];
+
+		b2 = b[2];
+		b3 = b[3];
+		/* start dual issue */
+		a0 ^= b0;
+		b0 = c[0];
+
+		b += 4;
+		a1 ^= b1;
+
+		a2 ^= b2;
+		a3 ^= b3;
+
+		b1 = c[1];
+		a0 ^= b0;
+
+		b2 = c[2];
+		a1 ^= b1;
+
+		b3 = c[3];
+		a2 ^= b2;
+
+		dst[0] = a0;
+		a3 ^= b3;
+		dst[1] = a1;
+		c += 4;
+		dst[2] = a2;
+		dst[3] = a3;
+		dst += 4;
+	}
+	while (len) {
+		*dst++ = *a++ ^ *b++ ^ *c++;
+		len--;
+	}
+}
+/* note that first arg is not incremented but 2nd arg is */
+#define LOAD_FIRST(_dst,_b) \
+  a0 = _dst[0]; len -= 4;   \
+  a1 = _dst[1];             \
+  a2 = _dst[2];             \
+  a3 = _dst[3];             \
+  b0 = _b[0];               \
+  b1 = _b[1];               \
+  b2 = _b[2];               \
+  b3 = _b[3];  _b += 4;
+
+/* note: arg is incremented */
+#define XOR_AND_LOAD_NEXT(_n) \
+  a0 ^= b0; b0 = _n[0];       \
+  a1 ^= b1; b1 = _n[1];       \
+  a2 ^= b2; b2 = _n[2];       \
+  a3 ^= b3; b3 = _n[3];       \
+  _n += 4;
+
+/* arg is incremented */
+#define XOR_AND_STORE(_dst)       \
+  a0 ^= b0; _dst[0] = a0;         \
+  a1 ^= b1; _dst[1] = a1;         \
+  a2 ^= b2; _dst[2] = a2;         \
+  a3 ^= b3; _dst[3] = a3;         \
+  _dst += 4;
+
+
+void 
+rf_nWayXor3(src_rbs, dest_rb, len)
+	RF_ReconBuffer_t **src_rbs;
+	RF_ReconBuffer_t *dest_rb;
+	int     len;
+{
+	unsigned long *dst = (unsigned long *) dest_rb->buffer;
+	unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+	unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+	unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+	unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
+
+	callcount[3]++;
+	/* align dest to cache line */
+	while ((((unsigned long) dst) & 0x1f)) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++;
+		len--;
+	}
+	while (len > 4) {
+		LOAD_FIRST(dst, b);
+		XOR_AND_LOAD_NEXT(c);
+		XOR_AND_LOAD_NEXT(d);
+		XOR_AND_STORE(dst);
+	}
+	while (len) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++;
+		len--;
+	}
+}
+
+void 
+rf_nWayXor4(src_rbs, dest_rb, len)
+	RF_ReconBuffer_t **src_rbs;
+	RF_ReconBuffer_t *dest_rb;
+	int     len;
+{
+	unsigned long *dst = (unsigned long *) dest_rb->buffer;
+	unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+	unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+	unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+	unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
+	unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
+
+	callcount[4]++;
+	/* align dest to cache line */
+	while ((((unsigned long) dst) & 0x1f)) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++;
+		len--;
+	}
+	while (len > 4) {
+		LOAD_FIRST(dst, b);
+		XOR_AND_LOAD_NEXT(c);
+		XOR_AND_LOAD_NEXT(d);
+		XOR_AND_LOAD_NEXT(e);
+		XOR_AND_STORE(dst);
+	}
+	while (len) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++;
+		len--;
+	}
+}
+
+void 
+rf_nWayXor5(src_rbs, dest_rb, len)
+	RF_ReconBuffer_t **src_rbs;
+	RF_ReconBuffer_t *dest_rb;
+	int     len;
+{
+	unsigned long *dst = (unsigned long *) dest_rb->buffer;
+	unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+	unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+	unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+	unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
+	unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
+	unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
+
+	callcount[5]++;
+	/* align dest to cache line */
+	while ((((unsigned long) dst) & 0x1f)) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++;
+		len--;
+	}
+	while (len > 4) {
+		LOAD_FIRST(dst, b);
+		XOR_AND_LOAD_NEXT(c);
+		XOR_AND_LOAD_NEXT(d);
+		XOR_AND_LOAD_NEXT(e);
+		XOR_AND_LOAD_NEXT(f);
+		XOR_AND_STORE(dst);
+	}
+	while (len) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++;
+		len--;
+	}
+}
+
+void 
+rf_nWayXor6(src_rbs, dest_rb, len)
+	RF_ReconBuffer_t **src_rbs;
+	RF_ReconBuffer_t *dest_rb;
+	int     len;
+{
+	unsigned long *dst = (unsigned long *) dest_rb->buffer;
+	unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+	unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+	unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+	unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
+	unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
+	unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
+	unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
+
+	callcount[6]++;
+	/* align dest to cache line */
+	while ((((unsigned long) dst) & 0x1f)) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++;
+		len--;
+	}
+	while (len > 4) {
+		LOAD_FIRST(dst, b);
+		XOR_AND_LOAD_NEXT(c);
+		XOR_AND_LOAD_NEXT(d);
+		XOR_AND_LOAD_NEXT(e);
+		XOR_AND_LOAD_NEXT(f);
+		XOR_AND_LOAD_NEXT(g);
+		XOR_AND_STORE(dst);
+	}
+	while (len) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++;
+		len--;
+	}
+}
+
+void 
+rf_nWayXor7(src_rbs, dest_rb, len)
+	RF_ReconBuffer_t **src_rbs;
+	RF_ReconBuffer_t *dest_rb;
+	int     len;
+{
+	unsigned long *dst = (unsigned long *) dest_rb->buffer;
+	unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+	unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+	unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+	unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
+	unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
+	unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
+	unsigned long *h = (unsigned long *) src_rbs[6]->buffer;
+	unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
+
+	callcount[7]++;
+	/* align dest to cache line */
+	while ((((unsigned long) dst) & 0x1f)) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++;
+		len--;
+	}
+	while (len > 4) {
+		LOAD_FIRST(dst, b);
+		XOR_AND_LOAD_NEXT(c);
+		XOR_AND_LOAD_NEXT(d);
+		XOR_AND_LOAD_NEXT(e);
+		XOR_AND_LOAD_NEXT(f);
+		XOR_AND_LOAD_NEXT(g);
+		XOR_AND_LOAD_NEXT(h);
+		XOR_AND_STORE(dst);
+	}
+	while (len) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++;
+		len--;
+	}
+}
+
+void 
+rf_nWayXor8(src_rbs, dest_rb, len)
+	RF_ReconBuffer_t **src_rbs;
+	RF_ReconBuffer_t *dest_rb;
+	int     len;
+{
+	unsigned long *dst = (unsigned long *) dest_rb->buffer;
+	unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+	unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+	unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+	unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
+	unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
+	unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
+	unsigned long *h = (unsigned long *) src_rbs[6]->buffer;
+	unsigned long *i = (unsigned long *) src_rbs[7]->buffer;
+	unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
+
+	callcount[8]++;
+	/* align dest to cache line */
+	while ((((unsigned long) dst) & 0x1f)) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++;
+		len--;
+	}
+	while (len > 4) {
+		LOAD_FIRST(dst, b);
+		XOR_AND_LOAD_NEXT(c);
+		XOR_AND_LOAD_NEXT(d);
+		XOR_AND_LOAD_NEXT(e);
+		XOR_AND_LOAD_NEXT(f);
+		XOR_AND_LOAD_NEXT(g);
+		XOR_AND_LOAD_NEXT(h);
+		XOR_AND_LOAD_NEXT(i);
+		XOR_AND_STORE(dst);
+	}
+	while (len) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++;
+		len--;
+	}
+}
+
+
+void 
+rf_nWayXor9(src_rbs, dest_rb, len)
+	RF_ReconBuffer_t **src_rbs;
+	RF_ReconBuffer_t *dest_rb;
+	int     len;
+{
+	unsigned long *dst = (unsigned long *) dest_rb->buffer;
+	unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+	unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+	unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+	unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
+	unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
+	unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
+	unsigned long *h = (unsigned long *) src_rbs[6]->buffer;
+	unsigned long *i = (unsigned long *) src_rbs[7]->buffer;
+	unsigned long *j = (unsigned long *) src_rbs[8]->buffer;
+	unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
+
+	callcount[9]++;
+	/* align dest to cache line */
+	while ((((unsigned long) dst) & 0x1f)) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++ ^ *j++;
+		len--;
+	}
+	while (len > 4) {
+		LOAD_FIRST(dst, b);
+		XOR_AND_LOAD_NEXT(c);
+		XOR_AND_LOAD_NEXT(d);
+		XOR_AND_LOAD_NEXT(e);
+		XOR_AND_LOAD_NEXT(f);
+		XOR_AND_LOAD_NEXT(g);
+		XOR_AND_LOAD_NEXT(h);
+		XOR_AND_LOAD_NEXT(i);
+		XOR_AND_LOAD_NEXT(j);
+		XOR_AND_STORE(dst);
+	}
+	while (len) {
+		*dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++ ^ *j++;
+		len--;
+	}
+}
diff --git a/sys/dev/raidframe/rf_nwayxor.h b/sys/dev/raidframe/rf_nwayxor.h
new file mode 100644
index 0000000..1460d9b
--- /dev/null
+++ b/sys/dev/raidframe/rf_nwayxor.h
@@ -0,0 +1,54 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_nwayxor.h,v 1.3 1999/02/05 00:06:13 oster Exp $	*/
+/*
+ * rf_nwayxor.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * rf_nwayxor.h -- types and prototypes for nwayxor module
+ */
+
+#ifndef _RF__RF_NWAYXOR_H_
+#define _RF__RF_NWAYXOR_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_reconstruct.h>
+
+int     rf_ConfigureNWayXor(RF_ShutdownList_t ** listp);
+void    rf_nWayXor1(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
+void    rf_nWayXor2(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
+void    rf_nWayXor3(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
+void    rf_nWayXor4(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
+void    rf_nWayXor5(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
+void    rf_nWayXor6(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
+void    rf_nWayXor7(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
+void    rf_nWayXor8(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
+void    rf_nWayXor9(RF_ReconBuffer_t ** src_rbs, RF_ReconBuffer_t * dest_rb, int len);
+
+#endif				/* !_RF__RF_NWAYXOR_H_ */
diff --git a/sys/dev/raidframe/rf_options.c b/sys/dev/raidframe/rf_options.c
new file mode 100644
index 0000000..9ead8b2
--- /dev/null
+++ b/sys/dev/raidframe/rf_options.c
@@ -0,0 +1,76 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_options.c,v 1.3 1999/02/05 00:06:13 oster Exp $	*/
+/*
+ * rf_options.c
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_options.h>
+
+#ifdef RF_DBG_OPTION
+#undef RF_DBG_OPTION
+#endif				/* RF_DBG_OPTION */
+
+#ifdef __STDC__
+#define RF_DBG_OPTION(_option_,_defval_) long rf_##_option_ = _defval_;
+#else				/* __STDC__ */
+#define RF_DBG_OPTION(_option_,_defval_) long rf_/**/_option_ = _defval_;
+#endif				/* __STDC__ */
+
+#include <dev/raidframe/rf_optnames.h>
+
+#undef RF_DBG_OPTION
+
+#ifdef __STDC__
+#define RF_DBG_OPTION(_option_,_defval_) { RF_STRING(_option_), &rf_##_option_ },
+#else				/* __STDC__ */
+#define RF_DBG_OPTION(_option_,_defval_) { RF_STRING(_option_), &rf_/**/_option_ },
+#endif				/* __STDC__ */
+
+RF_DebugName_t rf_debugNames[] = {
+#include <dev/raidframe/rf_optnames.h>
+	{NULL, NULL}
+};
+#undef RF_DBG_OPTION
+
+#ifdef __STDC__
+#define RF_DBG_OPTION(_option_,_defval_) rf_##_option_  = _defval_ ;
+#else				/* __STDC__ */
+#define RF_DBG_OPTION(_option_,_defval_) rf_/**/_option_ = _defval_ ;
+#endif				/* __STDC__ */
+
+void 
+rf_ResetDebugOptions()
+{
+#include <dev/raidframe/rf_optnames.h>
+}
diff --git a/sys/dev/raidframe/rf_options.h b/sys/dev/raidframe/rf_options.h
new file mode 100644
index 0000000..22b6341
--- /dev/null
+++ b/sys/dev/raidframe/rf_options.h
@@ -0,0 +1,58 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_options.h,v 1.3 1999/02/05 00:06:13 oster Exp $	*/
+/*
+ * rf_options.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_OPTIONS_H_
+#define _RF__RF_OPTIONS_H_
+
+#define RF_DEFAULT_LOCK_TABLE_SIZE 256
+
+typedef struct RF_DebugNames_s {
+	char   *name;
+	long   *ptr;
+}       RF_DebugName_t;
+
+extern RF_DebugName_t rf_debugNames[];
+
+#ifdef RF_DBG_OPTION
+#undef RF_DBG_OPTION
+#endif				/* RF_DBG_OPTION */
+
+#ifdef __STDC__
+#define RF_DBG_OPTION(_option_,_defval_) extern long rf_##_option_;
+#else				/* __STDC__ */
+#define RF_DBG_OPTION(_option_,_defval_) extern long rf_/**/_option_;
+#endif				/* __STDC__ */
+#include <dev/raidframe/rf_optnames.h>
+
+void    rf_ResetDebugOptions(void);
+
+#endif				/* !_RF__RF_OPTIONS_H_ */
diff --git a/sys/dev/raidframe/rf_optnames.h b/sys/dev/raidframe/rf_optnames.h
new file mode 100644
index 0000000..f04fbc1
--- /dev/null
+++ b/sys/dev/raidframe/rf_optnames.h
@@ -0,0 +1,105 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_optnames.h,v 1.6 1999/12/07 02:54:08 oster Exp $	*/
+/*
+ * rf_optnames.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * Don't protect against multiple inclusion here- we actually want this.
+ */
+
+RF_DBG_OPTION(accessDebug, 0)
+RF_DBG_OPTION(accessTraceBufSize, 0)
+RF_DBG_OPTION(cscanDebug, 0)	/* debug CSCAN sorting */
+RF_DBG_OPTION(dagDebug, 0)
+RF_DBG_OPTION(debugPrintUseBuffer, 0)
+RF_DBG_OPTION(degDagDebug, 0)
+RF_DBG_OPTION(disableAsyncAccs, 0)
+RF_DBG_OPTION(diskDebug, 0)
+RF_DBG_OPTION(enableAtomicRMW, 0)	/* this debug var enables locking of
+					 * the disk arm during small-write
+					 * operations.  Setting this variable
+					 * to anything other than 0 will
+					 * result in deadlock.  (wvcii) */
+RF_DBG_OPTION(engineDebug, 0)
+RF_DBG_OPTION(fifoDebug, 0)	/* debug fifo queueing */
+RF_DBG_OPTION(floatingRbufDebug, 0)
+RF_DBG_OPTION(forceHeadSepLimit, -1)
+RF_DBG_OPTION(forceNumFloatingReconBufs, -1)		/* wire down number of
+							 * extra recon buffers
+							 * to use */
+RF_DBG_OPTION(keepAccTotals, 0)		/* turn on keep_acc_totals */
+RF_DBG_OPTION(lockTableSize, RF_DEFAULT_LOCK_TABLE_SIZE)
+RF_DBG_OPTION(mapDebug, 0)
+RF_DBG_OPTION(maxNumTraces, -1)
+
+RF_DBG_OPTION(memChunkDebug, 0)
+RF_DBG_OPTION(memDebug, 0)
+RF_DBG_OPTION(memDebugAddress, 0)
+RF_DBG_OPTION(numBufsToAccumulate, 1)		/* number of buffers to
+						 * accumulate before doing XOR */
+RF_DBG_OPTION(prReconSched, 0)
+RF_DBG_OPTION(printDAGsDebug, 0)
+RF_DBG_OPTION(printStatesDebug, 0)
+RF_DBG_OPTION(protectedSectors, 64L)		/* # of sectors at start of
+						 * disk to exclude from RAID
+						 * address space */
+RF_DBG_OPTION(pssDebug, 0)
+RF_DBG_OPTION(queueDebug, 0)
+RF_DBG_OPTION(quiesceDebug, 0)
+RF_DBG_OPTION(raidSectorOffset, 0)	/* added to all incoming sectors to
+					 * debug alignment problems */
+RF_DBG_OPTION(reconDebug, 0)
+RF_DBG_OPTION(reconbufferDebug, 0)
+RF_DBG_OPTION(scanDebug, 0)	/* debug SCAN sorting */
+RF_DBG_OPTION(showXorCallCounts, 0)	/* show n-way Xor call counts */
+RF_DBG_OPTION(shutdownDebug, 0)		/* show shutdown calls */
+RF_DBG_OPTION(sizePercentage, 100)
+RF_DBG_OPTION(sstfDebug, 0)	/* turn on debugging info for sstf queueing */
+RF_DBG_OPTION(stripeLockDebug, 0)
+RF_DBG_OPTION(suppressLocksAndLargeWrites, 0)
+RF_DBG_OPTION(suppressTraceDelays, 0)
+RF_DBG_OPTION(useMemChunks, 1)
+RF_DBG_OPTION(validateDAGDebug, 0)
+RF_DBG_OPTION(validateVisitedDebug, 1)		/* XXX turn to zero by
+						 * default? */
+RF_DBG_OPTION(verifyParityDebug, 0)
+RF_DBG_OPTION(debugKernelAccess, 0)	/* DoAccessKernel debugging */
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+RF_DBG_OPTION(forceParityLogReint, 0)
+RF_DBG_OPTION(numParityRegions, 0)	/* number of regions in the array */
+RF_DBG_OPTION(numReintegrationThreads, 1)
+RF_DBG_OPTION(parityLogDebug, 0)	/* if nonzero, enables debugging of
+					 * parity logging */
+RF_DBG_OPTION(totalInCoreLogCapacity, 1024 * 1024)	/* target bytes
+							 * available for in-core
+							 * logs */
+#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
+
diff --git a/sys/dev/raidframe/rf_paritylog.c b/sys/dev/raidframe/rf_paritylog.c
new file mode 100644
index 0000000..6c56c95
--- /dev/null
+++ b/sys/dev/raidframe/rf_paritylog.c
@@ -0,0 +1,869 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_paritylog.c,v 1.5 2000/01/07 03:41:01 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* Code for manipulating in-core parity logs
+ *
+ */
+
+#include <dev/raidframe/rf_archs.h>
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+
+/*
+ * Append-only log for recording parity "update" and "overwrite" records
+ */
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_mcpair.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_desc.h>
+#include <dev/raidframe/rf_layout.h>
+#include <dev/raidframe/rf_diskqueue.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_paritylog.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_map.h>
+#include <dev/raidframe/rf_paritylogging.h>
+#include <dev/raidframe/rf_paritylogDiskMgr.h>
+
+static RF_CommonLogData_t *
+AllocParityLogCommonData(RF_Raid_t * raidPtr)
+{
+	RF_CommonLogData_t *common = NULL;
+	int     rc;
+
+	/* Return a struct for holding common parity log information from the
+	 * free list (rf_parityLogDiskQueue.freeCommonList).  If the free list
+	 * is empty, call RF_Malloc to create a new structure. NON-BLOCKING */
+
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	if (raidPtr->parityLogDiskQueue.freeCommonList) {
+		common = raidPtr->parityLogDiskQueue.freeCommonList;
+		raidPtr->parityLogDiskQueue.freeCommonList = raidPtr->parityLogDiskQueue.freeCommonList->next;
+		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	} else {
+		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+		RF_Malloc(common, sizeof(RF_CommonLogData_t), (RF_CommonLogData_t *));
+		rc = rf_mutex_init(&common->mutex, __FUNCTION__);
+		if (rc) {
+			RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+			    __LINE__, rc);
+			RF_Free(common, sizeof(RF_CommonLogData_t));
+			common = NULL;
+		}
+	}
+	common->next = NULL;
+	return (common);
+}
+
+static void 
+FreeParityLogCommonData(RF_CommonLogData_t * common)
+{
+	RF_Raid_t *raidPtr;
+
+	/* Insert a single struct for holding parity log information (data)
+	 * into the free list (rf_parityLogDiskQueue.freeCommonList).
+	 * NON-BLOCKING */
+
+	raidPtr = common->raidPtr;
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	common->next = raidPtr->parityLogDiskQueue.freeCommonList;
+	raidPtr->parityLogDiskQueue.freeCommonList = common;
+	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+}
+
+static RF_ParityLogData_t *
+AllocParityLogData(RF_Raid_t * raidPtr)
+{
+	RF_ParityLogData_t *data = NULL;
+
+	/* Return a struct for holding parity log information from the free
+	 * list (rf_parityLogDiskQueue.freeList).  If the free list is empty,
+	 * call RF_Malloc to create a new structure. NON-BLOCKING */
+
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	if (raidPtr->parityLogDiskQueue.freeDataList) {
+		data = raidPtr->parityLogDiskQueue.freeDataList;
+		raidPtr->parityLogDiskQueue.freeDataList = raidPtr->parityLogDiskQueue.freeDataList->next;
+		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	} else {
+		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+		RF_Malloc(data, sizeof(RF_ParityLogData_t), (RF_ParityLogData_t *));
+	}
+	data->next = NULL;
+	data->prev = NULL;
+	return (data);
+}
+
+
+static void 
+FreeParityLogData(RF_ParityLogData_t * data)
+{
+	RF_ParityLogData_t *nextItem;
+	RF_Raid_t *raidPtr;
+
+	/* Insert a linked list of structs for holding parity log information
+	 * (data) into the free list (parityLogDiskQueue.freeList).
+	 * NON-BLOCKING */
+
+	raidPtr = data->common->raidPtr;
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	while (data) {
+		nextItem = data->next;
+		data->next = raidPtr->parityLogDiskQueue.freeDataList;
+		raidPtr->parityLogDiskQueue.freeDataList = data;
+		data = nextItem;
+	}
+	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+}
+
+
+static void 
+EnqueueParityLogData(
+    RF_ParityLogData_t * data,
+    RF_ParityLogData_t ** head,
+    RF_ParityLogData_t ** tail)
+{
+	RF_Raid_t *raidPtr;
+
+	/* Insert an in-core parity log (*data) into the head of a disk queue
+	 * (*head, *tail). NON-BLOCKING */
+
+	raidPtr = data->common->raidPtr;
+	if (rf_parityLogDebug)
+		printf("[enqueueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
+	RF_ASSERT(data->prev == NULL);
+	RF_ASSERT(data->next == NULL);
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	if (*head) {
+		/* insert into head of queue */
+		RF_ASSERT((*head)->prev == NULL);
+		RF_ASSERT((*tail)->next == NULL);
+		data->next = *head;
+		(*head)->prev = data;
+		*head = data;
+	} else {
+		/* insert into empty list */
+		RF_ASSERT(*head == NULL);
+		RF_ASSERT(*tail == NULL);
+		*head = data;
+		*tail = data;
+	}
+	RF_ASSERT((*head)->prev == NULL);
+	RF_ASSERT((*tail)->next == NULL);
+	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+}
+
+static RF_ParityLogData_t *
+DequeueParityLogData(
+    RF_Raid_t * raidPtr,
+    RF_ParityLogData_t ** head,
+    RF_ParityLogData_t ** tail,
+    int ignoreLocks)
+{
+	RF_ParityLogData_t *data;
+
+	/* Remove and return an in-core parity log from the tail of a disk
+	 * queue (*head, *tail). NON-BLOCKING */
+
+	/* remove from tail, preserving FIFO order */
+	if (!ignoreLocks)
+		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	data = *tail;
+	if (data) {
+		if (*head == *tail) {
+			/* removing last item from queue */
+			*head = NULL;
+			*tail = NULL;
+		} else {
+			*tail = (*tail)->prev;
+			(*tail)->next = NULL;
+			RF_ASSERT((*head)->prev == NULL);
+			RF_ASSERT((*tail)->next == NULL);
+		}
+		data->next = NULL;
+		data->prev = NULL;
+		if (rf_parityLogDebug)
+			printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
+	}
+	if (*head) {
+		RF_ASSERT((*head)->prev == NULL);
+		RF_ASSERT((*tail)->next == NULL);
+	}
+	if (!ignoreLocks)
+		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	return (data);
+}
+
+
+static void 
+RequeueParityLogData(
+    RF_ParityLogData_t * data,
+    RF_ParityLogData_t ** head,
+    RF_ParityLogData_t ** tail)
+{
+	RF_Raid_t *raidPtr;
+
+	/* Insert an in-core parity log (*data) into the tail of a disk queue
+	 * (*head, *tail). NON-BLOCKING */
+
+	raidPtr = data->common->raidPtr;
+	RF_ASSERT(data);
+	if (rf_parityLogDebug)
+		printf("[requeueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	if (*tail) {
+		/* append to tail of list */
+		data->prev = *tail;
+		data->next = NULL;
+		(*tail)->next = data;
+		*tail = data;
+	} else {
+		/* inserting into an empty list */
+		*head = data;
+		*tail = data;
+		(*head)->prev = NULL;
+		(*tail)->next = NULL;
+	}
+	RF_ASSERT((*head)->prev == NULL);
+	RF_ASSERT((*tail)->next == NULL);
+	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+}
+
+RF_ParityLogData_t *
+rf_CreateParityLogData(
+    RF_ParityRecordType_t operation,
+    RF_PhysDiskAddr_t * pda,
+    caddr_t bufPtr,
+    RF_Raid_t * raidPtr,
+    int (*wakeFunc) (RF_DagNode_t * node, int status),
+    void *wakeArg,
+    RF_AccTraceEntry_t * tracerec,
+    RF_Etimer_t startTime)
+{
+	RF_ParityLogData_t *data, *resultHead = NULL, *resultTail = NULL;
+	RF_CommonLogData_t *common;
+	RF_PhysDiskAddr_t *diskAddress;
+	int     boundary, offset = 0;
+
+	/* Return an initialized struct of info to be logged. Build one item
+	 * per physical disk address, one item per region.
+	 * 
+	 * NON-BLOCKING */
+
+	diskAddress = pda;
+	common = AllocParityLogCommonData(raidPtr);
+	RF_ASSERT(common);
+
+	common->operation = operation;
+	common->bufPtr = bufPtr;
+	common->raidPtr = raidPtr;
+	common->wakeFunc = wakeFunc;
+	common->wakeArg = wakeArg;
+	common->tracerec = tracerec;
+	common->startTime = startTime;
+	common->cnt = 0;
+
+	if (rf_parityLogDebug)
+		printf("[entering CreateParityLogData]\n");
+	while (diskAddress) {
+		common->cnt++;
+		data = AllocParityLogData(raidPtr);
+		RF_ASSERT(data);
+		data->common = common;
+		data->next = NULL;
+		data->prev = NULL;
+		data->regionID = rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector);
+		if (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + diskAddress->numSector - 1)) {
+			/* disk address does not cross a region boundary */
+			data->diskAddress = *diskAddress;
+			data->bufOffset = offset;
+			offset = offset + diskAddress->numSector;
+			EnqueueParityLogData(data, &resultHead, &resultTail);
+			/* adjust disk address */
+			diskAddress = diskAddress->next;
+		} else {
+			/* disk address crosses a region boundary */
+			/* find address where region is crossed */
+			boundary = 0;
+			while (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + boundary))
+				boundary++;
+
+			/* enter data before the boundary */
+			data->diskAddress = *diskAddress;
+			data->diskAddress.numSector = boundary;
+			data->bufOffset = offset;
+			offset += boundary;
+			EnqueueParityLogData(data, &resultHead, &resultTail);
+			/* adjust disk address */
+			diskAddress->startSector += boundary;
+			diskAddress->numSector -= boundary;
+		}
+	}
+	if (rf_parityLogDebug)
+		printf("[leaving CreateParityLogData]\n");
+	return (resultHead);
+}
+
+
+RF_ParityLogData_t *
+rf_SearchAndDequeueParityLogData(
+    RF_Raid_t * raidPtr,
+    int regionID,
+    RF_ParityLogData_t ** head,
+    RF_ParityLogData_t ** tail,
+    int ignoreLocks)
+{
+	RF_ParityLogData_t *w;
+
+	/* Remove and return an in-core parity log from a specified region
+	 * (regionID). If a matching log is not found, return NULL.
+	 * 
+	 * NON-BLOCKING. */
+
+	/* walk backward through a list, looking for an entry with a matching
+	 * region ID */
+	if (!ignoreLocks)
+		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	w = (*tail);
+	while (w) {
+		if (w->regionID == regionID) {
+			/* remove an element from the list */
+			if (w == *tail) {
+				if (*head == *tail) {
+					/* removing only element in the list */
+					*head = NULL;
+					*tail = NULL;
+				} else {
+					/* removing last item in the list */
+					*tail = (*tail)->prev;
+					(*tail)->next = NULL;
+					RF_ASSERT((*head)->prev == NULL);
+					RF_ASSERT((*tail)->next == NULL);
+				}
+			} else {
+				if (w == *head) {
+					/* removing first item in the list */
+					*head = (*head)->next;
+					(*head)->prev = NULL;
+					RF_ASSERT((*head)->prev == NULL);
+					RF_ASSERT((*tail)->next == NULL);
+				} else {
+					/* removing an item from the middle of
+					 * the list */
+					w->prev->next = w->next;
+					w->next->prev = w->prev;
+					RF_ASSERT((*head)->prev == NULL);
+					RF_ASSERT((*tail)->next == NULL);
+				}
+			}
+			w->prev = NULL;
+			w->next = NULL;
+			if (rf_parityLogDebug)
+				printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n", w->regionID, (int) w->diskAddress.raidAddress, (int) w->diskAddress.numSector);
+			return (w);
+		} else
+			w = w->prev;
+	}
+	if (!ignoreLocks)
+		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	return (NULL);
+}
+
+static RF_ParityLogData_t *
+DequeueMatchingLogData(
+    RF_Raid_t * raidPtr,
+    RF_ParityLogData_t ** head,
+    RF_ParityLogData_t ** tail)
+{
+	RF_ParityLogData_t *logDataList, *logData;
+	int     regionID;
+
+	/* Remove and return an in-core parity log from the tail of a disk
+	 * queue (*head, *tail).  Then remove all matching (identical
+	 * regionIDs) logData and return as a linked list.
+	 * 
+	 * NON-BLOCKING */
+
+	logDataList = DequeueParityLogData(raidPtr, head, tail, RF_TRUE);
+	if (logDataList) {
+		regionID = logDataList->regionID;
+		logData = logDataList;
+		logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
+		while (logData->next) {
+			logData = logData->next;
+			logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
+		}
+	}
+	return (logDataList);
+}
+
+
+static RF_ParityLog_t *
+AcquireParityLog(
+    RF_ParityLogData_t * logData,
+    int finish)
+{
+	RF_ParityLog_t *log = NULL;
+	RF_Raid_t *raidPtr;
+
+	/* Grab a log buffer from the pool and return it. If no buffers are
+	 * available, return NULL. NON-BLOCKING */
+	raidPtr = logData->common->raidPtr;
+	RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
+	if (raidPtr->parityLogPool.parityLogs) {
+		log = raidPtr->parityLogPool.parityLogs;
+		raidPtr->parityLogPool.parityLogs = raidPtr->parityLogPool.parityLogs->next;
+		log->regionID = logData->regionID;
+		log->numRecords = 0;
+		log->next = NULL;
+		raidPtr->logsInUse++;
+		RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
+	} else {
+		/* no logs available, so place ourselves on the queue of work
+		 * waiting on log buffers this is done while
+		 * parityLogPool.mutex is held, to ensure synchronization with
+		 * ReleaseParityLogs. */
+		if (rf_parityLogDebug)
+			printf("[blocked on log, region %d, finish %d]\n", logData->regionID, finish);
+		if (finish)
+			RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
+		else
+			EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
+	}
+	RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
+	return (log);
+}
+
+void 
+rf_ReleaseParityLogs(
+    RF_Raid_t * raidPtr,
+    RF_ParityLog_t * firstLog)
+{
+	RF_ParityLogData_t *logDataList;
+	RF_ParityLog_t *log, *lastLog;
+	int     cnt;
+
+	/* Insert a linked list of parity logs (firstLog) to the free list
+	 * (parityLogPool.parityLogPool)
+	 * 
+	 * NON-BLOCKING. */
+
+	RF_ASSERT(firstLog);
+
+	/* Before returning logs to global free list, service all requests
+	 * which are blocked on logs.  Holding mutexes for parityLogPool and
+	 * parityLogDiskQueue forces synchronization with AcquireParityLog(). */
+	RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
+	log = firstLog;
+	if (firstLog)
+		firstLog = firstLog->next;
+	log->numRecords = 0;
+	log->next = NULL;
+	while (logDataList && log) {
+		RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
+		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+		rf_ParityLogAppend(logDataList, RF_TRUE, &log, RF_FALSE);
+		if (rf_parityLogDebug)
+			printf("[finishing up buf-blocked log data, region %d]\n", logDataList->regionID);
+		if (log == NULL) {
+			log = firstLog;
+			if (firstLog) {
+				firstLog = firstLog->next;
+				log->numRecords = 0;
+				log->next = NULL;
+			}
+		}
+		RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
+		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+		if (log)
+			logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
+	}
+	/* return remaining logs to pool */
+	if (log) {
+		log->next = firstLog;
+		firstLog = log;
+	}
+	if (firstLog) {
+		lastLog = firstLog;
+		raidPtr->logsInUse--;
+		RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
+		while (lastLog->next) {
+			lastLog = lastLog->next;
+			raidPtr->logsInUse--;
+			RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
+		}
+		lastLog->next = raidPtr->parityLogPool.parityLogs;
+		raidPtr->parityLogPool.parityLogs = firstLog;
+		cnt = 0;
+		log = raidPtr->parityLogPool.parityLogs;
+		while (log) {
+			cnt++;
+			log = log->next;
+		}
+		RF_ASSERT(cnt + raidPtr->logsInUse == raidPtr->numParityLogs);
+	}
+	RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
+	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+}
+
+static void 
+ReintLog(
+    RF_Raid_t * raidPtr,
+    int regionID,
+    RF_ParityLog_t * log)
+{
+	RF_ASSERT(log);
+
+	/* Insert an in-core parity log (log) into the disk queue of
+	 * reintegration work.  Set the flag (reintInProgress) for the
+	 * specified region (regionID) to indicate that reintegration is in
+	 * progress for this region. NON-BLOCKING */
+
+	RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+	raidPtr->regionInfo[regionID].reintInProgress = RF_TRUE;	/* cleared when reint
+									 * complete */
+
+	if (rf_parityLogDebug)
+		printf("[requesting reintegration of region %d]\n", log->regionID);
+	/* move record to reintegration queue */
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	log->next = raidPtr->parityLogDiskQueue.reintQueue;
+	raidPtr->parityLogDiskQueue.reintQueue = log;
+	RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
+}
+
+static void 
+FlushLog(
+    RF_Raid_t * raidPtr,
+    RF_ParityLog_t * log)
+{
+	/* insert a core log (log) into a list of logs
+	 * (parityLogDiskQueue.flushQueue) waiting to be written to disk.
+	 * NON-BLOCKING */
+
+	RF_ASSERT(log);
+	RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
+	RF_ASSERT(log->next == NULL);
+	/* move log to flush queue */
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	log->next = raidPtr->parityLogDiskQueue.flushQueue;
+	raidPtr->parityLogDiskQueue.flushQueue = log;
+	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
+}
+
+static int 
+DumpParityLogToDisk(
+    int finish,
+    RF_ParityLogData_t * logData)
+{
+	int     i, diskCount, regionID = logData->regionID;
+	RF_ParityLog_t *log;
+	RF_Raid_t *raidPtr;
+
+	raidPtr = logData->common->raidPtr;
+
+	/* Move a core log to disk.  If the log disk is full, initiate
+	 * reintegration.
+	 * 
+	 * Return (0) if we can enqueue the dump immediately, otherwise return
+	 * (1) to indicate we are blocked on reintegration and control of the
+	 * thread should be relinquished.
+	 * 
+	 * Caller must hold regionInfo[regionID].mutex
+	 * 
+	 * NON-BLOCKING */
+
+	if (rf_parityLogDebug)
+		printf("[dumping parity log to disk, region %d]\n", regionID);
+	log = raidPtr->regionInfo[regionID].coreLog;
+	RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
+	RF_ASSERT(log->next == NULL);
+
+	/* if reintegration is in progress, must queue work */
+	RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+	if (raidPtr->regionInfo[regionID].reintInProgress) {
+		/* Can not proceed since this region is currently being
+		 * reintegrated. We can not block, so queue remaining work and
+		 * return */
+		if (rf_parityLogDebug)
+			printf("[region %d waiting on reintegration]\n", regionID);
+		/* XXX not sure about the use of finish - shouldn't this
+		 * always be "Enqueue"? */
+		if (finish)
+			RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
+		else
+			EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
+		RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+		return (1);	/* relenquish control of this thread */
+	}
+	RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+	raidPtr->regionInfo[regionID].coreLog = NULL;
+	if ((raidPtr->regionInfo[regionID].diskCount) < raidPtr->regionInfo[regionID].capacity)
+		/* IMPORTANT!! this loop bound assumes region disk holds an
+		 * integral number of core logs */
+	{
+		/* update disk map for this region */
+		diskCount = raidPtr->regionInfo[regionID].diskCount;
+		for (i = 0; i < raidPtr->numSectorsPerLog; i++) {
+			raidPtr->regionInfo[regionID].diskMap[i + diskCount].operation = log->records[i].operation;
+			raidPtr->regionInfo[regionID].diskMap[i + diskCount].parityAddr = log->records[i].parityAddr;
+		}
+		log->diskOffset = diskCount;
+		raidPtr->regionInfo[regionID].diskCount += raidPtr->numSectorsPerLog;
+		FlushLog(raidPtr, log);
+	} else {
+		/* no room for log on disk, send it to disk manager and
+		 * request reintegration */
+		RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == raidPtr->regionInfo[regionID].capacity);
+		ReintLog(raidPtr, regionID, log);
+	}
+	if (rf_parityLogDebug)
+		printf("[finished dumping parity log to disk, region %d]\n", regionID);
+	return (0);
+}
+
+int 
+rf_ParityLogAppend(
+    RF_ParityLogData_t * logData,
+    int finish,
+    RF_ParityLog_t ** incomingLog,
+    int clearReintFlag)
+{
+	int     regionID, logItem, itemDone;
+	RF_ParityLogData_t *item;
+	int     punt, done = RF_FALSE;
+	RF_ParityLog_t *log;
+	RF_Raid_t *raidPtr;
+	RF_Etimer_t timer;
+	int     (*wakeFunc) (RF_DagNode_t * node, int status);
+	void   *wakeArg;
+
+	/* Add parity to the appropriate log, one sector at a time. This
+	 * routine is called is called by dag functions ParityLogUpdateFunc
+	 * and ParityLogOverwriteFunc and therefore MUST BE NONBLOCKING.
+	 * 
+	 * Parity to be logged is contained in a linked-list (logData).  When
+	 * this routine returns, every sector in the list will be in one of
+	 * three places: 1) entered into the parity log 2) queued, waiting on
+	 * reintegration 3) queued, waiting on a core log
+	 * 
+	 * Blocked work is passed to the ParityLoggingDiskManager for completion.
+	 * Later, as conditions which required the block are removed, the work
+	 * reenters this routine with the "finish" parameter set to "RF_TRUE."
+	 * 
+	 * NON-BLOCKING */
+
+	raidPtr = logData->common->raidPtr;
+	/* lock the region for the first item in logData */
+	RF_ASSERT(logData != NULL);
+	regionID = logData->regionID;
+	RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+	RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
+
+	if (clearReintFlag) {
+		/* Enable flushing for this region.  Holding both locks
+		 * provides a synchronization barrier with DumpParityLogToDisk */
+		RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+		RF_ASSERT(raidPtr->regionInfo[regionID].reintInProgress == RF_TRUE);
+		raidPtr->regionInfo[regionID].diskCount = 0;
+		raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE;
+		RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);	/* flushing is now
+										 * enabled */
+		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	}
+	/* process each item in logData */
+	while (logData) {
+		/* remove an item from logData */
+		item = logData;
+		logData = logData->next;
+		item->next = NULL;
+		item->prev = NULL;
+
+		if (rf_parityLogDebug)
+			printf("[appending parity log data, region %d, raidAddress %d, numSector %d]\n", item->regionID, (int) item->diskAddress.raidAddress, (int) item->diskAddress.numSector);
+
+		/* see if we moved to a new region */
+		if (regionID != item->regionID) {
+			RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+			regionID = item->regionID;
+			RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+			RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
+		}
+		punt = RF_FALSE;/* Set to RF_TRUE if work is blocked.  This
+				 * can happen in one of two ways: 1) no core
+				 * log (AcquireParityLog) 2) waiting on
+				 * reintegration (DumpParityLogToDisk) If punt
+				 * is RF_TRUE, the dataItem was queued, so
+				 * skip to next item. */
+
+		/* process item, one sector at a time, until all sectors
+		 * processed or we punt */
+		if (item->diskAddress.numSector > 0)
+			done = RF_FALSE;
+		else
+			RF_ASSERT(0);
+		while (!punt && !done) {
+			/* verify that a core log exists for this region */
+			if (!raidPtr->regionInfo[regionID].coreLog) {
+				/* Attempt to acquire a parity log. If
+				 * acquisition fails, queue remaining work in
+				 * data item and move to nextItem. */
+				if (incomingLog)
+					if (*incomingLog) {
+						RF_ASSERT((*incomingLog)->next == NULL);
+						raidPtr->regionInfo[regionID].coreLog = *incomingLog;
+						raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
+						*incomingLog = NULL;
+					} else
+						raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
+				else
+					raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
+				/* Note: AcquireParityLog either returns a log
+				 * or enqueues currentItem */
+			}
+			if (!raidPtr->regionInfo[regionID].coreLog)
+				punt = RF_TRUE;	/* failed to find a core log */
+			else {
+				RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
+				/* verify that the log has room for new
+				 * entries */
+				/* if log is full, dump it to disk and grab a
+				 * new log */
+				if (raidPtr->regionInfo[regionID].coreLog->numRecords == raidPtr->numSectorsPerLog) {
+					/* log is full, dump it to disk */
+					if (DumpParityLogToDisk(finish, item))
+						punt = RF_TRUE;	/* dump unsuccessful,
+								 * blocked on
+								 * reintegration */
+					else {
+						/* dump was successful */
+						if (incomingLog)
+							if (*incomingLog) {
+								RF_ASSERT((*incomingLog)->next == NULL);
+								raidPtr->regionInfo[regionID].coreLog = *incomingLog;
+								raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
+								*incomingLog = NULL;
+							} else
+								raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
+						else
+							raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
+						/* if a core log is not
+						 * available, must queue work
+						 * and return */
+						if (!raidPtr->regionInfo[regionID].coreLog)
+							punt = RF_TRUE;	/* blocked on log
+									 * availability */
+					}
+				}
+			}
+			/* if we didn't punt on this item, attempt to add a
+			 * sector to the core log */
+			if (!punt) {
+				RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
+				/* at this point, we have a core log with
+				 * enough room for a sector */
+				/* copy a sector into the log */
+				log = raidPtr->regionInfo[regionID].coreLog;
+				RF_ASSERT(log->numRecords < raidPtr->numSectorsPerLog);
+				logItem = log->numRecords++;
+				log->records[logItem].parityAddr = item->diskAddress;
+				RF_ASSERT(log->records[logItem].parityAddr.startSector >= raidPtr->regionInfo[regionID].parityStartAddr);
+				RF_ASSERT(log->records[logItem].parityAddr.startSector < raidPtr->regionInfo[regionID].parityStartAddr + raidPtr->regionInfo[regionID].numSectorsParity);
+				log->records[logItem].parityAddr.numSector = 1;
+				log->records[logItem].operation = item->common->operation;
+				bcopy((item->common->bufPtr + (item->bufOffset++ * (1 << item->common->raidPtr->logBytesPerSector))), log->bufPtr + (logItem * (1 << item->common->raidPtr->logBytesPerSector)), (1 << item->common->raidPtr->logBytesPerSector));
+				item->diskAddress.numSector--;
+				item->diskAddress.startSector++;
+				if (item->diskAddress.numSector == 0)
+					done = RF_TRUE;
+			}
+		}
+
+		if (!punt) {
+			/* Processed this item completely, decrement count of
+			 * items to be processed. */
+			RF_ASSERT(item->diskAddress.numSector == 0);
+			RF_LOCK_MUTEX(item->common->mutex);
+			item->common->cnt--;
+			if (item->common->cnt == 0)
+				itemDone = RF_TRUE;
+			else
+				itemDone = RF_FALSE;
+			RF_UNLOCK_MUTEX(item->common->mutex);
+			if (itemDone) {
+				/* Finished processing all log data for this
+				 * IO Return structs to free list and invoke
+				 * wakeup function. */
+				timer = item->common->startTime;	/* grab initial value of
+									 * timer */
+				RF_ETIMER_STOP(timer);
+				RF_ETIMER_EVAL(timer);
+				item->common->tracerec->plog_us += RF_ETIMER_VAL_US(timer);
+				if (rf_parityLogDebug)
+					printf("[waking process for region %d]\n", item->regionID);
+				wakeFunc = item->common->wakeFunc;
+				wakeArg = item->common->wakeArg;
+				FreeParityLogCommonData(item->common);
+				FreeParityLogData(item);
+				(wakeFunc) (wakeArg, 0);
+			} else
+				FreeParityLogData(item);
+		}
+	}
+	RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+	if (rf_parityLogDebug)
+		printf("[exiting ParityLogAppend]\n");
+	return (0);
+}
+
+
+void 
+rf_EnableParityLogging(RF_Raid_t * raidPtr)
+{
+	int     regionID;
+
+	for (regionID = 0; regionID < rf_numParityRegions; regionID++) {
+		RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+		raidPtr->regionInfo[regionID].loggingEnabled = RF_TRUE;
+		RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+	}
+	if (rf_parityLogDebug)
+		printf("[parity logging enabled]\n");
+}
+#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
diff --git a/sys/dev/raidframe/rf_paritylog.h b/sys/dev/raidframe/rf_paritylog.h
new file mode 100644
index 0000000..1f2b80d
--- /dev/null
+++ b/sys/dev/raidframe/rf_paritylog.h
@@ -0,0 +1,181 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_paritylog.h,v 1.3 1999/02/05 00:06:14 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* header file for parity log
+ *
+ */
+
+#ifndef _RF__RF_PARITYLOG_H_
+#define _RF__RF_PARITYLOG_H_
+
+#include <dev/raidframe/rf_types.h>
+
+#define RF_DEFAULT_NUM_SECTORS_PER_LOG 64
+
+typedef int RF_RegionId_t;
+
+typedef enum RF_ParityRecordType_e {
+	RF_STOP,
+	RF_UPDATE,
+	RF_OVERWRITE
+}       RF_ParityRecordType_t;
+
+struct RF_CommonLogData_s {
+	RF_DECLARE_MUTEX(mutex)	/* protects cnt */
+	int     cnt;		/* when 0, time to call wakeFunc */
+	RF_Raid_t *raidPtr;
+/*   int                    (*wakeFunc)(RF_Buf_t); */
+	int     (*wakeFunc) (RF_DagNode_t * node, int status);
+	void   *wakeArg;
+	RF_AccTraceEntry_t *tracerec;
+	RF_Etimer_t startTime;
+	caddr_t bufPtr;
+	RF_ParityRecordType_t operation;
+	RF_CommonLogData_t *next;
+};
+
+struct RF_ParityLogData_s {
+	RF_RegionId_t regionID;	/* this struct guaranteed to span a single
+				 * region */
+	int     bufOffset;	/* offset from common->bufPtr */
+	RF_PhysDiskAddr_t diskAddress;
+	RF_CommonLogData_t *common;	/* info shared by one or more
+					 * parityLogData structs */
+	RF_ParityLogData_t *next;
+	RF_ParityLogData_t *prev;
+};
+
+struct RF_ParityLogAppendQueue_s {
+	RF_DECLARE_MUTEX(mutex)
+};
+
+struct RF_ParityLogRecord_s {
+	RF_PhysDiskAddr_t parityAddr;
+	RF_ParityRecordType_t operation;
+};
+
+struct RF_ParityLog_s {
+	RF_RegionId_t regionID;
+	int     numRecords;
+	int     diskOffset;
+	RF_ParityLogRecord_t *records;
+	caddr_t bufPtr;
+	RF_ParityLog_t *next;
+};
+
+struct RF_ParityLogQueue_s {
+	RF_DECLARE_MUTEX(mutex)
+	RF_ParityLog_t *parityLogs;
+};
+
+struct RF_RegionBufferQueue_s {
+	RF_DECLARE_MUTEX(mutex)
+	RF_DECLARE_COND(cond)
+	int     bufferSize;
+	int     totalBuffers;	/* size of array 'buffers' */
+	int     availableBuffers;	/* num available 'buffers' */
+	int     emptyBuffersIndex;	/* stick next freed buffer here */
+	int     availBuffersIndex;	/* grab next buffer from here */
+	caddr_t *buffers;	/* array buffers used to hold parity */
+};
+#define RF_PLOG_CREATED   (1<<0)/* thread is created */
+#define RF_PLOG_RUNNING   (1<<1)/* thread is running */
+#define RF_PLOG_TERMINATE (1<<2)/* thread is terminated (should exit) */
+#define RF_PLOG_SHUTDOWN  (1<<3)/* thread is aware and exiting/exited */
+
+struct RF_ParityLogDiskQueue_s {
+	RF_DECLARE_MUTEX(mutex)	/* protects all vars in this struct */
+	RF_DECLARE_COND(cond)
+	int     threadState;	/* is thread running, should it shutdown  (see
+				 * above) */
+	RF_ParityLog_t *flushQueue;	/* list of parity logs to be flushed
+					 * to log disk */
+	RF_ParityLog_t *reintQueue;	/* list of parity logs waiting to be
+					 * reintegrated */
+	RF_ParityLogData_t *bufHead;	/* head of FIFO list of log data,
+					 * waiting on a buffer */
+	RF_ParityLogData_t *bufTail;	/* tail of FIFO list of log data,
+					 * waiting on a buffer */
+	RF_ParityLogData_t *reintHead;	/* head of FIFO list of log data,
+					 * waiting on reintegration */
+	RF_ParityLogData_t *reintTail;	/* tail of FIFO list of log data,
+					 * waiting on reintegration */
+	RF_ParityLogData_t *logBlockHead;	/* queue of work, blocked
+						 * until a log is available */
+	RF_ParityLogData_t *logBlockTail;
+	RF_ParityLogData_t *reintBlockHead;	/* queue of work, blocked
+						 * until reintegration is
+						 * complete */
+	RF_ParityLogData_t *reintBlockTail;
+	RF_CommonLogData_t *freeCommonList;	/* list of unused common data
+						 * structs */
+	RF_ParityLogData_t *freeDataList;	/* list of unused log data
+						 * structs */
+};
+
+struct RF_DiskMap_s {
+	RF_PhysDiskAddr_t parityAddr;
+	RF_ParityRecordType_t operation;
+};
+
+struct RF_RegionInfo_s {
+	RF_DECLARE_MUTEX(mutex)	/* protects: diskCount, diskMap,
+				 * loggingEnabled, coreLog */
+	RF_DECLARE_MUTEX(reintMutex)	/* protects: reintInProgress */
+	int     reintInProgress;/* flag used to suspend flushing operations */
+	RF_SectorCount_t capacity;	/* capacity of this region in sectors */
+	RF_SectorNum_t regionStartAddr;	/* starting disk address for this
+					 * region */
+	RF_SectorNum_t parityStartAddr;	/* starting disk address for this
+					 * region */
+	RF_SectorCount_t numSectorsParity;	/* number of parity sectors
+						 * protected by this region */
+	RF_SectorCount_t diskCount;	/* num of sectors written to this
+					 * region's disk log */
+	RF_DiskMap_t *diskMap;	/* in-core map of what's in this region's disk
+				 * log */
+	int     loggingEnabled;	/* logging enable for this region */
+	RF_ParityLog_t *coreLog;/* in-core log for this region */
+};
+
+RF_ParityLogData_t *
+rf_CreateParityLogData(RF_ParityRecordType_t operation,
+    RF_PhysDiskAddr_t * pda, caddr_t bufPtr, RF_Raid_t * raidPtr,
+    int (*wakeFunc) (RF_DagNode_t * node, int status),
+    void *wakeArg, RF_AccTraceEntry_t * tracerec,
+    RF_Etimer_t startTime);
+	RF_ParityLogData_t *rf_SearchAndDequeueParityLogData(RF_Raid_t * raidPtr,
+            RF_RegionId_t regionID, RF_ParityLogData_t ** head,
+            RF_ParityLogData_t ** tail, int ignoreLocks);
+	void    rf_ReleaseParityLogs(RF_Raid_t * raidPtr, RF_ParityLog_t * firstLog);
+	int     rf_ParityLogAppend(RF_ParityLogData_t * logData, int finish,
+            RF_ParityLog_t ** incomingLog, int clearReintFlag);
+	void    rf_EnableParityLogging(RF_Raid_t * raidPtr);
+
+#endif				/* !_RF__RF_PARITYLOG_H_ */
diff --git a/sys/dev/raidframe/rf_paritylogDiskMgr.c b/sys/dev/raidframe/rf_paritylogDiskMgr.c
new file mode 100644
index 0000000..5eadad8
--- /dev/null
+++ b/sys/dev/raidframe/rf_paritylogDiskMgr.c
@@ -0,0 +1,701 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_paritylogDiskMgr.c,v 1.10 2000/01/15 01:57:57 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/* Code for flushing and reintegration operations related to parity logging.
+ *
+ */
+
+#include <dev/raidframe/rf_archs.h>
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_mcpair.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_desc.h>
+#include <dev/raidframe/rf_layout.h>
+#include <dev/raidframe/rf_diskqueue.h>
+#include <dev/raidframe/rf_paritylog.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_paritylogging.h>
+#include <dev/raidframe/rf_engine.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_map.h>
+#include <dev/raidframe/rf_parityscan.h>
+#include <dev/raidframe/rf_kintf.h>
+
+#include <dev/raidframe/rf_paritylogDiskMgr.h>
+
+static caddr_t AcquireReintBuffer(RF_RegionBufferQueue_t *);
+
+static caddr_t 
+AcquireReintBuffer(pool)
+	RF_RegionBufferQueue_t *pool;
+{
+	caddr_t bufPtr = NULL;
+
+	/* Return a region buffer from the free list (pool). If the free list
+	 * is empty, WAIT. BLOCKING */
+
+	RF_LOCK_MUTEX(pool->mutex);
+	if (pool->availableBuffers > 0) {
+		bufPtr = pool->buffers[pool->availBuffersIndex];
+		pool->availableBuffers--;
+		pool->availBuffersIndex++;
+		if (pool->availBuffersIndex == pool->totalBuffers)
+			pool->availBuffersIndex = 0;
+		RF_UNLOCK_MUTEX(pool->mutex);
+	} else {
+		RF_PANIC();	/* should never happen in correct config,
+				 * single reint */
+		RF_WAIT_COND(pool->cond, pool->mutex);
+	}
+	return (bufPtr);
+}
+
+static void 
+ReleaseReintBuffer(
+    RF_RegionBufferQueue_t * pool,
+    caddr_t bufPtr)
+{
+	/* Insert a region buffer (bufPtr) into the free list (pool).
+	 * NON-BLOCKING */
+
+	RF_LOCK_MUTEX(pool->mutex);
+	pool->availableBuffers++;
+	pool->buffers[pool->emptyBuffersIndex] = bufPtr;
+	pool->emptyBuffersIndex++;
+	if (pool->emptyBuffersIndex == pool->totalBuffers)
+		pool->emptyBuffersIndex = 0;
+	RF_ASSERT(pool->availableBuffers <= pool->totalBuffers);
+	RF_UNLOCK_MUTEX(pool->mutex);
+	RF_SIGNAL_COND(pool->cond);
+}
+
+
+
+static void 
+ReadRegionLog(
+    RF_RegionId_t regionID,
+    RF_MCPair_t * rrd_mcpair,
+    caddr_t regionBuffer,
+    RF_Raid_t * raidPtr,
+    RF_DagHeader_t ** rrd_dag_h,
+    RF_AllocListElem_t ** rrd_alloclist,
+    RF_PhysDiskAddr_t ** rrd_pda)
+{
+	/* Initiate the read a region log from disk.  Once initiated, return
+	 * to the calling routine.
+	 * 
+	 * NON-BLOCKING */
+
+	RF_AccTraceEntry_t *tracerec;
+	RF_DagNode_t *rrd_rdNode;
+
+	/* create DAG to read region log from disk */
+	rf_MakeAllocList(*rrd_alloclist);
+	*rrd_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, regionBuffer, 
+				      rf_DiskReadFunc, rf_DiskReadUndoFunc,
+				      "Rrl", *rrd_alloclist, 
+				      RF_DAG_FLAGS_NONE, 
+				      RF_IO_NORMAL_PRIORITY);
+
+	/* create and initialize PDA for the core log */
+	/* RF_Malloc(*rrd_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t
+	 * *)); */
+	*rrd_pda = rf_AllocPDAList(1);
+	rf_MapLogParityLogging(raidPtr, regionID, 0, &((*rrd_pda)->row), 
+			       &((*rrd_pda)->col), &((*rrd_pda)->startSector));
+	(*rrd_pda)->numSector = raidPtr->regionInfo[regionID].capacity;
+
+	if ((*rrd_pda)->next) {
+		(*rrd_pda)->next = NULL;
+		printf("set rrd_pda->next to NULL\n");
+	}
+	/* initialize DAG parameters */
+	RF_Malloc(tracerec,sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
+	bzero((char *) tracerec, sizeof(RF_AccTraceEntry_t));
+	(*rrd_dag_h)->tracerec = tracerec;
+	rrd_rdNode = (*rrd_dag_h)->succedents[0]->succedents[0];
+	rrd_rdNode->params[0].p = *rrd_pda;
+/*  rrd_rdNode->params[1] = regionBuffer; */
+	rrd_rdNode->params[2].v = 0;
+	rrd_rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 
+						   0, 0, 0);
+
+	/* launch region log read dag */
+	rf_DispatchDAG(*rrd_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
+	    (void *) rrd_mcpair);
+}
+
+
+
+static void 
+WriteCoreLog(
+    RF_ParityLog_t * log,
+    RF_MCPair_t * fwr_mcpair,
+    RF_Raid_t * raidPtr,
+    RF_DagHeader_t ** fwr_dag_h,
+    RF_AllocListElem_t ** fwr_alloclist,
+    RF_PhysDiskAddr_t ** fwr_pda)
+{
+	RF_RegionId_t regionID = log->regionID;
+	RF_AccTraceEntry_t *tracerec;
+	RF_SectorNum_t regionOffset;
+	RF_DagNode_t *fwr_wrNode;
+
+	/* Initiate the write of a core log to a region log disk. Once
+	 * initiated, return to the calling routine.
+	 * 
+	 * NON-BLOCKING */
+
+	/* create DAG to write a core log to a region log disk */
+	rf_MakeAllocList(*fwr_alloclist);
+	*fwr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, log->bufPtr, 
+				      rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+	    "Wcl", *fwr_alloclist, RF_DAG_FLAGS_NONE, RF_IO_NORMAL_PRIORITY);
+
+	/* create and initialize PDA for the region log */
+	/* RF_Malloc(*fwr_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t
+	 * *)); */
+	*fwr_pda = rf_AllocPDAList(1);
+	regionOffset = log->diskOffset;
+	rf_MapLogParityLogging(raidPtr, regionID, regionOffset, 
+			       &((*fwr_pda)->row), &((*fwr_pda)->col), 
+			       &((*fwr_pda)->startSector));
+	(*fwr_pda)->numSector = raidPtr->numSectorsPerLog;
+
+	/* initialize DAG parameters */
+	RF_Malloc(tracerec,sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
+	bzero((char *) tracerec, sizeof(RF_AccTraceEntry_t));
+	(*fwr_dag_h)->tracerec = tracerec;
+	fwr_wrNode = (*fwr_dag_h)->succedents[0]->succedents[0];
+	fwr_wrNode->params[0].p = *fwr_pda;
+/*  fwr_wrNode->params[1] = log->bufPtr; */
+	fwr_wrNode->params[2].v = 0;
+	fwr_wrNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 
+						   0, 0, 0);
+
+	/* launch the dag to write the core log to disk */
+	rf_DispatchDAG(*fwr_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
+	    (void *) fwr_mcpair);
+}
+
+
+static void 
+ReadRegionParity(
+    RF_RegionId_t regionID,
+    RF_MCPair_t * prd_mcpair,
+    caddr_t parityBuffer,
+    RF_Raid_t * raidPtr,
+    RF_DagHeader_t ** prd_dag_h,
+    RF_AllocListElem_t ** prd_alloclist,
+    RF_PhysDiskAddr_t ** prd_pda)
+{
+	/* Initiate the read region parity from disk. Once initiated, return
+	 * to the calling routine.
+	 * 
+	 * NON-BLOCKING */
+
+	RF_AccTraceEntry_t *tracerec;
+	RF_DagNode_t *prd_rdNode;
+
+	/* create DAG to read region parity from disk */
+	rf_MakeAllocList(*prd_alloclist);
+	*prd_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, NULL, rf_DiskReadFunc, 
+				      rf_DiskReadUndoFunc, "Rrp", 
+				      *prd_alloclist, RF_DAG_FLAGS_NONE, 
+				      RF_IO_NORMAL_PRIORITY);
+
+	/* create and initialize PDA for region parity */
+	/* RF_Malloc(*prd_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t
+	 * *)); */
+	*prd_pda = rf_AllocPDAList(1);
+	rf_MapRegionParity(raidPtr, regionID, &((*prd_pda)->row), 
+			   &((*prd_pda)->col), &((*prd_pda)->startSector), 
+			   &((*prd_pda)->numSector));
+	if (rf_parityLogDebug)
+		printf("[reading %d sectors of parity from region %d]\n",
+		    (int) (*prd_pda)->numSector, regionID);
+	if ((*prd_pda)->next) {
+		(*prd_pda)->next = NULL;
+		printf("set prd_pda->next to NULL\n");
+	}
+	/* initialize DAG parameters */
+	RF_Malloc(tracerec,sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
+	bzero((char *) tracerec, sizeof(RF_AccTraceEntry_t));
+	(*prd_dag_h)->tracerec = tracerec;
+	prd_rdNode = (*prd_dag_h)->succedents[0]->succedents[0];
+	prd_rdNode->params[0].p = *prd_pda;
+	prd_rdNode->params[1].p = parityBuffer;
+	prd_rdNode->params[2].v = 0;
+	prd_rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 
+						   0, 0, 0);
+	if (rf_validateDAGDebug)
+		rf_ValidateDAG(*prd_dag_h);
+	/* launch region parity read dag */
+	rf_DispatchDAG(*prd_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
+	    (void *) prd_mcpair);
+}
+
+static void 
+WriteRegionParity(
+    RF_RegionId_t regionID,
+    RF_MCPair_t * pwr_mcpair,
+    caddr_t parityBuffer,
+    RF_Raid_t * raidPtr,
+    RF_DagHeader_t ** pwr_dag_h,
+    RF_AllocListElem_t ** pwr_alloclist,
+    RF_PhysDiskAddr_t ** pwr_pda)
+{
+	/* Initiate the write of region parity to disk. Once initiated, return
+	 * to the calling routine.
+	 * 
+	 * NON-BLOCKING */
+
+	RF_AccTraceEntry_t *tracerec;
+	RF_DagNode_t *pwr_wrNode;
+
+	/* create DAG to write region log from disk */
+	rf_MakeAllocList(*pwr_alloclist);
+	*pwr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, parityBuffer, 
+				      rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+				      "Wrp", *pwr_alloclist, 
+				      RF_DAG_FLAGS_NONE, 
+				      RF_IO_NORMAL_PRIORITY);
+
+	/* create and initialize PDA for region parity */
+	/* RF_Malloc(*pwr_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t
+	 * *)); */
+	*pwr_pda = rf_AllocPDAList(1);
+	rf_MapRegionParity(raidPtr, regionID, &((*pwr_pda)->row), 
+			   &((*pwr_pda)->col), &((*pwr_pda)->startSector), 
+			   &((*pwr_pda)->numSector));
+
+	/* initialize DAG parameters */
+	RF_Malloc(tracerec,sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
+	bzero((char *) tracerec, sizeof(RF_AccTraceEntry_t));
+	(*pwr_dag_h)->tracerec = tracerec;
+	pwr_wrNode = (*pwr_dag_h)->succedents[0]->succedents[0];
+	pwr_wrNode->params[0].p = *pwr_pda;
+/*  pwr_wrNode->params[1] = parityBuffer; */
+	pwr_wrNode->params[2].v = 0;
+	pwr_wrNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 
+						   0, 0, 0);
+
+	/* launch the dag to write region parity to disk */
+	rf_DispatchDAG(*pwr_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
+	    (void *) pwr_mcpair);
+}
+
+static void 
+FlushLogsToDisk(
+    RF_Raid_t * raidPtr,
+    RF_ParityLog_t * logList)
+{
+	/* Flush a linked list of core logs to the log disk. Logs contain the
+	 * disk location where they should be written.  Logs were written in
+	 * FIFO order and that order must be preserved.
+	 * 
+	 * Recommended optimizations: 1) allow multiple flushes to occur
+	 * simultaneously 2) coalesce contiguous flush operations
+	 * 
+	 * BLOCKING */
+
+	RF_ParityLog_t *log;
+	RF_RegionId_t regionID;
+	RF_MCPair_t *fwr_mcpair;
+	RF_DagHeader_t *fwr_dag_h;
+	RF_AllocListElem_t *fwr_alloclist;
+	RF_PhysDiskAddr_t *fwr_pda;
+
+	fwr_mcpair = rf_AllocMCPair();
+	RF_LOCK_MUTEX(fwr_mcpair->mutex);
+
+	RF_ASSERT(logList);
+	log = logList;
+	while (log) {
+		regionID = log->regionID;
+
+		/* create and launch a DAG to write the core log */
+		if (rf_parityLogDebug)
+			printf("[initiating write of core log for region %d]\n", regionID);
+		fwr_mcpair->flag = RF_FALSE;
+		WriteCoreLog(log, fwr_mcpair, raidPtr, &fwr_dag_h, 
+			     &fwr_alloclist, &fwr_pda);
+
+		/* wait for the DAG to complete */
+		while (!fwr_mcpair->flag)
+			RF_WAIT_COND(fwr_mcpair->cond, fwr_mcpair->mutex);
+		if (fwr_dag_h->status != rf_enable) {
+			RF_ERRORMSG1("Unable to write core log to disk (region %d)\n", regionID);
+			RF_ASSERT(0);
+		}
+		/* RF_Free(fwr_pda, sizeof(RF_PhysDiskAddr_t)); */
+		rf_FreePhysDiskAddr(fwr_pda);
+		rf_FreeDAG(fwr_dag_h);
+		rf_FreeAllocList(fwr_alloclist);
+
+		log = log->next;
+	}
+	RF_UNLOCK_MUTEX(fwr_mcpair->mutex);
+	rf_FreeMCPair(fwr_mcpair);
+	rf_ReleaseParityLogs(raidPtr, logList);
+}
+
+static void 
+ReintegrateRegion(
+    RF_Raid_t * raidPtr,
+    RF_RegionId_t regionID,
+    RF_ParityLog_t * coreLog)
+{
+	RF_MCPair_t *rrd_mcpair = NULL, *prd_mcpair, *pwr_mcpair;
+	RF_DagHeader_t *rrd_dag_h, *prd_dag_h, *pwr_dag_h;
+	RF_AllocListElem_t *rrd_alloclist, *prd_alloclist, *pwr_alloclist;
+	RF_PhysDiskAddr_t *rrd_pda, *prd_pda, *pwr_pda;
+	caddr_t parityBuffer, regionBuffer = NULL;
+
+	/* Reintegrate a region (regionID). 
+	 *
+	 * 1. acquire region and parity buffers 
+	 * 2. read log from disk 
+	 * 3. read parity from disk 
+	 * 4. apply log to parity 
+	 * 5. apply core log to parity 
+	 * 6. write new parity to disk
+	 * 
+	 * BLOCKING */
+
+	if (rf_parityLogDebug)
+		printf("[reintegrating region %d]\n", regionID);
+
+	/* initiate read of region parity */
+	if (rf_parityLogDebug)
+		printf("[initiating read of parity for region %d]\n",regionID);
+	parityBuffer = AcquireReintBuffer(&raidPtr->parityBufferPool);
+	prd_mcpair = rf_AllocMCPair();
+	RF_LOCK_MUTEX(prd_mcpair->mutex);
+	prd_mcpair->flag = RF_FALSE;
+	ReadRegionParity(regionID, prd_mcpair, parityBuffer, raidPtr, 
+			 &prd_dag_h, &prd_alloclist, &prd_pda);
+
+	/* if region log nonempty, initiate read */
+	if (raidPtr->regionInfo[regionID].diskCount > 0) {
+		if (rf_parityLogDebug)
+			printf("[initiating read of disk log for region %d]\n",
+			       regionID);
+		regionBuffer = AcquireReintBuffer(&raidPtr->regionBufferPool);
+		rrd_mcpair = rf_AllocMCPair();
+		RF_LOCK_MUTEX(rrd_mcpair->mutex);
+		rrd_mcpair->flag = RF_FALSE;
+		ReadRegionLog(regionID, rrd_mcpair, regionBuffer, raidPtr, 
+			      &rrd_dag_h, &rrd_alloclist, &rrd_pda);
+	}
+	/* wait on read of region parity to complete */
+	while (!prd_mcpair->flag) {
+		RF_WAIT_COND(prd_mcpair->cond, prd_mcpair->mutex);
+	}
+	RF_UNLOCK_MUTEX(prd_mcpair->mutex);
+	if (prd_dag_h->status != rf_enable) {
+		RF_ERRORMSG("Unable to read parity from disk\n");
+		/* add code to fail the parity disk */
+		RF_ASSERT(0);
+	}
+	/* apply core log to parity */
+	/* if (coreLog) ApplyLogsToParity(coreLog, parityBuffer); */
+
+	if (raidPtr->regionInfo[regionID].diskCount > 0) {
+		/* wait on read of region log to complete */
+		while (!rrd_mcpair->flag)
+			RF_WAIT_COND(rrd_mcpair->cond, rrd_mcpair->mutex);
+		RF_UNLOCK_MUTEX(rrd_mcpair->mutex);
+		if (rrd_dag_h->status != rf_enable) {
+			RF_ERRORMSG("Unable to read region log from disk\n");
+			/* add code to fail the log disk */
+			RF_ASSERT(0);
+		}
+		/* apply region log to parity */
+		/* ApplyRegionToParity(regionID, regionBuffer, parityBuffer); */
+		/* release resources associated with region log */
+		/* RF_Free(rrd_pda, sizeof(RF_PhysDiskAddr_t)); */
+		rf_FreePhysDiskAddr(rrd_pda);
+		rf_FreeDAG(rrd_dag_h);
+		rf_FreeAllocList(rrd_alloclist);
+		rf_FreeMCPair(rrd_mcpair);
+		ReleaseReintBuffer(&raidPtr->regionBufferPool, regionBuffer);
+	}
+	/* write reintegrated parity to disk */
+	if (rf_parityLogDebug)
+		printf("[initiating write of parity for region %d]\n",
+		       regionID);
+	pwr_mcpair = rf_AllocMCPair();
+	RF_LOCK_MUTEX(pwr_mcpair->mutex);
+	pwr_mcpair->flag = RF_FALSE;
+	WriteRegionParity(regionID, pwr_mcpair, parityBuffer, raidPtr, 
+			  &pwr_dag_h, &pwr_alloclist, &pwr_pda);
+	while (!pwr_mcpair->flag)
+		RF_WAIT_COND(pwr_mcpair->cond, pwr_mcpair->mutex);
+	RF_UNLOCK_MUTEX(pwr_mcpair->mutex);
+	if (pwr_dag_h->status != rf_enable) {
+		RF_ERRORMSG("Unable to write parity to disk\n");
+		/* add code to fail the parity disk */
+		RF_ASSERT(0);
+	}
+	/* release resources associated with read of old parity */
+	/* RF_Free(prd_pda, sizeof(RF_PhysDiskAddr_t)); */
+	rf_FreePhysDiskAddr(prd_pda);
+	rf_FreeDAG(prd_dag_h);
+	rf_FreeAllocList(prd_alloclist);
+	rf_FreeMCPair(prd_mcpair);
+
+	/* release resources associated with write of new parity */
+	ReleaseReintBuffer(&raidPtr->parityBufferPool, parityBuffer);
+	/* RF_Free(pwr_pda, sizeof(RF_PhysDiskAddr_t)); */
+	rf_FreePhysDiskAddr(pwr_pda);
+	rf_FreeDAG(pwr_dag_h);
+	rf_FreeAllocList(pwr_alloclist);
+	rf_FreeMCPair(pwr_mcpair);
+
+	if (rf_parityLogDebug)
+		printf("[finished reintegrating region %d]\n", regionID);
+}
+
+
+
+static void 
+ReintegrateLogs(
+    RF_Raid_t * raidPtr,
+    RF_ParityLog_t * logList)
+{
+	RF_ParityLog_t *log, *freeLogList = NULL;
+	RF_ParityLogData_t *logData, *logDataList;
+	RF_RegionId_t regionID;
+
+	RF_ASSERT(logList);
+	while (logList) {
+		log = logList;
+		logList = logList->next;
+		log->next = NULL;
+		regionID = log->regionID;
+		ReintegrateRegion(raidPtr, regionID, log);
+		log->numRecords = 0;
+
+		/* remove all items which are blocked on reintegration of this
+		 * region */
+		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+		logData = rf_SearchAndDequeueParityLogData(raidPtr, regionID, 
+			   &raidPtr->parityLogDiskQueue.reintBlockHead, 
+			   &raidPtr->parityLogDiskQueue.reintBlockTail, 
+							   RF_TRUE);
+		logDataList = logData;
+		while (logData) {
+			logData->next = rf_SearchAndDequeueParityLogData(
+					 raidPtr, regionID, 
+					 &raidPtr->parityLogDiskQueue.reintBlockHead, 
+					 &raidPtr->parityLogDiskQueue.reintBlockTail, 
+					 RF_TRUE);
+			logData = logData->next;
+		}
+		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+
+		/* process blocked log data and clear reintInProgress flag for
+		 * this region */
+		if (logDataList)
+			rf_ParityLogAppend(logDataList, RF_TRUE, &log, RF_TRUE);
+		else {
+			/* Enable flushing for this region.  Holding both
+			 * locks provides a synchronization barrier with
+			 * DumpParityLogToDisk */
+			RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+			RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+			RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+			raidPtr->regionInfo[regionID].diskCount = 0;
+			raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE;
+			RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+			RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);	/* flushing is now
+											 * enabled */
+			RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+		}
+		/* if log wasn't used, attach it to the list of logs to be
+		 * returned */
+		if (log) {
+			log->next = freeLogList;
+			freeLogList = log;
+		}
+	}
+	if (freeLogList)
+		rf_ReleaseParityLogs(raidPtr, freeLogList);
+}
+
+int 
+rf_ShutdownLogging(RF_Raid_t * raidPtr)
+{
+	/* shutdown parity logging 1) disable parity logging in all regions 2)
+	 * reintegrate all regions */
+
+	RF_SectorCount_t diskCount;
+	RF_RegionId_t regionID;
+	RF_ParityLog_t *log;
+
+	if (rf_parityLogDebug)
+		printf("[shutting down parity logging]\n");
+	/* Since parity log maps are volatile, we must reintegrate all
+	 * regions. */
+	if (rf_forceParityLogReint) {
+		for (regionID = 0; regionID < rf_numParityRegions; regionID++) {
+			RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+			raidPtr->regionInfo[regionID].loggingEnabled = 
+				RF_FALSE;
+			log = raidPtr->regionInfo[regionID].coreLog;
+			raidPtr->regionInfo[regionID].coreLog = NULL;
+			diskCount = raidPtr->regionInfo[regionID].diskCount;
+			RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+			if (diskCount > 0 || log != NULL)
+				ReintegrateRegion(raidPtr, regionID, log);
+			if (log != NULL)
+				rf_ReleaseParityLogs(raidPtr, log);
+		}
+	}
+	if (rf_parityLogDebug) {
+		printf("[parity logging disabled]\n");
+		printf("[should be done!]\n");
+	}
+	return (0);
+}
+
+int 
+rf_ParityLoggingDiskManager(RF_Raid_t * raidPtr)
+{
+	RF_ParityLog_t *reintQueue, *flushQueue;
+	int     workNeeded, done = RF_FALSE;
+	int s;
+
+	/* Main program for parity logging disk thread.  This routine waits
+	 * for work to appear in either the flush or reintegration queues and
+	 * is responsible for flushing core logs to the log disk as well as
+	 * reintegrating parity regions.
+	 * 
+	 * BLOCKING */
+
+	s = splbio();
+
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+
+	/*
+         * Inform our creator that we're running. Don't bother doing the
+         * mutex lock/unlock dance- we locked above, and we'll unlock
+         * below with nothing to do, yet.
+         */
+	raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_RUNNING;
+	RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
+
+	/* empty the work queues */
+	flushQueue = raidPtr->parityLogDiskQueue.flushQueue;
+	raidPtr->parityLogDiskQueue.flushQueue = NULL;
+	reintQueue = raidPtr->parityLogDiskQueue.reintQueue;
+	raidPtr->parityLogDiskQueue.reintQueue = NULL;
+	workNeeded = (flushQueue || reintQueue);
+
+	while (!done) {
+		while (workNeeded) {
+			/* First, flush all logs in the flush queue, freeing
+			 * buffers Second, reintegrate all regions which are
+			 * reported as full. Third, append queued log data
+			 * until blocked.
+			 * 
+			 * Note: Incoming appends (ParityLogAppend) can block on
+			 * either 1. empty buffer pool 2. region under
+			 * reintegration To preserve a global FIFO ordering of
+			 * appends, buffers are not released to the world
+			 * until those appends blocked on buffers are removed
+			 * from the append queue.  Similarly, regions which
+			 * are reintegrated are not opened for general use
+			 * until the append queue has been emptied. */
+
+			RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+
+			/* empty flushQueue, using free'd log buffers to
+			 * process bufTail */
+			if (flushQueue)
+			       FlushLogsToDisk(raidPtr, flushQueue);
+
+			/* empty reintQueue, flushing from reintTail as we go */
+			if (reintQueue)
+				ReintegrateLogs(raidPtr, reintQueue);
+
+			RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+			flushQueue = raidPtr->parityLogDiskQueue.flushQueue;
+			raidPtr->parityLogDiskQueue.flushQueue = NULL;
+			reintQueue = raidPtr->parityLogDiskQueue.reintQueue;
+			raidPtr->parityLogDiskQueue.reintQueue = NULL;
+			workNeeded = (flushQueue || reintQueue);
+		}
+		/* no work is needed at this point */
+		if (raidPtr->parityLogDiskQueue.threadState & RF_PLOG_TERMINATE) {
+			/* shutdown parity logging 1. disable parity logging
+			 * in all regions 2. reintegrate all regions */
+			done = RF_TRUE;	/* thread disabled, no work needed */
+			RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+			rf_ShutdownLogging(raidPtr);
+		}
+		if (!done) {
+			/* thread enabled, no work needed, so sleep */
+			if (rf_parityLogDebug)
+				printf("[parity logging disk manager sleeping]\n");
+			RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 
+				     raidPtr->parityLogDiskQueue.mutex);
+			if (rf_parityLogDebug)
+				printf("[parity logging disk manager just woke up]\n");
+			flushQueue = raidPtr->parityLogDiskQueue.flushQueue;
+			raidPtr->parityLogDiskQueue.flushQueue = NULL;
+			reintQueue = raidPtr->parityLogDiskQueue.reintQueue;
+			raidPtr->parityLogDiskQueue.reintQueue = NULL;
+			workNeeded = (flushQueue || reintQueue);
+		}
+	}
+	/*
+         * Announce that we're done.
+         */
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_SHUTDOWN;
+	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
+
+	splx(s);
+
+	/*
+         * In the NetBSD kernel, the thread must exit; returning would
+         * cause the proc trampoline to attempt to return to userspace.
+         */
+	kthread_exit(0);	/* does not return */
+}
+#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
diff --git a/sys/dev/raidframe/rf_paritylogDiskMgr.h b/sys/dev/raidframe/rf_paritylogDiskMgr.h
new file mode 100644
index 0000000..bdcc2a5
--- /dev/null
+++ b/sys/dev/raidframe/rf_paritylogDiskMgr.h
@@ -0,0 +1,42 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_paritylogDiskMgr.h,v 1.3 1999/02/05 00:06:14 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* header file for parity log disk mgr code
+ *
+ */
+
+#ifndef _RF__RF_PARITYLOGDISKMGR_H_
+#define _RF__RF_PARITYLOGDISKMGR_H_
+
+#include <dev/raidframe/rf_types.h>
+
+int     rf_ShutdownLogging(RF_Raid_t * raidPtr);
+int     rf_ParityLoggingDiskManager(RF_Raid_t * raidPtr);
+
+#endif				/* !_RF__RF_PARITYLOGDISKMGR_H_ */
diff --git a/sys/dev/raidframe/rf_paritylogging.c b/sys/dev/raidframe/rf_paritylogging.c
new file mode 100644
index 0000000..f318655
--- /dev/null
+++ b/sys/dev/raidframe/rf_paritylogging.c
@@ -0,0 +1,1074 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_paritylogging.c,v 1.10 2000/02/12 16:06:27 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+
+/*
+  parity logging configuration, dag selection, and mapping is implemented here
+ */
+
+#include <dev/raidframe/rf_archs.h>
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_dagffrd.h>
+#include <dev/raidframe/rf_dagffwr.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+#include <dev/raidframe/rf_dagdegwr.h>
+#include <dev/raidframe/rf_paritylog.h>
+#include <dev/raidframe/rf_paritylogDiskMgr.h>
+#include <dev/raidframe/rf_paritylogging.h>
+#include <dev/raidframe/rf_parityloggingdags.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_map.h>
+#include <dev/raidframe/rf_utils.h>
+#include <dev/raidframe/rf_shutdown.h>
+#include <dev/raidframe/rf_kintf.h>
+
+typedef struct RF_ParityLoggingConfigInfo_s {
+	RF_RowCol_t **stripeIdentifier;	/* filled in at config time & used by
+					 * IdentifyStripe */
+}       RF_ParityLoggingConfigInfo_t;
+
+static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID);
+static void rf_ShutdownParityLogging(RF_ThreadArg_t arg);
+static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg);
+static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg);
+static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg);
+static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg);
+static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg);
+
+int 
+rf_ConfigureParityLogging(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	int     i, j, startdisk, rc;
+	RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity;
+	RF_SectorCount_t parityBufferCapacity, maxRegionParityRange;
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_ParityLoggingConfigInfo_t *info;
+	RF_ParityLog_t *l = NULL, *next;
+	caddr_t lHeapPtr;
+
+	if (rf_numParityRegions <= 0)
+		return(EINVAL);
+
+	/*
+         * We create multiple entries on the shutdown list here, since
+         * this configuration routine is fairly complicated in and of
+         * itself, and this makes backing out of a failed configuration
+         * much simpler.
+         */
+
+	raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG;
+
+	/* create a parity logging configuration structure */
+	RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t), 
+			(RF_ParityLoggingConfigInfo_t *), 
+			raidPtr->cleanupList);
+	if (info == NULL)
+		return (ENOMEM);
+	layoutPtr->layoutSpecificInfo = (void *) info;
+
+	RF_ASSERT(raidPtr->numRow == 1);
+
+	/* the stripe identifier must identify the disks in each stripe, IN
+	 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */
+	info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol), 
+						  (raidPtr->numCol), 
+						  raidPtr->cleanupList);
+	if (info->stripeIdentifier == NULL)
+		return (ENOMEM);
+
+	startdisk = 0;
+	for (i = 0; i < (raidPtr->numCol); i++) {
+		for (j = 0; j < (raidPtr->numCol); j++) {
+			info->stripeIdentifier[i][j] = (startdisk + j) % 
+				(raidPtr->numCol - 1);
+		}
+		if ((--startdisk) < 0)
+			startdisk = raidPtr->numCol - 1 - 1;
+	}
+
+	/* fill in the remaining layout parameters */
+	layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
+	layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << 
+		raidPtr->logBytesPerSector;
+	layoutPtr->numParityCol = 1;
+	layoutPtr->numParityLogCol = 1;
+	layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol - 
+		layoutPtr->numParityLogCol;
+	layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * 
+		layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
+	raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * 
+		layoutPtr->sectorsPerStripeUnit;
+
+	raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * 
+		layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+
+	/* configure parity log parameters
+	 * 
+	 * parameter               comment/constraints 
+	 * ------------------------------------------- 
+	 * numParityRegions*       all regions (except possibly last) 
+	 *                         of equal size 
+	 * totalInCoreLogCapacity* amount of memory in bytes available 
+	 *                         for in-core logs (default 1 MB) 
+	 * numSectorsPerLog#       capacity of an in-core log in sectors 
+	 *                         (1 * disk track)
+	 * numParityLogs           total number of in-core logs,
+	 *                         should be at least numParityRegions 
+	 * regionLogCapacity       size of a region log (except possibly 
+	 *                         last one) in sectors 
+	 * totalLogCapacity        total amount of log space in sectors
+	 * 
+	 * where '*' denotes a user settable parameter. 
+	 * Note that logs are fixed to be the size of a disk track, 
+	 * value #defined in rf_paritylog.h
+	 * 
+	 */
+
+	totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol;
+	raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions;
+	if (rf_parityLogDebug)
+		printf("bytes per sector %d\n", raidPtr->bytesPerSector);
+
+	/* reduce fragmentation within a disk region by adjusting the number
+	 * of regions in an attempt to allow an integral number of logs to fit
+	 * into a disk region */
+	fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog;
+	if (fragmentation > 0)
+		for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) {
+			if (((totalLogCapacity / (rf_numParityRegions + i)) % 
+			     raidPtr->numSectorsPerLog) < fragmentation) {
+				rf_numParityRegions++;
+				raidPtr->regionLogCapacity = totalLogCapacity /
+					rf_numParityRegions;
+				fragmentation = raidPtr->regionLogCapacity % 
+					raidPtr->numSectorsPerLog;
+			}
+			if (((totalLogCapacity / (rf_numParityRegions - i)) % 
+			     raidPtr->numSectorsPerLog) < fragmentation) {
+				rf_numParityRegions--;
+				raidPtr->regionLogCapacity = totalLogCapacity /
+					rf_numParityRegions;
+				fragmentation = raidPtr->regionLogCapacity % 
+					raidPtr->numSectorsPerLog;
+			}
+		}
+	/* ensure integral number of regions per log */
+	raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity / 
+				      raidPtr->numSectorsPerLog) * 
+		raidPtr->numSectorsPerLog;
+
+	raidPtr->numParityLogs = rf_totalInCoreLogCapacity / 
+		(raidPtr->bytesPerSector * raidPtr->numSectorsPerLog);
+	/* to avoid deadlock, must ensure that enough logs exist for each
+	 * region to have one simultaneously */
+	if (raidPtr->numParityLogs < rf_numParityRegions)
+		raidPtr->numParityLogs = rf_numParityRegions;
+
+	/* create region information structs */
+	printf("Allocating %d bytes for in-core parity region info\n",
+	       (int) (rf_numParityRegions * sizeof(RF_RegionInfo_t)));
+	RF_Malloc(raidPtr->regionInfo, 
+		  (rf_numParityRegions * sizeof(RF_RegionInfo_t)), 
+		  (RF_RegionInfo_t *));
+	if (raidPtr->regionInfo == NULL)
+		return (ENOMEM);
+
+	/* last region may not be full capacity */
+	lastRegionCapacity = raidPtr->regionLogCapacity;
+	while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity + 
+	       lastRegionCapacity > totalLogCapacity)
+		lastRegionCapacity = lastRegionCapacity - 
+			raidPtr->numSectorsPerLog;
+
+	raidPtr->regionParityRange = raidPtr->sectorsPerDisk / 
+		rf_numParityRegions;
+	maxRegionParityRange = raidPtr->regionParityRange;
+
+/* i can't remember why this line is in the code -wvcii 6/30/95 */
+/*  if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0)
+    regionParityRange++; */
+
+	/* build pool of unused parity logs */
+	printf("Allocating %d bytes for %d parity logs\n",
+	       raidPtr->numParityLogs * raidPtr->numSectorsPerLog * 
+	       raidPtr->bytesPerSector,
+	       raidPtr->numParityLogs);
+	RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 
+		  raidPtr->numSectorsPerLog * raidPtr->bytesPerSector, 
+		  (caddr_t));
+	if (raidPtr->parityLogBufferHeap == NULL)
+		return (ENOMEM);
+	lHeapPtr = raidPtr->parityLogBufferHeap;
+	rc = rf_mutex_init(&raidPtr->parityLogPool.mutex, "RF_PARITYLOGGING1");
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", 
+			     __FILE__, __LINE__, rc);
+		RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 
+			raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
+		return (ENOMEM);
+	}
+	for (i = 0; i < raidPtr->numParityLogs; i++) {
+		if (i == 0) {
+			RF_Calloc(raidPtr->parityLogPool.parityLogs, 1, 
+				  sizeof(RF_ParityLog_t), (RF_ParityLog_t *));
+			if (raidPtr->parityLogPool.parityLogs == NULL) {
+				RF_Free(raidPtr->parityLogBufferHeap, 
+					raidPtr->numParityLogs * 
+					raidPtr->numSectorsPerLog * 
+					raidPtr->bytesPerSector);
+				return (ENOMEM);
+			}
+			l = raidPtr->parityLogPool.parityLogs;
+		} else {
+			RF_Calloc(l->next, 1, sizeof(RF_ParityLog_t), 
+				  (RF_ParityLog_t *));
+			if (l->next == NULL) {
+				RF_Free(raidPtr->parityLogBufferHeap, 
+					raidPtr->numParityLogs * 
+					raidPtr->numSectorsPerLog * 
+					raidPtr->bytesPerSector);
+				for (l = raidPtr->parityLogPool.parityLogs; 
+				     l;
+				     l = next) {
+					next = l->next;
+					if (l->records)
+						RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)));
+					RF_Free(l, sizeof(RF_ParityLog_t));
+				}
+				return (ENOMEM);
+			}
+			l = l->next;
+		}
+		l->bufPtr = lHeapPtr;
+		lHeapPtr += raidPtr->numSectorsPerLog * 
+			raidPtr->bytesPerSector;
+		RF_Malloc(l->records, (raidPtr->numSectorsPerLog * 
+				       sizeof(RF_ParityLogRecord_t)), 
+			  (RF_ParityLogRecord_t *));
+		if (l->records == NULL) {
+			RF_Free(raidPtr->parityLogBufferHeap, 
+				raidPtr->numParityLogs * 
+				raidPtr->numSectorsPerLog * 
+				raidPtr->bytesPerSector);
+			for (l = raidPtr->parityLogPool.parityLogs; 
+			     l; 
+			     l = next) {
+				next = l->next;
+				if (l->records)
+					RF_Free(l->records, 
+						(raidPtr->numSectorsPerLog * 
+						 sizeof(RF_ParityLogRecord_t)));
+				RF_Free(l, sizeof(RF_ParityLog_t));
+			}
+			return (ENOMEM);
+		}
+	}
+	rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr);
+	if (rc) {
+		RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		rf_ShutdownParityLoggingPool(raidPtr);
+		return (rc);
+	}
+	/* build pool of region buffers */
+	rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex, "RF_PARITYLOGGING3");
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", 
+			     __FILE__, __LINE__, rc);
+		return (ENOMEM);
+	}
+	rc = rf_cond_init(&raidPtr->regionBufferPool.cond);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", 
+			     __FILE__, __LINE__, rc);
+		rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
+		return (ENOMEM);
+	}
+	raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity * 
+		raidPtr->bytesPerSector;
+	printf("regionBufferPool.bufferSize %d\n", 
+	       raidPtr->regionBufferPool.bufferSize);
+
+	/* for now, only one region at a time may be reintegrated */
+	raidPtr->regionBufferPool.totalBuffers = 1;	
+
+	raidPtr->regionBufferPool.availableBuffers = 
+		raidPtr->regionBufferPool.totalBuffers;
+	raidPtr->regionBufferPool.availBuffersIndex = 0;
+	raidPtr->regionBufferPool.emptyBuffersIndex = 0;
+	printf("Allocating %d bytes for regionBufferPool\n",
+	       (int) (raidPtr->regionBufferPool.totalBuffers * 
+		      sizeof(caddr_t)));
+	RF_Malloc(raidPtr->regionBufferPool.buffers, 
+		  raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t), 
+		  (caddr_t *));
+	if (raidPtr->regionBufferPool.buffers == NULL) {
+		rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
+		rf_cond_destroy(&raidPtr->regionBufferPool.cond);
+		return (ENOMEM);
+	}
+	for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) {
+		printf("Allocating %d bytes for regionBufferPool#%d\n",
+		       (int) (raidPtr->regionBufferPool.bufferSize * 
+			      sizeof(char)), i);
+		RF_Malloc(raidPtr->regionBufferPool.buffers[i], 
+			  raidPtr->regionBufferPool.bufferSize * sizeof(char),
+			  (caddr_t));
+		if (raidPtr->regionBufferPool.buffers[i] == NULL) {
+			rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
+			rf_cond_destroy(&raidPtr->regionBufferPool.cond);
+			for (j = 0; j < i; j++) {
+				RF_Free(raidPtr->regionBufferPool.buffers[i], 
+					raidPtr->regionBufferPool.bufferSize *
+					sizeof(char));
+			}
+			RF_Free(raidPtr->regionBufferPool.buffers, 
+				raidPtr->regionBufferPool.totalBuffers * 
+				sizeof(caddr_t));
+			return (ENOMEM);
+		}
+		printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i,
+		    (long) raidPtr->regionBufferPool.buffers[i]);
+	}
+	rc = rf_ShutdownCreate(listp, 
+			       rf_ShutdownParityLoggingRegionBufferPool,
+			       raidPtr);
+	if (rc) {
+		RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		rf_ShutdownParityLoggingRegionBufferPool(raidPtr);
+		return (rc);
+	}
+	/* build pool of parity buffers */
+	parityBufferCapacity = maxRegionParityRange;
+	rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex, "RF_PARITYLOGGING3");
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", 
+			     __FILE__, __LINE__, rc);
+		return (rc);
+	}
+	rc = rf_cond_init(&raidPtr->parityBufferPool.cond);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", 
+			     __FILE__, __LINE__, rc);
+		rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
+		return (ENOMEM);
+	}
+	raidPtr->parityBufferPool.bufferSize = parityBufferCapacity * 
+		raidPtr->bytesPerSector;
+	printf("parityBufferPool.bufferSize %d\n", 
+	       raidPtr->parityBufferPool.bufferSize);
+
+	/* for now, only one region at a time may be reintegrated */
+	raidPtr->parityBufferPool.totalBuffers = 1;	
+
+	raidPtr->parityBufferPool.availableBuffers = 
+		raidPtr->parityBufferPool.totalBuffers;
+	raidPtr->parityBufferPool.availBuffersIndex = 0;
+	raidPtr->parityBufferPool.emptyBuffersIndex = 0;
+	printf("Allocating %d bytes for parityBufferPool of %d units\n",
+	       (int) (raidPtr->parityBufferPool.totalBuffers * 
+		      sizeof(caddr_t)),
+	       raidPtr->parityBufferPool.totalBuffers );
+	RF_Malloc(raidPtr->parityBufferPool.buffers, 
+		  raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t), 
+		  (caddr_t *));
+	if (raidPtr->parityBufferPool.buffers == NULL) {
+		rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
+		rf_cond_destroy(&raidPtr->parityBufferPool.cond);
+		return (ENOMEM);
+	}
+	for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) {
+		printf("Allocating %d bytes for parityBufferPool#%d\n",
+		       (int) (raidPtr->parityBufferPool.bufferSize * 
+			      sizeof(char)),i);
+		RF_Malloc(raidPtr->parityBufferPool.buffers[i], 
+			  raidPtr->parityBufferPool.bufferSize * sizeof(char),
+			  (caddr_t));
+		if (raidPtr->parityBufferPool.buffers == NULL) {
+			rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
+			rf_cond_destroy(&raidPtr->parityBufferPool.cond);
+			for (j = 0; j < i; j++) {
+				RF_Free(raidPtr->parityBufferPool.buffers[i], 
+					raidPtr->regionBufferPool.bufferSize * 
+					sizeof(char));
+			}
+			RF_Free(raidPtr->parityBufferPool.buffers, 
+				raidPtr->regionBufferPool.totalBuffers * 
+				sizeof(caddr_t));
+			return (ENOMEM);
+		}
+		printf("parityBufferPool.buffers[%d] = %lx\n", i,
+		    (long) raidPtr->parityBufferPool.buffers[i]);
+	}
+	rc = rf_ShutdownCreate(listp, 
+			       rf_ShutdownParityLoggingParityBufferPool, 
+			       raidPtr);
+	if (rc) {
+		RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		rf_ShutdownParityLoggingParityBufferPool(raidPtr);
+		return (rc);
+	}
+	/* initialize parityLogDiskQueue */
+	rc = rf_create_managed_mutex(listp, 
+				     &raidPtr->parityLogDiskQueue.mutex);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", 
+			     __FILE__, __LINE__, rc);
+		return (rc);
+	}
+	rc = rf_create_managed_cond(listp, &raidPtr->parityLogDiskQueue.cond);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", 
+			     __FILE__, __LINE__, rc);
+		return (rc);
+	}
+	raidPtr->parityLogDiskQueue.flushQueue = NULL;
+	raidPtr->parityLogDiskQueue.reintQueue = NULL;
+	raidPtr->parityLogDiskQueue.bufHead = NULL;
+	raidPtr->parityLogDiskQueue.bufTail = NULL;
+	raidPtr->parityLogDiskQueue.reintHead = NULL;
+	raidPtr->parityLogDiskQueue.reintTail = NULL;
+	raidPtr->parityLogDiskQueue.logBlockHead = NULL;
+	raidPtr->parityLogDiskQueue.logBlockTail = NULL;
+	raidPtr->parityLogDiskQueue.reintBlockHead = NULL;
+	raidPtr->parityLogDiskQueue.reintBlockTail = NULL;
+	raidPtr->parityLogDiskQueue.freeDataList = NULL;
+	raidPtr->parityLogDiskQueue.freeCommonList = NULL;
+
+	rc = rf_ShutdownCreate(listp, 
+			       rf_ShutdownParityLoggingDiskQueue, 
+			       raidPtr);
+	if (rc) {
+		RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		return (rc);
+	}
+	for (i = 0; i < rf_numParityRegions; i++) {
+		rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex, "RF_PARITYLOGGING3");
+		if (rc) {
+			RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+			    __LINE__, rc);
+			for (j = 0; j < i; j++)
+				FreeRegionInfo(raidPtr, j);
+			RF_Free(raidPtr->regionInfo, 
+				(rf_numParityRegions * 
+				 sizeof(RF_RegionInfo_t)));
+			return (ENOMEM);
+		}
+		rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex, "RF_PARITYLOGGING4");
+		if (rc) {
+			RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+			    __LINE__, rc);
+			rf_mutex_destroy(&raidPtr->regionInfo[i].mutex);
+			for (j = 0; j < i; j++)
+				FreeRegionInfo(raidPtr, j);
+			RF_Free(raidPtr->regionInfo, 
+				(rf_numParityRegions * 
+				 sizeof(RF_RegionInfo_t)));
+			return (ENOMEM);
+		}
+		raidPtr->regionInfo[i].reintInProgress = RF_FALSE;
+		raidPtr->regionInfo[i].regionStartAddr = 
+			raidPtr->regionLogCapacity * i;
+		raidPtr->regionInfo[i].parityStartAddr = 
+			raidPtr->regionParityRange * i;
+		if (i < rf_numParityRegions - 1) {
+			raidPtr->regionInfo[i].capacity = 
+				raidPtr->regionLogCapacity;
+			raidPtr->regionInfo[i].numSectorsParity = 
+				raidPtr->regionParityRange;
+		} else {
+			raidPtr->regionInfo[i].capacity = 
+				lastRegionCapacity;
+			raidPtr->regionInfo[i].numSectorsParity = 
+				raidPtr->sectorsPerDisk - 
+				raidPtr->regionParityRange * i;
+			if (raidPtr->regionInfo[i].numSectorsParity > 
+			    maxRegionParityRange)
+				maxRegionParityRange = 
+					raidPtr->regionInfo[i].numSectorsParity;
+		}
+		raidPtr->regionInfo[i].diskCount = 0;
+		RF_ASSERT(raidPtr->regionInfo[i].capacity + 
+			  raidPtr->regionInfo[i].regionStartAddr <= 
+			  totalLogCapacity);
+		RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr + 
+			  raidPtr->regionInfo[i].numSectorsParity <= 
+			  raidPtr->sectorsPerDisk);
+		printf("Allocating %d bytes for region %d\n",
+		       (int) (raidPtr->regionInfo[i].capacity *
+			   sizeof(RF_DiskMap_t)), i);
+		RF_Malloc(raidPtr->regionInfo[i].diskMap, 
+			  (raidPtr->regionInfo[i].capacity *
+			   sizeof(RF_DiskMap_t)), 
+			  (RF_DiskMap_t *));
+		if (raidPtr->regionInfo[i].diskMap == NULL) {
+			rf_mutex_destroy(&raidPtr->regionInfo[i].mutex);
+			rf_mutex_destroy(&raidPtr->regionInfo[i].reintMutex);
+			for (j = 0; j < i; j++)
+				FreeRegionInfo(raidPtr, j);
+			RF_Free(raidPtr->regionInfo, 
+				(rf_numParityRegions * 
+				 sizeof(RF_RegionInfo_t)));
+			return (ENOMEM);
+		}
+		raidPtr->regionInfo[i].loggingEnabled = RF_FALSE;
+		raidPtr->regionInfo[i].coreLog = NULL;
+	}
+	rc = rf_ShutdownCreate(listp,
+			       rf_ShutdownParityLoggingRegionInfo, 
+			       raidPtr);
+	if (rc) {
+		RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		rf_ShutdownParityLoggingRegionInfo(raidPtr);
+		return (rc);
+	}
+	RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0);
+	raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED;
+	rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle, 
+			      rf_ParityLoggingDiskManager, raidPtr,"rf_log");
+	if (rc) {
+		raidPtr->parityLogDiskQueue.threadState = 0;
+		RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n",
+		    __FILE__, __LINE__, rc);
+		return (ENOMEM);
+	}
+	/* wait for thread to start */
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) {
+		RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 
+			     raidPtr->parityLogDiskQueue.mutex);
+	}
+	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+
+	rc = rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr);
+	if (rc) {
+		RF_ERRORMSG1("Got rc=%d adding parity logging shutdown event\n", rc);
+		rf_ShutdownParityLogging(raidPtr);
+		return (rc);
+	}
+	if (rf_parityLogDebug) {
+		printf("                            size of disk log in sectors: %d\n",
+		    (int) totalLogCapacity);
+		printf("                            total number of parity regions is %d\n", (int) rf_numParityRegions);
+		printf("                            nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity);
+		printf("                            nominal region fragmentation is %d sectors\n", (int) fragmentation);
+		printf("                            total number of parity logs is %d\n", raidPtr->numParityLogs);
+		printf("                            parity log size is %d sectors\n", raidPtr->numSectorsPerLog);
+		printf("                            total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity);
+	}
+	rf_EnableParityLogging(raidPtr);
+
+	return (0);
+}
+
+static void 
+FreeRegionInfo(
+    RF_Raid_t * raidPtr,
+    RF_RegionId_t regionID)
+{
+	RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+	RF_Free(raidPtr->regionInfo[regionID].diskMap, 
+		(raidPtr->regionInfo[regionID].capacity * 
+		 sizeof(RF_DiskMap_t)));
+	if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) {
+		rf_ReleaseParityLogs(raidPtr, 
+				     raidPtr->regionInfo[regionID].coreLog);
+		raidPtr->regionInfo[regionID].coreLog = NULL;
+	} else {
+		RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL);
+		RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0);
+	}
+	RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+	rf_mutex_destroy(&raidPtr->regionInfo[regionID].mutex);
+	rf_mutex_destroy(&raidPtr->regionInfo[regionID].reintMutex);
+}
+
+
+static void 
+FreeParityLogQueue(
+    RF_Raid_t * raidPtr,
+    RF_ParityLogQueue_t * queue)
+{
+	RF_ParityLog_t *l1, *l2;
+
+	RF_LOCK_MUTEX(queue->mutex);
+	l1 = queue->parityLogs;
+	while (l1) {
+		l2 = l1;
+		l1 = l2->next;
+		RF_Free(l2->records, (raidPtr->numSectorsPerLog * 
+				      sizeof(RF_ParityLogRecord_t)));
+		RF_Free(l2, sizeof(RF_ParityLog_t));
+	}
+	RF_UNLOCK_MUTEX(queue->mutex);
+	rf_mutex_destroy(&queue->mutex);
+}
+
+
+static void 
+FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue)
+{
+	int     i;
+
+	RF_LOCK_MUTEX(queue->mutex);
+	if (queue->availableBuffers != queue->totalBuffers) {
+		printf("Attempt to free region queue which is still in use!\n");
+		RF_ASSERT(0);
+	}
+	for (i = 0; i < queue->totalBuffers; i++)
+		RF_Free(queue->buffers[i], queue->bufferSize);
+	RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t));
+	RF_UNLOCK_MUTEX(queue->mutex);
+	rf_mutex_destroy(&queue->mutex);
+}
+
+static void 
+rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg)
+{
+	RF_Raid_t *raidPtr;
+	RF_RegionId_t i;
+
+	raidPtr = (RF_Raid_t *) arg;
+	if (rf_parityLogDebug) {
+		printf("raid%d: ShutdownParityLoggingRegionInfo\n", 
+		       raidPtr->raidid);
+	}
+	/* free region information structs */
+	for (i = 0; i < rf_numParityRegions; i++)
+		FreeRegionInfo(raidPtr, i);
+	RF_Free(raidPtr->regionInfo, (rf_numParityRegions * 
+				      sizeof(raidPtr->regionInfo)));
+	raidPtr->regionInfo = NULL;
+}
+
+static void 
+rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg)
+{
+	RF_Raid_t *raidPtr;
+
+	raidPtr = (RF_Raid_t *) arg;
+	if (rf_parityLogDebug) {
+		printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid);
+	}
+	/* free contents of parityLogPool */
+	FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool);
+	RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 
+		raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
+}
+
+static void 
+rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg)
+{
+	RF_Raid_t *raidPtr;
+
+	raidPtr = (RF_Raid_t *) arg;
+	if (rf_parityLogDebug) {
+		printf("raid%d: ShutdownParityLoggingRegionBufferPool\n", 
+		       raidPtr->raidid);
+	}
+	FreeRegionBufferQueue(&raidPtr->regionBufferPool);
+}
+
+static void 
+rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg)
+{
+	RF_Raid_t *raidPtr;
+
+	raidPtr = (RF_Raid_t *) arg;
+	if (rf_parityLogDebug) {
+		printf("raid%d: ShutdownParityLoggingParityBufferPool\n",
+		       raidPtr->raidid);
+	}
+	FreeRegionBufferQueue(&raidPtr->parityBufferPool);
+}
+
+static void 
+rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg)
+{
+	RF_ParityLogData_t *d;
+	RF_CommonLogData_t *c;
+	RF_Raid_t *raidPtr;
+
+	raidPtr = (RF_Raid_t *) arg;
+	if (rf_parityLogDebug) {
+		printf("raid%d: ShutdownParityLoggingDiskQueue\n",
+		       raidPtr->raidid);
+	}
+	/* free disk manager stuff */
+	RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL);
+	RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL);
+	RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL);
+	RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL);
+	while (raidPtr->parityLogDiskQueue.freeDataList) {
+		d = raidPtr->parityLogDiskQueue.freeDataList;
+		raidPtr->parityLogDiskQueue.freeDataList = 
+			raidPtr->parityLogDiskQueue.freeDataList->next;
+		RF_Free(d, sizeof(RF_ParityLogData_t));
+	}
+	while (raidPtr->parityLogDiskQueue.freeCommonList) {
+		c = raidPtr->parityLogDiskQueue.freeCommonList;
+		rf_mutex_destroy(&c->mutex);
+		raidPtr->parityLogDiskQueue.freeCommonList = 
+			raidPtr->parityLogDiskQueue.freeCommonList->next;
+		RF_Free(c, sizeof(RF_CommonLogData_t));
+	}
+}
+
+static void 
+rf_ShutdownParityLogging(RF_ThreadArg_t arg)
+{
+	RF_Raid_t *raidPtr;
+
+	raidPtr = (RF_Raid_t *) arg;
+	if (rf_parityLogDebug) {
+		printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid);
+	}
+	/* shutdown disk thread */
+	/* This has the desirable side-effect of forcing all regions to be
+	 * reintegrated.  This is necessary since all parity log maps are
+	 * currently held in volatile memory. */
+
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE;
+	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
+	/*
+         * pLogDiskThread will now terminate when queues are cleared
+         * now wait for it to be done
+         */
+	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) {
+		RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 
+			     raidPtr->parityLogDiskQueue.mutex);
+	}
+	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+	if (rf_parityLogDebug) {
+		printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid);
+	}
+}
+
+int 
+rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr)
+{
+	return (20);
+}
+
+RF_HeadSepLimit_t 
+rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr)
+{
+	return (10);
+}
+/* return the region ID for a given RAID address */
+RF_RegionId_t 
+rf_MapRegionIDParityLogging(
+    RF_Raid_t * raidPtr,
+    RF_SectorNum_t address)
+{
+	RF_RegionId_t regionID;
+
+/*  regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */
+	regionID = address / raidPtr->regionParityRange;
+	if (regionID == rf_numParityRegions) {
+		/* last region may be larger than other regions */
+		regionID--;
+	}
+	RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr);
+	RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr + 
+		  raidPtr->regionInfo[regionID].numSectorsParity);
+	RF_ASSERT(regionID < rf_numParityRegions);
+	return (regionID);
+}
+
+
+/* given a logical RAID sector, determine physical disk address of data */
+void 
+rf_MapSectorParityLogging(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_StripeNum_t SUID = raidSector / 
+		raidPtr->Layout.sectorsPerStripeUnit;
+	*row = 0;
+	/* *col = (SUID % (raidPtr->numCol -
+	 * raidPtr->Layout.numParityLogCol)); */
+	*col = SUID % raidPtr->Layout.numDataCol;
+	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 
+		raidPtr->Layout.sectorsPerStripeUnit +
+		(raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+
+/* given a logical RAID sector, determine physical disk address of parity  */
+void 
+rf_MapParityParityLogging(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_StripeNum_t SUID = raidSector / 
+		raidPtr->Layout.sectorsPerStripeUnit;
+
+	*row = 0;
+	/* *col =
+	 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt
+	 * r->numCol - raidPtr->Layout.numParityLogCol); */
+	*col = raidPtr->Layout.numDataCol;
+	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 
+		raidPtr->Layout.sectorsPerStripeUnit +
+		(raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+
+/* given a regionID and sector offset, determine the physical disk address of the parity log */
+void 
+rf_MapLogParityLogging(
+    RF_Raid_t * raidPtr,
+    RF_RegionId_t regionID,
+    RF_SectorNum_t regionOffset,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * startSector)
+{
+	*row = 0;
+	*col = raidPtr->numCol - 1;
+	*startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset;
+}
+
+
+/* given a regionID, determine the physical disk address of the logged
+   parity for that region */
+void 
+rf_MapRegionParity(
+    RF_Raid_t * raidPtr,
+    RF_RegionId_t regionID,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * startSector,
+    RF_SectorCount_t * numSector)
+{
+	*row = 0;
+	*col = raidPtr->numCol - 2;
+	*startSector = raidPtr->regionInfo[regionID].parityStartAddr;
+	*numSector = raidPtr->regionInfo[regionID].numSectorsParity;
+}
+
+
+/* given a logical RAID address, determine the participating disks in
+   the stripe */
+void 
+rf_IdentifyStripeParityLogging(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids,
+    RF_RowCol_t * outRow)
+{
+	RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, 
+							   addr);
+	RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *) 
+		raidPtr->Layout.layoutSpecificInfo;
+	*outRow = 0;
+	*diskids = info->stripeIdentifier[stripeID % raidPtr->numCol];
+}
+
+
+void 
+rf_MapSIDToPSIDParityLogging(
+    RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID,
+    RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru)
+{
+	*which_ru = 0;
+	*psID = stripeID;
+}
+
+
+/* select an algorithm for performing an access.  Returns two pointers,
+ * one to a function that will return information about the DAG, and
+ * another to a function that will create the dag.
+ */
+void 
+rf_ParityLoggingDagSelect(
+    RF_Raid_t * raidPtr,
+    RF_IoType_t type,
+    RF_AccessStripeMap_t * asmp,
+    RF_VoidFuncPtr * createFunc)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_PhysDiskAddr_t *failedPDA = NULL;
+	RF_RowCol_t frow, fcol;
+	RF_RowStatus_t rstat;
+	int     prior_recon;
+
+	RF_ASSERT(RF_IO_IS_R_OR_W(type));
+
+	if (asmp->numDataFailed + asmp->numParityFailed > 1) {
+		RF_ERRORMSG("Multiple disks failed in a single group!  Aborting I/O operation.\n");
+		 /* *infoFunc = */ *createFunc = NULL;
+		return;
+	} else
+		if (asmp->numDataFailed + asmp->numParityFailed == 1) {
+
+			/* if under recon & already reconstructed, redirect
+			 * the access to the spare drive and eliminate the
+			 * failure indication */
+			failedPDA = asmp->failedPDAs[0];
+			frow = failedPDA->row;
+			fcol = failedPDA->col;
+			rstat = raidPtr->status[failedPDA->row];
+			prior_recon = (rstat == rf_rs_reconfigured) || (
+			    (rstat == rf_rs_reconstructing) ?
+			    rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
+			    );
+			if (prior_recon) {
+				RF_RowCol_t or = failedPDA->row, oc = failedPDA->col;
+				RF_SectorNum_t oo = failedPDA->startSector;
+				if (layoutPtr->map->flags & 
+				    RF_DISTRIBUTE_SPARE) {	
+					/* redirect to dist spare space */
+
+					if (failedPDA == asmp->parityInfo) {
+
+						/* parity has failed */
+						(layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, &failedPDA->row,
+						    &failedPDA->col, &failedPDA->startSector, RF_REMAP);
+
+						if (asmp->parityInfo->next) {	/* redir 2nd component,
+										 * if any */
+							RF_PhysDiskAddr_t *p = asmp->parityInfo->next;
+							RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit;
+							p->row = failedPDA->row;
+							p->col = failedPDA->col;
+							p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) +
+							    SUoffs;	/* cheating:
+									 * startSector is not
+									 * really a RAID address */
+						}
+					} else
+						if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) {
+							RF_ASSERT(0);	/* should not ever
+									 * happen */
+						} else {
+
+							/* data has failed */
+							(layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, &failedPDA->row,
+							    &failedPDA->col, &failedPDA->startSector, RF_REMAP);
+
+						}
+
+				} else {	
+					/* redirect to dedicated spare space */
+
+					failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
+					failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
+
+					/* the parity may have two distinct
+					 * components, both of which may need
+					 * to be redirected */
+					if (asmp->parityInfo->next) {
+						if (failedPDA == asmp->parityInfo) {
+							failedPDA->next->row = failedPDA->row;
+							failedPDA->next->col = failedPDA->col;
+						} else
+							if (failedPDA == asmp->parityInfo->next) {	/* paranoid:  should never occur */
+								asmp->parityInfo->row = failedPDA->row;
+								asmp->parityInfo->col = failedPDA->col;
+							}
+					}
+				}
+
+				RF_ASSERT(failedPDA->col != -1);
+
+				if (rf_dagDebug || rf_mapDebug) {
+					printf("raid%d: Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
+					    raidPtr->raidid, type, or, oc, (long) oo, failedPDA->row, failedPDA->col, (long) failedPDA->startSector);
+				}
+				asmp->numDataFailed = asmp->numParityFailed = 0;
+			}
+		}
+	if (type == RF_IO_TYPE_READ) {
+
+		if (asmp->numDataFailed == 0)
+			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;
+		else
+			*createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG;
+
+	} else {
+
+
+		/* if mirroring, always use large writes.  If the access
+		 * requires two distinct parity updates, always do a small
+		 * write.  If the stripe contains a failure but the access
+		 * does not, do a small write. The first conditional
+		 * (numStripeUnitsAccessed <= numDataCol/2) uses a
+		 * less-than-or-equal rather than just a less-than because
+		 * when G is 3 or 4, numDataCol/2 is 1, and I want
+		 * single-stripe-unit updates to use just one disk. */
+		if ((asmp->numDataFailed + asmp->numParityFailed) == 0) {
+			if (((asmp->numStripeUnitsAccessed <= 
+			      (layoutPtr->numDataCol / 2)) && 
+			     (layoutPtr->numDataCol != 1)) ||
+			    (asmp->parityInfo->next != NULL) || 
+			    rf_CheckStripeForFailures(raidPtr, asmp)) {
+				*createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG;
+			} else
+				*createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG;
+		} else
+			if (asmp->numParityFailed == 1)
+				*createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG;
+			else
+				if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)
+					*createFunc = NULL;
+				else
+					*createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG;
+	}
+}
+#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
diff --git a/sys/dev/raidframe/rf_paritylogging.h b/sys/dev/raidframe/rf_paritylogging.h
new file mode 100644
index 0000000..5b7dd25
--- /dev/null
+++ b/sys/dev/raidframe/rf_paritylogging.h
@@ -0,0 +1,70 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_paritylogging.h,v 1.3 1999/02/05 00:06:14 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* header file for Parity Logging */
+
+#ifndef _RF__RF_PARITYLOGGING_H_
+#define _RF__RF_PARITYLOGGING_H_
+
+int 
+rf_ConfigureParityLogging(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+int     rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr);
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr);
+RF_RegionId_t 
+rf_MapRegionIDParityLogging(RF_Raid_t * raidPtr,
+    RF_SectorNum_t address);
+void 
+rf_MapSectorParityLogging(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector,
+    int remap);
+void 
+rf_MapParityParityLogging(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector,
+    int remap);
+void 
+rf_MapLogParityLogging(RF_Raid_t * raidPtr, RF_RegionId_t regionID,
+    RF_SectorNum_t regionOffset, RF_RowCol_t * row, RF_RowCol_t * col,
+    RF_SectorNum_t * startSector);
+void 
+rf_MapRegionParity(RF_Raid_t * raidPtr, RF_RegionId_t regionID,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * startSector,
+    RF_SectorCount_t * numSector);
+void 
+rf_IdentifyStripeParityLogging(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
+void 
+rf_MapSIDToPSIDParityLogging(RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID, RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru);
+void 
+rf_ParityLoggingDagSelect(RF_Raid_t * raidPtr, RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap, RF_VoidFuncPtr * createFunc);
+
+#endif				/* !_RF__RF_PARITYLOGGING_H_ */
diff --git a/sys/dev/raidframe/rf_parityloggingdags.c b/sys/dev/raidframe/rf_parityloggingdags.c
new file mode 100644
index 0000000..7ccef55
--- /dev/null
+++ b/sys/dev/raidframe/rf_parityloggingdags.c
@@ -0,0 +1,673 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_parityloggingdags.c,v 1.4 2000/01/07 03:41:04 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <dev/raidframe/rf_archs.h>
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+
+/*
+  DAGs specific to parity logging are created here
+ */
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_paritylog.h>
+#include <dev/raidframe/rf_memchunk.h>
+#include <dev/raidframe/rf_general.h>
+
+#include <dev/raidframe/rf_parityloggingdags.h>
+
+/******************************************************************************
+ *
+ * creates a DAG to perform a large-write operation:
+ *
+ *         / Rod \     / Wnd \
+ * H -- NIL- Rod - NIL - Wnd ------ NIL - T
+ *         \ Rod /     \ Xor - Lpo /
+ *
+ * The writes are not done until the reads complete because if they were done in
+ * parallel, a failure on one of the reads could leave the parity in an inconsistent
+ * state, so that the retry with a new DAG would produce erroneous parity.
+ *
+ * Note:  this DAG has the nasty property that none of the buffers allocated for reading
+ *        old data can be freed until the XOR node fires.  Need to fix this.
+ *
+ * The last two arguments are the number of faults tolerated, and function for the
+ * redundancy calculation. The undo for the redundancy calc is assumed to be null
+ *
+ *****************************************************************************/
+
+void 
+rf_CommonCreateParityLoggingLargeWriteDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    int nfaults,
+    int (*redFunc) (RF_DagNode_t *))
+{
+	RF_DagNode_t *nodes, *wndNodes, *rodNodes = NULL, *syncNode, *xorNode,
+	       *lpoNode, *blockNode, *unblockNode, *termNode;
+	int     nWndNodes, nRodNodes, i;
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_AccessStripeMapHeader_t *new_asm_h[2];
+	int     nodeNum, asmNum;
+	RF_ReconUnitNum_t which_ru;
+	char   *sosBuffer, *eosBuffer;
+	RF_PhysDiskAddr_t *pda;
+	RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
+
+	if (rf_dagDebug)
+		printf("[Creating parity-logging large-write DAG]\n");
+	RF_ASSERT(nfaults == 1);/* this arch only single fault tolerant */
+	dag_h->creator = "ParityLoggingLargeWriteDAG";
+
+	/* alloc the Wnd nodes, the xor node, and the Lpo node */
+	nWndNodes = asmap->numStripeUnitsAccessed;
+	RF_CallocAndAdd(nodes, nWndNodes + 6, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+	i = 0;
+	wndNodes = &nodes[i];
+	i += nWndNodes;
+	xorNode = &nodes[i];
+	i += 1;
+	lpoNode = &nodes[i];
+	i += 1;
+	blockNode = &nodes[i];
+	i += 1;
+	syncNode = &nodes[i];
+	i += 1;
+	unblockNode = &nodes[i];
+	i += 1;
+	termNode = &nodes[i];
+	i += 1;
+
+	dag_h->numCommitNodes = nWndNodes + 1;
+	dag_h->numCommits = 0;
+	dag_h->numSuccedents = 1;
+
+	rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList);
+	if (nRodNodes > 0)
+		RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+
+	/* begin node initialization */
+	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes + 1, 0, 0, 0, dag_h, "Nil", allocList);
+	rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWndNodes + 1, 0, 0, dag_h, "Nil", allocList);
+	rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes + 1, 0, 0, dag_h, "Nil", allocList);
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+	/* initialize the Rod nodes */
+	for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
+		if (new_asm_h[asmNum]) {
+			pda = new_asm_h[asmNum]->stripeMap->physInfo;
+			while (pda) {
+				rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rod", allocList);
+				rodNodes[nodeNum].params[0].p = pda;
+				rodNodes[nodeNum].params[1].p = pda->bufPtr;
+				rodNodes[nodeNum].params[2].v = parityStripeID;
+				rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+				nodeNum++;
+				pda = pda->next;
+			}
+		}
+	}
+	RF_ASSERT(nodeNum == nRodNodes);
+
+	/* initialize the wnd nodes */
+	pda = asmap->physInfo;
+	for (i = 0; i < nWndNodes; i++) {
+		rf_InitNode(&wndNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
+		RF_ASSERT(pda != NULL);
+		wndNodes[i].params[0].p = pda;
+		wndNodes[i].params[1].p = pda->bufPtr;
+		wndNodes[i].params[2].v = parityStripeID;
+		wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		pda = pda->next;
+	}
+
+	/* initialize the redundancy node */
+	rf_InitNode(xorNode, rf_wait, RF_TRUE, redFunc, rf_NullNodeUndoFunc, NULL, 1, 1, 2 * (nWndNodes + nRodNodes) + 1, 1, dag_h, "Xr ", allocList);
+	xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
+	for (i = 0; i < nWndNodes; i++) {
+		xorNode->params[2 * i + 0] = wndNodes[i].params[0];	/* pda */
+		xorNode->params[2 * i + 1] = wndNodes[i].params[1];	/* buf ptr */
+	}
+	for (i = 0; i < nRodNodes; i++) {
+		xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0];	/* pda */
+		xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1];	/* buf ptr */
+	}
+	xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr;	/* xor node needs to get
+									 * at RAID information */
+
+	/* look for an Rod node that reads a complete SU.  If none, alloc a
+	 * buffer to receive the parity info. Note that we can't use a new
+	 * data buffer because it will not have gotten written when the xor
+	 * occurs. */
+	for (i = 0; i < nRodNodes; i++)
+		if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
+			break;
+	if (i == nRodNodes) {
+		RF_CallocAndAdd(xorNode->results[0], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
+	} else {
+		xorNode->results[0] = rodNodes[i].params[1].p;
+	}
+
+	/* initialize the Lpo node */
+	rf_InitNode(lpoNode, rf_wait, RF_FALSE, rf_ParityLogOverwriteFunc, rf_ParityLogOverwriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpo", allocList);
+
+	lpoNode->params[0].p = asmap->parityInfo;
+	lpoNode->params[1].p = xorNode->results[0];
+	RF_ASSERT(asmap->parityInfo->next == NULL);	/* parityInfo must
+							 * describe entire
+							 * parity unit */
+
+	/* connect nodes to form graph */
+
+	/* connect dag header to block node */
+	RF_ASSERT(dag_h->numSuccedents == 1);
+	RF_ASSERT(blockNode->numAntecedents == 0);
+	dag_h->succedents[0] = blockNode;
+
+	/* connect the block node to the Rod nodes */
+	RF_ASSERT(blockNode->numSuccedents == nRodNodes + 1);
+	for (i = 0; i < nRodNodes; i++) {
+		RF_ASSERT(rodNodes[i].numAntecedents == 1);
+		blockNode->succedents[i] = &rodNodes[i];
+		rodNodes[i].antecedents[0] = blockNode;
+		rodNodes[i].antType[0] = rf_control;
+	}
+
+	/* connect the block node to the sync node */
+	/* necessary if nRodNodes == 0 */
+	RF_ASSERT(syncNode->numAntecedents == nRodNodes + 1);
+	blockNode->succedents[nRodNodes] = syncNode;
+	syncNode->antecedents[0] = blockNode;
+	syncNode->antType[0] = rf_control;
+
+	/* connect the Rod nodes to the syncNode */
+	for (i = 0; i < nRodNodes; i++) {
+		rodNodes[i].succedents[0] = syncNode;
+		syncNode->antecedents[1 + i] = &rodNodes[i];
+		syncNode->antType[1 + i] = rf_control;
+	}
+
+	/* connect the sync node to the xor node */
+	RF_ASSERT(syncNode->numSuccedents == nWndNodes + 1);
+	RF_ASSERT(xorNode->numAntecedents == 1);
+	syncNode->succedents[0] = xorNode;
+	xorNode->antecedents[0] = syncNode;
+	xorNode->antType[0] = rf_trueData;	/* carry forward from sync */
+
+	/* connect the sync node to the Wnd nodes */
+	for (i = 0; i < nWndNodes; i++) {
+		RF_ASSERT(wndNodes->numAntecedents == 1);
+		syncNode->succedents[1 + i] = &wndNodes[i];
+		wndNodes[i].antecedents[0] = syncNode;
+		wndNodes[i].antType[0] = rf_control;
+	}
+
+	/* connect the xor node to the Lpo node */
+	RF_ASSERT(xorNode->numSuccedents == 1);
+	RF_ASSERT(lpoNode->numAntecedents == 1);
+	xorNode->succedents[0] = lpoNode;
+	lpoNode->antecedents[0] = xorNode;
+	lpoNode->antType[0] = rf_trueData;
+
+	/* connect the Wnd nodes to the unblock node */
+	RF_ASSERT(unblockNode->numAntecedents == nWndNodes + 1);
+	for (i = 0; i < nWndNodes; i++) {
+		RF_ASSERT(wndNodes->numSuccedents == 1);
+		wndNodes[i].succedents[0] = unblockNode;
+		unblockNode->antecedents[i] = &wndNodes[i];
+		unblockNode->antType[i] = rf_control;
+	}
+
+	/* connect the Lpo node to the unblock node */
+	RF_ASSERT(lpoNode->numSuccedents == 1);
+	lpoNode->succedents[0] = unblockNode;
+	unblockNode->antecedents[nWndNodes] = lpoNode;
+	unblockNode->antType[nWndNodes] = rf_control;
+
+	/* connect unblock node to terminator */
+	RF_ASSERT(unblockNode->numSuccedents == 1);
+	RF_ASSERT(termNode->numAntecedents == 1);
+	RF_ASSERT(termNode->numSuccedents == 0);
+	unblockNode->succedents[0] = termNode;
+	termNode->antecedents[0] = unblockNode;
+	termNode->antType[0] = rf_control;
+}
+
+
+
+
+/******************************************************************************
+ *
+ * creates a DAG to perform a small-write operation (either raid 5 or pq), which is as follows:
+ *
+ *                                     Header
+ *                                       |
+ *                                     Block
+ *                                 / |  ... \   \
+ *                                /  |       \   \
+ *                             Rod  Rod      Rod  Rop
+ *                             | \ /| \    / |  \/ |
+ *                             |    |        |  /\ |
+ *                             Wnd  Wnd      Wnd   X
+ *                              |    \       /     |
+ *                              |     \     /      |
+ *                               \     \   /      Lpo
+ *                                \     \ /       /
+ *                                 +-> Unblock <-+
+ *                                       |
+ *                                       T
+ *
+ *
+ * R = Read, W = Write, X = Xor, o = old, n = new, d = data, p = parity.
+ * When the access spans a stripe unit boundary and is less than one SU in size, there will
+ * be two Rop -- X -- Wnp branches.  I call this the "double-XOR" case.
+ * The second output from each Rod node goes to the X node.  In the double-XOR
+ * case, there are exactly 2 Rod nodes, and each sends one output to one X node.
+ * There is one Rod -- Wnd -- T branch for each stripe unit being updated.
+ *
+ * The block and unblock nodes are unused.  See comment above CreateFaultFreeReadDAG.
+ *
+ * Note:  this DAG ignores all the optimizations related to making the RMWs atomic.
+ *        it also has the nasty property that none of the buffers allocated for reading
+ *        old data & parity can be freed until the XOR node fires.  Need to fix this.
+ *
+ * A null qfuncs indicates single fault tolerant
+ *****************************************************************************/
+
+void 
+rf_CommonCreateParityLoggingSmallWriteDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    RF_RedFuncs_t * pfuncs,
+    RF_RedFuncs_t * qfuncs)
+{
+	RF_DagNode_t *xorNodes, *blockNode, *unblockNode, *nodes;
+	RF_DagNode_t *readDataNodes, *readParityNodes;
+	RF_DagNode_t *writeDataNodes, *lpuNodes;
+	RF_DagNode_t *unlockDataNodes = NULL, *termNode;
+	RF_PhysDiskAddr_t *pda = asmap->physInfo;
+	int     numDataNodes = asmap->numStripeUnitsAccessed;
+	int     numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
+	int     i, j, nNodes, totalNumNodes;
+	RF_ReconUnitNum_t which_ru;
+	int     (*func) (RF_DagNode_t * node), (*undoFunc) (RF_DagNode_t * node);
+	int     (*qfunc) (RF_DagNode_t * node);
+	char   *name, *qname;
+	RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
+	long    nfaults = qfuncs ? 2 : 1;
+	int     lu_flag = (rf_enableAtomicRMW) ? 1 : 0;	/* lock/unlock flag */
+
+	if (rf_dagDebug)
+		printf("[Creating parity-logging small-write DAG]\n");
+	RF_ASSERT(numDataNodes > 0);
+	RF_ASSERT(nfaults == 1);
+	dag_h->creator = "ParityLoggingSmallWriteDAG";
+
+	/* DAG creation occurs in three steps: 1. count the number of nodes in
+	 * the DAG 2. create the nodes 3. initialize the nodes 4. connect the
+	 * nodes */
+
+	/* Step 1. compute number of nodes in the graph */
+
+	/* number of nodes: a read and write for each data unit a redundancy
+	 * computation node for each parity node a read and Lpu for each
+	 * parity unit a block and unblock node (2) a terminator node if
+	 * atomic RMW an unlock node for each data unit, redundancy unit */
+	totalNumNodes = (2 * numDataNodes) + numParityNodes + (2 * numParityNodes) + 3;
+	if (lu_flag)
+		totalNumNodes += numDataNodes;
+
+	nNodes = numDataNodes + numParityNodes;
+
+	dag_h->numCommitNodes = numDataNodes + numParityNodes;
+	dag_h->numCommits = 0;
+	dag_h->numSuccedents = 1;
+
+	/* Step 2. create the nodes */
+	RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+	i = 0;
+	blockNode = &nodes[i];
+	i += 1;
+	unblockNode = &nodes[i];
+	i += 1;
+	readDataNodes = &nodes[i];
+	i += numDataNodes;
+	readParityNodes = &nodes[i];
+	i += numParityNodes;
+	writeDataNodes = &nodes[i];
+	i += numDataNodes;
+	lpuNodes = &nodes[i];
+	i += numParityNodes;
+	xorNodes = &nodes[i];
+	i += numParityNodes;
+	termNode = &nodes[i];
+	i += 1;
+	if (lu_flag) {
+		unlockDataNodes = &nodes[i];
+		i += numDataNodes;
+	}
+	RF_ASSERT(i == totalNumNodes);
+
+	/* Step 3. initialize the nodes */
+	/* initialize block node (Nil) */
+	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
+
+	/* initialize unblock node (Nil) */
+	rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", allocList);
+
+	/* initialize terminatory node (Trm) */
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+	/* initialize nodes which read old data (Rod) */
+	for (i = 0; i < numDataNodes; i++) {
+		rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rod", allocList);
+		RF_ASSERT(pda != NULL);
+		readDataNodes[i].params[0].p = pda;	/* physical disk addr
+							 * desc */
+		readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList);	/* buffer to hold old
+												 * data */
+		readDataNodes[i].params[2].v = parityStripeID;
+		readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
+		pda = pda->next;
+		readDataNodes[i].propList[0] = NULL;
+		readDataNodes[i].propList[1] = NULL;
+	}
+
+	/* initialize nodes which read old parity (Rop) */
+	pda = asmap->parityInfo;
+	i = 0;
+	for (i = 0; i < numParityNodes; i++) {
+		RF_ASSERT(pda != NULL);
+		rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rop", allocList);
+		readParityNodes[i].params[0].p = pda;
+		readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList);	/* buffer to hold old
+													 * parity */
+		readParityNodes[i].params[2].v = parityStripeID;
+		readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		readParityNodes[i].propList[0] = NULL;
+		pda = pda->next;
+	}
+
+	/* initialize nodes which write new data (Wnd) */
+	pda = asmap->physInfo;
+	for (i = 0; i < numDataNodes; i++) {
+		RF_ASSERT(pda != NULL);
+		rf_InitNode(&writeDataNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, nNodes, 4, 0, dag_h, "Wnd", allocList);
+		writeDataNodes[i].params[0].p = pda;	/* physical disk addr
+							 * desc */
+		writeDataNodes[i].params[1].p = pda->bufPtr;	/* buffer holding new
+								 * data to be written */
+		writeDataNodes[i].params[2].v = parityStripeID;
+		writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+
+		if (lu_flag) {
+			/* initialize node to unlock the disk queue */
+			rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Und", allocList);
+			unlockDataNodes[i].params[0].p = pda;	/* physical disk addr
+								 * desc */
+			unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
+		}
+		pda = pda->next;
+	}
+
+
+	/* initialize nodes which compute new parity */
+	/* we use the simple XOR func in the double-XOR case, and when we're
+	 * accessing only a portion of one stripe unit. the distinction
+	 * between the two is that the regular XOR func assumes that the
+	 * targbuf is a full SU in size, and examines the pda associated with
+	 * the buffer to decide where within the buffer to XOR the data,
+	 * whereas the simple XOR func just XORs the data into the start of
+	 * the buffer. */
+	if ((numParityNodes == 2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
+		func = pfuncs->simple;
+		undoFunc = rf_NullNodeUndoFunc;
+		name = pfuncs->SimpleName;
+		if (qfuncs) {
+			qfunc = qfuncs->simple;
+			qname = qfuncs->SimpleName;
+		}
+	} else {
+		func = pfuncs->regular;
+		undoFunc = rf_NullNodeUndoFunc;
+		name = pfuncs->RegularName;
+		if (qfuncs) {
+			qfunc = qfuncs->regular;
+			qname = qfuncs->RegularName;
+		}
+	}
+	/* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop}
+	 * nodes, and raidPtr  */
+	if (numParityNodes == 2) {	/* double-xor case */
+		for (i = 0; i < numParityNodes; i++) {
+			rf_InitNode(&xorNodes[i], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, 7, 1, dag_h, name, allocList);	/* no wakeup func for
+																	 * xor */
+			xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
+			xorNodes[i].params[0] = readDataNodes[i].params[0];
+			xorNodes[i].params[1] = readDataNodes[i].params[1];
+			xorNodes[i].params[2] = readParityNodes[i].params[0];
+			xorNodes[i].params[3] = readParityNodes[i].params[1];
+			xorNodes[i].params[4] = writeDataNodes[i].params[0];
+			xorNodes[i].params[5] = writeDataNodes[i].params[1];
+			xorNodes[i].params[6].p = raidPtr;
+			xorNodes[i].results[0] = readParityNodes[i].params[1].p;	/* use old parity buf as
+											 * target buf */
+		}
+	} else {
+		/* there is only one xor node in this case */
+		rf_InitNode(&xorNodes[0], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
+		xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
+		for (i = 0; i < numDataNodes + 1; i++) {
+			/* set up params related to Rod and Rop nodes */
+			xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0];	/* pda */
+			xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1];	/* buffer pointer */
+		}
+		for (i = 0; i < numDataNodes; i++) {
+			/* set up params related to Wnd and Wnp nodes */
+			xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = writeDataNodes[i].params[0];	/* pda */
+			xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = writeDataNodes[i].params[1];	/* buffer pointer */
+		}
+		xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;	/* xor node needs to get
+											 * at RAID information */
+		xorNodes[0].results[0] = readParityNodes[0].params[1].p;
+	}
+
+	/* initialize the log node(s) */
+	pda = asmap->parityInfo;
+	for (i = 0; i < numParityNodes; i++) {
+		RF_ASSERT(pda);
+		rf_InitNode(&lpuNodes[i], rf_wait, RF_FALSE, rf_ParityLogUpdateFunc, rf_ParityLogUpdateUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpu", allocList);
+		lpuNodes[i].params[0].p = pda;	/* PhysDiskAddr of parity */
+		lpuNodes[i].params[1].p = xorNodes[i].results[0];	/* buffer pointer to
+									 * parity */
+		pda = pda->next;
+	}
+
+
+	/* Step 4. connect the nodes */
+
+	/* connect header to block node */
+	RF_ASSERT(dag_h->numSuccedents == 1);
+	RF_ASSERT(blockNode->numAntecedents == 0);
+	dag_h->succedents[0] = blockNode;
+
+	/* connect block node to read old data nodes */
+	RF_ASSERT(blockNode->numSuccedents == (numDataNodes + numParityNodes));
+	for (i = 0; i < numDataNodes; i++) {
+		blockNode->succedents[i] = &readDataNodes[i];
+		RF_ASSERT(readDataNodes[i].numAntecedents == 1);
+		readDataNodes[i].antecedents[0] = blockNode;
+		readDataNodes[i].antType[0] = rf_control;
+	}
+
+	/* connect block node to read old parity nodes */
+	for (i = 0; i < numParityNodes; i++) {
+		blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
+		RF_ASSERT(readParityNodes[i].numAntecedents == 1);
+		readParityNodes[i].antecedents[0] = blockNode;
+		readParityNodes[i].antType[0] = rf_control;
+	}
+
+	/* connect read old data nodes to write new data nodes */
+	for (i = 0; i < numDataNodes; i++) {
+		RF_ASSERT(readDataNodes[i].numSuccedents == numDataNodes + numParityNodes);
+		for (j = 0; j < numDataNodes; j++) {
+			RF_ASSERT(writeDataNodes[j].numAntecedents == numDataNodes + numParityNodes);
+			readDataNodes[i].succedents[j] = &writeDataNodes[j];
+			writeDataNodes[j].antecedents[i] = &readDataNodes[i];
+			if (i == j)
+				writeDataNodes[j].antType[i] = rf_antiData;
+			else
+				writeDataNodes[j].antType[i] = rf_control;
+		}
+	}
+
+	/* connect read old data nodes to xor nodes */
+	for (i = 0; i < numDataNodes; i++)
+		for (j = 0; j < numParityNodes; j++) {
+			RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
+			readDataNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
+			xorNodes[j].antecedents[i] = &readDataNodes[i];
+			xorNodes[j].antType[i] = rf_trueData;
+		}
+
+	/* connect read old parity nodes to write new data nodes */
+	for (i = 0; i < numParityNodes; i++) {
+		RF_ASSERT(readParityNodes[i].numSuccedents == numDataNodes + numParityNodes);
+		for (j = 0; j < numDataNodes; j++) {
+			readParityNodes[i].succedents[j] = &writeDataNodes[j];
+			writeDataNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
+			writeDataNodes[j].antType[numDataNodes + i] = rf_control;
+		}
+	}
+
+	/* connect read old parity nodes to xor nodes */
+	for (i = 0; i < numParityNodes; i++)
+		for (j = 0; j < numParityNodes; j++) {
+			readParityNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
+			xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
+			xorNodes[j].antType[numDataNodes + i] = rf_trueData;
+		}
+
+	/* connect xor nodes to write new parity nodes */
+	for (i = 0; i < numParityNodes; i++) {
+		RF_ASSERT(xorNodes[i].numSuccedents == 1);
+		RF_ASSERT(lpuNodes[i].numAntecedents == 1);
+		xorNodes[i].succedents[0] = &lpuNodes[i];
+		lpuNodes[i].antecedents[0] = &xorNodes[i];
+		lpuNodes[i].antType[0] = rf_trueData;
+	}
+
+	for (i = 0; i < numDataNodes; i++) {
+		if (lu_flag) {
+			/* connect write new data nodes to unlock nodes */
+			RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
+			RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
+			writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
+			unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
+			unlockDataNodes[i].antType[0] = rf_control;
+
+			/* connect unlock nodes to unblock node */
+			RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
+			RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
+			unlockDataNodes[i].succedents[0] = unblockNode;
+			unblockNode->antecedents[i] = &unlockDataNodes[i];
+			unblockNode->antType[i] = rf_control;
+		} else {
+			/* connect write new data nodes to unblock node */
+			RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
+			RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
+			writeDataNodes[i].succedents[0] = unblockNode;
+			unblockNode->antecedents[i] = &writeDataNodes[i];
+			unblockNode->antType[i] = rf_control;
+		}
+	}
+
+	/* connect write new parity nodes to unblock node */
+	for (i = 0; i < numParityNodes; i++) {
+		RF_ASSERT(lpuNodes[i].numSuccedents == 1);
+		lpuNodes[i].succedents[0] = unblockNode;
+		unblockNode->antecedents[numDataNodes + i] = &lpuNodes[i];
+		unblockNode->antType[numDataNodes + i] = rf_control;
+	}
+
+	/* connect unblock node to terminator */
+	RF_ASSERT(unblockNode->numSuccedents == 1);
+	RF_ASSERT(termNode->numAntecedents == 1);
+	RF_ASSERT(termNode->numSuccedents == 0);
+	unblockNode->succedents[0] = termNode;
+	termNode->antecedents[0] = unblockNode;
+	termNode->antType[0] = rf_control;
+}
+
+
+void 
+rf_CreateParityLoggingSmallWriteDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    RF_RedFuncs_t * pfuncs,
+    RF_RedFuncs_t * qfuncs)
+{
+	dag_h->creator = "ParityLoggingSmallWriteDAG";
+	rf_CommonCreateParityLoggingSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_xorFuncs, NULL);
+}
+
+
+void 
+rf_CreateParityLoggingLargeWriteDAG(
+    RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap,
+    RF_DagHeader_t * dag_h,
+    void *bp,
+    RF_RaidAccessFlags_t flags,
+    RF_AllocListElem_t * allocList,
+    int nfaults,
+    int (*redFunc) (RF_DagNode_t *))
+{
+	dag_h->creator = "ParityLoggingSmallWriteDAG";
+	rf_CommonCreateParityLoggingLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularXorFunc);
+}
+#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
diff --git a/sys/dev/raidframe/rf_parityloggingdags.h b/sys/dev/raidframe/rf_parityloggingdags.h
new file mode 100644
index 0000000..dc0fc9b
--- /dev/null
+++ b/sys/dev/raidframe/rf_parityloggingdags.h
@@ -0,0 +1,59 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_parityloggingdags.h,v 1.3 1999/02/05 00:06:14 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/****************************************************************************
+ *                                                                          *
+ * rf_parityloggingdags.h -- header file for parity logging dags            *
+ *                                                                          *
+ ****************************************************************************/
+
+#ifndef _RF__RF_PARITYLOGGINGDAGS_H_
+#define _RF__RF_PARITYLOGGINGDAGS_H_
+
+/* routines that create DAGs */
+void 
+rf_CommonCreateParityLoggingLargeWriteDAG(RF_Raid_t * raidPtr,
+    RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h,
+    void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
+    int nfaults, int (*redFunc) (RF_DagNode_t *));
+	void    rf_CommonCreateParityLoggingSmallWriteDAG(RF_Raid_t * raidPtr,
+            RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h,
+            void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
+            RF_RedFuncs_t * pfuncs, RF_RedFuncs_t * qfuncs);
+
+	void    rf_CreateParityLoggingLargeWriteDAG(RF_Raid_t * raidPtr,
+            RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h,
+            void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
+            int nfaults, int (*redFunc) (RF_DagNode_t *));
+	void    rf_CreateParityLoggingSmallWriteDAG(RF_Raid_t * raidPtr,
+            RF_AccessStripeMap_t * asmap, RF_DagHeader_t * dag_h,
+            void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t * allocList,
+            RF_RedFuncs_t * pfuncs, RF_RedFuncs_t * qfuncs);
+
+#endif				/* !_RF__RF_PARITYLOGGINGDAGS_H_ */
diff --git a/sys/dev/raidframe/rf_parityscan.c b/sys/dev/raidframe/rf_parityscan.c
new file mode 100644
index 0000000..bcdf506
--- /dev/null
+++ b/sys/dev/raidframe/rf_parityscan.c
@@ -0,0 +1,443 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_parityscan.c,v 1.9 2000/05/28 03:00:31 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * rf_parityscan.c -- misc utilities related to parity verification
+ *
+ *****************************************************************************/
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_mcpair.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_engine.h>
+#include <dev/raidframe/rf_parityscan.h>
+#include <dev/raidframe/rf_map.h>
+#include <dev/raidframe/rf_kintf.h>
+
+/*****************************************************************************************
+ *
+ * walk through the entire arry and write new parity.
+ * This works by creating two DAGs, one to read a stripe of data and one to
+ * write new parity.  The first is executed, the data is xored together, and
+ * then the second is executed.  To avoid constantly building and tearing down
+ * the DAGs, we create them a priori and fill them in with the mapping
+ * information as we go along.
+ *
+ * there should never be more than one thread running this.
+ *
+ ****************************************************************************************/
+
+int 
+rf_RewriteParity(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_AccessStripeMapHeader_t *asm_h;
+	int ret_val;
+	int rc;
+	RF_PhysDiskAddr_t pda;
+	RF_SectorNum_t i;
+
+	if (raidPtr->Layout.map->faultsTolerated == 0) {
+		/* There isn't any parity. Call it "okay." */
+		return (RF_PARITY_OKAY);
+	}
+	if (raidPtr->status[0] != rf_rs_optimal) {
+		/*
+		 * We're in degraded mode.  Don't try to verify parity now! 
+		 * XXX: this should be a "we don't want to", not a 
+		 * "we can't" error. 
+		 */
+		return (RF_PARITY_COULD_NOT_VERIFY);
+	}
+
+	ret_val = 0;
+
+	pda.startSector = 0;
+	pda.numSector = raidPtr->Layout.sectorsPerStripeUnit;
+	rc = RF_PARITY_OKAY;
+
+	for (i = 0; i < raidPtr->totalSectors && 
+		     rc <= RF_PARITY_CORRECTED; 
+	     i += layoutPtr->dataSectorsPerStripe) {
+		if (raidPtr->waitShutdown) {
+			/* Someone is pulling the plug on this set...
+			   abort the re-write */
+			return (1);
+		}
+		asm_h = rf_MapAccess(raidPtr, i, 
+				     layoutPtr->dataSectorsPerStripe, 
+				     NULL, RF_DONT_REMAP);
+		raidPtr->parity_rewrite_stripes_done = 
+			i / layoutPtr->dataSectorsPerStripe ;
+		rc = rf_VerifyParity(raidPtr, asm_h->stripeMap, 1, 0);
+
+		switch (rc) {
+		case RF_PARITY_OKAY:
+		case RF_PARITY_CORRECTED:
+			break;
+		case RF_PARITY_BAD:
+			printf("Parity bad during correction\n");
+			ret_val = 1;
+			break;
+		case RF_PARITY_COULD_NOT_CORRECT:
+			printf("Could not correct bad parity\n");
+			ret_val = 1;
+			break;
+		case RF_PARITY_COULD_NOT_VERIFY:
+			printf("Could not verify parity\n");
+			ret_val = 1;
+			break;
+		default:
+			printf("Bad rc=%d from VerifyParity in RewriteParity\n", rc);
+			ret_val = 1;
+		}
+		rf_FreeAccessStripeMap(asm_h);
+	}
+	return (ret_val);
+}
+/*****************************************************************************************
+ *
+ * verify that the parity in a particular stripe is correct.
+ * we validate only the range of parity defined by parityPDA, since
+ * this is all we have locked.  The way we do this is to create an asm
+ * that maps the whole stripe and then range-restrict it to the parity
+ * region defined by the parityPDA.
+ *
+ ****************************************************************************************/
+int 
+rf_VerifyParity(raidPtr, aasm, correct_it, flags)
+	RF_Raid_t *raidPtr;
+	RF_AccessStripeMap_t *aasm;
+	int     correct_it;
+	RF_RaidAccessFlags_t flags;
+{
+	RF_PhysDiskAddr_t *parityPDA;
+	RF_AccessStripeMap_t *doasm;
+	RF_LayoutSW_t *lp;
+	int     lrc, rc;
+
+	lp = raidPtr->Layout.map;
+	if (lp->faultsTolerated == 0) {
+		/*
+	         * There isn't any parity. Call it "okay."
+	         */
+		return (RF_PARITY_OKAY);
+	}
+	rc = RF_PARITY_OKAY;
+	if (lp->VerifyParity) {
+		for (doasm = aasm; doasm; doasm = doasm->next) {
+			for (parityPDA = doasm->parityInfo; parityPDA; 
+			     parityPDA = parityPDA->next) {
+				lrc = lp->VerifyParity(raidPtr, 
+						       doasm->raidAddress, 
+						       parityPDA,
+						       correct_it, flags);
+				if (lrc > rc) {
+					/* see rf_parityscan.h for why this
+					 * works */
+					rc = lrc;
+				}
+			}
+		}
+	} else {
+		rc = RF_PARITY_COULD_NOT_VERIFY;
+	}
+	return (rc);
+}
+
+int 
+rf_VerifyParityBasic(raidPtr, raidAddr, parityPDA, correct_it, flags)
+	RF_Raid_t *raidPtr;
+	RF_RaidAddr_t raidAddr;
+	RF_PhysDiskAddr_t *parityPDA;
+	int     correct_it;
+	RF_RaidAccessFlags_t flags;
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_RaidAddr_t startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
+								     raidAddr);
+	RF_SectorCount_t numsector = parityPDA->numSector;
+	int     numbytes = rf_RaidAddressToByte(raidPtr, numsector);
+	int     bytesPerStripe = numbytes * layoutPtr->numDataCol;
+	RF_DagHeader_t *rd_dag_h, *wr_dag_h;	/* read, write dag */
+	RF_DagNode_t *blockNode, *unblockNode, *wrBlock, *wrUnblock;
+	RF_AccessStripeMapHeader_t *asm_h;
+	RF_AccessStripeMap_t *asmap;
+	RF_AllocListElem_t *alloclist;
+	RF_PhysDiskAddr_t *pda;
+	char   *pbuf, *buf, *end_p, *p;
+	int     i, retcode;
+	RF_ReconUnitNum_t which_ru;
+	RF_StripeNum_t psID = rf_RaidAddressToParityStripeID(layoutPtr, 
+							     raidAddr, 
+							     &which_ru);
+	int     stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
+	RF_AccTraceEntry_t tracerec;
+	RF_MCPair_t *mcpair;
+
+	retcode = RF_PARITY_OKAY;
+
+	mcpair = rf_AllocMCPair();
+	rf_MakeAllocList(alloclist);
+	RF_MallocAndAdd(buf, numbytes * (layoutPtr->numDataCol + layoutPtr->numParityCol), (char *), alloclist);
+	RF_CallocAndAdd(pbuf, 1, numbytes, (char *), alloclist);	/* use calloc to make
+									 * sure buffer is zeroed */
+	end_p = buf + bytesPerStripe;
+
+	rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, numbytes, buf, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+	    "Rod", alloclist, flags, RF_IO_NORMAL_PRIORITY);
+	blockNode = rd_dag_h->succedents[0];
+	unblockNode = blockNode->succedents[0]->succedents[0];
+
+	/* map the stripe and fill in the PDAs in the dag */
+	asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe, buf, RF_DONT_REMAP);
+	asmap = asm_h->stripeMap;
+
+	for (pda = asmap->physInfo, i = 0; i < layoutPtr->numDataCol; i++, pda = pda->next) {
+		RF_ASSERT(pda);
+		rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
+		RF_ASSERT(pda->numSector != 0);
+		if (rf_TryToRedirectPDA(raidPtr, pda, 0))
+			goto out;	/* no way to verify parity if disk is
+					 * dead.  return w/ good status */
+		blockNode->succedents[i]->params[0].p = pda;
+		blockNode->succedents[i]->params[2].v = psID;
+		blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+	}
+
+	RF_ASSERT(!asmap->parityInfo->next);
+	rf_RangeRestrictPDA(raidPtr, parityPDA, asmap->parityInfo, 0, 1);
+	RF_ASSERT(asmap->parityInfo->numSector != 0);
+	if (rf_TryToRedirectPDA(raidPtr, asmap->parityInfo, 1))
+		goto out;
+	blockNode->succedents[layoutPtr->numDataCol]->params[0].p = asmap->parityInfo;
+
+	/* fire off the DAG */
+	bzero((char *) &tracerec, sizeof(tracerec));
+	rd_dag_h->tracerec = &tracerec;
+
+	if (rf_verifyParityDebug) {
+		printf("Parity verify read dag:\n");
+		rf_PrintDAGList(rd_dag_h);
+	}
+	RF_LOCK_MUTEX(mcpair->mutex);
+	mcpair->flag = 0;
+	rf_DispatchDAG(rd_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
+	    (void *) mcpair);
+	while (!mcpair->flag)
+		RF_WAIT_COND(mcpair->cond, mcpair->mutex);
+	RF_UNLOCK_MUTEX(mcpair->mutex);
+	if (rd_dag_h->status != rf_enable) {
+		RF_ERRORMSG("Unable to verify parity:  can't read the stripe\n");
+		retcode = RF_PARITY_COULD_NOT_VERIFY;
+		goto out;
+	}
+	for (p = buf; p < end_p; p += numbytes) {
+		rf_bxor(p, pbuf, numbytes, NULL);
+	}
+	for (i = 0; i < numbytes; i++) {
+#if 0
+		if (pbuf[i] != 0 || buf[bytesPerStripe + i] != 0) {
+			printf("Bytes: %d %d %d\n", i, pbuf[i], buf[bytesPerStripe + i]);
+		}
+#endif
+		if (pbuf[i] != buf[bytesPerStripe + i]) {
+			if (!correct_it)
+				RF_ERRORMSG3("Parity verify error: byte %d of parity is 0x%x should be 0x%x\n",
+				    i, (u_char) buf[bytesPerStripe + i], (u_char) pbuf[i]);
+			retcode = RF_PARITY_BAD;
+			break;
+		}
+	}
+
+	if (retcode && correct_it) {
+		wr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, numbytes, pbuf, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+		    "Wnp", alloclist, flags, RF_IO_NORMAL_PRIORITY);
+		wrBlock = wr_dag_h->succedents[0];
+		wrUnblock = wrBlock->succedents[0]->succedents[0];
+		wrBlock->succedents[0]->params[0].p = asmap->parityInfo;
+		wrBlock->succedents[0]->params[2].v = psID;
+		wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		bzero((char *) &tracerec, sizeof(tracerec));
+		wr_dag_h->tracerec = &tracerec;
+		if (rf_verifyParityDebug) {
+			printf("Parity verify write dag:\n");
+			rf_PrintDAGList(wr_dag_h);
+		}
+		RF_LOCK_MUTEX(mcpair->mutex);
+		mcpair->flag = 0;
+		rf_DispatchDAG(wr_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
+		    (void *) mcpair);
+		while (!mcpair->flag)
+			RF_WAIT_COND(mcpair->cond, mcpair->mutex);
+		RF_UNLOCK_MUTEX(mcpair->mutex);
+		if (wr_dag_h->status != rf_enable) {
+			RF_ERRORMSG("Unable to correct parity in VerifyParity:  can't write the stripe\n");
+			retcode = RF_PARITY_COULD_NOT_CORRECT;
+		}
+		rf_FreeDAG(wr_dag_h);
+		if (retcode == RF_PARITY_BAD)
+			retcode = RF_PARITY_CORRECTED;
+	}
+out:
+	rf_FreeAccessStripeMap(asm_h);
+	rf_FreeAllocList(alloclist);
+	rf_FreeDAG(rd_dag_h);
+	rf_FreeMCPair(mcpair);
+	return (retcode);
+}
+
+int 
+rf_TryToRedirectPDA(raidPtr, pda, parity)
+	RF_Raid_t *raidPtr;
+	RF_PhysDiskAddr_t *pda;
+	int     parity;
+{
+	if (raidPtr->Disks[pda->row][pda->col].status == rf_ds_reconstructing) {
+		if (rf_CheckRUReconstructed(raidPtr->reconControl[pda->row]->reconMap, pda->startSector)) {
+			if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+				RF_RowCol_t or = pda->row, oc = pda->col;
+				RF_SectorNum_t os = pda->startSector;
+				if (parity) {
+					(raidPtr->Layout.map->MapParity) (raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
+					if (rf_verifyParityDebug)
+						printf("VerifyParity: Redir P r %d c %d sect %ld -> r %d c %d sect %ld\n",
+						    or, oc, (long) os, pda->row, pda->col, (long) pda->startSector);
+				} else {
+					(raidPtr->Layout.map->MapSector) (raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
+					if (rf_verifyParityDebug)
+						printf("VerifyParity: Redir D r %d c %d sect %ld -> r %d c %d sect %ld\n",
+						    or, oc, (long) os, pda->row, pda->col, (long) pda->startSector);
+				}
+			} else {
+				RF_RowCol_t spRow = raidPtr->Disks[pda->row][pda->col].spareRow;
+				RF_RowCol_t spCol = raidPtr->Disks[pda->row][pda->col].spareCol;
+				pda->row = spRow;
+				pda->col = spCol;
+			}
+		}
+	}
+	if (RF_DEAD_DISK(raidPtr->Disks[pda->row][pda->col].status))
+		return (1);
+	return (0);
+}
+/*****************************************************************************************
+ *
+ * currently a stub.
+ *
+ * takes as input an ASM describing a write operation and containing one failure, and
+ * verifies that the parity was correctly updated to reflect the write.
+ *
+ * if it's a data unit that's failed, we read the other data units in the stripe and
+ * the parity unit, XOR them together, and verify that we get the data intended for
+ * the failed disk.  Since it's easy, we also validate that the right data got written
+ * to the surviving data disks.
+ *
+ * If it's the parity that failed, there's really no validation we can do except the
+ * above verification that the right data got written to all disks.  This is because
+ * the new data intended for the failed disk is supplied in the ASM, but this is of
+ * course not the case for the new parity.
+ *
+ ****************************************************************************************/
+int 
+rf_VerifyDegrModeWrite(raidPtr, asmh)
+	RF_Raid_t *raidPtr;
+	RF_AccessStripeMapHeader_t *asmh;
+{
+	return (0);
+}
+/* creates a simple DAG with a header, a block-recon node at level 1,
+ * nNodes nodes at level 2, an unblock-recon node at level 3, and
+ * a terminator node at level 4.  The stripe address field in
+ * the block and unblock nodes are not touched, nor are the pda
+ * fields in the second-level nodes, so they must be filled in later.
+ *
+ * commit point is established at unblock node - this means that any
+ * failure during dag execution causes the dag to fail
+ */
+RF_DagHeader_t *
+rf_MakeSimpleDAG(raidPtr, nNodes, bytesPerSU, databuf, doFunc, undoFunc, name, alloclist, flags, priority)
+	RF_Raid_t *raidPtr;
+	int     nNodes;
+	int     bytesPerSU;
+	char   *databuf;
+	int     (*doFunc) (RF_DagNode_t * node);
+	int     (*undoFunc) (RF_DagNode_t * node);
+	char   *name;		/* node names at the second level */
+	RF_AllocListElem_t *alloclist;
+	RF_RaidAccessFlags_t flags;
+	int     priority;
+{
+	RF_DagHeader_t *dag_h;
+	RF_DagNode_t *nodes, *termNode, *blockNode, *unblockNode;
+	int     i;
+
+	/* create the nodes, the block & unblock nodes, and the terminator
+	 * node */
+	RF_CallocAndAdd(nodes, nNodes + 3, sizeof(RF_DagNode_t), (RF_DagNode_t *), alloclist);
+	blockNode = &nodes[nNodes];
+	unblockNode = blockNode + 1;
+	termNode = unblockNode + 1;
+
+	dag_h = rf_AllocDAGHeader();
+	dag_h->raidPtr = (void *) raidPtr;
+	dag_h->allocList = NULL;/* we won't use this alloc list */
+	dag_h->status = rf_enable;
+	dag_h->numSuccedents = 1;
+	dag_h->creator = "SimpleDAG";
+
+	/* this dag can not commit until the unblock node is reached errors
+	 * prior to the commit point imply the dag has failed */
+	dag_h->numCommitNodes = 1;
+	dag_h->numCommits = 0;
+
+	dag_h->succedents[0] = blockNode;
+	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", alloclist);
+	rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", alloclist);
+	unblockNode->succedents[0] = termNode;
+	for (i = 0; i < nNodes; i++) {
+		blockNode->succedents[i] = unblockNode->antecedents[i] = &nodes[i];
+		unblockNode->antType[i] = rf_control;
+		rf_InitNode(&nodes[i], rf_wait, RF_FALSE, doFunc, undoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, alloclist);
+		nodes[i].succedents[0] = unblockNode;
+		nodes[i].antecedents[0] = blockNode;
+		nodes[i].antType[0] = rf_control;
+		nodes[i].params[1].p = (databuf + (i * bytesPerSU));
+	}
+	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", alloclist);
+	termNode->antecedents[0] = unblockNode;
+	termNode->antType[0] = rf_control;
+	return (dag_h);
+}
diff --git a/sys/dev/raidframe/rf_parityscan.h b/sys/dev/raidframe/rf_parityscan.h
new file mode 100644
index 0000000..babca41
--- /dev/null
+++ b/sys/dev/raidframe/rf_parityscan.h
@@ -0,0 +1,67 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_parityscan.h,v 1.3 1999/02/05 00:06:14 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_PARITYSCAN_H_
+#define _RF__RF_PARITYSCAN_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_alloclist.h>
+
+int     rf_RewriteParity(RF_Raid_t * raidPtr);
+int 
+rf_VerifyParityBasic(RF_Raid_t * raidPtr, RF_RaidAddr_t raidAddr,
+    RF_PhysDiskAddr_t * parityPDA, int correct_it, RF_RaidAccessFlags_t flags);
+int 
+rf_VerifyParity(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * stripeMap,
+    int correct_it, RF_RaidAccessFlags_t flags);
+int     rf_TryToRedirectPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, int parity);
+int     rf_VerifyDegrModeWrite(RF_Raid_t * raidPtr, RF_AccessStripeMapHeader_t * asmh);
+RF_DagHeader_t *
+rf_MakeSimpleDAG(RF_Raid_t * raidPtr, int nNodes,
+    int bytesPerSU, char *databuf,
+    int (*doFunc) (RF_DagNode_t *),
+    int (*undoFunc) (RF_DagNode_t *),
+    char *name, RF_AllocListElem_t * alloclist,
+    RF_RaidAccessFlags_t flags, int priority);
+
+#define RF_DO_CORRECT_PARITY   1
+#define RF_DONT_CORRECT_PARITY 0
+
+/*
+ * Return vals for VerifyParity operation
+ *
+ * Ordering is important here.
+ */
+#define RF_PARITY_OKAY               0	/* or no parity information */
+#define RF_PARITY_CORRECTED          1
+#define RF_PARITY_BAD                2
+#define RF_PARITY_COULD_NOT_CORRECT  3
+#define RF_PARITY_COULD_NOT_VERIFY   4
+
+#endif				/* !_RF__RF_PARITYSCAN_H_ */
diff --git a/sys/dev/raidframe/rf_pq.c b/sys/dev/raidframe/rf_pq.c
new file mode 100644
index 0000000..b96729e
--- /dev/null
+++ b/sys/dev/raidframe/rf_pq.c
@@ -0,0 +1,926 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_pq.c,v 1.7 2000/01/07 03:41:02 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * Code for RAID level 6 (P + Q) disk array architecture.
+ */
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagffrd.h>
+#include <dev/raidframe/rf_dagffwr.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+#include <dev/raidframe/rf_dagdegwr.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_pqdeg.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_map.h>
+#include <dev/raidframe/rf_pq.h>
+
+RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
+RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
+
+int 
+rf_RegularONPFunc(node)
+	RF_DagNode_t *node;
+{
+	return (rf_RegularXorFunc(node));
+}
+/*
+   same as simpleONQ func, but the coefficient is always 1
+*/
+
+int 
+rf_SimpleONPFunc(node)
+	RF_DagNode_t *node;
+{
+	return (rf_SimpleXorFunc(node));
+}
+
+int 
+rf_RecoveryPFunc(node)
+	RF_DagNode_t *node;
+{
+	return (rf_RecoveryXorFunc(node));
+}
+
+int 
+rf_RegularPFunc(node)
+	RF_DagNode_t *node;
+{
+	return (rf_RegularXorFunc(node));
+}
+#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
+
+static void 
+QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
+    unsigned char coeff);
+static void 
+rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
+    unsigned length, unsigned coeff);
+
+RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
+RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
+RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
+
+void 
+rf_PQDagSelect(
+    RF_Raid_t * raidPtr,
+    RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap,
+    RF_VoidFuncPtr * createFunc)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	unsigned ndfail = asmap->numDataFailed;
+	unsigned npfail = asmap->numParityFailed;
+	unsigned ntfail = npfail + ndfail;
+
+	RF_ASSERT(RF_IO_IS_R_OR_W(type));
+	if (ntfail > 2) {
+		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
+		 /* *infoFunc = */ *createFunc = NULL;
+		return;
+	}
+	/* ok, we can do this I/O */
+	if (type == RF_IO_TYPE_READ) {
+		switch (ndfail) {
+		case 0:
+			/* fault free read */
+			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
+			break;
+		case 1:
+			/* lost a single data unit */
+			/* two cases: (1) parity is not lost. do a normal raid
+			 * 5 reconstruct read. (2) parity is lost. do a
+			 * reconstruct read using "q". */
+			if (ntfail == 2) {	/* also lost redundancy */
+				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
+					*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
+				else
+					*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
+			} else {
+				/* P and Q are ok. But is there a failure in
+				 * some unaccessed data unit? */
+				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
+					*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
+				else
+					*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
+			}
+			break;
+		case 2:
+			/* lost two data units */
+			/* *infoFunc = PQOneTwo; */
+			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
+			break;
+		}
+		return;
+	}
+	/* a write */
+	switch (ntfail) {
+	case 0:		/* fault free */
+		if (rf_suppressLocksAndLargeWrites ||
+		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
+			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
+
+			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
+		} else {
+			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
+		}
+		break;
+
+	case 1:		/* single disk fault */
+		if (npfail == 1) {
+			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
+			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
+										 * normal mode raid5
+										 * write. */
+				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
+				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
+					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
+				else
+					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
+			} else {/* parity died, small write only updating Q */
+				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
+				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
+					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
+				else
+					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
+			}
+		} else {	/* data missing. Do a P reconstruct write if
+				 * only a single data unit is lost in the
+				 * stripe, otherwise a PQ reconstruct write. */
+			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
+				*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
+			else
+				*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
+		}
+		break;
+
+	case 2:		/* two disk faults */
+		switch (npfail) {
+		case 2:	/* both p and q dead */
+			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
+			break;
+		case 1:	/* either p or q and dead data */
+			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
+			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
+			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
+				*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
+			else
+				*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
+			break;
+		case 0:	/* double data loss */
+			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
+			break;
+		}
+		break;
+
+	default:		/* more than 2 disk faults */
+		*createFunc = NULL;
+		RF_PANIC();
+	}
+	return;
+}
+/*
+   Used as a stop gap info function
+*/
+#if 0
+static void 
+PQOne(raidPtr, nSucc, nAnte, asmap)
+	RF_Raid_t *raidPtr;
+	int    *nSucc;
+	int    *nAnte;
+	RF_AccessStripeMap_t *asmap;
+{
+	*nSucc = *nAnte = 1;
+}
+
+static void 
+PQOneTwo(raidPtr, nSucc, nAnte, asmap)
+	RF_Raid_t *raidPtr;
+	int    *nSucc;
+	int    *nAnte;
+	RF_AccessStripeMap_t *asmap;
+{
+	*nSucc = 1;
+	*nAnte = 2;
+}
+#endif
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
+{
+	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
+	    rf_RegularPQFunc, RF_FALSE);
+}
+
+int 
+rf_RegularONQFunc(node)
+	RF_DagNode_t *node;
+{
+	int     np = node->numParams;
+	int     d;
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
+	int     i;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+	char   *qbuf, *qpbuf;
+	char   *obuf, *nbuf;
+	RF_PhysDiskAddr_t *old, *new;
+	unsigned long coeff;
+	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
+
+	RF_ETIMER_START(timer);
+
+	d = (np - 3) / 4;
+	RF_ASSERT(4 * d + 3 == np);
+	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
+	for (i = 0; i < d; i++) {
+		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
+		obuf = (char *) node->params[2 * i + 1].p;
+		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
+		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
+		RF_ASSERT(new->numSector == old->numSector);
+		RF_ASSERT(new->raidAddress == old->raidAddress);
+		/* the stripe unit within the stripe tells us the coefficient
+		 * to use for the multiply. */
+		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
+		/* compute the data unit offset within the column, then add
+		 * one */
+		coeff = (coeff % raidPtr->Layout.numDataCol);
+		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
+		QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
+	}
+
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->q_us += RF_ETIMER_VAL_US(timer);
+	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
+					 * I/O in this node */
+	return (0);
+}
+/*
+   See the SimpleXORFunc for the difference between a simple and regular func.
+   These Q functions should be used for
+
+         new q = Q(data,old data,old q)
+
+   style updates and not for
+
+         q = ( new data, new data, .... )
+
+   computations.
+
+   The simple q takes 2(2d+1)+1 params, where d is the number
+   of stripes written. The order of params is
+   old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
+   [2d] old q pda_0, old q buffer
+   [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
+   raidPtr
+*/
+
+int 
+rf_SimpleONQFunc(node)
+	RF_DagNode_t *node;
+{
+	int     np = node->numParams;
+	int     d;
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
+	int     i;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+	char   *qbuf;
+	char   *obuf, *nbuf;
+	RF_PhysDiskAddr_t *old, *new;
+	unsigned long coeff;
+
+	RF_ETIMER_START(timer);
+
+	d = (np - 3) / 4;
+	RF_ASSERT(4 * d + 3 == np);
+	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
+	for (i = 0; i < d; i++) {
+		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
+		obuf = (char *) node->params[2 * i + 1].p;
+		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
+		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
+		RF_ASSERT(new->numSector == old->numSector);
+		RF_ASSERT(new->raidAddress == old->raidAddress);
+		/* the stripe unit within the stripe tells us the coefficient
+		 * to use for the multiply. */
+		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
+		/* compute the data unit offset within the column, then add
+		 * one */
+		coeff = (coeff % raidPtr->Layout.numDataCol);
+		QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
+	}
+
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->q_us += RF_ETIMER_VAL_US(timer);
+	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
+					 * I/O in this node */
+	return (0);
+}
+RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
+{
+	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
+}
+
+static void RegularQSubr(RF_DagNode_t *node, char   *qbuf);
+
+static void 
+RegularQSubr(node, qbuf)
+	RF_DagNode_t *node;
+	char   *qbuf;
+{
+	int     np = node->numParams;
+	int     d;
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
+	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
+	int     i;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+	char   *obuf, *qpbuf;
+	RF_PhysDiskAddr_t *old;
+	unsigned long coeff;
+
+	RF_ETIMER_START(timer);
+
+	d = (np - 1) / 2;
+	RF_ASSERT(2 * d + 1 == np);
+	for (i = 0; i < d; i++) {
+		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
+		obuf = (char *) node->params[2 * i + 1].p;
+		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
+		/* compute the data unit offset within the column, then add
+		 * one */
+		coeff = (coeff % raidPtr->Layout.numDataCol);
+		/* the input buffers may not all be aligned with the start of
+		 * the stripe. so shift by their sector offset within the
+		 * stripe unit */
+		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
+		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
+	}
+
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->q_us += RF_ETIMER_VAL_US(timer);
+}
+/*
+   used in degraded writes.
+*/
+
+static void DegrQSubr(RF_DagNode_t *node);
+
+static void 
+DegrQSubr(node)
+	RF_DagNode_t *node;
+{
+	int     np = node->numParams;
+	int     d;
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
+	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
+	int     i;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+	char   *qbuf = node->results[1];
+	char   *obuf, *qpbuf;
+	RF_PhysDiskAddr_t *old;
+	unsigned long coeff;
+	unsigned fail_start;
+	int     j;
+
+	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
+	fail_start = old->startSector % secPerSU;
+
+	RF_ETIMER_START(timer);
+
+	d = (np - 2) / 2;
+	RF_ASSERT(2 * d + 2 == np);
+	for (i = 0; i < d; i++) {
+		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
+		obuf = (char *) node->params[2 * i + 1].p;
+		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
+		/* compute the data unit offset within the column, then add
+		 * one */
+		coeff = (coeff % raidPtr->Layout.numDataCol);
+		/* the input buffers may not all be aligned with the start of
+		 * the stripe. so shift by their sector offset within the
+		 * stripe unit */
+		j = old->startSector % secPerSU;
+		RF_ASSERT(j >= fail_start);
+		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
+		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
+	}
+
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->q_us += RF_ETIMER_VAL_US(timer);
+}
+/*
+   Called by large write code to compute the new parity and the new q.
+
+   structure of the params:
+
+   pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
+   raidPtr
+
+   for a total of 2d+1 arguments.
+   The result buffers results[0], results[1] are the buffers for the p and q,
+   respectively.
+
+   We compute Q first, then compute P. The P calculation may try to reuse
+   one of the input buffers for its output, so if we computed P first, we would
+   corrupt the input for the q calculation.
+*/
+
+int 
+rf_RegularPQFunc(node)
+	RF_DagNode_t *node;
+{
+	RegularQSubr(node, node->results[1]);
+	return (rf_RegularXorFunc(node));	/* does the wakeup */
+}
+
+int 
+rf_RegularQFunc(node)
+	RF_DagNode_t *node;
+{
+	/* Almost ... adjust Qsubr args */
+	RegularQSubr(node, node->results[0]);
+	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
+					 * I/O in this node */
+	return (0);
+}
+/*
+   Called by singly degraded write code to compute the new parity and the new q.
+
+   structure of the params:
+
+   pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
+   failedPDA raidPtr
+
+   for a total of 2d+2 arguments.
+   The result buffers results[0], results[1] are the buffers for the parity and q,
+   respectively.
+
+   We compute Q first, then compute parity. The parity calculation may try to reuse
+   one of the input buffers for its output, so if we computed parity first, we would
+   corrupt the input for the q calculation.
+
+   We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
+*/
+
+void 
+rf_Degraded_100_PQFunc(node)
+	RF_DagNode_t *node;
+{
+	int     np = node->numParams;
+
+	RF_ASSERT(np >= 2);
+	DegrQSubr(node);
+	rf_RecoveryXorFunc(node);
+}
+
+
+/*
+   The two below are used when reading a stripe with a single lost data unit.
+   The parameters are
+
+   pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
+
+   and results[0] contains the data buffer. Which is originally zero-filled.
+
+*/
+
+/* this Q func is used by the degraded-mode dag functions to recover lost data.
+ * the second-to-last parameter is the PDA for the failed portion of the access.
+ * the code here looks at this PDA and assumes that the xor target buffer is
+ * equal in size to the number of sectors in the failed PDA.  It then uses
+ * the other PDAs in the parameter list to determine where within the target
+ * buffer the corresponding data should be xored.
+ *
+ * Recall the basic equation is
+ *
+ *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
+ *
+ * so to recover data_j we need
+ *
+ *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
+ *
+ * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
+ * copying Q into it. Then we need to do a table lookup to convert to solve
+ *   data_j /= J
+ *
+ *
+ */
+int 
+rf_RecoveryQFunc(node)
+	RF_DagNode_t *node;
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
+	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
+	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
+	int     i;
+	RF_PhysDiskAddr_t *pda;
+	RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
+	char   *srcbuf, *destbuf;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+	unsigned long coeff;
+
+	RF_ETIMER_START(timer);
+	/* start by copying Q into the buffer */
+	bcopy(node->params[node->numParams - 3].p, node->results[0],
+	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
+	for (i = 0; i < node->numParams - 4; i += 2) {
+		RF_ASSERT(node->params[i + 1].p != node->results[0]);
+		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
+		srcbuf = (char *) node->params[i + 1].p;
+		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+		destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
+		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
+		/* compute the data unit offset within the column */
+		coeff = (coeff % raidPtr->Layout.numDataCol);
+		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
+	}
+	/* Do the nasty inversion now */
+	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
+	rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->q_us += RF_ETIMER_VAL_US(timer);
+	rf_GenericWakeupFunc(node, 0);
+	return (0);
+}
+
+int 
+rf_RecoveryPQFunc(node)
+	RF_DagNode_t *node;
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
+	printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
+	return (1);
+}
+/*
+   Degraded write Q subroutine.
+   Used when P is dead.
+   Large-write style Q computation.
+   Parameters
+
+   (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
+
+   We ignore failedPDA.
+
+   This is a "simple style" recovery func.
+*/
+
+void 
+rf_PQ_DegradedWriteQFunc(node)
+	RF_DagNode_t *node;
+{
+	int     np = node->numParams;
+	int     d;
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
+	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
+	int     i;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+	RF_Etimer_t timer;
+	char   *qbuf = node->results[0];
+	char   *obuf, *qpbuf;
+	RF_PhysDiskAddr_t *old;
+	unsigned long coeff;
+	int     fail_start, j;
+
+	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
+	fail_start = old->startSector % secPerSU;
+
+	RF_ETIMER_START(timer);
+
+	d = (np - 2) / 2;
+	RF_ASSERT(2 * d + 2 == np);
+
+	for (i = 0; i < d; i++) {
+		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
+		obuf = (char *) node->params[2 * i + 1].p;
+		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
+		/* compute the data unit offset within the column, then add
+		 * one */
+		coeff = (coeff % raidPtr->Layout.numDataCol);
+		j = old->startSector % secPerSU;
+		RF_ASSERT(j >= fail_start);
+		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
+		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
+	}
+
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->q_us += RF_ETIMER_VAL_US(timer);
+	rf_GenericWakeupFunc(node, 0);
+}
+
+
+
+
+/* Q computations */
+
+/*
+   coeff - colummn;
+
+   compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
+
+   on 5-bit basis;
+   length in bytes;
+*/
+
+void 
+rf_IncQ(dest, buf, length, coeff)
+	unsigned long *dest;
+	unsigned long *buf;
+	unsigned length;
+	unsigned coeff;
+{
+	unsigned long a, d, new;
+	unsigned long a1, a2;
+	unsigned int *q = &(rf_qfor[28 - coeff][0]);
+	unsigned r = rf_rn[coeff + 1];
+
+#define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
+#define INSERT(a,i) (a << (5L*i))
+
+	length /= 8;
+	/* 13 5 bit quants in a 64 bit word */
+	while (length) {
+		a = *buf++;
+		d = *dest;
+		a1 = EXTRACT(a, 0) ^ r;
+		a2 = EXTRACT(a, 1) ^ r;
+		new = INSERT(a2, 1) | a1;
+		a1 = EXTRACT(a, 2) ^ r;
+		a2 = EXTRACT(a, 3) ^ r;
+		a1 = q[a1];
+		a2 = q[a2];
+		new = new | INSERT(a1, 2) | INSERT(a2, 3);
+		a1 = EXTRACT(a, 4) ^ r;
+		a2 = EXTRACT(a, 5) ^ r;
+		a1 = q[a1];
+		a2 = q[a2];
+		new = new | INSERT(a1, 4) | INSERT(a2, 5);
+		a1 = EXTRACT(a, 5) ^ r;
+		a2 = EXTRACT(a, 6) ^ r;
+		a1 = q[a1];
+		a2 = q[a2];
+		new = new | INSERT(a1, 5) | INSERT(a2, 6);
+#if RF_LONGSHIFT > 2
+		a1 = EXTRACT(a, 7) ^ r;
+		a2 = EXTRACT(a, 8) ^ r;
+		a1 = q[a1];
+		a2 = q[a2];
+		new = new | INSERT(a1, 7) | INSERT(a2, 8);
+		a1 = EXTRACT(a, 9) ^ r;
+		a2 = EXTRACT(a, 10) ^ r;
+		a1 = q[a1];
+		a2 = q[a2];
+		new = new | INSERT(a1, 9) | INSERT(a2, 10);
+		a1 = EXTRACT(a, 11) ^ r;
+		a2 = EXTRACT(a, 12) ^ r;
+		a1 = q[a1];
+		a2 = q[a2];
+		new = new | INSERT(a1, 11) | INSERT(a2, 12);
+#endif				/* RF_LONGSHIFT > 2 */
+		d ^= new;
+		*dest++ = d;
+		length--;
+	}
+}
+/*
+   compute
+
+   dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
+
+   on a five bit basis.
+   optimization: compute old ^ new on 64 bit basis.
+
+   length in bytes.
+*/
+
+static void 
+QDelta(
+    char *dest,
+    char *obuf,
+    char *nbuf,
+    unsigned length,
+    unsigned char coeff)
+{
+	unsigned long a, d, new;
+	unsigned long a1, a2;
+	unsigned int *q = &(rf_qfor[28 - coeff][0]);
+	unsigned int r = rf_rn[coeff + 1];
+
+	r = a1 = a2 = new = d = a = 0; /* XXX for now... */
+	q = NULL; /* XXX for now */
+
+#ifdef _KERNEL
+	/* PQ in kernel currently not supported because the encoding/decoding
+	 * table is not present */
+	bzero(dest, length);
+#else				/* KERNEL */
+	/* this code probably doesn't work and should be rewritten  -wvcii */
+	/* 13 5 bit quants in a 64 bit word */
+	length /= 8;
+	while (length) {
+		a = *obuf++;	/* XXX need to reorg to avoid cache conflicts */
+		a ^= *nbuf++;
+		d = *dest;
+		a1 = EXTRACT(a, 0) ^ r;
+		a2 = EXTRACT(a, 1) ^ r;
+		a1 = q[a1];
+		a2 = q[a2];
+		new = INSERT(a2, 1) | a1;
+		a1 = EXTRACT(a, 2) ^ r;
+		a2 = EXTRACT(a, 3) ^ r;
+		a1 = q[a1];
+		a2 = q[a2];
+		new = new | INSERT(a1, 2) | INSERT(a2, 3);
+		a1 = EXTRACT(a, 4) ^ r;
+		a2 = EXTRACT(a, 5) ^ r;
+		a1 = q[a1];
+		a2 = q[a2];
+		new = new | INSERT(a1, 4) | INSERT(a2, 5);
+		a1 = EXTRACT(a, 5) ^ r;
+		a2 = EXTRACT(a, 6) ^ r;
+		a1 = q[a1];
+		a2 = q[a2];
+		new = new | INSERT(a1, 5) | INSERT(a2, 6);
+#if RF_LONGSHIFT > 2
+		a1 = EXTRACT(a, 7) ^ r;
+		a2 = EXTRACT(a, 8) ^ r;
+		a1 = q[a1];
+		a2 = q[a2];
+		new = new | INSERT(a1, 7) | INSERT(a2, 8);
+		a1 = EXTRACT(a, 9) ^ r;
+		a2 = EXTRACT(a, 10) ^ r;
+		a1 = q[a1];
+		a2 = q[a2];
+		new = new | INSERT(a1, 9) | INSERT(a2, 10);
+		a1 = EXTRACT(a, 11) ^ r;
+		a2 = EXTRACT(a, 12) ^ r;
+		a1 = q[a1];
+		a2 = q[a2];
+		new = new | INSERT(a1, 11) | INSERT(a2, 12);
+#endif				/* RF_LONGSHIFT > 2 */
+		d ^= new;
+		*dest++ = d;
+		length--;
+	}
+#endif				/* _KERNEL */
+}
+/*
+   recover columns a and b from the given p and q into
+   bufs abuf and bbuf. All bufs are word aligned.
+   Length is in bytes.
+*/
+
+
+/*
+ * XXX
+ *
+ * Everything about this seems wrong.
+ */
+void 
+rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b)
+	unsigned long *pbuf;
+	unsigned long *qbuf;
+	unsigned long *abuf;
+	unsigned long *bbuf;
+	unsigned length;
+	unsigned coeff_a;
+	unsigned coeff_b;
+{
+	unsigned long p, q, a, a0, a1;
+	int     col = (29 * coeff_a) + coeff_b;
+	unsigned char *q0 = &(rf_qinv[col][0]);
+
+	length /= 8;
+	while (length) {
+		p = *pbuf++;
+		q = *qbuf++;
+		a0 = EXTRACT(p, 0);
+		a1 = EXTRACT(q, 0);
+		a = q0[a0 << 5 | a1];
+#define MF(i) \
+      a0 = EXTRACT(p,i); \
+      a1 = EXTRACT(q,i); \
+      a  = a | INSERT(q0[a0<<5 | a1],i)
+
+		MF(1);
+		MF(2);
+		MF(3);
+		MF(4);
+		MF(5);
+		MF(6);
+#if 0
+		MF(7);
+		MF(8);
+		MF(9);
+		MF(10);
+		MF(11);
+		MF(12);
+#endif				/* 0 */
+		*abuf++ = a;
+		*bbuf++ = a ^ p;
+		length--;
+	}
+}
+/*
+   Lost parity and a data column. Recover that data column.
+   Assume col coeff is lost. Let q the contents of Q after
+   all surviving data columns have been q-xored out of it.
+   Then we have the equation
+
+   q[28-coeff][a_i ^ r_i+1] = q
+
+   but q is cyclic with period 31.
+   So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
+      q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
+
+   so a_i = r_{coeff+1} ^ q[3+coeff][q]
+
+   The routine is passed q buffer and the buffer
+   the data is to be recoverd into. They can be the same.
+*/
+
+
+
+static void 
+rf_InvertQ(
+    unsigned long *qbuf,
+    unsigned long *abuf,
+    unsigned length,
+    unsigned coeff)
+{
+	unsigned long a, new;
+	unsigned long a1, a2;
+	unsigned int *q = &(rf_qfor[3 + coeff][0]);
+	unsigned r = rf_rn[coeff + 1];
+
+	/* 13 5 bit quants in a 64 bit word */
+	length /= 8;
+	while (length) {
+		a = *qbuf++;
+		a1 = EXTRACT(a, 0);
+		a2 = EXTRACT(a, 1);
+		a1 = r ^ q[a1];
+		a2 = r ^ q[a2];
+		new = INSERT(a2, 1) | a1;
+#define M(i,j) \
+      a1 = EXTRACT(a,i); \
+      a2 = EXTRACT(a,j); \
+      a1 = r ^ q[a1]; \
+      a2 = r ^ q[a2]; \
+      new = new | INSERT(a1,i) | INSERT(a2,j)
+
+		M(2, 3);
+		M(4, 5);
+		M(5, 6);
+#if RF_LONGSHIFT > 2
+		M(7, 8);
+		M(9, 10);
+		M(11, 12);
+#endif				/* RF_LONGSHIFT > 2 */
+		*abuf++ = new;
+		length--;
+	}
+}
+#endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
+				 * (RF_INCLUDE_RAID6 > 0) */
diff --git a/sys/dev/raidframe/rf_pq.h b/sys/dev/raidframe/rf_pq.h
new file mode 100644
index 0000000..9a2ce23
--- /dev/null
+++ b/sys/dev/raidframe/rf_pq.h
@@ -0,0 +1,75 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_pq.h,v 1.3 1999/02/05 00:06:15 oster Exp $	*/
+/*
+ * rf_pq.h
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_PQ_H_
+#define _RF__RF_PQ_H_
+
+#include <dev/raidframe/rf_archs.h>
+
+extern RF_RedFuncs_t rf_pFuncs;
+extern RF_RedFuncs_t rf_pRecoveryFuncs;
+
+int     rf_RegularONPFunc(RF_DagNode_t * node);
+int     rf_SimpleONPFunc(RF_DagNode_t * node);
+int     rf_RecoveryPFunc(RF_DagNode_t * node);
+int     rf_RegularPFunc(RF_DagNode_t * node);
+
+#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
+
+extern RF_RedFuncs_t rf_qFuncs;
+extern RF_RedFuncs_t rf_qRecoveryFuncs;
+extern RF_RedFuncs_t rf_pqRecoveryFuncs;
+
+void 
+rf_PQDagSelect(RF_Raid_t * raidPtr, RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap, RF_VoidFuncPtr * createFunc);
+RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG);
+int     rf_RegularONQFunc(RF_DagNode_t * node);
+int     rf_SimpleONQFunc(RF_DagNode_t * node);
+RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG);
+int     rf_RegularPQFunc(RF_DagNode_t * node);
+int     rf_RegularQFunc(RF_DagNode_t * node);
+void    rf_Degraded_100_PQFunc(RF_DagNode_t * node);
+int     rf_RecoveryQFunc(RF_DagNode_t * node);
+int     rf_RecoveryPQFunc(RF_DagNode_t * node);
+void    rf_PQ_DegradedWriteQFunc(RF_DagNode_t * node);
+void 
+rf_IncQ(unsigned long *dest, unsigned long *buf, unsigned length,
+    unsigned coeff);
+void 
+rf_PQ_recover(unsigned long *pbuf, unsigned long *qbuf, unsigned long *abuf,
+    unsigned long *bbuf, unsigned length, unsigned coeff_a, unsigned coeff_b);
+
+#endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
+				 * (RF_INCLUDE_RAID6 > 0) */
+
+#endif				/* !_RF__RF_PQ_H_ */
diff --git a/sys/dev/raidframe/rf_pqdeg.c b/sys/dev/raidframe/rf_pqdeg.c
new file mode 100644
index 0000000..e76ccdf
--- /dev/null
+++ b/sys/dev/raidframe/rf_pqdeg.c
@@ -0,0 +1,217 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_pqdeg.c,v 1.5 2000/01/07 03:41:04 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <dev/raidframe/rf_archs.h>
+
+#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_dagffrd.h>
+#include <dev/raidframe/rf_dagffwr.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+#include <dev/raidframe/rf_dagdegwr.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_pqdeg.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_pqdegdags.h>
+#include <dev/raidframe/rf_pq.h>
+
+/*
+   Degraded mode dag functions for P+Q calculations.
+
+   The following nomenclature is used.
+
+   PQ_<D><P><Q>_Create{Large,Small}<Write|Read>DAG
+
+   where <D><P><Q> are single digits representing the number of failed
+   data units <D> (0,1,2), parity units <P> (0,1), and Q units <Q>, effecting
+   the I/O. The reads have only  PQ_<D><P><Q>_CreateReadDAG variants, while
+   the single fault writes have both large and small write versions. (Single fault
+   PQ is equivalent to normal mode raid 5 in many aspects.
+
+   Some versions degenerate into the same case, and are grouped together below.
+*/
+
+/* Reads, single failure
+
+   we have parity, so we can do a raid 5
+   reconstruct read.
+*/
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_100_CreateReadDAG)
+{
+	rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pRecoveryFuncs);
+}
+/* Reads double failure  */
+
+/*
+   Q is lost, but not parity
+   so we can a raid 5 reconstruct read.
+*/
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_101_CreateReadDAG)
+{
+	rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pRecoveryFuncs);
+}
+/*
+  parity is lost, so we need to
+  do a reconstruct read and recompute
+  the data with Q.
+*/
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_110_CreateReadDAG)
+{
+	RF_PhysDiskAddr_t *temp;
+	/* swap P and Q pointers to fake out the DegradedReadDAG code */
+	temp = asmap->parityInfo;
+	asmap->parityInfo = asmap->qInfo;
+	asmap->qInfo = temp;
+	rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_qRecoveryFuncs);
+}
+/*
+  Two data units are dead in this stripe, so we will need read
+  both P and Q to reconstruct the data. Note that only
+  one data unit we are reading may actually be missing.
+*/
+RF_CREATE_DAG_FUNC_DECL(rf_CreateDoubleDegradedReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_CreateDoubleDegradedReadDAG)
+{
+	rf_PQ_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList);
+}
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateReadDAG)
+{
+	rf_CreateDoubleDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList);
+}
+/* Writes, single failure */
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_100_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_100_CreateWriteDAG)
+{
+	if (asmap->numStripeUnitsAccessed != 1 &&
+	    asmap->failedPDAs[0]->numSector != 
+	    raidPtr->Layout.sectorsPerStripeUnit)
+		RF_PANIC();
+	rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, 
+		      flags, allocList, 2, 
+		      (int (*) (RF_DagNode_t *)) rf_Degraded_100_PQFunc, 
+		      RF_FALSE);
+}
+/* Dead  P - act like a RAID 5 small write with parity = Q */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_010_CreateSmallWriteDAG)
+{
+	RF_PhysDiskAddr_t *temp;
+	/* swap P and Q pointers to fake out the DegradedReadDAG code */
+	temp = asmap->parityInfo;
+	asmap->parityInfo = asmap->qInfo;
+	asmap->qInfo = temp;
+	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, 
+				     allocList, &rf_qFuncs, NULL);
+}
+/* Dead Q - act like a RAID 5 small write */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_001_CreateSmallWriteDAG)
+{
+	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, 
+				     allocList, &rf_pFuncs, NULL);
+}
+/* Dead P - act like a RAID 5 large write but for Q */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_010_CreateLargeWriteDAG)
+{
+	RF_PhysDiskAddr_t *temp;
+	/* swap P and Q pointers to fake out the code */
+	temp = asmap->parityInfo;
+	asmap->parityInfo = asmap->qInfo;
+	asmap->qInfo = temp;
+	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, 
+				     allocList, 1, rf_RegularQFunc, RF_FALSE);
+}
+/* Dead Q - act like a RAID 5 large write */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_001_CreateLargeWriteDAG)
+{
+	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, 
+				     allocList, 1, rf_RegularPFunc, RF_FALSE);
+}
+
+
+/*
+ * writes, double failure
+ */
+
+/*
+ * Lost P & Q - do a nonredundant write
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_011_CreateWriteDAG)
+{
+	rf_CreateNonRedundantWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+	    RF_IO_TYPE_WRITE);
+}
+/*
+   In the two cases below,
+   A nasty case arises when the write a (strict) portion of a failed stripe unit
+   and parts of another su. For now, we do not support this.
+*/
+
+/*
+  Lost Data and  P - do a Q write.
+*/
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_110_CreateWriteDAG)
+{
+	RF_PhysDiskAddr_t *temp;
+
+	if (asmap->numStripeUnitsAccessed != 1 &&
+	    asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit) {
+		RF_PANIC();
+	}
+	/* swap P and Q to fake out parity code */
+	temp = asmap->parityInfo;
+	asmap->parityInfo = asmap->qInfo;
+	asmap->qInfo = temp;
+	rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
+		      allocList, 1, 
+		      (int (*) (RF_DagNode_t *)) rf_PQ_DegradedWriteQFunc,
+		      RF_FALSE);
+	/* is the regular Q func the right one to call? */
+}
+/*
+   Lost Data and Q - do degraded mode P write
+*/
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_101_CreateWriteDAG)
+{
+	if (asmap->numStripeUnitsAccessed != 1 &&
+	    asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit)
+		RF_PANIC();
+	rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
+	    allocList, 1, rf_RecoveryXorFunc, RF_FALSE);
+}
+#endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
+				 * (RF_INCLUDE_RAID6 > 0) */
diff --git a/sys/dev/raidframe/rf_pqdeg.h b/sys/dev/raidframe/rf_pqdeg.h
new file mode 100644
index 0000000..83371e6
--- /dev/null
+++ b/sys/dev/raidframe/rf_pqdeg.h
@@ -0,0 +1,75 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_pqdeg.h,v 1.3 1999/02/05 00:06:15 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_PQDEG_H_
+#define _RF__RF_PQDEG_H_
+
+#include <dev/raidframe/rf_types.h>
+
+#if RF_UTILITY == 0
+#include <dev/raidframe/rf_dag.h>
+
+/* extern decl's of the failure mode PQ functions.
+ * See pddeg.c for nomenclature discussion.
+ */
+
+/* reads, single failure  */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_100_CreateReadDAG);
+/* reads, two failure */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_110_CreateReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_101_CreateReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateReadDAG);
+
+/* writes, single failure */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_100_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_010_CreateSmallWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_010_CreateLargeWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_001_CreateSmallWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_001_CreateLargeWriteDAG);
+
+/* writes, double failure */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_011_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_110_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_101_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG);
+#endif				/* RF_UTILITY == 0 */
+
+typedef RF_uint32 RF_ua32_t[32];
+typedef RF_uint8 RF_ua1024_t[1024];
+
+extern RF_ua32_t rf_rn;
+extern RF_ua32_t rf_qfor[32];
+#ifndef _KERNEL			/* we don't support PQ in the kernel yet, so
+				 * don't link in this monster table */
+extern RF_ua1024_t rf_qinv[29 * 29];
+#else				/* !_KERNEL */
+extern RF_ua1024_t rf_qinv[1];
+#endif				/* !_KERNEL */
+
+#endif				/* !_RF__RF_PQDEG_H_ */
diff --git a/sys/dev/raidframe/rf_pqdegdags.c b/sys/dev/raidframe/rf_pqdegdags.c
new file mode 100644
index 0000000..e0d97ed
--- /dev/null
+++ b/sys/dev/raidframe/rf_pqdegdags.c
@@ -0,0 +1,430 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_pqdegdags.c,v 1.5 1999/08/15 02:36:40 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_pqdegdags.c
+ * Degraded mode dags for double fault cases.
+*/
+
+
+#include <dev/raidframe/rf_archs.h>
+
+#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+#include <dev/raidframe/rf_dagdegwr.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_acctrace.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_pqdegdags.h>
+#include <dev/raidframe/rf_pq.h>
+
+static void 
+applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
+    RF_PhysDiskAddr_t * qpda, void *bp);
+
+/*
+   Two data drives have failed, and we are doing a read that covers one of them.
+   We may also be reading some of the surviving drives.
+
+
+ *****************************************************************************************
+ *
+ * creates a DAG to perform a degraded-mode read of data within one stripe.
+ * This DAG is as follows:
+ *
+ *                                      Hdr
+ *                                       |
+ *                                     Block
+ *                       /         /           \         \     \   \
+ *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
+ *                      | \       | \         | \       | \    | \ | \
+ *
+ *                                 |                 |
+ *                              Unblock              X
+ *                                  \               /
+ *                                   ------ T ------
+ *
+ * Each R node is a successor of the L node
+ * One successor arc from each R node goes to U, and the other to X
+ * There is one Rud for each chunk of surviving user data requested by the user,
+ * and one Rrd for each chunk of surviving user data _not_ being read by the user
+ * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
+ * X = pq recovery node, T = terminate
+ *
+ * The block & unblock nodes are leftovers from a previous version.  They
+ * do nothing, but I haven't deleted them because it would be a tremendous
+ * effort to put them back in.
+ *
+ * Note:  The target buffer for the XOR node is set to the actual user buffer where the
+ * failed data is supposed to end up.  This buffer is zero'd by the code here.  Thus,
+ * if you create a degraded read dag, use it, and then re-use, you have to be sure to
+ * zero the target buffer prior to the re-use.
+ *
+ * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
+ * needs and what's not.
+ ****************************************************************************************/
+/*   init a disk node with 2 successors and one predecessor */
+#define INIT_DISK_NODE(node,name) \
+rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
+(node)->succedents[0] = unblockNode; \
+(node)->succedents[1] = recoveryNode; \
+(node)->antecedents[0] = blockNode; \
+(node)->antType[0] = rf_control
+
+#define DISK_NODE_PARAMS(_node_,_p_) \
+  (_node_).params[0].p = _p_ ; \
+  (_node_).params[1].p = (_p_)->bufPtr; \
+  (_node_).params[2].v = parityStripeID; \
+  (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
+
+#define DISK_NODE_PDA(node)  ((node)->params[0].p)
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
+{
+	rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
+	    "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
+}
+
+static void 
+applyPDA(raidPtr, pda, ppda, qpda, bp)
+	RF_Raid_t *raidPtr;
+	RF_PhysDiskAddr_t *pda;
+	RF_PhysDiskAddr_t *ppda;
+	RF_PhysDiskAddr_t *qpda;
+	void   *bp;
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
+	RF_SectorCount_t s0len = ppda->numSector, len;
+	RF_SectorNum_t suoffset;
+	unsigned coeff;
+	char   *pbuf = ppda->bufPtr;
+	char   *qbuf = qpda->bufPtr;
+	char   *buf;
+	int     delta;
+
+	suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+	len = pda->numSector;
+	/* see if pda intersects a recovery pda */
+	if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
+		buf = pda->bufPtr;
+		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
+		coeff = (coeff % raidPtr->Layout.numDataCol);
+
+		if (suoffset < s0off) {
+			delta = s0off - suoffset;
+			buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
+			suoffset = s0off;
+			len -= delta;
+		}
+		if (suoffset > s0off) {
+			delta = suoffset - s0off;
+			pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
+			qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
+		}
+		if ((suoffset + len) > (s0len + s0off))
+			len = s0len + s0off - suoffset;
+
+		/* src, dest, len */
+		rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
+
+		/* dest, src, len, coeff */
+		rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
+	}
+}
+/*
+   Recover data in the case of a double failure. There can be two
+   result buffers, one for each chunk of data trying to be recovered.
+   The params are pda's that have not been range restricted or otherwise
+   politely massaged - this should be done here. The last params are the
+   pdas of P and Q, followed by the raidPtr. The list can look like
+
+   pda, pda, ... , p pda, q pda, raidptr, asm
+
+   or
+
+   pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
+
+   depending on wether two chunks of recovery data were required.
+
+   The second condition only arises if there are two failed buffers
+   whose lengths do not add up a stripe unit.
+*/
+
+
+int 
+rf_PQDoubleRecoveryFunc(node)
+	RF_DagNode_t *node;
+{
+	int     np = node->numParams;
+	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
+	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
+	int     d, i;
+	unsigned coeff;
+	RF_RaidAddr_t sosAddr, suoffset;
+	RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
+	int     two = 0;
+	RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
+	char   *buf;
+	int     numDataCol = layoutPtr->numDataCol;
+	RF_Etimer_t timer;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+
+	RF_ETIMER_START(timer);
+
+	if (asmap->failedPDAs[1] &&
+	    (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
+		RF_ASSERT(0);
+		ppda = node->params[np - 6].p;
+		ppda2 = node->params[np - 5].p;
+		qpda = node->params[np - 4].p;
+		qpda2 = node->params[np - 3].p;
+		d = (np - 6);
+		two = 1;
+	} else {
+		ppda = node->params[np - 4].p;
+		qpda = node->params[np - 3].p;
+		d = (np - 4);
+	}
+
+	for (i = 0; i < d; i++) {
+		pda = node->params[i].p;
+		buf = pda->bufPtr;
+		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+		len = pda->numSector;
+		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
+		/* compute the data unit offset within the column */
+		coeff = (coeff % raidPtr->Layout.numDataCol);
+		/* see if pda intersects a recovery pda */
+		applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
+		if (two)
+			applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
+	}
+
+	/* ok, we got the parity back to the point where we can recover. We
+	 * now need to determine the coeff of the columns that need to be
+	 * recovered. We can also only need to recover a single stripe unit. */
+
+	if (asmap->failedPDAs[1] == NULL) {	/* only a single stripe unit
+						 * to recover. */
+		pda = asmap->failedPDAs[0];
+		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+		/* need to determine the column of the other failed disk */
+		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
+		/* compute the data unit offset within the column */
+		coeff = (coeff % raidPtr->Layout.numDataCol);
+		for (i = 0; i < numDataCol; i++) {
+			npda.raidAddress = sosAddr + (i * secPerSU);
+			(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
+			/* skip over dead disks */
+			if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
+				if (i != coeff)
+					break;
+		}
+		RF_ASSERT(i < numDataCol);
+		RF_ASSERT(two == 0);
+		/* recover the data. Since we need only want to recover one
+		 * column, we overwrite the parity with the other one. */
+		if (coeff < i)	/* recovering 'a' */
+			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
+		else		/* recovering 'b' */
+			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
+	} else
+		RF_PANIC();
+
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	if (tracerec)
+		tracerec->q_us += RF_ETIMER_VAL_US(timer);
+	rf_GenericWakeupFunc(node, 0);
+	return (0);
+}
+
+int 
+rf_PQWriteDoubleRecoveryFunc(node)
+	RF_DagNode_t *node;
+{
+	/* The situation:
+	 * 
+	 * We are doing a write that hits only one failed data unit. The other
+	 * failed data unit is not being overwritten, so we need to generate
+	 * it.
+	 * 
+	 * For the moment, we assume all the nonfailed data being written is in
+	 * the shadow of the failed data unit. (i.e,, either a single data
+	 * unit write or the entire failed stripe unit is being overwritten. )
+	 * 
+	 * Recovery strategy: apply the recovery data to the parity and q. Use P
+	 * & Q to recover the second failed data unit in P. Zero fill Q, then
+	 * apply the recovered data to p. Then apply the data being written to
+	 * the failed drive. Then walk through the surviving drives, applying
+	 * new data when it exists, othewise the recovery data. Quite a mess.
+	 * 
+	 * 
+	 * The params
+	 * 
+	 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
+	 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
+	 * raidPtr, asmap */
+
+	int     np = node->numParams;
+	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
+	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
+	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
+	int     i;
+	RF_RaidAddr_t sosAddr;
+	unsigned coeff;
+	RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
+	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
+	int     numDataCol = layoutPtr->numDataCol;
+	RF_Etimer_t timer;
+	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+
+	RF_ASSERT(node->numResults == 2);
+	RF_ASSERT(asmap->failedPDAs[1] == NULL);
+	RF_ETIMER_START(timer);
+	ppda = node->results[0];
+	qpda = node->results[1];
+	/* apply the recovery data */
+	for (i = 0; i < numDataCol - 2; i++)
+		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
+
+	/* determine the other failed data unit */
+	pda = asmap->failedPDAs[0];
+	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+	/* need to determine the column of the other failed disk */
+	coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
+	/* compute the data unit offset within the column */
+	coeff = (coeff % raidPtr->Layout.numDataCol);
+	for (i = 0; i < numDataCol; i++) {
+		npda.raidAddress = sosAddr + (i * secPerSU);
+		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
+		/* skip over dead disks */
+		if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
+			if (i != coeff)
+				break;
+	}
+	RF_ASSERT(i < numDataCol);
+	/* recover the data. The column we want to recover we write over the
+	 * parity. The column we don't care about we dump in q. */
+	if (coeff < i)		/* recovering 'a' */
+		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
+	else			/* recovering 'b' */
+		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
+
+	/* OK. The valid data is in P. Zero fill Q, then inc it into it. */
+	bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector));
+	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
+
+	/* now apply all the write data to the buffer */
+	/* single stripe unit write case: the failed data is only thing we are
+	 * writing. */
+	RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
+	/* dest, src, len, coeff */
+	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
+	rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
+
+	/* now apply all the recovery data */
+	for (i = 0; i < numDataCol - 2; i++)
+		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
+
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	if (tracerec)
+		tracerec->q_us += RF_ETIMER_VAL_US(timer);
+
+	rf_GenericWakeupFunc(node, 0);
+	return (0);
+}
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
+{
+	RF_PANIC();
+}
+/*
+   Two lost data unit write case.
+
+   There are really two cases here:
+
+   (1) The write completely covers the two lost data units.
+       In that case, a reconstruct write that doesn't write the
+       failed data units will do the correct thing. So in this case,
+       the dag looks like
+
+            full stripe read of surviving data units (not being overwriten)
+	    write new data (ignoring failed units)   compute P&Q
+	                                             write P&Q
+
+
+   (2) The write does not completely cover both failed data units
+       (but touches at least one of them). Then we need to do the
+       equivalent of a reconstruct read to recover the missing data
+       unit from the other stripe.
+
+       For any data we are writing that is not in the "shadow"
+       of the failed units, we need to do a four cycle update.
+       PANIC on this case. for now
+
+*/
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
+	int     sum;
+	int     nf = asmap->numDataFailed;
+
+	sum = asmap->failedPDAs[0]->numSector;
+	if (nf == 2)
+		sum += asmap->failedPDAs[1]->numSector;
+
+	if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
+		/* large write case */
+		rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
+		return;
+	}
+	if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
+		/* small write case, no user data not in shadow */
+		rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
+		return;
+	}
+	RF_PANIC();
+}
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
+{
+	rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
+}
+#endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
+				 * (RF_INCLUDE_RAID6 > 0) */
diff --git a/sys/dev/raidframe/rf_pqdegdags.h b/sys/dev/raidframe/rf_pqdegdags.h
new file mode 100644
index 0000000..11ce820
--- /dev/null
+++ b/sys/dev/raidframe/rf_pqdegdags.h
@@ -0,0 +1,49 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_pqdegdags.h,v 1.3 1999/02/05 00:06:15 oster Exp $	*/
+/*
+ * rf_pqdegdags.h
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * rf_pqdegdags.c
+ * Degraded mode dags for double fault cases.
+ */
+
+#ifndef _RF__RF_PQDEGDAGS_H_
+#define _RF__RF_PQDEGDAGS_H_
+
+#include <dev/raidframe/rf_dag.h>
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead);
+int     rf_PQDoubleRecoveryFunc(RF_DagNode_t * node);
+int     rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t * node);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG);
+
+#endif				/* !_RF__RF_PQDEGDAGS_H_ */
diff --git a/sys/dev/raidframe/rf_psstatus.c b/sys/dev/raidframe/rf_psstatus.c
new file mode 100644
index 0000000..31c2be7
--- /dev/null
+++ b/sys/dev/raidframe/rf_psstatus.c
@@ -0,0 +1,376 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_psstatus.c,v 1.5 2000/01/08 22:57:31 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * psstatus.c
+ *
+ * The reconstruction code maintains a bunch of status related to the parity
+ * stripes that are currently under reconstruction.  This header file defines
+ * the status structures.
+ *
+ *****************************************************************************/
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_debugprint.h>
+#include <dev/raidframe/rf_freelist.h>
+#include <dev/raidframe/rf_psstatus.h>
+#include <dev/raidframe/rf_shutdown.h>
+
+#define Dprintf1(s,a)         if (rf_pssDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf2(s,a,b)       if (rf_pssDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf3(s,a,b,c)     if (rf_pssDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
+
+static void 
+RealPrintPSStatusTable(RF_Raid_t * raidPtr,
+    RF_PSStatusHeader_t * pssTable);
+
+#define RF_MAX_FREE_PSS  32
+#define RF_PSS_INC        8
+#define RF_PSS_INITIAL    4
+
+static int init_pss(RF_ReconParityStripeStatus_t *, RF_Raid_t *);
+static void clean_pss(RF_ReconParityStripeStatus_t *, RF_Raid_t *);
+static void rf_ShutdownPSStatus(void *);
+
+static int 
+init_pss(p, raidPtr)
+	RF_ReconParityStripeStatus_t *p;
+	RF_Raid_t *raidPtr;
+{
+	RF_Calloc(p->issued, raidPtr->numCol, sizeof(char), (char *));
+	if (p->issued == NULL)
+		return (ENOMEM);
+	return (0);
+}
+
+static void 
+clean_pss(p, raidPtr)
+	RF_ReconParityStripeStatus_t *p;
+	RF_Raid_t *raidPtr;
+{
+	RF_Free(p->issued, raidPtr->numCol * sizeof(char));
+}
+
+static void 
+rf_ShutdownPSStatus(arg)
+	void   *arg;
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) arg;
+
+	RF_FREELIST_DESTROY_CLEAN_ARG(raidPtr->pss_freelist, next, (RF_ReconParityStripeStatus_t *), clean_pss, raidPtr);
+}
+
+int 
+rf_ConfigurePSStatus(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	int     rc;
+
+	raidPtr->pssTableSize = RF_PSS_DEFAULT_TABLESIZE;
+	RF_FREELIST_CREATE(raidPtr->pss_freelist, RF_MAX_FREE_PSS,
+	    RF_PSS_INC, sizeof(RF_ReconParityStripeStatus_t));
+	if (raidPtr->pss_freelist == NULL)
+		return (ENOMEM);
+	rc = rf_ShutdownCreate(listp, rf_ShutdownPSStatus, raidPtr);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+		    __FILE__, __LINE__, rc);
+		rf_ShutdownPSStatus(raidPtr);
+		return (rc);
+	}
+	RF_FREELIST_PRIME_INIT_ARG(raidPtr->pss_freelist, RF_PSS_INITIAL, next,
+	    (RF_ReconParityStripeStatus_t *), init_pss, raidPtr);
+	return (0);
+}
+/*****************************************************************************************
+ * sets up the pss table
+ * We pre-allocate a bunch of entries to avoid as much as possible having to
+ * malloc up hash chain entries.
+ ****************************************************************************************/
+RF_PSStatusHeader_t *
+rf_MakeParityStripeStatusTable(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	RF_PSStatusHeader_t *pssTable;
+	int     i, j, rc;
+
+	RF_Calloc(pssTable, raidPtr->pssTableSize, sizeof(RF_PSStatusHeader_t), (RF_PSStatusHeader_t *));
+	for (i = 0; i < raidPtr->pssTableSize; i++) {
+		rc = rf_mutex_init(&pssTable[i].mutex, __FUNCTION__);
+		if (rc) {
+			RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+			    __LINE__, rc);
+			/* fail and deallocate */
+			for (j = 0; j < i; j++) {
+				rf_mutex_destroy(&pssTable[i].mutex);
+			}
+			RF_Free(pssTable, raidPtr->pssTableSize * sizeof(RF_PSStatusHeader_t));
+			return (NULL);
+		}
+	}
+	return (pssTable);
+}
+
+void 
+rf_FreeParityStripeStatusTable(raidPtr, pssTable)
+	RF_Raid_t *raidPtr;
+	RF_PSStatusHeader_t *pssTable;
+{
+	int     i;
+
+	if (rf_pssDebug)
+		RealPrintPSStatusTable(raidPtr, pssTable);
+	for (i = 0; i < raidPtr->pssTableSize; i++) {
+		if (pssTable[i].chain) {
+			printf("ERROR: pss hash chain not null at recon shutdown\n");
+		}
+		rf_mutex_destroy(&pssTable[i].mutex);
+	}
+	RF_Free(pssTable, raidPtr->pssTableSize * sizeof(RF_PSStatusHeader_t));
+}
+
+
+/* looks up the status structure for a parity stripe.
+ * if the create_flag is on, creates and returns the status structure it it doesn't exist
+ * otherwise returns NULL if the status structure does not exist
+ *
+ * ASSUMES THE PSS DESCRIPTOR IS LOCKED UPON ENTRY
+ */
+RF_ReconParityStripeStatus_t *
+rf_LookupRUStatus(
+    RF_Raid_t * raidPtr,
+    RF_PSStatusHeader_t * pssTable,
+    RF_StripeNum_t psID,
+    RF_ReconUnitNum_t which_ru,
+    RF_PSSFlags_t flags,	/* whether or not to create it if it doesn't
+				 * exist + what flags to set initially */
+    int *created)
+{
+	RF_PSStatusHeader_t *hdr = &pssTable[RF_HASH_PSID(raidPtr, psID)];
+	RF_ReconParityStripeStatus_t *p, *pssPtr = hdr->chain;
+
+	*created = 0;
+	for (p = pssPtr; p; p = p->next) {
+		if (p->parityStripeID == psID && p->which_ru == which_ru)
+			break;
+	}
+
+	if (!p && (flags & RF_PSS_CREATE)) {
+		Dprintf2("PSS: creating pss for psid %ld ru %d\n", psID, which_ru);
+		p = rf_AllocPSStatus(raidPtr);
+		p->next = hdr->chain;
+		hdr->chain = p;
+
+		p->parityStripeID = psID;
+		p->which_ru = which_ru;
+		p->flags = flags;
+		p->rbuf = NULL;
+		p->writeRbuf = NULL;
+		p->blockCount = 0;
+		p->procWaitList = NULL;
+		p->blockWaitList = NULL;
+		p->bufWaitList = NULL;
+		*created = 1;
+	} else
+		if (p) {	/* we didn't create, but we want to specify
+				 * some new status */
+			p->flags |= flags;	/* add in whatever flags we're
+						 * specifying */
+		}
+	if (p && (flags & RF_PSS_RECON_BLOCKED)) {
+		p->blockCount++;/* if we're asking to block recon, bump the
+				 * count */
+		Dprintf3("raid%d: Blocked recon on psid %ld.  count now %d\n",
+			 raidPtr->raidid, psID, p->blockCount);
+	}
+	return (p);
+}
+/* deletes an entry from the parity stripe status table.  typically used
+ * when an entry has been allocated solely to block reconstruction, and
+ * no recon was requested while recon was blocked.  Assumes the hash
+ * chain is ALREADY LOCKED.
+ */
+void 
+rf_PSStatusDelete(raidPtr, pssTable, pssPtr)
+	RF_Raid_t *raidPtr;
+	RF_PSStatusHeader_t *pssTable;
+	RF_ReconParityStripeStatus_t *pssPtr;
+{
+	RF_PSStatusHeader_t *hdr = &(pssTable[RF_HASH_PSID(raidPtr, pssPtr->parityStripeID)]);
+	RF_ReconParityStripeStatus_t *p = hdr->chain, *pt = NULL;
+
+	while (p) {
+		if (p == pssPtr) {
+			if (pt)
+				pt->next = p->next;
+			else
+				hdr->chain = p->next;
+			p->next = NULL;
+			rf_FreePSStatus(raidPtr, p);
+			return;
+		}
+		pt = p;
+		p = p->next;
+	}
+	RF_ASSERT(0);		/* we must find it here */
+}
+/* deletes an entry from the ps status table after reconstruction has completed */
+void 
+rf_RemoveFromActiveReconTable(raidPtr, row, psid, which_ru)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t row;
+	RF_ReconUnitNum_t which_ru;
+	RF_StripeNum_t psid;
+{
+	RF_PSStatusHeader_t *hdr = &(raidPtr->reconControl[row]->pssTable[RF_HASH_PSID(raidPtr, psid)]);
+	RF_ReconParityStripeStatus_t *p, *pt;
+	RF_CallbackDesc_t *cb, *cb1;
+
+	RF_LOCK_MUTEX(hdr->mutex);
+	for (pt = NULL, p = hdr->chain; p; pt = p, p = p->next) {
+		if ((p->parityStripeID == psid) && (p->which_ru == which_ru))
+			break;
+	}
+	if (p == NULL) {
+		rf_PrintPSStatusTable(raidPtr, row);
+	}
+	RF_ASSERT(p);		/* it must be there */
+
+	Dprintf2("PSS: deleting pss for psid %ld ru %d\n", psid, which_ru);
+
+	/* delete this entry from the hash chain */
+	if (pt)
+		pt->next = p->next;
+	else
+		hdr->chain = p->next;
+	p->next = NULL;
+
+	RF_UNLOCK_MUTEX(hdr->mutex);
+
+	/* wakup anyone waiting on the parity stripe ID */
+	cb = p->procWaitList;
+	p->procWaitList = NULL;
+	while (cb) {
+		Dprintf1("Waking up access waiting on parity stripe ID %ld\n", p->parityStripeID);
+		cb1 = cb->next;
+		(cb->callbackFunc) (cb->callbackArg);
+
+		/* THIS IS WHAT THE ORIGINAL CODE HAD... the extra 0 is bogus,
+		 * IMHO */
+		/* (cb->callbackFunc)(cb->callbackArg, 0); */
+		rf_FreeCallbackDesc(cb);
+		cb = cb1;
+	}
+
+	rf_FreePSStatus(raidPtr, p);
+}
+
+RF_ReconParityStripeStatus_t *
+rf_AllocPSStatus(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	RF_ReconParityStripeStatus_t *p;
+
+	RF_FREELIST_GET_INIT_ARG(raidPtr->pss_freelist, p, next, (RF_ReconParityStripeStatus_t *), init_pss, raidPtr);
+	if (p) {
+		bzero(p->issued, raidPtr->numCol);
+	}
+	p->next = NULL;
+	/* no need to initialize here b/c the only place we're called from is
+	 * the above Lookup */
+	return (p);
+}
+
+void 
+rf_FreePSStatus(raidPtr, p)
+	RF_Raid_t *raidPtr;
+	RF_ReconParityStripeStatus_t *p;
+{
+	RF_ASSERT(p->procWaitList == NULL);
+	RF_ASSERT(p->blockWaitList == NULL);
+	RF_ASSERT(p->bufWaitList == NULL);
+
+	RF_FREELIST_FREE_CLEAN_ARG(raidPtr->pss_freelist, p, next, clean_pss, raidPtr);
+}
+
+static void 
+RealPrintPSStatusTable(raidPtr, pssTable)
+	RF_Raid_t *raidPtr;
+	RF_PSStatusHeader_t *pssTable;
+{
+	int     i, j, procsWaiting, blocksWaiting, bufsWaiting;
+	RF_ReconParityStripeStatus_t *p;
+	RF_CallbackDesc_t *cb;
+
+	printf("\nParity Stripe Status Table\n");
+	for (i = 0; i < raidPtr->pssTableSize; i++) {
+		for (p = pssTable[i].chain; p; p = p->next) {
+			procsWaiting = blocksWaiting = bufsWaiting = 0;
+			for (cb = p->procWaitList; cb; cb = cb->next)
+				procsWaiting++;
+			for (cb = p->blockWaitList; cb; cb = cb->next)
+				blocksWaiting++;
+			for (cb = p->bufWaitList; cb; cb = cb->next)
+				bufsWaiting++;
+			printf("PSID %ld RU %d : blockCount %d %d/%d/%d proc/block/buf waiting, issued ",
+			    (long) p->parityStripeID, p->which_ru, p->blockCount, procsWaiting, blocksWaiting, bufsWaiting);
+			for (j = 0; j < raidPtr->numCol; j++)
+				printf("%c", (p->issued[j]) ? '1' : '0');
+			if (!p->flags)
+				printf(" flags: (none)");
+			else {
+				if (p->flags & RF_PSS_UNDER_RECON)
+					printf(" under-recon");
+				if (p->flags & RF_PSS_FORCED_ON_WRITE)
+					printf(" forced-w");
+				if (p->flags & RF_PSS_FORCED_ON_READ)
+					printf(" forced-r");
+				if (p->flags & RF_PSS_RECON_BLOCKED)
+					printf(" blocked");
+				if (p->flags & RF_PSS_BUFFERWAIT)
+					printf(" bufwait");
+			}
+			printf("\n");
+		}
+	}
+}
+
+void 
+rf_PrintPSStatusTable(raidPtr, row)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t row;
+{
+	RF_PSStatusHeader_t *pssTable = raidPtr->reconControl[row]->pssTable;
+	RealPrintPSStatusTable(raidPtr, pssTable);
+}
diff --git a/sys/dev/raidframe/rf_psstatus.h b/sys/dev/raidframe/rf_psstatus.h
new file mode 100644
index 0000000..c836d49
--- /dev/null
+++ b/sys/dev/raidframe/rf_psstatus.h
@@ -0,0 +1,132 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_psstatus.h,v 1.3 1999/02/05 00:06:15 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * psstatus.h
+ *
+ * The reconstruction code maintains a bunch of status related to the parity
+ * stripes that are currently under reconstruction.  This header file defines
+ * the status structures.
+ *
+ *****************************************************************************/
+
+#ifndef _RF__RF_PSSTATUS_H_
+#define _RF__RF_PSSTATUS_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_callback.h>
+
+#define RF_PS_MAX_BUFS 10	/* max number of bufs we'll accumulate before
+				 * we do an XOR */
+
+#define RF_PSS_DEFAULT_TABLESIZE 200
+
+/*
+ * Macros to acquire/release the mutex lock on a parity stripe status
+ * descriptor. Note that we use just one lock for the whole hash chain.
+ */
+#define RF_HASH_PSID(_raid_,_psid_) ( (_psid_) % ((_raid_)->pssTableSize) )	/* simple hash function */
+#define RF_LOCK_PSS_MUTEX(_raidPtr, _row, _psid) \
+  RF_LOCK_MUTEX((_raidPtr)->reconControl[_row]->pssTable[ RF_HASH_PSID(_raidPtr,_psid) ].mutex)
+#define RF_UNLOCK_PSS_MUTEX(_raidPtr, _row, _psid) \
+  RF_UNLOCK_MUTEX((_raidPtr)->reconControl[_row]->pssTable[ RF_HASH_PSID(_raidPtr,_psid) ].mutex)
+
+struct RF_ReconParityStripeStatus_s {
+	RF_StripeNum_t parityStripeID;	/* the parity stripe ID */
+	RF_ReconUnitNum_t which_ru;	/* which reconstruction unit with the
+					 * indicated parity stripe */
+	RF_PSSFlags_t flags;	/* flags indicating various conditions */
+	void   *rbuf;		/* this is the accumulating xor sum */
+	void   *writeRbuf;	/* DEBUG ONLY:  a pointer to the rbuf after it
+				 * has filled & been sent to disk */
+	void   *rbufsForXor[RF_PS_MAX_BUFS];	/* these are buffers still to
+						 * be xored into the
+						 * accumulating sum */
+	int     xorBufCount;	/* num buffers waiting to be xored */
+	int     blockCount;	/* count of # proc that have blocked recon on
+				 * this parity stripe */
+	char   *issued;		/* issued[i]==1 <=> column i has already
+				 * issued a read request for the indicated RU */
+	RF_CallbackDesc_t *procWaitList;	/* list of user procs waiting
+						 * for recon to be done */
+	RF_CallbackDesc_t *blockWaitList;	/* list of disks blocked
+						 * waiting for user write to
+						 * complete */
+	RF_CallbackDesc_t *bufWaitList;	/* list of disks blocked waiting to
+					 * acquire a buffer for this RU */
+	RF_ReconParityStripeStatus_t *next;
+};
+
+struct RF_PSStatusHeader_s {
+	RF_DECLARE_MUTEX(mutex)	/* mutex for this hash chain */
+	RF_ReconParityStripeStatus_t *chain;	/* the hash chain */
+};
+/* masks for the "flags" field above */
+#define RF_PSS_NONE            0x00000000	/* no flags */
+#define RF_PSS_UNDER_RECON     0x00000001	/* this parity stripe is
+						 * currently under
+						 * reconstruction */
+#define RF_PSS_FORCED_ON_WRITE 0x00000002	/* indicates a recon was
+						 * forced due to a user-write
+						 * operation */
+#define RF_PSS_FORCED_ON_READ  0x00000004	/* ditto for read, but not
+						 * currently implemented */
+#define RF_PSS_RECON_BLOCKED   0x00000008	/* reconstruction is currently
+						 * blocked due to a pending
+						 * user I/O */
+#define RF_PSS_CREATE          0x00000010	/* tells LookupRUStatus to
+						 * create the entry */
+#define RF_PSS_BUFFERWAIT      0x00000020	/* someone is waiting for a
+						 * buffer for this RU */
+
+int 
+rf_ConfigurePSStatus(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+
+RF_PSStatusHeader_t *rf_MakeParityStripeStatusTable(RF_Raid_t * raidPtr);
+void 
+rf_FreeParityStripeStatusTable(RF_Raid_t * raidPtr,
+    RF_PSStatusHeader_t * pssTable);
+RF_ReconParityStripeStatus_t *
+rf_LookupRUStatus(RF_Raid_t * raidPtr,
+    RF_PSStatusHeader_t * pssTable, RF_StripeNum_t psID,
+    RF_ReconUnitNum_t which_ru, RF_PSSFlags_t flags, int *created);
+void 
+rf_PSStatusDelete(RF_Raid_t * raidPtr, RF_PSStatusHeader_t * pssTable,
+    RF_ReconParityStripeStatus_t * pssPtr);
+void 
+rf_RemoveFromActiveReconTable(RF_Raid_t * raidPtr, RF_RowCol_t row,
+    RF_StripeNum_t psid, RF_ReconUnitNum_t which_ru);
+RF_ReconParityStripeStatus_t *rf_AllocPSStatus(RF_Raid_t * raidPtr);
+void    rf_FreePSStatus(RF_Raid_t * raidPtr, RF_ReconParityStripeStatus_t * p);
+void    rf_PrintPSStatusTable(RF_Raid_t * raidPtr, RF_RowCol_t row);
+
+#endif				/* !_RF__RF_PSSTATUS_H_ */
diff --git a/sys/dev/raidframe/rf_raid.h b/sys/dev/raidframe/rf_raid.h
new file mode 100644
index 0000000..e91a2ae
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid.h
@@ -0,0 +1,299 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_raid.h,v 1.12 2000/02/24 17:12:10 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/**********************************************
+ * rf_raid.h -- main header file for RAID driver
+ **********************************************/
+
+
+#ifndef _RF__RF_RAID_H_
+#define _RF__RF_RAID_H_
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+
+#include <dev/raidframe/rf_bsd.h>
+
+#include <sys/disklabel.h>
+#include <sys/types.h>
+
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_stripelocks.h>
+#include <dev/raidframe/rf_layout.h>
+#include <dev/raidframe/rf_disks.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_diskqueue.h>
+#include <dev/raidframe/rf_reconstruct.h>
+#include <dev/raidframe/rf_acctrace.h>
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+#include <dev/raidframe/rf_paritylog.h>
+#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
+
+#define RF_MAX_DISKS 128	/* max disks per array */
+#if defined(__NetBSD__)
+#define RF_DEV2RAIDID(_dev)  (DISKUNIT(_dev))
+#endif
+
+#define RF_COMPONENT_LABEL_VERSION_1 1
+#define RF_COMPONENT_LABEL_VERSION 2
+#define RF_RAID_DIRTY 0
+#define RF_RAID_CLEAN 1
+
+
+/*
+ * Each row in the array is a distinct parity group, so
+ * each has it's own status, which is one of the following.
+ */
+typedef enum RF_RowStatus_e {
+	rf_rs_optimal,
+	rf_rs_degraded,
+	rf_rs_reconstructing,
+	rf_rs_reconfigured
+}       RF_RowStatus_t;
+
+struct RF_CumulativeStats_s {
+	struct timeval start;	/* the time when the stats were last started */
+	struct timeval stop;	/* the time when the stats were last stopped */
+	long    sum_io_us;	/* sum of all user response times (us) */
+	long    num_ios;	/* total number of I/Os serviced */
+	long    num_sect_moved;	/* total number of sectors read or written */
+};
+
+struct RF_ThroughputStats_s {
+	RF_DECLARE_MUTEX(mutex)	/* a mutex used to lock the configuration
+				 * stuff */
+	struct timeval start;	/* timer started when numOutstandingRequests
+				 * moves from 0 to 1 */
+	struct timeval stop;	/* timer stopped when numOutstandingRequests
+				 * moves from 1 to 0 */
+	RF_uint64 sum_io_us;	/* total time timer is enabled */
+	RF_uint64 num_ios;	/* total number of ios processed by RAIDframe */
+	long    num_out_ios;	/* number of outstanding ios */
+};
+
+struct RF_Raid_s {
+	/* This portion never changes, and can be accessed without locking */
+	/* an exception is Disks[][].status, which requires locking when it is
+	 * changed.  XXX this is no longer true.  numSpare and friends can 
+	 * change now. 
+         */
+	u_int   numRow;		/* number of rows of disks, typically == # of
+				 * ranks */
+	u_int   numCol;		/* number of columns of disks, typically == #
+				 * of disks/rank */
+	u_int   numSpare;	/* number of spare disks */
+	int     maxQueueDepth;	/* max disk queue depth */
+	RF_SectorCount_t totalSectors;	/* total number of sectors in the
+					 * array */
+	RF_SectorCount_t sectorsPerDisk;	/* number of sectors on each
+						 * disk */
+	u_int   logBytesPerSector;	/* base-2 log of the number of bytes
+					 * in a sector */
+	u_int   bytesPerSector;	/* bytes in a sector */
+	RF_int32 sectorMask;	/* mask of bytes-per-sector */
+
+	RF_RaidLayout_t Layout;	/* all information related to layout */
+	RF_RaidDisk_t **Disks;	/* all information related to physical disks */
+	RF_DiskQueue_t **Queues;/* all information related to disk queues */
+	RF_DiskQueueSW_t *qType;/* pointer to the DiskQueueSW used for the
+				   component queues. */
+	/* NOTE:  This is an anchor point via which the queues can be
+	 * accessed, but the enqueue/dequeue routines in diskqueue.c use a
+	 * local copy of this pointer for the actual accesses. */
+	/* The remainder of the structure can change, and therefore requires
+	 * locking on reads and updates */
+	        RF_DECLARE_MUTEX(mutex)	/* mutex used to serialize access to
+					 * the fields below */
+	RF_RowStatus_t *status;	/* the status of each row in the array */
+	int     valid;		/* indicates successful configuration */
+	RF_LockTableEntry_t *lockTable;	/* stripe-lock table */
+	RF_LockTableEntry_t *quiesceLock;	/* quiesnce table */
+	int     numFailures;	/* total number of failures in the array */
+	int     numNewFailures; /* number of *new* failures (that havn't 
+				   caused a mod_counter update */
+
+	int     parity_good;    /* !0 if parity is known to be correct */
+	int     serial_number;  /* a "serial number" for this set */
+	int     mod_counter;    /* modification counter for component labels */
+	int     clean;          /* the clean bit for this array. */
+
+	int     openings;       /* Number of IO's which can be scheduled
+				   simultaneously (high-level - not a 
+				   per-component limit)*/
+
+	int maxOutstanding;   /* maxOutstanding requests (per-component) */
+	int autoconfigure;    /* automatically configure this RAID set. 
+				 0 == no, 1 == yes */
+	int root_partition;   /* Use this set as /
+				 0 == no, 1 == yes*/
+	int last_unit;        /* last unit number (e.g. 0 for /dev/raid0) 
+				 of this component.  Used for autoconfigure
+				 only. */
+	int config_order;     /* 0 .. n.  The order in which the component
+				 should be auto-configured.  E.g. 0 is will 
+				 done first, (and would become raid0).
+				 This may be in conflict with last_unit!!?! */
+	                      /* Not currently used. */
+
+	/*
+         * Cleanup stuff
+         */
+	RF_ShutdownList_t *shutdownList;	/* shutdown activities */
+	RF_AllocListElem_t *cleanupList;	/* memory to be freed at
+						 * shutdown time */
+
+	/*
+         * Recon stuff
+         */
+	RF_HeadSepLimit_t headSepLimit;
+	int     numFloatingReconBufs;
+	int     reconInProgress;
+	        RF_DECLARE_COND(waitForReconCond)
+	RF_RaidReconDesc_t *reconDesc;	/* reconstruction descriptor */
+	RF_ReconCtrl_t **reconControl;	/* reconstruction control structure
+					 * pointers for each row in the array */
+
+	/*
+         * Array-quiescence stuff
+         */
+	        RF_DECLARE_MUTEX(access_suspend_mutex)
+	        RF_DECLARE_COND(quiescent_cond)
+	RF_IoCount_t accesses_suspended;
+	RF_IoCount_t accs_in_flight;
+	int     access_suspend_release;
+	int     waiting_for_quiescence;
+	RF_CallbackDesc_t *quiesce_wait_list;
+
+	/*
+         * Statistics
+         */
+#if !defined(_KERNEL) && !defined(SIMULATE)
+	RF_ThroughputStats_t throughputstats;
+#endif				/* !KERNEL && !SIMULATE */
+	RF_CumulativeStats_t userstats;
+	int     parity_rewrite_stripes_done;
+	int     recon_stripes_done;
+	int     copyback_stripes_done;
+
+	int     recon_in_progress;
+	int     parity_rewrite_in_progress;
+	int     copyback_in_progress;
+
+	/*
+         * Engine thread control
+         */
+	        RF_DECLARE_MUTEX(node_queue_mutex)
+	        RF_DECLARE_COND(node_queue_cond)
+	RF_DagNode_t *node_queue;
+	RF_Thread_t parity_rewrite_thread;
+	RF_Thread_t copyback_thread;
+	RF_Thread_t engine_thread;
+	RF_Thread_t recon_thread;
+	RF_ThreadGroup_t engine_tg;
+	int     shutdown_engine;
+	int     dags_in_flight;	/* debug */
+
+	/*
+         * PSS (Parity Stripe Status) stuff
+         */
+	RF_FreeList_t *pss_freelist;
+	long    pssTableSize;
+
+	/*
+         * Reconstruction stuff
+         */
+	int     procsInBufWait;
+	int     numFullReconBuffers;
+	RF_AccTraceEntry_t *recon_tracerecs;
+	unsigned long accumXorTimeUs;
+	RF_ReconDoneProc_t *recon_done_procs;
+	        RF_DECLARE_MUTEX(recon_done_proc_mutex)
+	/*
+         * nAccOutstanding, waitShutdown protected by desc freelist lock
+         * (This may seem strange, since that's a central serialization point
+         * for a per-array piece of data, but otherwise, it'd be an extra
+         * per-array lock, and that'd only be less efficient...)
+         */
+	        RF_DECLARE_COND(outstandingCond)
+	int     waitShutdown;
+	int     nAccOutstanding;
+
+	RF_DiskId_t **diskids;
+	RF_DiskId_t *sparediskids;
+
+	int     raidid;
+	RF_AccTotals_t acc_totals;
+	int     keep_acc_totals;
+
+	struct raidcinfo **raid_cinfo;	/* array of component info */
+
+	int     terminate_disk_queues;
+
+	/*
+         * XXX
+         *
+         * config-specific information should be moved
+         * somewhere else, or at least hung off this
+         * in some generic way
+         */
+
+	/* used by rf_compute_workload_shift */
+	RF_RowCol_t hist_diskreq[RF_MAXROW][RF_MAXCOL];
+
+	/* used by declustering */
+	int     noRotate;
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+	/* used by parity logging */
+	RF_SectorCount_t regionLogCapacity;
+	RF_ParityLogQueue_t parityLogPool;	/* pool of unused parity logs */
+	RF_RegionInfo_t *regionInfo;	/* array of region state */
+	int     numParityLogs;
+	int     numSectorsPerLog;
+	int     regionParityRange;
+	int     logsInUse;	/* debugging */
+	RF_ParityLogDiskQueue_t parityLogDiskQueue;	/* state of parity
+							 * logging disk work */
+	RF_RegionBufferQueue_t regionBufferPool;	/* buffers for holding
+							 * region log */
+	RF_RegionBufferQueue_t parityBufferPool;	/* buffers for holding
+							 * parity */
+	caddr_t parityLogBufferHeap;	/* pool of unused parity logs */
+	RF_Thread_t pLogDiskThreadHandle;
+
+#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
+	/* Point back to the softc for this device.  This is needed to rid
+	 * ourselves of the ugly static device arrays.
+	 * XXX Will this affect compatibility with NetBSD?
+	 */
+	void 	*sc;
+};
+#endif				/* !_RF__RF_RAID_H_ */
diff --git a/sys/dev/raidframe/rf_raid0.c b/sys/dev/raidframe/rf_raid0.c
new file mode 100644
index 0000000..a9418d3
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid0.c
@@ -0,0 +1,161 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_raid0.c,v 1.4 2000/01/07 03:41:02 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***************************************
+ *
+ * rf_raid0.c -- implements RAID Level 0
+ *
+ ***************************************/
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_raid0.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagffrd.h>
+#include <dev/raidframe/rf_dagffwr.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_configure.h>
+#include <dev/raidframe/rf_parityscan.h>
+
+typedef struct RF_Raid0ConfigInfo_s {
+	RF_RowCol_t *stripeIdentifier;
+}       RF_Raid0ConfigInfo_t;
+
+int 
+rf_ConfigureRAID0(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_Raid0ConfigInfo_t *info;
+	RF_RowCol_t i;
+
+	/* create a RAID level 0 configuration structure */
+	RF_MallocAndAdd(info, sizeof(RF_Raid0ConfigInfo_t), (RF_Raid0ConfigInfo_t *), raidPtr->cleanupList);
+	if (info == NULL)
+		return (ENOMEM);
+	layoutPtr->layoutSpecificInfo = (void *) info;
+
+	RF_MallocAndAdd(info->stripeIdentifier, raidPtr->numCol * sizeof(RF_RowCol_t), (RF_RowCol_t *), raidPtr->cleanupList);
+	if (info->stripeIdentifier == NULL)
+		return (ENOMEM);
+	for (i = 0; i < raidPtr->numCol; i++)
+		info->stripeIdentifier[i] = i;
+
+	RF_ASSERT(raidPtr->numRow == 1);
+	raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * raidPtr->numCol * layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
+	layoutPtr->dataSectorsPerStripe = raidPtr->numCol * layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+	layoutPtr->numDataCol = raidPtr->numCol;
+	layoutPtr->numParityCol = 0;
+	return (0);
+}
+
+void 
+rf_MapSectorRAID0(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+	*row = 0;
+	*col = SUID % raidPtr->numCol;
+	*diskSector = (SUID / raidPtr->numCol) * raidPtr->Layout.sectorsPerStripeUnit +
+	    (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void 
+rf_MapParityRAID0(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	*row = *col = 0;
+	*diskSector = 0;
+}
+
+void 
+rf_IdentifyStripeRAID0(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids,
+    RF_RowCol_t * outRow)
+{
+	RF_Raid0ConfigInfo_t *info;
+
+	info = raidPtr->Layout.layoutSpecificInfo;
+	*diskids = info->stripeIdentifier;
+	*outRow = 0;
+}
+
+void 
+rf_MapSIDToPSIDRAID0(
+    RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID,
+    RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru)
+{
+	*which_ru = 0;
+	*psID = stripeID;
+}
+
+void 
+rf_RAID0DagSelect(
+    RF_Raid_t * raidPtr,
+    RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap,
+    RF_VoidFuncPtr * createFunc)
+{
+	*createFunc = ((type == RF_IO_TYPE_READ) ?
+	    (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG : (RF_VoidFuncPtr) rf_CreateRAID0WriteDAG);
+}
+
+int 
+rf_VerifyParityRAID0(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidAddr,
+    RF_PhysDiskAddr_t * parityPDA,
+    int correct_it,
+    RF_RaidAccessFlags_t flags)
+{
+	/*
+         * No parity is always okay.
+         */
+	return (RF_PARITY_OKAY);
+}
diff --git a/sys/dev/raidframe/rf_raid0.h b/sys/dev/raidframe/rf_raid0.h
new file mode 100644
index 0000000..36aae81
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid0.h
@@ -0,0 +1,58 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_raid0.h,v 1.3 1999/02/05 00:06:15 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_raid0.h - header file for RAID Level 0 */
+
+#ifndef _RF__RF_RAID0_H_
+#define _RF__RF_RAID0_H_
+
+int 
+rf_ConfigureRAID0(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+void 
+rf_MapSectorRAID0(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_MapParityRAID0(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_IdentifyStripeRAID0(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
+void 
+rf_MapSIDToPSIDRAID0(RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID, RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru);
+void 
+rf_RAID0DagSelect(RF_Raid_t * raidPtr, RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap, RF_VoidFuncPtr * createFunc);
+int 
+rf_VerifyParityRAID0(RF_Raid_t * raidPtr, RF_RaidAddr_t raidAddr,
+    RF_PhysDiskAddr_t * parityPDA, int correct_it, RF_RaidAccessFlags_t flags);
+
+#endif				/* !_RF__RF_RAID0_H_ */
diff --git a/sys/dev/raidframe/rf_raid1.c b/sys/dev/raidframe/rf_raid1.c
new file mode 100644
index 0000000..5831dfe
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid1.c
@@ -0,0 +1,689 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_raid1.c,v 1.5 2000/01/08 22:57:30 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * rf_raid1.c -- implements RAID Level 1
+ *
+ *****************************************************************************/
+
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_raid1.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagffrd.h>
+#include <dev/raidframe/rf_dagffwr.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_diskqueue.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_utils.h>
+#include <dev/raidframe/rf_parityscan.h>
+#include <dev/raidframe/rf_mcpair.h>
+#include <dev/raidframe/rf_layout.h>
+#include <dev/raidframe/rf_map.h>
+#include <dev/raidframe/rf_engine.h>
+#include <dev/raidframe/rf_reconbuffer.h>
+#include <dev/raidframe/rf_kintf.h>
+
+typedef struct RF_Raid1ConfigInfo_s {
+	RF_RowCol_t **stripeIdentifier;
+}       RF_Raid1ConfigInfo_t;
+/* start of day code specific to RAID level 1 */
+int 
+rf_ConfigureRAID1(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_Raid1ConfigInfo_t *info;
+	RF_RowCol_t i;
+
+	/* create a RAID level 1 configuration structure */
+	RF_MallocAndAdd(info, sizeof(RF_Raid1ConfigInfo_t), (RF_Raid1ConfigInfo_t *), raidPtr->cleanupList);
+	if (info == NULL)
+		return (ENOMEM);
+	layoutPtr->layoutSpecificInfo = (void *) info;
+
+	/* ... and fill it in. */
+	info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol / 2, 2, raidPtr->cleanupList);
+	if (info->stripeIdentifier == NULL)
+		return (ENOMEM);
+	for (i = 0; i < (raidPtr->numCol / 2); i++) {
+		info->stripeIdentifier[i][0] = (2 * i);
+		info->stripeIdentifier[i][1] = (2 * i) + 1;
+	}
+
+	RF_ASSERT(raidPtr->numRow == 1);
+
+	/* this implementation of RAID level 1 uses one row of numCol disks
+	 * and allows multiple (numCol / 2) stripes per row.  A stripe
+	 * consists of a single data unit and a single parity (mirror) unit.
+	 * stripe id = raidAddr / stripeUnitSize */
+	raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * (raidPtr->numCol / 2) * layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk * (raidPtr->numCol / 2);
+	layoutPtr->dataSectorsPerStripe = layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+	layoutPtr->numDataCol = 1;
+	layoutPtr->numParityCol = 1;
+	return (0);
+}
+
+
+/* returns the physical disk location of the primary copy in the mirror pair */
+void 
+rf_MapSectorRAID1(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+	RF_RowCol_t mirrorPair = SUID % (raidPtr->numCol / 2);
+
+	*row = 0;
+	*col = 2 * mirrorPair;
+	*diskSector = ((SUID / (raidPtr->numCol / 2)) * raidPtr->Layout.sectorsPerStripeUnit) + (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+
+/* Map Parity
+ *
+ * returns the physical disk location of the secondary copy in the mirror
+ * pair
+ */
+void 
+rf_MapParityRAID1(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+	RF_RowCol_t mirrorPair = SUID % (raidPtr->numCol / 2);
+
+	*row = 0;
+	*col = (2 * mirrorPair) + 1;
+
+	*diskSector = ((SUID / (raidPtr->numCol / 2)) * raidPtr->Layout.sectorsPerStripeUnit) + (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+
+/* IdentifyStripeRAID1
+ *
+ * returns a list of disks for a given redundancy group
+ */
+void 
+rf_IdentifyStripeRAID1(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids,
+    RF_RowCol_t * outRow)
+{
+	RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
+	RF_Raid1ConfigInfo_t *info = raidPtr->Layout.layoutSpecificInfo;
+	RF_ASSERT(stripeID >= 0);
+	RF_ASSERT(addr >= 0);
+	*outRow = 0;
+	*diskids = info->stripeIdentifier[stripeID % (raidPtr->numCol / 2)];
+	RF_ASSERT(*diskids);
+}
+
+
+/* MapSIDToPSIDRAID1
+ *
+ * maps a logical stripe to a stripe in the redundant array
+ */
+void 
+rf_MapSIDToPSIDRAID1(
+    RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID,
+    RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru)
+{
+	*which_ru = 0;
+	*psID = stripeID;
+}
+
+
+
+/******************************************************************************
+ * select a graph to perform a single-stripe access
+ *
+ * Parameters:  raidPtr    - description of the physical array
+ *              type       - type of operation (read or write) requested
+ *              asmap      - logical & physical addresses for this access
+ *              createFunc - name of function to use to create the graph
+ *****************************************************************************/
+
+void 
+rf_RAID1DagSelect(
+    RF_Raid_t * raidPtr,
+    RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap,
+    RF_VoidFuncPtr * createFunc)
+{
+	RF_RowCol_t frow, fcol, or, oc;
+	RF_PhysDiskAddr_t *failedPDA;
+	int     prior_recon;
+	RF_RowStatus_t rstat;
+	RF_SectorNum_t oo;
+
+
+	RF_ASSERT(RF_IO_IS_R_OR_W(type));
+
+	if (asmap->numDataFailed + asmap->numParityFailed > 1) {
+		RF_ERRORMSG("Multiple disks failed in a single group!  Aborting I/O operation.\n");
+		*createFunc = NULL;
+		return;
+	}
+	if (asmap->numDataFailed + asmap->numParityFailed) {
+		/*
+	         * We've got a fault. Re-map to spare space, iff applicable.
+	         * Shouldn't the arch-independent code do this for us?
+	         * Anyway, it turns out if we don't do this here, then when
+	         * we're reconstructing, writes go only to the surviving
+	         * original disk, and aren't reflected on the reconstructed
+	         * spare. Oops. --jimz
+	         */
+		failedPDA = asmap->failedPDAs[0];
+		frow = failedPDA->row;
+		fcol = failedPDA->col;
+		rstat = raidPtr->status[frow];
+		prior_recon = (rstat == rf_rs_reconfigured) || (
+		    (rstat == rf_rs_reconstructing) ?
+		    rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
+		    );
+		if (prior_recon) {
+			or = frow;
+			oc = fcol;
+			oo = failedPDA->startSector;
+			/*
+		         * If we did distributed sparing, we'd monkey with that here.
+		         * But we don't, so we'll
+		         */
+			failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
+			failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
+			/*
+		         * Redirect other components, iff necessary. This looks
+		         * pretty suspicious to me, but it's what the raid5
+		         * DAG select does.
+		         */
+			if (asmap->parityInfo->next) {
+				if (failedPDA == asmap->parityInfo) {
+					failedPDA->next->row = failedPDA->row;
+					failedPDA->next->col = failedPDA->col;
+				} else {
+					if (failedPDA == asmap->parityInfo->next) {
+						asmap->parityInfo->row = failedPDA->row;
+						asmap->parityInfo->col = failedPDA->col;
+					}
+				}
+			}
+			if (rf_dagDebug || rf_mapDebug) {
+				printf("raid%d: Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
+				       raidPtr->raidid, type, or, oc, 
+				       (long) oo, failedPDA->row, 
+				       failedPDA->col,
+				       (long) failedPDA->startSector);
+			}
+			asmap->numDataFailed = asmap->numParityFailed = 0;
+		}
+	}
+	if (type == RF_IO_TYPE_READ) {
+		if (asmap->numDataFailed == 0)
+			*createFunc = (RF_VoidFuncPtr) rf_CreateMirrorIdleReadDAG;
+		else
+			*createFunc = (RF_VoidFuncPtr) rf_CreateRaidOneDegradedReadDAG;
+	} else {
+		*createFunc = (RF_VoidFuncPtr) rf_CreateRaidOneWriteDAG;
+	}
+}
+
+int 
+rf_VerifyParityRAID1(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidAddr,
+    RF_PhysDiskAddr_t * parityPDA,
+    int correct_it,
+    RF_RaidAccessFlags_t flags)
+{
+	int     nbytes, bcount, stripeWidth, ret, i, j, nbad, *bbufs;
+	RF_DagNode_t *blockNode, *unblockNode, *wrBlock;
+	RF_DagHeader_t *rd_dag_h, *wr_dag_h;
+	RF_AccessStripeMapHeader_t *asm_h;
+	RF_AllocListElem_t *allocList;
+	RF_AccTraceEntry_t tracerec;
+	RF_ReconUnitNum_t which_ru;
+	RF_RaidLayout_t *layoutPtr;
+	RF_AccessStripeMap_t *aasm;
+	RF_SectorCount_t nsector;
+	RF_RaidAddr_t startAddr;
+	char   *buf, *buf1, *buf2;
+	RF_PhysDiskAddr_t *pda;
+	RF_StripeNum_t psID;
+	RF_MCPair_t *mcpair;
+
+	layoutPtr = &raidPtr->Layout;
+	startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
+	nsector = parityPDA->numSector;
+	nbytes = rf_RaidAddressToByte(raidPtr, nsector);
+	psID = rf_RaidAddressToParityStripeID(layoutPtr, raidAddr, &which_ru);
+
+	asm_h = NULL;
+	rd_dag_h = wr_dag_h = NULL;
+	mcpair = NULL;
+
+	ret = RF_PARITY_COULD_NOT_VERIFY;
+
+	rf_MakeAllocList(allocList);
+	if (allocList == NULL)
+		return (RF_PARITY_COULD_NOT_VERIFY);
+	mcpair = rf_AllocMCPair();
+	if (mcpair == NULL)
+		goto done;
+	RF_ASSERT(layoutPtr->numDataCol == layoutPtr->numParityCol);
+	stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
+	bcount = nbytes * (layoutPtr->numDataCol + layoutPtr->numParityCol);
+	RF_MallocAndAdd(buf, bcount, (char *), allocList);
+	if (buf == NULL)
+		goto done;
+	if (rf_verifyParityDebug) {
+		printf("raid%d: RAID1 parity verify: buf=%lx bcount=%d (%lx - %lx)\n",
+		       raidPtr->raidid, (long) buf, bcount, (long) buf, 
+		       (long) buf + bcount);
+	}
+	/*
+         * Generate a DAG which will read the entire stripe- then we can
+         * just compare data chunks versus "parity" chunks.
+         */
+
+	rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, nbytes, buf,
+	    rf_DiskReadFunc, rf_DiskReadUndoFunc, "Rod", allocList, flags,
+	    RF_IO_NORMAL_PRIORITY);
+	if (rd_dag_h == NULL)
+		goto done;
+	blockNode = rd_dag_h->succedents[0];
+	unblockNode = blockNode->succedents[0]->succedents[0];
+
+	/*
+         * Map the access to physical disk addresses (PDAs)- this will
+         * get us both a list of data addresses, and "parity" addresses
+         * (which are really mirror copies).
+         */
+	asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe,
+	    buf, RF_DONT_REMAP);
+	aasm = asm_h->stripeMap;
+
+	buf1 = buf;
+	/*
+         * Loop through the data blocks, setting up read nodes for each.
+         */
+	for (pda = aasm->physInfo, i = 0; i < layoutPtr->numDataCol; i++, pda = pda->next) {
+		RF_ASSERT(pda);
+
+		rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
+
+		RF_ASSERT(pda->numSector != 0);
+		if (rf_TryToRedirectPDA(raidPtr, pda, 0)) {
+			/* cannot verify parity with dead disk */
+			goto done;
+		}
+		pda->bufPtr = buf1;
+		blockNode->succedents[i]->params[0].p = pda;
+		blockNode->succedents[i]->params[1].p = buf1;
+		blockNode->succedents[i]->params[2].v = psID;
+		blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		buf1 += nbytes;
+	}
+	RF_ASSERT(pda == NULL);
+	/*
+         * keep i, buf1 running
+         *
+         * Loop through parity blocks, setting up read nodes for each.
+         */
+	for (pda = aasm->parityInfo; i < layoutPtr->numDataCol + layoutPtr->numParityCol; i++, pda = pda->next) {
+		RF_ASSERT(pda);
+		rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
+		RF_ASSERT(pda->numSector != 0);
+		if (rf_TryToRedirectPDA(raidPtr, pda, 0)) {
+			/* cannot verify parity with dead disk */
+			goto done;
+		}
+		pda->bufPtr = buf1;
+		blockNode->succedents[i]->params[0].p = pda;
+		blockNode->succedents[i]->params[1].p = buf1;
+		blockNode->succedents[i]->params[2].v = psID;
+		blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		buf1 += nbytes;
+	}
+	RF_ASSERT(pda == NULL);
+
+	bzero((char *) &tracerec, sizeof(tracerec));
+	rd_dag_h->tracerec = &tracerec;
+
+	if (rf_verifyParityDebug > 1) {
+		printf("raid%d: RAID1 parity verify read dag:\n", 
+		       raidPtr->raidid);
+		rf_PrintDAGList(rd_dag_h);
+	}
+	RF_LOCK_MUTEX(mcpair->mutex);
+	mcpair->flag = 0;
+	rf_DispatchDAG(rd_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
+	    (void *) mcpair);
+	while (mcpair->flag == 0) {
+		RF_WAIT_MCPAIR(mcpair);
+	}
+	RF_UNLOCK_MUTEX(mcpair->mutex);
+
+	if (rd_dag_h->status != rf_enable) {
+		RF_ERRORMSG("Unable to verify raid1 parity: can't read stripe\n");
+		ret = RF_PARITY_COULD_NOT_VERIFY;
+		goto done;
+	}
+	/*
+         * buf1 is the beginning of the data blocks chunk
+         * buf2 is the beginning of the parity blocks chunk
+         */
+	buf1 = buf;
+	buf2 = buf + (nbytes * layoutPtr->numDataCol);
+	ret = RF_PARITY_OKAY;
+	/*
+         * bbufs is "bad bufs"- an array whose entries are the data
+         * column numbers where we had miscompares. (That is, column 0
+         * and column 1 of the array are mirror copies, and are considered
+         * "data column 0" for this purpose).
+         */
+	RF_MallocAndAdd(bbufs, layoutPtr->numParityCol * sizeof(int), (int *),
+	    allocList);
+	nbad = 0;
+	/*
+         * Check data vs "parity" (mirror copy).
+         */
+	for (i = 0; i < layoutPtr->numDataCol; i++) {
+		if (rf_verifyParityDebug) {
+			printf("raid%d: RAID1 parity verify %d bytes: i=%d buf1=%lx buf2=%lx buf=%lx\n",
+			       raidPtr->raidid, nbytes, i, (long) buf1, 
+			       (long) buf2, (long) buf);
+		}
+		ret = bcmp(buf1, buf2, nbytes);
+		if (ret) {
+			if (rf_verifyParityDebug > 1) {
+				for (j = 0; j < nbytes; j++) {
+					if (buf1[j] != buf2[j])
+						break;
+				}
+				printf("psid=%ld j=%d\n", (long) psID, j);
+				printf("buf1 %02x %02x %02x %02x %02x\n", buf1[0] & 0xff,
+				    buf1[1] & 0xff, buf1[2] & 0xff, buf1[3] & 0xff, buf1[4] & 0xff);
+				printf("buf2 %02x %02x %02x %02x %02x\n", buf2[0] & 0xff,
+				    buf2[1] & 0xff, buf2[2] & 0xff, buf2[3] & 0xff, buf2[4] & 0xff);
+			}
+			if (rf_verifyParityDebug) {
+				printf("raid%d: RAID1: found bad parity, i=%d\n", raidPtr->raidid, i);
+			}
+			/*
+		         * Parity is bad. Keep track of which columns were bad.
+		         */
+			if (bbufs)
+				bbufs[nbad] = i;
+			nbad++;
+			ret = RF_PARITY_BAD;
+		}
+		buf1 += nbytes;
+		buf2 += nbytes;
+	}
+
+	if ((ret != RF_PARITY_OKAY) && correct_it) {
+		ret = RF_PARITY_COULD_NOT_CORRECT;
+		if (rf_verifyParityDebug) {
+			printf("raid%d: RAID1 parity verify: parity not correct\n", raidPtr->raidid);
+		}
+		if (bbufs == NULL)
+			goto done;
+		/*
+	         * Make a DAG with one write node for each bad unit. We'll simply
+	         * write the contents of the data unit onto the parity unit for
+	         * correction. (It's possible that the mirror copy was the correct
+	         * copy, and that we're spooging good data by writing bad over it,
+	         * but there's no way we can know that.
+	         */
+		wr_dag_h = rf_MakeSimpleDAG(raidPtr, nbad, nbytes, buf,
+		    rf_DiskWriteFunc, rf_DiskWriteUndoFunc, "Wnp", allocList, flags,
+		    RF_IO_NORMAL_PRIORITY);
+		if (wr_dag_h == NULL)
+			goto done;
+		wrBlock = wr_dag_h->succedents[0];
+		/*
+	         * Fill in a write node for each bad compare.
+	         */
+		for (i = 0; i < nbad; i++) {
+			j = i + layoutPtr->numDataCol;
+			pda = blockNode->succedents[j]->params[0].p;
+			pda->bufPtr = blockNode->succedents[i]->params[1].p;
+			wrBlock->succedents[i]->params[0].p = pda;
+			wrBlock->succedents[i]->params[1].p = pda->bufPtr;
+			wrBlock->succedents[i]->params[2].v = psID;
+			wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+		}
+		bzero((char *) &tracerec, sizeof(tracerec));
+		wr_dag_h->tracerec = &tracerec;
+		if (rf_verifyParityDebug > 1) {
+			printf("Parity verify write dag:\n");
+			rf_PrintDAGList(wr_dag_h);
+		}
+		RF_LOCK_MUTEX(mcpair->mutex);
+		mcpair->flag = 0;
+		/* fire off the write DAG */
+		rf_DispatchDAG(wr_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
+		    (void *) mcpair);
+		while (!mcpair->flag) {
+			RF_WAIT_COND(mcpair->cond, mcpair->mutex);
+		}
+		RF_UNLOCK_MUTEX(mcpair->mutex);
+		if (wr_dag_h->status != rf_enable) {
+			RF_ERRORMSG("Unable to correct RAID1 parity in VerifyParity\n");
+			goto done;
+		}
+		ret = RF_PARITY_CORRECTED;
+	}
+done:
+	/*
+         * All done. We might've gotten here without doing part of the function,
+         * so cleanup what we have to and return our running status.
+         */
+	if (asm_h)
+		rf_FreeAccessStripeMap(asm_h);
+	if (rd_dag_h)
+		rf_FreeDAG(rd_dag_h);
+	if (wr_dag_h)
+		rf_FreeDAG(wr_dag_h);
+	if (mcpair)
+		rf_FreeMCPair(mcpair);
+	rf_FreeAllocList(allocList);
+	if (rf_verifyParityDebug) {
+		printf("raid%d: RAID1 parity verify, returning %d\n", 
+		       raidPtr->raidid, ret);
+	}
+	return (ret);
+}
+
+int 
+rf_SubmitReconBufferRAID1(rbuf, keep_it, use_committed)
+	RF_ReconBuffer_t *rbuf;	/* the recon buffer to submit */
+	int     keep_it;	/* whether we can keep this buffer or we have
+				 * to return it */
+	int     use_committed;	/* whether to use a committed or an available
+				 * recon buffer */
+{
+	RF_ReconParityStripeStatus_t *pssPtr;
+	RF_ReconCtrl_t *reconCtrlPtr;
+	RF_RaidLayout_t *layoutPtr;
+	int     retcode, created;
+	RF_CallbackDesc_t *cb, *p;
+	RF_ReconBuffer_t *t;
+	RF_Raid_t *raidPtr;
+	caddr_t ta;
+
+	retcode = 0;
+	created = 0;
+
+	raidPtr = rbuf->raidPtr;
+	layoutPtr = &raidPtr->Layout;
+	reconCtrlPtr = raidPtr->reconControl[rbuf->row];
+
+	RF_ASSERT(rbuf);
+	RF_ASSERT(rbuf->col != reconCtrlPtr->fcol);
+
+	if (rf_reconbufferDebug) {
+		printf("raid%d: RAID1 reconbuffer submission r%d c%d psid %ld ru%d (failed offset %ld)\n",
+		       raidPtr->raidid, rbuf->row, rbuf->col, 
+		       (long) rbuf->parityStripeID, rbuf->which_ru,
+		       (long) rbuf->failedDiskSectorOffset);
+	}
+	if (rf_reconDebug) {
+		printf("RAID1 reconbuffer submit psid %ld buf %lx\n",
+		    (long) rbuf->parityStripeID, (long) rbuf->buffer);
+		printf("RAID1 psid %ld   %02x %02x %02x %02x %02x\n",
+		    (long) rbuf->parityStripeID,
+		    rbuf->buffer[0], rbuf->buffer[1], rbuf->buffer[2], rbuf->buffer[3],
+		    rbuf->buffer[4]);
+	}
+	RF_LOCK_PSS_MUTEX(raidPtr, rbuf->row, rbuf->parityStripeID);
+
+	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
+
+	pssPtr = rf_LookupRUStatus(raidPtr, reconCtrlPtr->pssTable,
+	    rbuf->parityStripeID, rbuf->which_ru, RF_PSS_NONE, &created);
+	RF_ASSERT(pssPtr);	/* if it didn't exist, we wouldn't have gotten
+				 * an rbuf for it */
+
+	/*
+         * Since this is simple mirroring, the first submission for a stripe is also
+         * treated as the last.
+         */
+
+	t = NULL;
+	if (keep_it) {
+		if (rf_reconbufferDebug) {
+			printf("raid%d: RAID1 rbuf submission: keeping rbuf\n", 
+			       raidPtr->raidid);
+		}
+		t = rbuf;
+	} else {
+		if (use_committed) {
+			if (rf_reconbufferDebug) {
+				printf("raid%d: RAID1 rbuf submission: using committed rbuf\n", raidPtr->raidid);
+			}
+			t = reconCtrlPtr->committedRbufs;
+			RF_ASSERT(t);
+			reconCtrlPtr->committedRbufs = t->next;
+			t->next = NULL;
+		} else
+			if (reconCtrlPtr->floatingRbufs) {
+				if (rf_reconbufferDebug) {
+					printf("raid%d: RAID1 rbuf submission: using floating rbuf\n", raidPtr->raidid);
+				}
+				t = reconCtrlPtr->floatingRbufs;
+				reconCtrlPtr->floatingRbufs = t->next;
+				t->next = NULL;
+			}
+	}
+	if (t == NULL) {
+		if (rf_reconbufferDebug) {
+			printf("raid%d: RAID1 rbuf submission: waiting for rbuf\n", raidPtr->raidid);
+		}
+		RF_ASSERT((keep_it == 0) && (use_committed == 0));
+		raidPtr->procsInBufWait++;
+		if ((raidPtr->procsInBufWait == (raidPtr->numCol - 1))
+		    && (raidPtr->numFullReconBuffers == 0)) {
+			/* ruh-ro */
+			RF_ERRORMSG("Buffer wait deadlock\n");
+			rf_PrintPSStatusTable(raidPtr, rbuf->row);
+			RF_PANIC();
+		}
+		pssPtr->flags |= RF_PSS_BUFFERWAIT;
+		cb = rf_AllocCallbackDesc();
+		cb->row = rbuf->row;
+		cb->col = rbuf->col;
+		cb->callbackArg.v = rbuf->parityStripeID;
+		cb->callbackArg2.v = rbuf->which_ru;
+		cb->next = NULL;
+		if (reconCtrlPtr->bufferWaitList == NULL) {
+			/* we are the wait list- lucky us */
+			reconCtrlPtr->bufferWaitList = cb;
+		} else {
+			/* append to wait list */
+			for (p = reconCtrlPtr->bufferWaitList; p->next; p = p->next);
+			p->next = cb;
+		}
+		retcode = 1;
+		goto out;
+	}
+	if (t != rbuf) {
+		t->row = rbuf->row;
+		t->col = reconCtrlPtr->fcol;
+		t->parityStripeID = rbuf->parityStripeID;
+		t->which_ru = rbuf->which_ru;
+		t->failedDiskSectorOffset = rbuf->failedDiskSectorOffset;
+		t->spRow = rbuf->spRow;
+		t->spCol = rbuf->spCol;
+		t->spOffset = rbuf->spOffset;
+		/* Swap buffers. DANCE! */
+		ta = t->buffer;
+		t->buffer = rbuf->buffer;
+		rbuf->buffer = ta;
+	}
+	/*
+         * Use the rbuf we've been given as the target.
+         */
+	RF_ASSERT(pssPtr->rbuf == NULL);
+	pssPtr->rbuf = t;
+
+	t->count = 1;
+	/*
+         * Below, we use 1 for numDataCol (which is equal to the count in the
+         * previous line), so we'll always be done.
+         */
+	rf_CheckForFullRbuf(raidPtr, reconCtrlPtr, pssPtr, 1);
+
+out:
+	RF_UNLOCK_PSS_MUTEX(raidPtr, rbuf->row, rbuf->parityStripeID);
+	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
+	if (rf_reconbufferDebug) {
+		printf("raid%d: RAID1 rbuf submission: returning %d\n", 
+		       raidPtr->raidid, retcode);
+	}
+	return (retcode);
+}
diff --git a/sys/dev/raidframe/rf_raid1.h b/sys/dev/raidframe/rf_raid1.h
new file mode 100644
index 0000000..484cbcf
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid1.h
@@ -0,0 +1,63 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_raid1.h,v 1.3 1999/02/05 00:06:16 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* header file for RAID Level 1 */
+
+#ifndef _RF__RF_RAID1_H_
+#define _RF__RF_RAID1_H_
+
+#include <dev/raidframe/rf_types.h>
+
+int 
+rf_ConfigureRAID1(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+void 
+rf_MapSectorRAID1(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_MapParityRAID1(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_IdentifyStripeRAID1(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
+void 
+rf_MapSIDToPSIDRAID1(RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID, RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru);
+void 
+rf_RAID1DagSelect(RF_Raid_t * raidPtr, RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap, RF_VoidFuncPtr * createFunc);
+int 
+rf_VerifyParityRAID1(RF_Raid_t * raidPtr, RF_RaidAddr_t raidAddr,
+    RF_PhysDiskAddr_t * parityPDA, int correct_it, RF_RaidAccessFlags_t flags);
+int 
+rf_SubmitReconBufferRAID1(RF_ReconBuffer_t * rbuf, int keep_int,
+    int use_committed);
+
+#endif				/* !_RF__RF_RAID1_H_ */
diff --git a/sys/dev/raidframe/rf_raid4.c b/sys/dev/raidframe/rf_raid4.c
new file mode 100644
index 0000000..3f6c398
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid4.c
@@ -0,0 +1,157 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_raid4.c,v 1.4 2000/01/07 03:41:02 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Rachad Youssef
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***************************************
+ *
+ * rf_raid4.c -- implements RAID Level 4
+ *
+ ***************************************/
+
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_dagffrd.h>
+#include <dev/raidframe/rf_dagffwr.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+#include <dev/raidframe/rf_dagdegwr.h>
+#include <dev/raidframe/rf_raid4.h>
+#include <dev/raidframe/rf_general.h>
+
+typedef struct RF_Raid4ConfigInfo_s {
+	RF_RowCol_t *stripeIdentifier;	/* filled in at config time & used by
+					 * IdentifyStripe */
+}       RF_Raid4ConfigInfo_t;
+
+
+
+int 
+rf_ConfigureRAID4(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_Raid4ConfigInfo_t *info;
+	int     i;
+
+	/* create a RAID level 4 configuration structure ... */
+	RF_MallocAndAdd(info, sizeof(RF_Raid4ConfigInfo_t), (RF_Raid4ConfigInfo_t *), raidPtr->cleanupList);
+	if (info == NULL)
+		return (ENOMEM);
+	layoutPtr->layoutSpecificInfo = (void *) info;
+
+	/* ... and fill it in. */
+	RF_MallocAndAdd(info->stripeIdentifier, raidPtr->numCol * sizeof(RF_RowCol_t), (RF_RowCol_t *), raidPtr->cleanupList);
+	if (info->stripeIdentifier == NULL)
+		return (ENOMEM);
+	for (i = 0; i < raidPtr->numCol; i++)
+		info->stripeIdentifier[i] = i;
+
+	RF_ASSERT(raidPtr->numRow == 1);
+
+	/* fill in the remaining layout parameters */
+	layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
+	layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+	layoutPtr->numDataCol = raidPtr->numCol - 1;
+	layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->numParityCol = 1;
+	raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+
+	return (0);
+}
+
+int 
+rf_GetDefaultNumFloatingReconBuffersRAID4(RF_Raid_t * raidPtr)
+{
+	return (20);
+}
+
+RF_HeadSepLimit_t 
+rf_GetDefaultHeadSepLimitRAID4(RF_Raid_t * raidPtr)
+{
+	return (20);
+}
+
+void 
+rf_MapSectorRAID4(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+	*row = 0;
+	*col = SUID % raidPtr->Layout.numDataCol;
+	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+	    (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void 
+rf_MapParityRAID4(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+
+	*row = 0;
+	*col = raidPtr->Layout.numDataCol;
+	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+	    (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void 
+rf_IdentifyStripeRAID4(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids,
+    RF_RowCol_t * outRow)
+{
+	RF_Raid4ConfigInfo_t *info = raidPtr->Layout.layoutSpecificInfo;
+
+	*outRow = 0;
+	*diskids = info->stripeIdentifier;
+}
+
+void 
+rf_MapSIDToPSIDRAID4(
+    RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID,
+    RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru)
+{
+	*which_ru = 0;
+	*psID = stripeID;
+}
diff --git a/sys/dev/raidframe/rf_raid4.h b/sys/dev/raidframe/rf_raid4.h
new file mode 100644
index 0000000..56df05a
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid4.h
@@ -0,0 +1,57 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_raid4.h,v 1.3 1999/02/05 00:06:16 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Rachad Youssef
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_raid4.h header file for RAID Level 4 */
+
+#ifndef _RF__RF_RAID4_H_
+#define _RF__RF_RAID4_H_
+
+int 
+rf_ConfigureRAID4(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+int     rf_GetDefaultNumFloatingReconBuffersRAID4(RF_Raid_t * raidPtr);
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitRAID4(RF_Raid_t * raidPtr);
+void 
+rf_MapSectorRAID4(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_MapParityRAID4(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_IdentifyStripeRAID4(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
+void 
+rf_MapSIDToPSIDRAID4(RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID, RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru);
+void 
+rf_RAID4DagSelect(RF_Raid_t * raidPtr, RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap, RF_VoidFuncPtr * createFunc);
+
+#endif				/* !_RF__RF_RAID4_H_ */
diff --git a/sys/dev/raidframe/rf_raid5.c b/sys/dev/raidframe/rf_raid5.c
new file mode 100644
index 0000000..c1261ad
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid5.c
@@ -0,0 +1,320 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_raid5.c,v 1.4 2000/01/08 22:57:30 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/******************************************************************************
+ *
+ * rf_raid5.c -- implements RAID Level 5
+ *
+ *****************************************************************************/
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_raid5.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagffrd.h>
+#include <dev/raidframe/rf_dagffwr.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+#include <dev/raidframe/rf_dagdegwr.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_map.h>
+#include <dev/raidframe/rf_utils.h>
+
+typedef struct RF_Raid5ConfigInfo_s {
+	RF_RowCol_t **stripeIdentifier;	/* filled in at config time and used
+					 * by IdentifyStripe */
+}       RF_Raid5ConfigInfo_t;
+
+int 
+rf_ConfigureRAID5(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_Raid5ConfigInfo_t *info;
+	RF_RowCol_t i, j, startdisk;
+
+	/* create a RAID level 5 configuration structure */
+	RF_MallocAndAdd(info, sizeof(RF_Raid5ConfigInfo_t), (RF_Raid5ConfigInfo_t *), raidPtr->cleanupList);
+	if (info == NULL)
+		return (ENOMEM);
+	layoutPtr->layoutSpecificInfo = (void *) info;
+
+	RF_ASSERT(raidPtr->numRow == 1);
+
+	/* the stripe identifier must identify the disks in each stripe, IN
+	 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */
+	info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList);
+	if (info->stripeIdentifier == NULL)
+		return (ENOMEM);
+	startdisk = 0;
+	for (i = 0; i < raidPtr->numCol; i++) {
+		for (j = 0; j < raidPtr->numCol; j++) {
+			info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol;
+		}
+		if ((--startdisk) < 0)
+			startdisk = raidPtr->numCol - 1;
+	}
+
+	/* fill in the remaining layout parameters */
+	layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
+	layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+	layoutPtr->numDataCol = raidPtr->numCol - 1;
+	layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->numParityCol = 1;
+	layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
+
+	raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+
+	return (0);
+}
+
+int 
+rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t * raidPtr)
+{
+	return (20);
+}
+
+RF_HeadSepLimit_t 
+rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t * raidPtr)
+{
+	return (10);
+}
+#if !defined(__NetBSD__) && !defined(__FreeBSD__) && !defined(_KERNEL)
+/* not currently used */
+int 
+rf_ShutdownRAID5(RF_Raid_t * raidPtr)
+{
+	return (0);
+}
+#endif
+
+void 
+rf_MapSectorRAID5(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+	*row = 0;
+	*col = (SUID % raidPtr->numCol);
+	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+	    (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void 
+rf_MapParityRAID5(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+
+	*row = 0;
+	*col = raidPtr->Layout.numDataCol - (SUID / raidPtr->Layout.numDataCol) % raidPtr->numCol;
+	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+	    (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void 
+rf_IdentifyStripeRAID5(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids,
+    RF_RowCol_t * outRow)
+{
+	RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
+	RF_Raid5ConfigInfo_t *info = (RF_Raid5ConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+
+	*outRow = 0;
+	*diskids = info->stripeIdentifier[stripeID % raidPtr->numCol];
+}
+
+void 
+rf_MapSIDToPSIDRAID5(
+    RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID,
+    RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru)
+{
+	*which_ru = 0;
+	*psID = stripeID;
+}
+/* select an algorithm for performing an access.  Returns two pointers,
+ * one to a function that will return information about the DAG, and
+ * another to a function that will create the dag.
+ */
+void 
+rf_RaidFiveDagSelect(
+    RF_Raid_t * raidPtr,
+    RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap,
+    RF_VoidFuncPtr * createFunc)
+{
+	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+	RF_PhysDiskAddr_t *failedPDA = NULL;
+	RF_RowCol_t frow, fcol;
+	RF_RowStatus_t rstat;
+	int     prior_recon;
+
+	RF_ASSERT(RF_IO_IS_R_OR_W(type));
+
+	if (asmap->numDataFailed + asmap->numParityFailed > 1) {
+		RF_ERRORMSG("Multiple disks failed in a single group!  Aborting I/O operation.\n");
+		 /* *infoFunc = */ *createFunc = NULL;
+		return;
+	} else
+		if (asmap->numDataFailed + asmap->numParityFailed == 1) {
+
+			/* if under recon & already reconstructed, redirect
+			 * the access to the spare drive and eliminate the
+			 * failure indication */
+			failedPDA = asmap->failedPDAs[0];
+			frow = failedPDA->row;
+			fcol = failedPDA->col;
+			rstat = raidPtr->status[failedPDA->row];
+			prior_recon = (rstat == rf_rs_reconfigured) || (
+			    (rstat == rf_rs_reconstructing) ?
+			    rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
+			    );
+			if (prior_recon) {
+				RF_RowCol_t or = failedPDA->row, oc = failedPDA->col;
+				RF_SectorNum_t oo = failedPDA->startSector;
+
+				if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {	/* redirect to dist
+											 * spare space */
+
+					if (failedPDA == asmap->parityInfo) {
+
+						/* parity has failed */
+						(layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, &failedPDA->row,
+						    &failedPDA->col, &failedPDA->startSector, RF_REMAP);
+
+						if (asmap->parityInfo->next) {	/* redir 2nd component,
+										 * if any */
+							RF_PhysDiskAddr_t *p = asmap->parityInfo->next;
+							RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit;
+							p->row = failedPDA->row;
+							p->col = failedPDA->col;
+							p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) +
+							    SUoffs;	/* cheating:
+									 * startSector is not
+									 * really a RAID address */
+						}
+					} else
+						if (asmap->parityInfo->next && failedPDA == asmap->parityInfo->next) {
+							RF_ASSERT(0);	/* should not ever
+									 * happen */
+						} else {
+
+							/* data has failed */
+							(layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, &failedPDA->row,
+							    &failedPDA->col, &failedPDA->startSector, RF_REMAP);
+
+						}
+
+				} else {	/* redirect to dedicated spare
+						 * space */
+
+					failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
+					failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
+
+					/* the parity may have two distinct
+					 * components, both of which may need
+					 * to be redirected */
+					if (asmap->parityInfo->next) {
+						if (failedPDA == asmap->parityInfo) {
+							failedPDA->next->row = failedPDA->row;
+							failedPDA->next->col = failedPDA->col;
+						} else
+							if (failedPDA == asmap->parityInfo->next) {	/* paranoid:  should
+													 * never occur */
+								asmap->parityInfo->row = failedPDA->row;
+								asmap->parityInfo->col = failedPDA->col;
+							}
+					}
+				}
+
+				RF_ASSERT(failedPDA->col != -1);
+
+				if (rf_dagDebug || rf_mapDebug) {
+					printf("raid%d: Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
+					       raidPtr->raidid, type, or, oc, 
+					       (long) oo, failedPDA->row, 
+					       failedPDA->col,
+					       (long) failedPDA->startSector);
+				}
+				asmap->numDataFailed = asmap->numParityFailed = 0;
+			}
+		}
+	/* all dags begin/end with block/unblock node therefore, hdrSucc &
+	 * termAnt counts should always be 1 also, these counts should not be
+	 * visible outside dag creation routines - manipulating the counts
+	 * here should be removed */
+	if (type == RF_IO_TYPE_READ) {
+		if (asmap->numDataFailed == 0)
+			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;
+		else
+			*createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG;
+	} else {
+
+
+		/* if mirroring, always use large writes.  If the access
+		 * requires two distinct parity updates, always do a small
+		 * write.  If the stripe contains a failure but the access
+		 * does not, do a small write. The first conditional
+		 * (numStripeUnitsAccessed <= numDataCol/2) uses a
+		 * less-than-or-equal rather than just a less-than because
+		 * when G is 3 or 4, numDataCol/2 is 1, and I want
+		 * single-stripe-unit updates to use just one disk. */
+		if ((asmap->numDataFailed + asmap->numParityFailed) == 0) {
+			if (rf_suppressLocksAndLargeWrites ||
+			    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
+				(asmap->parityInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
+				*createFunc = (RF_VoidFuncPtr) rf_CreateSmallWriteDAG;
+			} else
+				*createFunc = (RF_VoidFuncPtr) rf_CreateLargeWriteDAG;
+		} else {
+			if (asmap->numParityFailed == 1)
+				*createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG;
+			else
+				if (asmap->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)
+					*createFunc = NULL;
+				else
+					*createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG;
+		}
+	}
+}
diff --git a/sys/dev/raidframe/rf_raid5.h b/sys/dev/raidframe/rf_raid5.h
new file mode 100644
index 0000000..17549fe
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid5.h
@@ -0,0 +1,57 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_raid5.h,v 1.3 1999/02/05 00:06:16 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_raid5.h - header file for RAID Level 5 */
+
+#ifndef _RF__RF_RAID5_H_
+#define _RF__RF_RAID5_H_
+
+int 
+rf_ConfigureRAID5(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+int     rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t * raidPtr);
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t * raidPtr);
+void 
+rf_MapSectorRAID5(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_MapParityRAID5(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_IdentifyStripeRAID5(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
+void 
+rf_MapSIDToPSIDRAID5(RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID, RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru);
+void 
+rf_RaidFiveDagSelect(RF_Raid_t * raidPtr, RF_IoType_t type,
+    RF_AccessStripeMap_t * asmap, RF_VoidFuncPtr * createFunc);
+
+#endif				/* !_RF__RF_RAID5_H_ */
diff --git a/sys/dev/raidframe/rf_raid5_rotatedspare.c b/sys/dev/raidframe/rf_raid5_rotatedspare.c
new file mode 100644
index 0000000..5c17b43
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid5_rotatedspare.c
@@ -0,0 +1,175 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_raid5_rotatedspare.c,v 1.5 2001/01/26 05:16:58 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/**************************************************************************
+ *
+ * rf_raid5_rotated_spare.c -- implements RAID Level 5 with rotated sparing
+ *
+ **************************************************************************/
+
+#include <dev/raidframe/rf_archs.h>
+
+#if RF_INCLUDE_RAID5_RS > 0
+
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_raid5.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_utils.h>
+#include <dev/raidframe/rf_raid5_rotatedspare.h>
+
+typedef struct RF_Raid5RSConfigInfo_s {
+	RF_RowCol_t **stripeIdentifier;	/* filled in at config time & used by
+					 * IdentifyStripe */
+}       RF_Raid5RSConfigInfo_t;
+
+int 
+rf_ConfigureRAID5_RS(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_Raid5RSConfigInfo_t *info;
+	RF_RowCol_t i, j, startdisk;
+
+	/* create a RAID level 5 configuration structure */
+	RF_MallocAndAdd(info, sizeof(RF_Raid5RSConfigInfo_t), (RF_Raid5RSConfigInfo_t *), raidPtr->cleanupList);
+	if (info == NULL)
+		return (ENOMEM);
+	layoutPtr->layoutSpecificInfo = (void *) info;
+
+	RF_ASSERT(raidPtr->numRow == 1);
+	RF_ASSERT(raidPtr->numCol >= 3);
+
+	/* the stripe identifier must identify the disks in each stripe, IN
+	 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */
+	info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList);
+	if (info->stripeIdentifier == NULL)
+		return (ENOMEM);
+	startdisk = 0;
+	for (i = 0; i < raidPtr->numCol; i++) {
+		for (j = 0; j < raidPtr->numCol; j++) {
+			info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol;
+		}
+		if ((--startdisk) < 0)
+			startdisk = raidPtr->numCol - 1;
+	}
+
+	/* fill in the remaining layout parameters */
+	layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
+	layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+	layoutPtr->numDataCol = raidPtr->numCol - 2;
+	layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+	layoutPtr->numParityCol = 1;
+	layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
+	raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+	raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+
+	return (0);
+}
+
+RF_ReconUnitCount_t 
+rf_GetNumSpareRUsRAID5_RS(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	return (raidPtr->Layout.stripeUnitsPerDisk / raidPtr->numCol);
+}
+
+void 
+rf_MapSectorRAID5_RS(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+
+	*row = 0;
+	if (remap) {
+		*col = raidPtr->numCol - 1 - (1 + SUID / raidPtr->Layout.numDataCol) % raidPtr->numCol;
+		*col = (*col + 1) % raidPtr->numCol;	/* spare unit is rotated
+							 * with parity; line
+							 * above maps to parity */
+	} else {
+		*col = (SUID + (SUID / raidPtr->Layout.numDataCol)) % raidPtr->numCol;
+	}
+	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+	    (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void 
+rf_MapParityRAID5_RS(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row,
+    RF_RowCol_t * col,
+    RF_SectorNum_t * diskSector,
+    int remap)
+{
+	RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+
+	*row = 0;
+	*col = raidPtr->numCol - 1 - (1 + SUID / raidPtr->Layout.numDataCol) % raidPtr->numCol;
+	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+	    (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+	if (remap)
+		*col = (*col + 1) % raidPtr->numCol;
+}
+
+void 
+rf_IdentifyStripeRAID5_RS(
+    RF_Raid_t * raidPtr,
+    RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids,
+    RF_RowCol_t * outRow)
+{
+	RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
+	RF_Raid5RSConfigInfo_t *info = (RF_Raid5RSConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+	*outRow = 0;
+	*diskids = info->stripeIdentifier[stripeID % raidPtr->numCol];
+
+}
+
+void 
+rf_MapSIDToPSIDRAID5_RS(
+    RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID,
+    RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru)
+{
+	*which_ru = 0;
+	*psID = stripeID;
+}
+#endif /* RF_INCLUDE_RAID5_RS > 0 */
diff --git a/sys/dev/raidframe/rf_raid5_rotatedspare.h b/sys/dev/raidframe/rf_raid5_rotatedspare.h
new file mode 100644
index 0000000..779150f
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid5_rotatedspare.h
@@ -0,0 +1,53 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_raid5_rotatedspare.h,v 1.3 1999/02/05 00:06:16 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_raid5_rotatedspare.h - header file for RAID Level 5 with rotated sparing */
+
+#ifndef _RF__RF_RAID5_ROTATEDSPARE_H_
+#define _RF__RF_RAID5_ROTATEDSPARE_H_
+
+int 
+rf_ConfigureRAID5_RS(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+RF_ReconUnitCount_t rf_GetNumSpareRUsRAID5_RS(RF_Raid_t * raidPtr);
+void 
+rf_MapSectorRAID5_RS(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_MapParityRAID5_RS(RF_Raid_t * raidPtr, RF_RaidAddr_t raidSector,
+    RF_RowCol_t * row, RF_RowCol_t * col, RF_SectorNum_t * diskSector, int remap);
+void 
+rf_IdentifyStripeRAID5_RS(RF_Raid_t * raidPtr, RF_RaidAddr_t addr,
+    RF_RowCol_t ** diskids, RF_RowCol_t * outRow);
+void 
+rf_MapSIDToPSIDRAID5_RS(RF_RaidLayout_t * layoutPtr,
+    RF_StripeNum_t stripeID, RF_StripeNum_t * psID,
+    RF_ReconUnitNum_t * which_ru);
+
+#endif				/* !_RF__RF_RAID5_ROTATEDSPARE_H_ */
diff --git a/sys/dev/raidframe/rf_raidframe.h b/sys/dev/raidframe/rf_raidframe.h
new file mode 100644
index 0000000..fd711bd
--- /dev/null
+++ b/sys/dev/raidframe/rf_raidframe.h
@@ -0,0 +1,162 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_raidframe.h,v 1.11 2000/05/28 00:48:31 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************
+ *
+ * rf_raidframe.h
+ *
+ * main header file for using raidframe in the kernel.
+ *
+ *****************************************************/
+
+
+#ifndef _RF__RF_RAIDFRAME_H_
+#define _RF__RF_RAIDFRAME_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_configure.h>
+#include <dev/raidframe/rf_disks.h>
+#include <dev/raidframe/rf_raid.h>
+
+typedef RF_uint32 RF_ReconReqFlags_t;
+
+struct rf_recon_req {		/* used to tell the kernel to fail a disk */
+	RF_RowCol_t row, col;
+	RF_ReconReqFlags_t flags;
+	void   *raidPtr;	/* used internally; need not be set at ioctl
+				 * time */
+	struct rf_recon_req *next;	/* used internally; need not be set at
+					 * ioctl time */
+};
+
+struct RF_SparetWait_s {
+	int     C, G, fcol;	/* C = # disks in row, G = # units in stripe,
+				 * fcol = which disk has failed */
+
+	RF_StripeCount_t SUsPerPU;	/* this stuff is the info required to
+					 * create a spare table */
+	int     TablesPerSpareRegion;
+	int     BlocksPerTable;
+	RF_StripeCount_t TableDepthInPUs;
+	RF_StripeCount_t SpareSpaceDepthPerRegionInSUs;
+
+	RF_SparetWait_t *next;	/* used internally; need not be set at ioctl
+				 * time */
+};
+
+typedef struct RF_DeviceConfig_s {
+	u_int   rows;
+	u_int   cols;
+	u_int   maxqdepth;
+	int     ndevs;
+	RF_RaidDisk_t devs[RF_MAX_DISKS];
+	int     nspares;
+	RF_RaidDisk_t spares[RF_MAX_DISKS];
+}       RF_DeviceConfig_t;
+
+typedef struct RF_ProgressInfo_s {
+	RF_uint64 remaining;
+	RF_uint64 completed;
+	RF_uint64 total;
+} RF_ProgressInfo_t;
+
+/* flags that can be put in the rf_recon_req structure */
+#define RF_FDFLAGS_NONE   0x0	/* just fail the disk */
+#define RF_FDFLAGS_RECON  0x1	/* fail and initiate recon */
+
+#define RAIDFRAME_CONFIGURE	_IOW ('r',  1, void *)	/* config an array */
+#if defined(__NetBSD__)
+#define RAIDFRAME_SHUTDOWN	_IO  ('r',  2)		/* shutdown the array */
+#elif defined(__FreeBSD__)
+#define RAIDFRAME_SHUTDOWN	_IOW ('r',  2, int)	/* shutdown the array */
+#endif
+#define RAIDFRAME_TUR		_IOW ('r',  3, dev_t)	/* debug only: test
+							 * ready */
+#define RAIDFRAME_TEST_ACC	_IOWR('r',  4, struct rf_test_acc)
+							/* run a test access */
+#define RAIDFRAME_FAIL_DISK	_IOW ('r',  5, struct rf_recon_req)
+							/* fail a disk &
+							 * optionally start
+							 * recon */
+#define RAIDFRAME_CHECK_RECON_STATUS _IOR('r',  6, int)	/* get reconstruction %
+							 * complete on indicated
+							 * row */
+#define RAIDFRAME_REWRITEPARITY	_IO  ('r',  7)		/* rewrite (initialize)
+							 * all parity */
+#define RAIDFRAME_COPYBACK	_IO  ('r',  8)		/* copy reconstructed
+							 * data back to replaced
+							 * disk */
+#define RAIDFRAME_SPARET_WAIT	_IOR ('r',  9, RF_SparetWait_t)
+							/* does not return until
+							 * kernel needs a spare
+							 * table */
+#define RAIDFRAME_SEND_SPARET	_IOW ('r', 10, void *)	/* used to send a spare
+							 * table down into the
+							 * kernel */
+#define RAIDFRAME_ABORT_SPARET_WAIT _IO  ('r', 11)	/* used to wake up the
+							 * sparemap daemon &
+							 * tell it to exit */
+#define RAIDFRAME_START_ATRACE	_IO  ('r', 12)		/* start tracing
+							 * accesses */
+#define RAIDFRAME_STOP_ATRACE	_IO  ('r', 13)		/* stop tracing
+							 * accesses */
+#define RAIDFRAME_GET_SIZE	_IOR ('r', 14, int)	/* get size (# sectors)
+							 * in raid device */
+#define RAIDFRAME_GET_INFO	_IOWR ('r', 15, RF_DeviceConfig_t *)
+							/* get configuration */
+#define RAIDFRAME_RESET_ACCTOTALS _IO  ('r', 16)	/* reset AccTotals for
+							 * device */
+#define RAIDFRAME_GET_ACCTOTALS	_IOR ('r', 17, RF_AccTotals_t)
+							/* retrieve AccTotals
+							 * for device */
+#define RAIDFRAME_KEEP_ACCTOTALS _IOW ('r', 18, int)	/* turn AccTotals on or
+							 * off for device */
+#define RAIDFRAME_GET_COMPONENT_LABEL	_IOWR ('r', 19, RF_ComponentLabel_t) 
+#define RAIDFRAME_SET_COMPONENT_LABEL	_IOW  ('r', 20, RF_ComponentLabel_t) 
+
+#define RAIDFRAME_INIT_LABELS		_IOW ('r', 21, RF_ComponentLabel_t)
+#define RAIDFRAME_ADD_HOT_SPARE		_IOW ('r', 22, RF_SingleComponent_t)
+#define RAIDFRAME_REMOVE_HOT_SPARE	_IOW ('r', 23, RF_SingleComponent_t)
+#define RAIDFRAME_REBUILD_IN_PLACE	_IOW ('r', 24, RF_SingleComponent_t)
+#define RAIDFRAME_CHECK_PARITY		_IOWR ('r', 25, int)
+#define RAIDFRAME_CHECK_PARITYREWRITE_STATUS _IOR ('r', 26, int)
+#define RAIDFRAME_CHECK_COPYBACK_STATUS	_IOR ('r', 27, int)
+#define RAIDFRAME_SET_AUTOCONFIG	_IOWR ('r', 28, int)
+#define RAIDFRAME_SET_ROOT		_IOWR ('r', 29, int)
+#define RAIDFRAME_DELETE_COMPONENT	_IOW  ('r', 30, RF_SingleComponent_t)
+#define RAIDFRAME_INCORPORATE_HOT_SPARE _IOW ('r', 31, RF_SingleComponent_t)
+
+/* 'Extended' status versions */
+#define RAIDFRAME_CHECK_RECON_STATUS_EXT _IOR('r',  32, RF_ProgressInfo_t)
+#define RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT _IOR ('r', 33, \
+							RF_ProgressInfo_t)
+#define RAIDFRAME_CHECK_COPYBACK_STATUS_EXT _IOR ('r', 34, RF_ProgressInfo_t)
+#define	RAIDFRAME_GET_UNIT		_IOWR ('r', 35, int)
+
+#endif				/* !_RF__RF_RAIDFRAME_H_ */
diff --git a/sys/dev/raidframe/rf_reconbuffer.c b/sys/dev/raidframe/rf_reconbuffer.c
new file mode 100644
index 0000000..1f38a82
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconbuffer.c
@@ -0,0 +1,466 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_reconbuffer.c,v 1.5 2001/01/27 20:10:49 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***************************************************
+ *
+ * rf_reconbuffer.c -- reconstruction buffer manager
+ *
+ ***************************************************/
+
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_reconbuffer.h>
+#include <dev/raidframe/rf_acctrace.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_debugprint.h>
+#include <dev/raidframe/rf_revent.h>
+#include <dev/raidframe/rf_reconutil.h>
+#include <dev/raidframe/rf_nwayxor.h>
+
+#define Dprintf1(s,a) if (rf_reconbufferDebug) printf(s,a)
+#define Dprintf2(s,a,b) if (rf_reconbufferDebug) printf(s,a,b)
+#define Dprintf3(s,a,b,c) if (rf_reconbufferDebug) printf(s,a,b,c)
+#define Dprintf4(s,a,b,c,d) if (rf_reconbufferDebug) printf(s,a,b,c,d)
+#define Dprintf5(s,a,b,c,d,e) if (rf_reconbufferDebug) printf(s,a,b,c,d,e)
+
+/*****************************************************************************
+ *
+ * Submit a reconstruction buffer to the manager for XOR.  We can only
+ * submit a buffer if (1) we can xor into an existing buffer, which
+ * means we don't have to acquire a new one, (2) we can acquire a
+ * floating recon buffer, or (3) the caller has indicated that we are
+ * allowed to keep the submitted buffer.
+ *
+ * Returns non-zero if and only if we were not able to submit.
+ * In this case, we append the current disk ID to the wait list on the
+ * indicated RU, so that it will be re-enabled when we acquire a buffer 
+ * for this RU.
+ *
+ ****************************************************************************/
+
+/*
+ * nWayXorFuncs[i] is a pointer to a function that will xor "i"
+ * bufs into the accumulating sum.
+ */
+static RF_VoidFuncPtr nWayXorFuncs[] = {
+	NULL,
+	(RF_VoidFuncPtr) rf_nWayXor1,
+	(RF_VoidFuncPtr) rf_nWayXor2,
+	(RF_VoidFuncPtr) rf_nWayXor3,
+	(RF_VoidFuncPtr) rf_nWayXor4,
+	(RF_VoidFuncPtr) rf_nWayXor5,
+	(RF_VoidFuncPtr) rf_nWayXor6,
+	(RF_VoidFuncPtr) rf_nWayXor7,
+	(RF_VoidFuncPtr) rf_nWayXor8,
+	(RF_VoidFuncPtr) rf_nWayXor9
+};
+
+int 
+rf_SubmitReconBuffer(rbuf, keep_it, use_committed)
+	RF_ReconBuffer_t *rbuf;	/* the recon buffer to submit */
+	int     keep_it;	/* whether we can keep this buffer or we have
+				 * to return it */
+	int     use_committed;	/* whether to use a committed or an available
+				 * recon buffer */
+{
+	RF_LayoutSW_t *lp;
+	int     rc;
+
+	lp = rbuf->raidPtr->Layout.map;
+	rc = lp->SubmitReconBuffer(rbuf, keep_it, use_committed);
+	return (rc);
+}
+
+int 
+rf_SubmitReconBufferBasic(rbuf, keep_it, use_committed)
+	RF_ReconBuffer_t *rbuf;	/* the recon buffer to submit */
+	int     keep_it;	/* whether we can keep this buffer or we have
+				 * to return it */
+	int     use_committed;	/* whether to use a committed or an available
+				 * recon buffer */
+{
+	RF_Raid_t *raidPtr = rbuf->raidPtr;
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[rbuf->row];
+	RF_ReconParityStripeStatus_t *pssPtr;
+	RF_ReconBuffer_t *targetRbuf, *t = NULL;	/* temporary rbuf
+							 * pointers */
+	caddr_t ta;		/* temporary data buffer pointer */
+	RF_CallbackDesc_t *cb, *p;
+	int     retcode = 0, created = 0;
+
+	RF_Etimer_t timer;
+
+	/* makes no sense to have a submission from the failed disk */
+	RF_ASSERT(rbuf);
+	RF_ASSERT(rbuf->col != reconCtrlPtr->fcol);
+
+	Dprintf5("RECON: submission by row %d col %d for psid %ld ru %d (failed offset %ld)\n",
+	    rbuf->row, rbuf->col, (long) rbuf->parityStripeID, rbuf->which_ru, (long) rbuf->failedDiskSectorOffset);
+
+	RF_LOCK_PSS_MUTEX(raidPtr, rbuf->row, rbuf->parityStripeID);
+
+	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
+
+	pssPtr = rf_LookupRUStatus(raidPtr, reconCtrlPtr->pssTable, rbuf->parityStripeID, rbuf->which_ru, RF_PSS_NONE, &created);
+	RF_ASSERT(pssPtr);	/* if it didn't exist, we wouldn't have gotten
+				 * an rbuf for it */
+
+	/* check to see if enough buffers have accumulated to do an XOR.  If
+	 * so, there's no need to acquire a floating rbuf.  Before we can do
+	 * any XORing, we must have acquired a destination buffer.  If we
+	 * have, then we can go ahead and do the XOR if (1) including this
+	 * buffer, enough bufs have accumulated, or (2) this is the last
+	 * submission for this stripe. Otherwise, we have to go acquire a
+	 * floating rbuf. */
+
+	targetRbuf = (RF_ReconBuffer_t *) pssPtr->rbuf;
+	if ((targetRbuf != NULL) &&
+	    ((pssPtr->xorBufCount == rf_numBufsToAccumulate - 1) || (targetRbuf->count + pssPtr->xorBufCount + 1 == layoutPtr->numDataCol))) {
+		pssPtr->rbufsForXor[pssPtr->xorBufCount++] = rbuf;	/* install this buffer */
+		Dprintf3("RECON: row %d col %d invoking a %d-way XOR\n", rbuf->row, rbuf->col, pssPtr->xorBufCount);
+		RF_ETIMER_START(timer);
+		rf_MultiWayReconXor(raidPtr, pssPtr);
+		RF_ETIMER_STOP(timer);
+		RF_ETIMER_EVAL(timer);
+		raidPtr->accumXorTimeUs += RF_ETIMER_VAL_US(timer);
+		if (!keep_it) {
+			raidPtr->recon_tracerecs[rbuf->col].xor_us = RF_ETIMER_VAL_US(timer);
+			RF_ETIMER_STOP(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+			RF_ETIMER_EVAL(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+			raidPtr->recon_tracerecs[rbuf->col].specific.recon.recon_return_to_submit_us +=
+			    RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+			RF_ETIMER_START(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+
+			rf_LogTraceRec(raidPtr, &raidPtr->recon_tracerecs[rbuf->col]);
+		}
+		rf_CheckForFullRbuf(raidPtr, reconCtrlPtr, pssPtr, layoutPtr->numDataCol);
+
+		/* if use_committed is on, we _must_ consume a buffer off the
+		 * committed list. */
+		if (use_committed) {
+			t = reconCtrlPtr->committedRbufs;
+			RF_ASSERT(t);
+			reconCtrlPtr->committedRbufs = t->next;
+			rf_ReleaseFloatingReconBuffer(raidPtr, rbuf->row, t);
+		}
+		if (keep_it) {
+			RF_UNLOCK_PSS_MUTEX(raidPtr, rbuf->row, rbuf->parityStripeID);
+			RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
+			rf_FreeReconBuffer(rbuf);
+			return (retcode);
+		}
+		goto out;
+	}
+	/* set the value of "t", which we'll use as the rbuf from here on */
+	if (keep_it) {
+		t = rbuf;
+	} else {
+		if (use_committed) {	/* if a buffer has been committed to
+					 * us, use it */
+			t = reconCtrlPtr->committedRbufs;
+			RF_ASSERT(t);
+			reconCtrlPtr->committedRbufs = t->next;
+			t->next = NULL;
+		} else
+			if (reconCtrlPtr->floatingRbufs) {
+				t = reconCtrlPtr->floatingRbufs;
+				reconCtrlPtr->floatingRbufs = t->next;
+				t->next = NULL;
+			}
+	}
+
+	/* If we weren't able to acquire a buffer, append to the end of the
+	 * buf list in the recon ctrl struct. */
+	if (!t) {
+		RF_ASSERT(!keep_it && !use_committed);
+		Dprintf2("RECON: row %d col %d failed to acquire floating rbuf\n", rbuf->row, rbuf->col);
+
+		raidPtr->procsInBufWait++;
+		if ((raidPtr->procsInBufWait == raidPtr->numCol - 1) && (raidPtr->numFullReconBuffers == 0)) {
+			printf("Buffer wait deadlock detected.  Exiting.\n");
+			rf_PrintPSStatusTable(raidPtr, rbuf->row);
+			RF_PANIC();
+		}
+		pssPtr->flags |= RF_PSS_BUFFERWAIT;
+		cb = rf_AllocCallbackDesc();	/* append to buf wait list in
+						 * recon ctrl structure */
+		cb->row = rbuf->row;
+		cb->col = rbuf->col;
+		cb->callbackArg.v = rbuf->parityStripeID;
+		cb->callbackArg2.v = rbuf->which_ru;
+		cb->next = NULL;
+		if (!reconCtrlPtr->bufferWaitList)
+			reconCtrlPtr->bufferWaitList = cb;
+		else {		/* might want to maintain head/tail pointers
+				 * here rather than search for end of list */
+			for (p = reconCtrlPtr->bufferWaitList; p->next; p = p->next);
+			p->next = cb;
+		}
+		retcode = 1;
+		goto out;
+	}
+	Dprintf2("RECON: row %d col %d acquired rbuf\n", rbuf->row, rbuf->col);
+	RF_ETIMER_STOP(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+	RF_ETIMER_EVAL(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+	raidPtr->recon_tracerecs[rbuf->col].specific.recon.recon_return_to_submit_us +=
+	    RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+	RF_ETIMER_START(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+
+	rf_LogTraceRec(raidPtr, &raidPtr->recon_tracerecs[rbuf->col]);
+
+	/* initialize the buffer */
+	if (t != rbuf) {
+		t->row = rbuf->row;
+		t->col = reconCtrlPtr->fcol;
+		t->parityStripeID = rbuf->parityStripeID;
+		t->which_ru = rbuf->which_ru;
+		t->failedDiskSectorOffset = rbuf->failedDiskSectorOffset;
+		t->spRow = rbuf->spRow;
+		t->spCol = rbuf->spCol;
+		t->spOffset = rbuf->spOffset;
+
+		ta = t->buffer;
+		t->buffer = rbuf->buffer;
+		rbuf->buffer = ta;	/* swap buffers */
+	}
+	/* the first installation always gets installed as the destination
+	 * buffer. subsequent installations get stacked up to allow for
+	 * multi-way XOR */
+	if (!pssPtr->rbuf) {
+		pssPtr->rbuf = t;
+		t->count = 1;
+	} else
+		pssPtr->rbufsForXor[pssPtr->xorBufCount++] = t;	/* install this buffer */
+
+	rf_CheckForFullRbuf(raidPtr, reconCtrlPtr, pssPtr, layoutPtr->numDataCol);	/* the buffer is full if
+											 * G=2 */
+
+out:
+	RF_UNLOCK_PSS_MUTEX(raidPtr, rbuf->row, rbuf->parityStripeID);
+	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
+	return (retcode);
+}
+
+int 
+rf_MultiWayReconXor(raidPtr, pssPtr)
+	RF_Raid_t *raidPtr;
+	RF_ReconParityStripeStatus_t *pssPtr;	/* the pss descriptor for this
+						 * parity stripe */
+{
+	int     i, numBufs = pssPtr->xorBufCount;
+	int     numBytes = rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU);
+	RF_ReconBuffer_t **rbufs = (RF_ReconBuffer_t **) pssPtr->rbufsForXor;
+	RF_ReconBuffer_t *targetRbuf = (RF_ReconBuffer_t *) pssPtr->rbuf;
+
+	RF_ASSERT(pssPtr->rbuf != NULL);
+	RF_ASSERT(numBufs > 0 && numBufs < RF_PS_MAX_BUFS);
+#ifdef _KERNEL
+#ifndef __NetBSD__
+#ifndef __FreeBSD__
+	thread_block();		/* yield the processor before doing a big XOR */
+#endif
+#endif
+#endif				/* _KERNEL */
+	/*
+         * XXX
+         *
+         * What if more than 9 bufs?
+         */
+	nWayXorFuncs[numBufs] (pssPtr->rbufsForXor, targetRbuf, numBytes / sizeof(long));
+
+	/* release all the reconstruction buffers except the last one, which
+	 * belongs to the disk whose submission caused this XOR to take place */
+	for (i = 0; i < numBufs - 1; i++) {
+		if (rbufs[i]->type == RF_RBUF_TYPE_FLOATING)
+			rf_ReleaseFloatingReconBuffer(raidPtr, rbufs[i]->row, rbufs[i]);
+		else
+			if (rbufs[i]->type == RF_RBUF_TYPE_FORCED)
+				rf_FreeReconBuffer(rbufs[i]);
+			else
+				RF_ASSERT(0);
+	}
+	targetRbuf->count += pssPtr->xorBufCount;
+	pssPtr->xorBufCount = 0;
+	return (0);
+}
+/* removes one full buffer from one of the full-buffer lists and returns it.
+ *
+ * ASSUMES THE RB_MUTEX IS UNLOCKED AT ENTRY.
+ */
+RF_ReconBuffer_t *
+rf_GetFullReconBuffer(reconCtrlPtr)
+	RF_ReconCtrl_t *reconCtrlPtr;
+{
+	RF_ReconBuffer_t *p;
+
+	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
+
+	if ((p = reconCtrlPtr->priorityList) != NULL) {
+		reconCtrlPtr->priorityList = p->next;
+		p->next = NULL;
+		goto out;
+	}
+	if ((p = reconCtrlPtr->fullBufferList) != NULL) {
+		reconCtrlPtr->fullBufferList = p->next;
+		p->next = NULL;
+		goto out;
+	}
+out:
+	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
+	return (p);
+}
+
+
+/* if the reconstruction buffer is full, move it to the full list,
+ * which is maintained sorted by failed disk sector offset
+ *
+ * ASSUMES THE RB_MUTEX IS LOCKED AT ENTRY.  */
+int 
+rf_CheckForFullRbuf(raidPtr, reconCtrl, pssPtr, numDataCol)
+	RF_Raid_t *raidPtr;
+	RF_ReconCtrl_t *reconCtrl;
+	RF_ReconParityStripeStatus_t *pssPtr;
+	int     numDataCol;
+{
+	RF_ReconBuffer_t *p, *pt, *rbuf = (RF_ReconBuffer_t *) pssPtr->rbuf;
+
+	if (rbuf->count == numDataCol) {
+		raidPtr->numFullReconBuffers++;
+		Dprintf2("RECON: rbuf for psid %ld ru %d has filled\n",
+		    (long) rbuf->parityStripeID, rbuf->which_ru);
+		if (!reconCtrl->fullBufferList || (rbuf->failedDiskSectorOffset < reconCtrl->fullBufferList->failedDiskSectorOffset)) {
+			Dprintf2("RECON: rbuf for psid %ld ru %d is head of list\n",
+			    (long) rbuf->parityStripeID, rbuf->which_ru);
+			rbuf->next = reconCtrl->fullBufferList;
+			reconCtrl->fullBufferList = rbuf;
+		} else {
+			for (pt = reconCtrl->fullBufferList, p = pt->next; p && p->failedDiskSectorOffset < rbuf->failedDiskSectorOffset; pt = p, p = p->next);
+			rbuf->next = p;
+			pt->next = rbuf;
+			Dprintf2("RECON: rbuf for psid %ld ru %d is in list\n",
+			    (long) rbuf->parityStripeID, rbuf->which_ru);
+		}
+#if 0
+		pssPtr->writeRbuf = pssPtr->rbuf;	/* DEBUG ONLY:  we like
+							 * to be able to find
+							 * this rbuf while it's
+							 * awaiting write */
+#else
+		rbuf->pssPtr = pssPtr;
+#endif
+		pssPtr->rbuf = NULL;
+		rf_CauseReconEvent(raidPtr, rbuf->row, rbuf->col, NULL, RF_REVENT_BUFREADY);
+	}
+	return (0);
+}
+
+
+/* release a floating recon buffer for someone else to use.
+ * assumes the rb_mutex is LOCKED at entry
+ */
+void 
+rf_ReleaseFloatingReconBuffer(raidPtr, row, rbuf)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t row;
+	RF_ReconBuffer_t *rbuf;
+{
+	RF_ReconCtrl_t *rcPtr = raidPtr->reconControl[row];
+	RF_CallbackDesc_t *cb;
+
+	Dprintf2("RECON: releasing rbuf for psid %ld ru %d\n",
+	    (long) rbuf->parityStripeID, rbuf->which_ru);
+
+	/* if anyone is waiting on buffers, wake one of them up.  They will
+	 * subsequently wake up anyone else waiting on their RU */
+	if (rcPtr->bufferWaitList) {
+		rbuf->next = rcPtr->committedRbufs;
+		rcPtr->committedRbufs = rbuf;
+		cb = rcPtr->bufferWaitList;
+		rcPtr->bufferWaitList = cb->next;
+		rf_CauseReconEvent(raidPtr, cb->row, cb->col, (void *) 1, RF_REVENT_BUFCLEAR);	/* arg==1 => we've
+												 * committed a buffer */
+		rf_FreeCallbackDesc(cb);
+		raidPtr->procsInBufWait--;
+	} else {
+		rbuf->next = rcPtr->floatingRbufs;
+		rcPtr->floatingRbufs = rbuf;
+	}
+}
+/* release any disk that is waiting on a buffer for the indicated RU.
+ * assumes the rb_mutex is LOCKED at entry
+ */
+void 
+rf_ReleaseBufferWaiters(raidPtr, pssPtr)
+	RF_Raid_t *raidPtr;
+	RF_ReconParityStripeStatus_t *pssPtr;
+{
+	RF_CallbackDesc_t *cb1, *cb = pssPtr->bufWaitList;
+
+	Dprintf2("RECON: releasing buf waiters for psid %ld ru %d\n",
+	    (long) pssPtr->parityStripeID, pssPtr->which_ru);
+	pssPtr->flags &= ~RF_PSS_BUFFERWAIT;
+	while (cb) {
+		cb1 = cb->next;
+		cb->next = NULL;
+		rf_CauseReconEvent(raidPtr, cb->row, cb->col, (void *) 0, RF_REVENT_BUFCLEAR);	/* arg==0 => we haven't
+												 * committed a buffer */
+		rf_FreeCallbackDesc(cb);
+		cb = cb1;
+	}
+	pssPtr->bufWaitList = NULL;
+}
+/* when reconstruction is forced on an RU, there may be some disks waiting to
+ * acquire a buffer for that RU.  Since we allocate a new buffer as part of
+ * the forced-reconstruction process, we no longer have to wait for any
+ * buffers, so we wakeup any waiter that we find in the bufferWaitList
+ *
+ * assumes the rb_mutex is LOCKED at entry
+ */
+void 
+rf_ReleaseBufferWaiter(rcPtr, rbuf)
+	RF_ReconCtrl_t *rcPtr;
+	RF_ReconBuffer_t *rbuf;
+{
+	RF_CallbackDesc_t *cb, *cbt;
+
+	for (cbt = NULL, cb = rcPtr->bufferWaitList; cb; cbt = cb, cb = cb->next) {
+		if ((cb->callbackArg.v == rbuf->parityStripeID) && (cb->callbackArg2.v == rbuf->which_ru)) {
+			Dprintf2("RECON: Dropping row %d col %d from buffer wait list\n", cb->row, cb->col);
+			if (cbt)
+				cbt->next = cb->next;
+			else
+				rcPtr->bufferWaitList = cb->next;
+			rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, cb->row, cb->col, (void *) 0, RF_REVENT_BUFREADY);	/* arg==0 => no
+																 * committed buffer */
+			rf_FreeCallbackDesc(cb);
+			return;
+		}
+	}
+}
diff --git a/sys/dev/raidframe/rf_reconbuffer.h b/sys/dev/raidframe/rf_reconbuffer.h
new file mode 100644
index 0000000..1a5407e
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconbuffer.h
@@ -0,0 +1,63 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_reconbuffer.h,v 1.3 1999/02/05 00:06:16 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*******************************************************************
+ *
+ * rf_reconbuffer.h -- header file for reconstruction buffer manager
+ *
+ *******************************************************************/
+
+#ifndef _RF__RF_RECONBUFFER_H_
+#define _RF__RF_RECONBUFFER_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_reconstruct.h>
+
+int 
+rf_SubmitReconBuffer(RF_ReconBuffer_t * rbuf, int keep_int,
+    int use_committed);
+int 
+rf_SubmitReconBufferBasic(RF_ReconBuffer_t * rbuf, int keep_int,
+    int use_committed);
+int 
+rf_MultiWayReconXor(RF_Raid_t * raidPtr,
+    RF_ReconParityStripeStatus_t * pssPtr);
+RF_ReconBuffer_t *rf_GetFullReconBuffer(RF_ReconCtrl_t * reconCtrlPtr);
+int 
+rf_CheckForFullRbuf(RF_Raid_t * raidPtr, RF_ReconCtrl_t * reconCtrl,
+    RF_ReconParityStripeStatus_t * pssPtr, int numDataCol);
+void 
+rf_ReleaseFloatingReconBuffer(RF_Raid_t * raidPtr, RF_RowCol_t row,
+    RF_ReconBuffer_t * rbuf);
+void 
+rf_ReleaseBufferWaiters(RF_Raid_t * raidPtr,
+    RF_ReconParityStripeStatus_t * pssPtr);
+void    rf_ReleaseBufferWaiter(RF_ReconCtrl_t * rcPtr, RF_ReconBuffer_t * rbuf);
+
+#endif				/* !_RF__RF_RECONBUFFER_H_ */
diff --git a/sys/dev/raidframe/rf_reconmap.c b/sys/dev/raidframe/rf_reconmap.c
new file mode 100644
index 0000000..a73c138
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconmap.c
@@ -0,0 +1,394 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_reconmap.c,v 1.6 1999/08/14 21:44:24 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*************************************************************************
+ * rf_reconmap.c
+ *
+ * code to maintain a map of what sectors have/have not been reconstructed
+ *
+ *************************************************************************/
+
+#include <dev/raidframe/rf_raid.h>
+#include <sys/time.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_utils.h>
+
+/* special pointer values indicating that a reconstruction unit
+ * has been either totally reconstructed or not at all.  Both
+ * are illegal pointer values, so you have to be careful not to
+ * dereference through them.  RU_NOTHING must be zero, since
+ * MakeReconMap uses bzero to initialize the structure.  These are used
+ * only at the head of the list.
+ */
+#define RU_ALL      ((RF_ReconMapListElem_t *) -1)
+#define RU_NOTHING  ((RF_ReconMapListElem_t *) 0)
+
+/* used to mark the end of the list */
+#define RU_NIL      ((RF_ReconMapListElem_t *) 0)
+
+
+static void 
+compact_stat_entry(RF_Raid_t * raidPtr, RF_ReconMap_t * mapPtr,
+    int i);
+static void crunch_list(RF_ReconMap_t * mapPtr, RF_ReconMapListElem_t * listPtr);
+static RF_ReconMapListElem_t *
+MakeReconMapListElem(RF_SectorNum_t startSector,
+    RF_SectorNum_t stopSector, RF_ReconMapListElem_t * next);
+static void 
+FreeReconMapListElem(RF_ReconMap_t * mapPtr,
+    RF_ReconMapListElem_t * p);
+static void update_size(RF_ReconMap_t * mapPtr, int size);
+static void PrintList(RF_ReconMapListElem_t * listPtr);
+
+/*-----------------------------------------------------------------------------
+ *
+ * Creates and initializes new Reconstruction map
+ *
+ *-----------------------------------------------------------------------------*/
+
+RF_ReconMap_t *
+rf_MakeReconMap(raidPtr, ru_sectors, disk_sectors, spareUnitsPerDisk)
+	RF_Raid_t *raidPtr;
+	RF_SectorCount_t ru_sectors;	/* size of reconstruction unit in
+					 * sectors */
+	RF_SectorCount_t disk_sectors;	/* size of disk in sectors */
+	RF_ReconUnitCount_t spareUnitsPerDisk;	/* zero unless distributed
+						 * sparing */
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_ReconUnitCount_t num_rus = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerRU;
+	RF_ReconMap_t *p;
+	int     rc;
+
+	RF_Malloc(p, sizeof(RF_ReconMap_t), (RF_ReconMap_t *));
+	p->sectorsPerReconUnit = ru_sectors;
+	p->sectorsInDisk = disk_sectors;
+
+	p->totalRUs = num_rus;
+	p->spareRUs = spareUnitsPerDisk;
+	p->unitsLeft = num_rus - spareUnitsPerDisk;
+
+	RF_Malloc(p->status, num_rus * sizeof(RF_ReconMapListElem_t *), (RF_ReconMapListElem_t **));
+	RF_ASSERT(p->status != (RF_ReconMapListElem_t **) NULL);
+
+	(void) bzero((char *) p->status, num_rus * sizeof(RF_ReconMapListElem_t *));
+
+	p->size = sizeof(RF_ReconMap_t) + num_rus * sizeof(RF_ReconMapListElem_t *);
+	p->maxSize = p->size;
+
+	rc = rf_mutex_init(&p->mutex, __FUNCTION__);
+	if (rc) {
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		RF_Free(p->status, num_rus * sizeof(RF_ReconMapListElem_t *));
+		RF_Free(p, sizeof(RF_ReconMap_t));
+		return (NULL);
+	}
+	return (p);
+}
+
+
+/*-----------------------------------------------------------------------------
+ *
+ * marks a new set of sectors as reconstructed.  All the possible mergings get
+ * complicated.  To simplify matters, the approach I take is to just dump
+ * something into the list, and then clean it up (i.e. merge elements and
+ * eliminate redundant ones) in a second pass over the list (compact_stat_entry()).
+ * Not 100% efficient, since a structure can be allocated and then immediately
+ * freed, but it keeps this code from becoming (more of) a nightmare of
+ * special cases.  The only thing that compact_stat_entry() assumes is that the
+ * list is sorted by startSector, and so this is the only condition I maintain
+ * here.  (MCH)
+ *
+ *-----------------------------------------------------------------------------*/
+
+void 
+rf_ReconMapUpdate(raidPtr, mapPtr, startSector, stopSector)
+	RF_Raid_t *raidPtr;
+	RF_ReconMap_t *mapPtr;
+	RF_SectorNum_t startSector;
+	RF_SectorNum_t stopSector;
+{
+	RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit;
+	RF_SectorNum_t i, first_in_RU, last_in_RU;
+	RF_ReconMapListElem_t *p, *pt;
+
+	RF_LOCK_MUTEX(mapPtr->mutex);
+	RF_ASSERT(startSector >= 0 && stopSector < mapPtr->sectorsInDisk && stopSector >= startSector);
+
+	while (startSector <= stopSector) {
+		i = startSector / mapPtr->sectorsPerReconUnit;
+		first_in_RU = i * sectorsPerReconUnit;
+		last_in_RU = first_in_RU + sectorsPerReconUnit - 1;
+		p = mapPtr->status[i];
+		if (p != RU_ALL) {
+			if (p == RU_NOTHING || p->startSector > startSector) {	/* insert at front of
+										 * list */
+
+				mapPtr->status[i] = MakeReconMapListElem(startSector, RF_MIN(stopSector, last_in_RU), (p == RU_NOTHING) ? NULL : p);
+				update_size(mapPtr, sizeof(RF_ReconMapListElem_t));
+
+			} else {/* general case */
+				do {	/* search for place to insert */
+					pt = p;
+					p = p->next;
+				} while (p && (p->startSector < startSector));
+				pt->next = MakeReconMapListElem(startSector, RF_MIN(stopSector, last_in_RU), p);
+				update_size(mapPtr, sizeof(RF_ReconMapListElem_t));
+			}
+			compact_stat_entry(raidPtr, mapPtr, i);
+		}
+		startSector = RF_MIN(stopSector, last_in_RU) + 1;
+	}
+	RF_UNLOCK_MUTEX(mapPtr->mutex);
+}
+
+
+
+/*-----------------------------------------------------------------------------
+ *
+ * performs whatever list compactions can be done, and frees any space
+ * that is no longer necessary.  Assumes only that the list is sorted
+ * by startSector.  crunch_list() compacts a single list as much as possible,
+ * and the second block of code deletes the entire list if possible.
+ * crunch_list() is also called from MakeReconMapAccessList().
+ *
+ * When a recon unit is detected to be fully reconstructed, we set the
+ * corresponding bit in the parity stripe map so that the head follow
+ * code will not select this parity stripe again.  This is redundant (but
+ * harmless) when compact_stat_entry is called from the reconstruction code,
+ * but necessary when called from the user-write code.
+ *
+ *-----------------------------------------------------------------------------*/
+
+static void 
+compact_stat_entry(raidPtr, mapPtr, i)
+	RF_Raid_t *raidPtr;
+	RF_ReconMap_t *mapPtr;
+	int     i;
+{
+	RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit;
+	RF_ReconMapListElem_t *p = mapPtr->status[i];
+
+	crunch_list(mapPtr, p);
+
+	if ((p->startSector == i * sectorsPerReconUnit) &&
+	    (p->stopSector == i * sectorsPerReconUnit + sectorsPerReconUnit - 1)) {
+		mapPtr->status[i] = RU_ALL;
+		mapPtr->unitsLeft--;
+		FreeReconMapListElem(mapPtr, p);
+	}
+}
+
+static void 
+crunch_list(mapPtr, listPtr)
+	RF_ReconMap_t *mapPtr;
+	RF_ReconMapListElem_t *listPtr;
+{
+	RF_ReconMapListElem_t *pt, *p = listPtr;
+
+	if (!p)
+		return;
+	pt = p;
+	p = p->next;
+	while (p) {
+		if (pt->stopSector >= p->startSector - 1) {
+			pt->stopSector = RF_MAX(pt->stopSector, p->stopSector);
+			pt->next = p->next;
+			FreeReconMapListElem(mapPtr, p);
+			p = pt->next;
+		} else {
+			pt = p;
+			p = p->next;
+		}
+	}
+}
+/*-----------------------------------------------------------------------------
+ *
+ * Allocate and fill a new list element
+ *
+ *-----------------------------------------------------------------------------*/
+
+static RF_ReconMapListElem_t *
+MakeReconMapListElem(
+    RF_SectorNum_t startSector,
+    RF_SectorNum_t stopSector,
+    RF_ReconMapListElem_t * next)
+{
+	RF_ReconMapListElem_t *p;
+
+	RF_Malloc(p, sizeof(RF_ReconMapListElem_t), (RF_ReconMapListElem_t *));
+	if (p == NULL)
+		return (NULL);
+	p->startSector = startSector;
+	p->stopSector = stopSector;
+	p->next = next;
+	return (p);
+}
+/*-----------------------------------------------------------------------------
+ *
+ * Free a list element
+ *
+ *-----------------------------------------------------------------------------*/
+
+static void 
+FreeReconMapListElem(mapPtr, p)
+	RF_ReconMap_t *mapPtr;
+	RF_ReconMapListElem_t *p;
+{
+	int     delta;
+
+	if (mapPtr) {
+		delta = 0 - (int) sizeof(RF_ReconMapListElem_t);
+		update_size(mapPtr, delta);
+	}
+	RF_Free(p, sizeof(*p));
+}
+/*-----------------------------------------------------------------------------
+ *
+ * Free an entire status structure.  Inefficient, but can be called at any time.
+ *
+ *-----------------------------------------------------------------------------*/
+void 
+rf_FreeReconMap(mapPtr)
+	RF_ReconMap_t *mapPtr;
+{
+	RF_ReconMapListElem_t *p, *q;
+	RF_ReconUnitCount_t numRUs;
+	RF_ReconUnitNum_t i;
+
+	numRUs = mapPtr->sectorsInDisk / mapPtr->sectorsPerReconUnit;
+	if (mapPtr->sectorsInDisk % mapPtr->sectorsPerReconUnit)
+		numRUs++;
+
+	for (i = 0; i < numRUs; i++) {
+		p = mapPtr->status[i];
+		while (p != RU_NOTHING && p != RU_ALL) {
+			q = p;
+			p = p->next;
+			RF_Free(q, sizeof(*q));
+		}
+	}
+	rf_mutex_destroy(&mapPtr->mutex);
+	RF_Free(mapPtr->status, mapPtr->totalRUs * sizeof(RF_ReconMapListElem_t *));
+	RF_Free(mapPtr, sizeof(RF_ReconMap_t));
+}
+/*-----------------------------------------------------------------------------
+ *
+ * returns nonzero if the indicated RU has been reconstructed already
+ *
+ *---------------------------------------------------------------------------*/
+
+int 
+rf_CheckRUReconstructed(mapPtr, startSector)
+	RF_ReconMap_t *mapPtr;
+	RF_SectorNum_t startSector;
+{
+	RF_ReconMapListElem_t *l;	/* used for searching */
+	RF_ReconUnitNum_t i;
+
+	i = startSector / mapPtr->sectorsPerReconUnit;
+	l = mapPtr->status[i];
+	return ((l == RU_ALL) ? 1 : 0);
+}
+
+RF_ReconUnitCount_t 
+rf_UnitsLeftToReconstruct(mapPtr)
+	RF_ReconMap_t *mapPtr;
+{
+	RF_ASSERT(mapPtr != NULL);
+	return (mapPtr->unitsLeft);
+}
+/* updates the size fields of a status descriptor */
+static void 
+update_size(mapPtr, size)
+	RF_ReconMap_t *mapPtr;
+	int     size;
+{
+	mapPtr->size += size;
+	mapPtr->maxSize = RF_MAX(mapPtr->size, mapPtr->maxSize);
+}
+
+static void 
+PrintList(listPtr)
+	RF_ReconMapListElem_t *listPtr;
+{
+	while (listPtr) {
+		printf("%d,%d -> ", (int) listPtr->startSector, (int) listPtr->stopSector);
+		listPtr = listPtr->next;
+	}
+	printf("\n");
+}
+
+void 
+rf_PrintReconMap(raidPtr, mapPtr, frow, fcol)
+	RF_Raid_t *raidPtr;
+	RF_ReconMap_t *mapPtr;
+	RF_RowCol_t frow;
+	RF_RowCol_t fcol;
+{
+	RF_ReconUnitCount_t numRUs;
+	RF_ReconMapListElem_t *p;
+	RF_ReconUnitNum_t i;
+
+	numRUs = mapPtr->totalRUs;
+	if (mapPtr->sectorsInDisk % mapPtr->sectorsPerReconUnit)
+		numRUs++;
+
+	for (i = 0; i < numRUs; i++) {
+		p = mapPtr->status[i];
+		if (p == RU_ALL)/* printf("[%d] ALL\n",i) */
+			;
+		else
+			if (p == RU_NOTHING) {
+				printf("%d: Unreconstructed\n", i);
+			} else {
+				printf("%d: ", i);
+				PrintList(p);
+			}
+	}
+}
+
+void 
+rf_PrintReconSchedule(mapPtr, starttime)
+	RF_ReconMap_t *mapPtr;
+	struct timeval *starttime;
+{
+	static int old_pctg = -1;
+	struct timeval tv, diff;
+	int     new_pctg;
+
+	new_pctg = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
+	if (new_pctg != old_pctg) {
+		RF_GETTIME(tv);
+		RF_TIMEVAL_DIFF(starttime, &tv, &diff);
+		printf("%d %d.%06d\n", (int) new_pctg, (int) diff.tv_sec, (int) diff.tv_usec);
+		old_pctg = new_pctg;
+	}
+}
diff --git a/sys/dev/raidframe/rf_reconmap.h b/sys/dev/raidframe/rf_reconmap.h
new file mode 100644
index 0000000..2fee059
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconmap.h
@@ -0,0 +1,86 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_reconmap.h,v 1.3 1999/02/05 00:06:16 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/******************************************************************************
+ * rf_reconMap.h -- Header file describing reconstruction status data structure
+ ******************************************************************************/
+
+#ifndef _RF__RF_RECONMAP_H_
+#define _RF__RF_RECONMAP_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+
+/*
+ * Main reconstruction status descriptor. size and maxsize are used for
+ * monitoring only:  they have no function for reconstruction.
+ */
+struct RF_ReconMap_s {
+	RF_SectorCount_t sectorsPerReconUnit;	/* sectors per reconstruct
+						 * unit */
+	RF_SectorCount_t sectorsInDisk;	/* total sectors in disk */
+	RF_SectorCount_t unitsLeft;	/* recon units left to recon */
+	RF_ReconUnitCount_t totalRUs;	/* total recon units on disk */
+	RF_ReconUnitCount_t spareRUs;	/* total number of spare RUs on failed
+					 * disk */
+	RF_StripeCount_t totalParityStripes;	/* total number of parity
+						 * stripes in array */
+	u_int   size;		/* overall size of this structure */
+	u_int   maxSize;	/* maximum size so far */
+	RF_ReconMapListElem_t **status;	/* array of ptrs to list elements */
+	        RF_DECLARE_MUTEX(mutex)
+};
+/* a list element */
+struct RF_ReconMapListElem_s {
+	RF_SectorNum_t startSector;	/* bounding sect nums on this block */
+	RF_SectorNum_t stopSector;
+	RF_ReconMapListElem_t *next;	/* next element in list */
+};
+
+RF_ReconMap_t *
+rf_MakeReconMap(RF_Raid_t * raidPtr, RF_SectorCount_t ru_sectors,
+    RF_SectorCount_t disk_sectors, RF_ReconUnitCount_t spareUnitsPerDisk);
+
+void 
+rf_ReconMapUpdate(RF_Raid_t * raidPtr, RF_ReconMap_t * mapPtr,
+    RF_SectorNum_t startSector, RF_SectorNum_t stopSector);
+
+void    rf_FreeReconMap(RF_ReconMap_t * mapPtr);
+
+int     rf_CheckRUReconstructed(RF_ReconMap_t * mapPtr, RF_SectorNum_t startSector);
+
+RF_ReconUnitCount_t rf_UnitsLeftToReconstruct(RF_ReconMap_t * mapPtr);
+
+void 
+rf_PrintReconMap(RF_Raid_t * raidPtr, RF_ReconMap_t * mapPtr,
+    RF_RowCol_t frow, RF_RowCol_t fcol);
+
+void    rf_PrintReconSchedule(RF_ReconMap_t * mapPtr, struct timeval * starttime);
+
+#endif				/* !_RF__RF_RECONMAP_H_ */
diff --git a/sys/dev/raidframe/rf_reconstruct.c b/sys/dev/raidframe/rf_reconstruct.c
new file mode 100644
index 0000000..9f13b67
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconstruct.c
@@ -0,0 +1,1680 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_reconstruct.c,v 1.27 2001/01/26 02:16:24 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/************************************************************
+ *
+ * rf_reconstruct.c -- code to perform on-line reconstruction
+ *
+ ************************************************************/
+
+#include <dev/raidframe/rf_types.h>
+#include <sys/time.h>
+#if defined(__FreeBSD__)
+#include <sys/systm.h>
+#if __FreeBSD_version > 500005
+#include <sys/bio.h>
+#endif
+#endif
+#include <sys/buf.h>
+#include <sys/errno.h>
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#if defined(__NetBSD__)
+#include <sys/ioctl.h>
+#elif defined(__FreeBSD__)
+#include <sys/ioccom.h>
+#endif
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+
+
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_reconutil.h>
+#include <dev/raidframe/rf_revent.h>
+#include <dev/raidframe/rf_reconbuffer.h>
+#include <dev/raidframe/rf_acctrace.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_desc.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_freelist.h>
+#include <dev/raidframe/rf_debugprint.h>
+#include <dev/raidframe/rf_driver.h>
+#include <dev/raidframe/rf_utils.h>
+#include <dev/raidframe/rf_shutdown.h>
+
+#include <dev/raidframe/rf_kintf.h>
+
+/* setting these to -1 causes them to be set to their default values if not set by debug options */
+
+#define Dprintf(s)         if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf1(s,a)         if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf2(s,a,b)       if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf3(s,a,b,c)     if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
+#define Dprintf4(s,a,b,c,d)   if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
+#define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
+#define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
+#define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
+
+#define DDprintf1(s,a)         if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define DDprintf2(s,a,b)       if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
+
+static RF_FreeList_t *rf_recond_freelist;
+#define RF_MAX_FREE_RECOND  4
+#define RF_RECOND_INC       1
+
+static RF_RaidReconDesc_t *
+AllocRaidReconDesc(RF_Raid_t * raidPtr,
+    RF_RowCol_t row, RF_RowCol_t col, RF_RaidDisk_t * spareDiskPtr,
+    int numDisksDone, RF_RowCol_t srow, RF_RowCol_t scol);
+static void FreeReconDesc(RF_RaidReconDesc_t * reconDesc);
+static int 
+ProcessReconEvent(RF_Raid_t * raidPtr, RF_RowCol_t frow,
+    RF_ReconEvent_t * event);
+static int 
+IssueNextReadRequest(RF_Raid_t * raidPtr, RF_RowCol_t row,
+    RF_RowCol_t col);
+static int TryToRead(RF_Raid_t * raidPtr, RF_RowCol_t row, RF_RowCol_t col);
+static int 
+ComputePSDiskOffsets(RF_Raid_t * raidPtr, RF_StripeNum_t psid,
+    RF_RowCol_t row, RF_RowCol_t col, RF_SectorNum_t * outDiskOffset,
+    RF_SectorNum_t * outFailedDiskSectorOffset, RF_RowCol_t * spRow,
+    RF_RowCol_t * spCol, RF_SectorNum_t * spOffset);
+static int IssueNextWriteRequest(RF_Raid_t * raidPtr, RF_RowCol_t row);
+static int ReconReadDoneProc(void *arg, int status);
+static int ReconWriteDoneProc(void *arg, int status);
+static void 
+CheckForNewMinHeadSep(RF_Raid_t * raidPtr, RF_RowCol_t row,
+    RF_HeadSepLimit_t hsCtr);
+static int 
+CheckHeadSeparation(RF_Raid_t * raidPtr, RF_PerDiskReconCtrl_t * ctrl,
+    RF_RowCol_t row, RF_RowCol_t col, RF_HeadSepLimit_t hsCtr,
+    RF_ReconUnitNum_t which_ru);
+static int 
+CheckForcedOrBlockedReconstruction(RF_Raid_t * raidPtr,
+    RF_ReconParityStripeStatus_t * pssPtr, RF_PerDiskReconCtrl_t * ctrl,
+    RF_RowCol_t row, RF_RowCol_t col, RF_StripeNum_t psid,
+    RF_ReconUnitNum_t which_ru);
+static void ForceReconReadDoneProc(void *arg, int status);
+
+static void rf_ShutdownReconstruction(void *);
+
+struct RF_ReconDoneProc_s {
+	void    (*proc) (RF_Raid_t *, void *);
+	void   *arg;
+	RF_ReconDoneProc_t *next;
+};
+
+static RF_FreeList_t *rf_rdp_freelist;
+#define RF_MAX_FREE_RDP 4
+#define RF_RDP_INC      1
+
+static void 
+SignalReconDone(RF_Raid_t * raidPtr)
+{
+	RF_ReconDoneProc_t *p;
+
+	RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex);
+	for (p = raidPtr->recon_done_procs; p; p = p->next) {
+		p->proc(raidPtr, p->arg);
+	}
+	RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex);
+}
+
+int 
+rf_RegisterReconDoneProc(
+    RF_Raid_t * raidPtr,
+    void (*proc) (RF_Raid_t *, void *),
+    void *arg,
+    RF_ReconDoneProc_t ** handlep)
+{
+	RF_ReconDoneProc_t *p;
+
+	RF_FREELIST_GET(rf_rdp_freelist, p, next, (RF_ReconDoneProc_t *));
+	if (p == NULL)
+		return (ENOMEM);
+	p->proc = proc;
+	p->arg = arg;
+	RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex);
+	p->next = raidPtr->recon_done_procs;
+	raidPtr->recon_done_procs = p;
+	RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex);
+	if (handlep)
+		*handlep = p;
+	return (0);
+}
+/**************************************************************************
+ *
+ * sets up the parameters that will be used by the reconstruction process
+ * currently there are none, except for those that the layout-specific
+ * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
+ *
+ * in the kernel, we fire off the recon thread.
+ *
+ **************************************************************************/
+static void 
+rf_ShutdownReconstruction(ignored)
+	void   *ignored;
+{
+	RF_FREELIST_DESTROY(rf_recond_freelist, next, (RF_RaidReconDesc_t *));
+	RF_FREELIST_DESTROY(rf_rdp_freelist, next, (RF_ReconDoneProc_t *));
+}
+
+int 
+rf_ConfigureReconstruction(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     rc;
+
+	RF_FREELIST_CREATE(rf_recond_freelist, RF_MAX_FREE_RECOND,
+	    RF_RECOND_INC, sizeof(RF_RaidReconDesc_t));
+	if (rf_recond_freelist == NULL)
+		return (ENOMEM);
+	RF_FREELIST_CREATE(rf_rdp_freelist, RF_MAX_FREE_RDP,
+	    RF_RDP_INC, sizeof(RF_ReconDoneProc_t));
+	if (rf_rdp_freelist == NULL) {
+		RF_FREELIST_DESTROY(rf_recond_freelist, next, (RF_RaidReconDesc_t *));
+		return (ENOMEM);
+	}
+	rc = rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+		    __FILE__, __LINE__, rc);
+		rf_ShutdownReconstruction(NULL);
+		return (rc);
+	}
+	return (0);
+}
+
+static RF_RaidReconDesc_t *
+AllocRaidReconDesc(raidPtr, row, col, spareDiskPtr, numDisksDone, srow, scol)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t row;
+	RF_RowCol_t col;
+	RF_RaidDisk_t *spareDiskPtr;
+	int     numDisksDone;
+	RF_RowCol_t srow;
+	RF_RowCol_t scol;
+{
+
+	RF_RaidReconDesc_t *reconDesc;
+
+	RF_FREELIST_GET(rf_recond_freelist, reconDesc, next, (RF_RaidReconDesc_t *));
+
+	reconDesc->raidPtr = raidPtr;
+	reconDesc->row = row;
+	reconDesc->col = col;
+	reconDesc->spareDiskPtr = spareDiskPtr;
+	reconDesc->numDisksDone = numDisksDone;
+	reconDesc->srow = srow;
+	reconDesc->scol = scol;
+	reconDesc->state = 0;
+	reconDesc->next = NULL;
+
+	return (reconDesc);
+}
+
+static void 
+FreeReconDesc(reconDesc)
+	RF_RaidReconDesc_t *reconDesc;
+{
+#if RF_RECON_STATS > 0
+	printf("RAIDframe: %lu recon event waits, %lu recon delays\n",
+	    (long) reconDesc->numReconEventWaits, (long) reconDesc->numReconExecDelays);
+#endif				/* RF_RECON_STATS > 0 */
+	printf("RAIDframe: %lu max exec ticks\n",
+	    (long) reconDesc->maxReconExecTicks);
+#if (RF_RECON_STATS > 0) || defined(KERNEL)
+	printf("\n");
+#endif				/* (RF_RECON_STATS > 0) || KERNEL */
+	RF_FREELIST_FREE(rf_recond_freelist, reconDesc, next);
+}
+
+
+/*****************************************************************************
+ *
+ * primary routine to reconstruct a failed disk.  This should be called from
+ * within its own thread.  It won't return until reconstruction completes,
+ * fails, or is aborted.
+ *****************************************************************************/
+int 
+rf_ReconstructFailedDisk(raidPtr, row, col)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t row;
+	RF_RowCol_t col;
+{
+	RF_LayoutSW_t *lp;
+	int     rc;
+
+	lp = raidPtr->Layout.map;
+	if (lp->SubmitReconBuffer) {
+		/*
+	         * The current infrastructure only supports reconstructing one
+	         * disk at a time for each array.
+	         */
+		RF_LOCK_MUTEX(raidPtr->mutex);
+		while (raidPtr->reconInProgress) {
+			RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
+		}
+		raidPtr->reconInProgress++;
+		RF_UNLOCK_MUTEX(raidPtr->mutex);
+		rc = rf_ReconstructFailedDiskBasic(raidPtr, row, col);
+		RF_LOCK_MUTEX(raidPtr->mutex);
+		raidPtr->reconInProgress--;
+		RF_UNLOCK_MUTEX(raidPtr->mutex);
+	} else {
+		RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
+		    lp->parityConfig);
+		rc = EIO;
+	}
+	RF_SIGNAL_COND(raidPtr->waitForReconCond);
+	wakeup(&raidPtr->waitForReconCond);	/* XXX Methinks this will be
+						 * needed at some point... GO */
+	return (rc);
+}
+
+int 
+rf_ReconstructFailedDiskBasic(raidPtr, row, col)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t row;
+	RF_RowCol_t col;
+{
+	RF_ComponentLabel_t *c_label;
+	RF_RaidDisk_t *spareDiskPtr = NULL;
+	RF_RaidReconDesc_t *reconDesc;
+	RF_RowCol_t srow, scol;
+	int     numDisksDone = 0, rc;
+
+	RF_Malloc(c_label, sizeof(RF_ComponentLabel_t), (RF_ComponentLabel_t *));
+	if (c_label == NULL) {
+		printf("rf_ReconstructInPlace: Out of memory?\n");
+		return (ENOMEM);
+	}
+
+	/* first look for a spare drive onto which to reconstruct the data */
+	/* spare disk descriptors are stored in row 0.  This may have to
+	 * change eventually */
+
+	RF_LOCK_MUTEX(raidPtr->mutex);
+	RF_ASSERT(raidPtr->Disks[row][col].status == rf_ds_failed);
+
+	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+		if (raidPtr->status[row] != rf_rs_degraded) {
+			RF_ERRORMSG2("Unable to reconstruct disk at row %d col %d because status not degraded\n", row, col);
+			RF_UNLOCK_MUTEX(raidPtr->mutex);
+			RF_Free(c_label, sizeof(RF_ComponentLabel_t));
+			return (EINVAL);
+		}
+		srow = row;
+		scol = (-1);
+	} else {
+		srow = 0;
+		for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
+			if (raidPtr->Disks[srow][scol].status == rf_ds_spare) {
+				spareDiskPtr = &raidPtr->Disks[srow][scol];
+				spareDiskPtr->status = rf_ds_used_spare;
+				break;
+			}
+		}
+		if (!spareDiskPtr) {
+			RF_ERRORMSG2("Unable to reconstruct disk at row %d col %d because no spares are available\n", row, col);
+			RF_UNLOCK_MUTEX(raidPtr->mutex);
+			RF_Free(c_label, sizeof(RF_ComponentLabel_t));
+			return (ENOSPC);
+		}
+		printf("RECON: initiating reconstruction on row %d col %d -> spare at row %d col %d\n", row, col, srow, scol);
+	}
+	RF_UNLOCK_MUTEX(raidPtr->mutex);
+
+	reconDesc = AllocRaidReconDesc((void *) raidPtr, row, col, spareDiskPtr, numDisksDone, srow, scol);
+	raidPtr->reconDesc = (void *) reconDesc;
+#if RF_RECON_STATS > 0
+	reconDesc->hsStallCount = 0;
+	reconDesc->numReconExecDelays = 0;
+	reconDesc->numReconEventWaits = 0;
+#endif				/* RF_RECON_STATS > 0 */
+	reconDesc->reconExecTimerRunning = 0;
+	reconDesc->reconExecTicks = 0;
+	reconDesc->maxReconExecTicks = 0;
+	rc = rf_ContinueReconstructFailedDisk(reconDesc);
+
+	if (!rc) {
+		/* fix up the component label */
+		/* Don't actually need the read here.. */
+		raidread_component_label(
+                        raidPtr->raid_cinfo[srow][scol].ci_dev,
+			raidPtr->raid_cinfo[srow][scol].ci_vp,
+			c_label);
+		
+		raid_init_component_label( raidPtr, c_label);
+		c_label->row = row;
+		c_label->column = col;
+		c_label->clean = RF_RAID_DIRTY;
+		c_label->status = rf_ds_optimal;
+		c_label->partitionSize = raidPtr->Disks[srow][scol].partitionSize;
+
+		/* We've just done a rebuild based on all the other
+		   disks, so at this point the parity is known to be
+		   clean, even if it wasn't before. */
+
+		/* XXX doesn't hold for RAID 6!! */
+
+		raidPtr->parity_good = RF_RAID_CLEAN;
+
+		/* XXXX MORE NEEDED HERE */
+		
+		raidwrite_component_label(
+                        raidPtr->raid_cinfo[srow][scol].ci_dev,
+			raidPtr->raid_cinfo[srow][scol].ci_vp,
+			c_label);
+		
+	}
+	RF_Free(c_label, sizeof(RF_ComponentLabel_t));
+	return (rc);
+}
+
+/* 
+
+   Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
+   and you don't get a spare until the next Monday.  With this function 
+   (and hot-swappable drives) you can now put your new disk containing 
+   /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
+   rebuild the data "on the spot".
+
+*/
+
+int
+rf_ReconstructInPlace(raidPtr, row, col)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t row;
+	RF_RowCol_t col;
+{
+	RF_RaidDisk_t *spareDiskPtr = NULL;
+	RF_RaidReconDesc_t *reconDesc;
+	RF_LayoutSW_t *lp;
+	RF_RaidDisk_t *badDisk;
+	RF_ComponentLabel_t *c_label;
+	int     numDisksDone = 0, rc;
+	struct vnode *vp;
+	int retcode;
+	int ac;
+
+	RF_Malloc(c_label, sizeof(RF_ComponentLabel_t), (RF_ComponentLabel_t *));
+	if (c_label == NULL) {
+		printf("rf_ReconstructInPlace: Out of memory?\n");
+		return (ENOMEM);
+	}
+
+	lp = raidPtr->Layout.map;
+	if (lp->SubmitReconBuffer) {
+		/*
+	         * The current infrastructure only supports reconstructing one
+	         * disk at a time for each array.
+	         */
+		RF_LOCK_MUTEX(raidPtr->mutex);
+		if ((raidPtr->Disks[row][col].status == rf_ds_optimal) &&
+		    (raidPtr->numFailures > 0)) { 
+			/* XXX 0 above shouldn't be constant!!! */
+			/* some component other than this has failed.
+			   Let's not make things worse than they already
+			   are... */
+			printf("RAIDFRAME: Unable to reconstruct to disk at:\n");
+			printf("      Row: %d Col: %d   Too many failures.\n",
+			       row, col);
+			RF_UNLOCK_MUTEX(raidPtr->mutex);
+			RF_Free(c_label, sizeof(RF_ComponentLabel_t));
+			return (EINVAL);
+		}
+		if (raidPtr->Disks[row][col].status == rf_ds_reconstructing) {
+			printf("RAIDFRAME: Unable to reconstruct to disk at:\n");
+			printf("      Row: %d Col: %d   Reconstruction already occuring!\n", row, col);
+
+			RF_UNLOCK_MUTEX(raidPtr->mutex);
+			RF_Free(c_label, sizeof(RF_ComponentLabel_t));
+			return (EINVAL);
+		}
+
+
+		if (raidPtr->Disks[row][col].status != rf_ds_failed) {
+			/* "It's gone..." */
+			raidPtr->numFailures++;
+			raidPtr->Disks[row][col].status = rf_ds_failed;
+			raidPtr->status[row] = rf_rs_degraded;
+			rf_update_component_labels(raidPtr, 
+						   RF_NORMAL_COMPONENT_UPDATE);
+		}
+
+		while (raidPtr->reconInProgress) {
+			RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
+		}
+
+		raidPtr->reconInProgress++;
+
+
+		/* first look for a spare drive onto which to reconstruct 
+		   the data.  spare disk descriptors are stored in row 0. 
+		   This may have to change eventually */
+
+		/* Actually, we don't care if it's failed or not... 
+		   On a RAID set with correct parity, this function
+		   should be callable on any component without ill affects. */
+		/* RF_ASSERT(raidPtr->Disks[row][col].status == rf_ds_failed);
+		 */
+
+		if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+			RF_ERRORMSG2("Unable to reconstruct to disk at row %d col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", row, col);
+
+			raidPtr->reconInProgress--;
+			RF_UNLOCK_MUTEX(raidPtr->mutex);
+			RF_Free(c_label, sizeof(RF_ComponentLabel_t));
+			return (EINVAL);
+		}			
+
+		/* XXX need goop here to see if the disk is alive,
+		   and, if not, make it so...  */
+		
+
+
+		badDisk = &raidPtr->Disks[row][col];
+
+		/* This device may have been opened successfully the 
+		   first time. Close it before trying to open it again.. */
+
+		if (raidPtr->raid_cinfo[row][col].ci_vp != NULL) {
+			printf("Closed the open device: %s\n",
+			       raidPtr->Disks[row][col].devname);
+			vp = raidPtr->raid_cinfo[row][col].ci_vp;
+			ac = raidPtr->Disks[row][col].auto_configured;
+			rf_close_component(raidPtr, vp, ac);
+			raidPtr->raid_cinfo[row][col].ci_vp = NULL;
+		}
+		/* note that this disk was *not* auto_configured (any longer)*/
+		raidPtr->Disks[row][col].auto_configured = 0;
+
+		printf("About to (re-)open the device for rebuilding: %s\n",
+		       raidPtr->Disks[row][col].devname);
+		
+		retcode = raid_getcomponentsize(raidPtr, row, col);
+	
+		if (retcode) {
+			printf("raid%d: rebuilding: raidlookup on device: %s failed: %d!\n",
+			    raidPtr->raidid, raidPtr->Disks[row][col].devname,
+			    retcode);
+
+			/* XXX the component isn't responding properly... 
+			   must be still dead :-( */
+			raidPtr->reconInProgress--;
+			RF_UNLOCK_MUTEX(raidPtr->mutex);
+			RF_Free(c_label, sizeof(RF_ComponentLabel_t));
+			return(retcode);
+
+		}
+
+		spareDiskPtr = &raidPtr->Disks[row][col];
+		spareDiskPtr->status = rf_ds_used_spare;
+
+		printf("RECON: initiating in-place reconstruction on\n");
+		printf("       row %d col %d -> spare at row %d col %d\n", 
+		       row, col, row, col);
+
+		RF_UNLOCK_MUTEX(raidPtr->mutex);
+		
+		reconDesc = AllocRaidReconDesc((void *) raidPtr, row, col, 
+					       spareDiskPtr, numDisksDone, 
+					       row, col);
+		raidPtr->reconDesc = (void *) reconDesc;
+#if RF_RECON_STATS > 0
+		reconDesc->hsStallCount = 0;
+		reconDesc->numReconExecDelays = 0;
+		reconDesc->numReconEventWaits = 0;
+#endif				/* RF_RECON_STATS > 0 */
+		reconDesc->reconExecTimerRunning = 0;
+		reconDesc->reconExecTicks = 0;
+		reconDesc->maxReconExecTicks = 0;
+		rc = rf_ContinueReconstructFailedDisk(reconDesc);
+
+		RF_LOCK_MUTEX(raidPtr->mutex);
+		raidPtr->reconInProgress--;
+		RF_UNLOCK_MUTEX(raidPtr->mutex);
+
+	} else {
+		RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
+			     lp->parityConfig);
+		rc = EIO;
+	}
+	RF_LOCK_MUTEX(raidPtr->mutex);
+	
+	if (!rc) {
+		/* Need to set these here, as at this point it'll be claiming
+		   that the disk is in rf_ds_spared!  But we know better :-) */
+		
+		raidPtr->Disks[row][col].status = rf_ds_optimal;
+		raidPtr->status[row] = rf_rs_optimal;
+		
+		/* fix up the component label */
+		/* Don't actually need the read here.. */
+		raidread_component_label(raidPtr->raid_cinfo[row][col].ci_dev,
+					 raidPtr->raid_cinfo[row][col].ci_vp,
+					 c_label);
+
+		raid_init_component_label(raidPtr, c_label);
+
+		c_label->row = row;
+		c_label->column = col;
+		
+		/* We've just done a rebuild based on all the other
+		   disks, so at this point the parity is known to be
+		   clean, even if it wasn't before. */
+
+		/* XXX doesn't hold for RAID 6!! */
+
+		raidPtr->parity_good = RF_RAID_CLEAN;
+
+		raidwrite_component_label(raidPtr->raid_cinfo[row][col].ci_dev,
+					  raidPtr->raid_cinfo[row][col].ci_vp,
+					  c_label);
+
+	}
+	RF_UNLOCK_MUTEX(raidPtr->mutex);
+	RF_SIGNAL_COND(raidPtr->waitForReconCond);
+	wakeup(&raidPtr->waitForReconCond);	
+	RF_Free(c_label, sizeof(RF_ComponentLabel_t));
+	return (rc);
+}
+
+
+int 
+rf_ContinueReconstructFailedDisk(reconDesc)
+	RF_RaidReconDesc_t *reconDesc;
+{
+	RF_Raid_t *raidPtr = reconDesc->raidPtr;
+	RF_RowCol_t row = reconDesc->row;
+	RF_RowCol_t col = reconDesc->col;
+	RF_RowCol_t srow = reconDesc->srow;
+	RF_RowCol_t scol = reconDesc->scol;
+	RF_ReconMap_t *mapPtr;
+
+	RF_ReconEvent_t *event;
+	struct timeval etime, elpsd;
+	unsigned long xor_s, xor_resid_us;
+	int     retcode, i, ds;
+
+	switch (reconDesc->state) {
+
+
+	case 0:
+
+		raidPtr->accumXorTimeUs = 0;
+
+		/* create one trace record per physical disk */
+		RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
+
+		/* quiesce the array prior to starting recon.  this is needed
+		 * to assure no nasty interactions with pending user writes.
+		 * We need to do this before we change the disk or row status. */
+		reconDesc->state = 1;
+
+		Dprintf("RECON: begin request suspend\n");
+		retcode = rf_SuspendNewRequestsAndWait(raidPtr);
+		Dprintf("RECON: end request suspend\n");
+		rf_StartUserStats(raidPtr);	/* zero out the stats kept on
+						 * user accs */
+
+		/* fall through to state 1 */
+
+	case 1:
+
+		RF_LOCK_MUTEX(raidPtr->mutex);
+
+		/* create the reconstruction control pointer and install it in
+		 * the right slot */
+		raidPtr->reconControl[row] = rf_MakeReconControl(reconDesc, row, col, srow, scol);
+		mapPtr = raidPtr->reconControl[row]->reconMap;
+		raidPtr->status[row] = rf_rs_reconstructing;
+		raidPtr->Disks[row][col].status = rf_ds_reconstructing;
+		raidPtr->Disks[row][col].spareRow = srow;
+		raidPtr->Disks[row][col].spareCol = scol;
+
+		RF_UNLOCK_MUTEX(raidPtr->mutex);
+
+		RF_GETTIME(raidPtr->reconControl[row]->starttime);
+
+		/* now start up the actual reconstruction: issue a read for
+		 * each surviving disk */
+
+		reconDesc->numDisksDone = 0;
+		for (i = 0; i < raidPtr->numCol; i++) {
+			if (i != col) {
+				/* find and issue the next I/O on the
+				 * indicated disk */
+				if (IssueNextReadRequest(raidPtr, row, i)) {
+					Dprintf2("RECON: done issuing for r%d c%d\n", row, i);
+					reconDesc->numDisksDone++;
+				}
+			}
+		}
+
+	case 2:
+		Dprintf("RECON: resume requests\n");
+		rf_ResumeNewRequests(raidPtr);
+
+
+		reconDesc->state = 3;
+
+	case 3:
+
+		/* process reconstruction events until all disks report that
+		 * they've completed all work */
+		mapPtr = raidPtr->reconControl[row]->reconMap;
+
+
+
+		while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
+
+			event = rf_GetNextReconEvent(reconDesc, row, (void (*) (void *)) rf_ContinueReconstructFailedDisk, reconDesc);
+			RF_ASSERT(event);
+
+			if (ProcessReconEvent(raidPtr, row, event))
+				reconDesc->numDisksDone++;
+			raidPtr->reconControl[row]->numRUsTotal = 
+				mapPtr->totalRUs;
+			raidPtr->reconControl[row]->numRUsComplete = 
+				mapPtr->totalRUs - 
+				rf_UnitsLeftToReconstruct(mapPtr);
+
+			raidPtr->reconControl[row]->percentComplete = 
+				(raidPtr->reconControl[row]->numRUsComplete * 100 / raidPtr->reconControl[row]->numRUsTotal);
+			if (rf_prReconSched) {
+				rf_PrintReconSchedule(raidPtr->reconControl[row]->reconMap, &(raidPtr->reconControl[row]->starttime));
+			}
+		}
+
+
+
+		reconDesc->state = 4;
+
+
+	case 4:
+		mapPtr = raidPtr->reconControl[row]->reconMap;
+		if (rf_reconDebug) {
+			printf("RECON: all reads completed\n");
+		}
+		/* at this point all the reads have completed.  We now wait
+		 * for any pending writes to complete, and then we're done */
+
+		while (rf_UnitsLeftToReconstruct(raidPtr->reconControl[row]->reconMap) > 0) {
+
+			event = rf_GetNextReconEvent(reconDesc, row, (void (*) (void *)) rf_ContinueReconstructFailedDisk, reconDesc);
+			RF_ASSERT(event);
+
+			(void) ProcessReconEvent(raidPtr, row, event);	/* ignore return code */
+			raidPtr->reconControl[row]->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
+			if (rf_prReconSched) {
+				rf_PrintReconSchedule(raidPtr->reconControl[row]->reconMap, &(raidPtr->reconControl[row]->starttime));
+			}
+		}
+		reconDesc->state = 5;
+
+	case 5:
+		/* Success:  mark the dead disk as reconstructed.  We quiesce
+		 * the array here to assure no nasty interactions with pending
+		 * user accesses when we free up the psstatus structure as
+		 * part of FreeReconControl() */
+
+		reconDesc->state = 6;
+
+		retcode = rf_SuspendNewRequestsAndWait(raidPtr);
+		rf_StopUserStats(raidPtr);
+		rf_PrintUserStats(raidPtr);	/* print out the stats on user
+						 * accs accumulated during
+						 * recon */
+
+		/* fall through to state 6 */
+	case 6:
+
+
+
+		RF_LOCK_MUTEX(raidPtr->mutex);
+		raidPtr->numFailures--;
+		ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
+		raidPtr->Disks[row][col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared;
+		raidPtr->status[row] = (ds) ? rf_rs_reconfigured : rf_rs_optimal;
+		RF_UNLOCK_MUTEX(raidPtr->mutex);
+		RF_GETTIME(etime);
+		RF_TIMEVAL_DIFF(&(raidPtr->reconControl[row]->starttime), &etime, &elpsd);
+
+		/* XXX -- why is state 7 different from state 6 if there is no
+		 * return() here? -- XXX Note that I set elpsd above & use it
+		 * below, so if you put a return here you'll have to fix this.
+		 * (also, FreeReconControl is called below) */
+
+	case 7:
+
+		rf_ResumeNewRequests(raidPtr);
+
+		printf("Reconstruction of disk at row %d col %d completed\n", 
+		       row, col);
+		xor_s = raidPtr->accumXorTimeUs / 1000000;
+		xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
+		printf("Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n",
+		    (int) elpsd.tv_sec, (int) elpsd.tv_usec, raidPtr->accumXorTimeUs, xor_s, xor_resid_us);
+		printf("  (start time %d sec %d usec, end time %d sec %d usec)\n",
+		    (int) raidPtr->reconControl[row]->starttime.tv_sec,
+		    (int) raidPtr->reconControl[row]->starttime.tv_usec,
+		    (int) etime.tv_sec, (int) etime.tv_usec);
+
+#if RF_RECON_STATS > 0
+		printf("Total head-sep stall count was %d\n",
+		    (int) reconDesc->hsStallCount);
+#endif				/* RF_RECON_STATS > 0 */
+		rf_FreeReconControl(raidPtr, row);
+		RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
+		FreeReconDesc(reconDesc);
+
+	}
+
+	SignalReconDone(raidPtr);
+	return (0);
+}
+/*****************************************************************************
+ * do the right thing upon each reconstruction event.
+ * returns nonzero if and only if there is nothing left unread on the 
+ * indicated disk
+ *****************************************************************************/
+static int 
+ProcessReconEvent(raidPtr, frow, event)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t frow;
+	RF_ReconEvent_t *event;
+{
+	int     retcode = 0, submitblocked;
+	RF_ReconBuffer_t *rbuf;
+	RF_SectorCount_t sectorsPerRU;
+
+	Dprintf1("RECON: ProcessReconEvent type %d\n", event->type);
+	switch (event->type) {
+
+		/* a read I/O has completed */
+	case RF_REVENT_READDONE:
+		rbuf = raidPtr->reconControl[frow]->perDiskInfo[event->col].rbuf;
+		Dprintf3("RECON: READDONE EVENT: row %d col %d psid %ld\n",
+		    frow, event->col, rbuf->parityStripeID);
+		Dprintf7("RECON: done read  psid %ld buf %lx  %02x %02x %02x %02x %02x\n",
+		    rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
+		    rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
+		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
+		submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
+		Dprintf1("RECON: submitblocked=%d\n", submitblocked);
+		if (!submitblocked)
+			retcode = IssueNextReadRequest(raidPtr, frow, event->col);
+		break;
+
+		/* a write I/O has completed */
+	case RF_REVENT_WRITEDONE:
+		if (rf_floatingRbufDebug) {
+			rf_CheckFloatingRbufCount(raidPtr, 1);
+		}
+		sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
+		rbuf = (RF_ReconBuffer_t *) event->arg;
+		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
+		Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n",
+		    rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl[frow]->percentComplete);
+		rf_ReconMapUpdate(raidPtr, raidPtr->reconControl[frow]->reconMap,
+		    rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
+		rf_RemoveFromActiveReconTable(raidPtr, frow, rbuf->parityStripeID, rbuf->which_ru);
+
+		if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
+			RF_LOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
+			raidPtr->numFullReconBuffers--;
+			rf_ReleaseFloatingReconBuffer(raidPtr, frow, rbuf);
+			RF_UNLOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
+		} else
+			if (rbuf->type == RF_RBUF_TYPE_FORCED)
+				rf_FreeReconBuffer(rbuf);
+			else
+				RF_ASSERT(0);
+		break;
+
+	case RF_REVENT_BUFCLEAR:	/* A buffer-stall condition has been
+					 * cleared */
+		Dprintf2("RECON: BUFCLEAR EVENT: row %d col %d\n", frow, event->col);
+		submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl[frow]->perDiskInfo[event->col].rbuf, 0, (int) (long) event->arg);
+		RF_ASSERT(!submitblocked);	/* we wouldn't have gotten the
+						 * BUFCLEAR event if we
+						 * couldn't submit */
+		retcode = IssueNextReadRequest(raidPtr, frow, event->col);
+		break;
+
+	case RF_REVENT_BLOCKCLEAR:	/* A user-write reconstruction
+					 * blockage has been cleared */
+		DDprintf2("RECON: BLOCKCLEAR EVENT: row %d col %d\n", frow, event->col);
+		retcode = TryToRead(raidPtr, frow, event->col);
+		break;
+
+	case RF_REVENT_HEADSEPCLEAR:	/* A max-head-separation
+					 * reconstruction blockage has been
+					 * cleared */
+		Dprintf2("RECON: HEADSEPCLEAR EVENT: row %d col %d\n", frow, event->col);
+		retcode = TryToRead(raidPtr, frow, event->col);
+		break;
+
+		/* a buffer has become ready to write */
+	case RF_REVENT_BUFREADY:
+		Dprintf2("RECON: BUFREADY EVENT: row %d col %d\n", frow, event->col);
+		retcode = IssueNextWriteRequest(raidPtr, frow);
+		if (rf_floatingRbufDebug) {
+			rf_CheckFloatingRbufCount(raidPtr, 1);
+		}
+		break;
+
+		/* we need to skip the current RU entirely because it got
+		 * recon'd while we were waiting for something else to happen */
+	case RF_REVENT_SKIP:
+		DDprintf2("RECON: SKIP EVENT: row %d col %d\n", frow, event->col);
+		retcode = IssueNextReadRequest(raidPtr, frow, event->col);
+		break;
+
+		/* a forced-reconstruction read access has completed.  Just
+		 * submit the buffer */
+	case RF_REVENT_FORCEDREADDONE:
+		rbuf = (RF_ReconBuffer_t *) event->arg;
+		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
+		DDprintf2("RECON: FORCEDREADDONE EVENT: row %d col %d\n", frow, event->col);
+		submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
+		RF_ASSERT(!submitblocked);
+		break;
+
+	default:
+		RF_PANIC();
+	}
+	rf_FreeReconEventDesc(event);
+	return (retcode);
+}
+/*****************************************************************************
+ *
+ * find the next thing that's needed on the indicated disk, and issue
+ * a read request for it.  We assume that the reconstruction buffer
+ * associated with this process is free to receive the data.  If
+ * reconstruction is blocked on the indicated RU, we issue a
+ * blockage-release request instead of a physical disk read request.
+ * If the current disk gets too far ahead of the others, we issue a
+ * head-separation wait request and return.
+ *
+ * ctrl->{ru_count, curPSID, diskOffset} and
+ * rbuf->failedDiskSectorOffset are maintained to point to the unit
+ * we're currently accessing.  Note that this deviates from the
+ * standard C idiom of having counters point to the next thing to be
+ * accessed.  This allows us to easily retry when we're blocked by
+ * head separation or reconstruction-blockage events.
+ *
+ * returns nonzero if and only if there is nothing left unread on the
+ * indicated disk
+ *
+ *****************************************************************************/
+static int 
+IssueNextReadRequest(raidPtr, row, col)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t row;
+	RF_RowCol_t col;
+{
+	RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl[row]->perDiskInfo[col];
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_ReconBuffer_t *rbuf = ctrl->rbuf;
+	RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
+	RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
+	int     do_new_check = 0, retcode = 0, status;
+
+	/* if we are currently the slowest disk, mark that we have to do a new
+	 * check */
+	if (ctrl->headSepCounter <= raidPtr->reconControl[row]->minHeadSepCounter)
+		do_new_check = 1;
+
+	while (1) {
+
+		ctrl->ru_count++;
+		if (ctrl->ru_count < RUsPerPU) {
+			ctrl->diskOffset += sectorsPerRU;
+			rbuf->failedDiskSectorOffset += sectorsPerRU;
+		} else {
+			ctrl->curPSID++;
+			ctrl->ru_count = 0;
+			/* code left over from when head-sep was based on
+			 * parity stripe id */
+			if (ctrl->curPSID >= raidPtr->reconControl[row]->lastPSID) {
+				CheckForNewMinHeadSep(raidPtr, row, ++(ctrl->headSepCounter));
+				return (1);	/* finito! */
+			}
+			/* find the disk offsets of the start of the parity
+			 * stripe on both the current disk and the failed
+			 * disk. skip this entire parity stripe if either disk
+			 * does not appear in the indicated PS */
+			status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, row, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset,
+			    &rbuf->spRow, &rbuf->spCol, &rbuf->spOffset);
+			if (status) {
+				ctrl->ru_count = RUsPerPU - 1;
+				continue;
+			}
+		}
+		rbuf->which_ru = ctrl->ru_count;
+
+		/* skip this RU if it's already been reconstructed */
+		if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, rbuf->failedDiskSectorOffset)) {
+			Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count);
+			continue;
+		}
+		break;
+	}
+	ctrl->headSepCounter++;
+	if (do_new_check)
+		CheckForNewMinHeadSep(raidPtr, row, ctrl->headSepCounter);	/* update min if needed */
+
+
+	/* at this point, we have definitely decided what to do, and we have
+	 * only to see if we can actually do it now */
+	rbuf->parityStripeID = ctrl->curPSID;
+	rbuf->which_ru = ctrl->ru_count;
+	bzero((char *) &raidPtr->recon_tracerecs[col], sizeof(raidPtr->recon_tracerecs[col]));
+	raidPtr->recon_tracerecs[col].reconacc = 1;
+	RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
+	retcode = TryToRead(raidPtr, row, col);
+	return (retcode);
+}
+
+/*
+ * tries to issue the next read on the indicated disk.  We may be
+ * blocked by (a) the heads being too far apart, or (b) recon on the
+ * indicated RU being blocked due to a write by a user thread.  In
+ * this case, we issue a head-sep or blockage wait request, which will
+ * cause this same routine to be invoked again later when the blockage
+ * has cleared.  
+ */
+
+static int 
+TryToRead(raidPtr, row, col)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t row;
+	RF_RowCol_t col;
+{
+	RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl[row]->perDiskInfo[col];
+	RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
+	RF_StripeNum_t psid = ctrl->curPSID;
+	RF_ReconUnitNum_t which_ru = ctrl->ru_count;
+	RF_DiskQueueData_t *req;
+	int     status, created = 0;
+	RF_ReconParityStripeStatus_t *pssPtr;
+
+	/* if the current disk is too far ahead of the others, issue a
+	 * head-separation wait and return */
+	if (CheckHeadSeparation(raidPtr, ctrl, row, col, ctrl->headSepCounter, which_ru))
+		return (0);
+	RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
+	pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_CREATE, &created);
+
+	/* if recon is blocked on the indicated parity stripe, issue a
+	 * block-wait request and return. this also must mark the indicated RU
+	 * in the stripe as under reconstruction if not blocked. */
+	status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, row, col, psid, which_ru);
+	if (status == RF_PSS_RECON_BLOCKED) {
+		Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru);
+		goto out;
+	} else
+		if (status == RF_PSS_FORCED_ON_WRITE) {
+			rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP);
+			goto out;
+		}
+	/* make one last check to be sure that the indicated RU didn't get
+	 * reconstructed while we were waiting for something else to happen.
+	 * This is unfortunate in that it causes us to make this check twice
+	 * in the normal case.  Might want to make some attempt to re-work
+	 * this so that we only do this check if we've definitely blocked on
+	 * one of the above checks.  When this condition is detected, we may
+	 * have just created a bogus status entry, which we need to delete. */
+	if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, ctrl->rbuf->failedDiskSectorOffset)) {
+		Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru);
+		if (created)
+			rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]->pssTable, pssPtr);
+		rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP);
+		goto out;
+	}
+	/* found something to read.  issue the I/O */
+	Dprintf5("RECON: Read for psid %ld on row %d col %d offset %ld buf %lx\n",
+	    psid, row, col, ctrl->diskOffset, ctrl->rbuf->buffer);
+	RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
+	RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
+	raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
+	    RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
+	RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
+
+	/* should be ok to use a NULL proc pointer here, all the bufs we use
+	 * should be in kernel space */
+	req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
+	    ReconReadDoneProc, (void *) ctrl, NULL, &raidPtr->recon_tracerecs[col], (void *) raidPtr, 0, NULL);
+
+	RF_ASSERT(req);		/* XXX -- fix this -- XXX */
+
+	ctrl->rbuf->arg = (void *) req;
+	rf_DiskIOEnqueue(&raidPtr->Queues[row][col], req, RF_IO_RECON_PRIORITY);
+	pssPtr->issued[col] = 1;
+
+out:
+	RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
+	return (0);
+}
+
+
+/*
+ * given a parity stripe ID, we want to find out whether both the
+ * current disk and the failed disk exist in that parity stripe.  If
+ * not, we want to skip this whole PS.  If so, we want to find the
+ * disk offset of the start of the PS on both the current disk and the
+ * failed disk.
+ *
+ * this works by getting a list of disks comprising the indicated
+ * parity stripe, and searching the list for the current and failed
+ * disks.  Once we've decided they both exist in the parity stripe, we
+ * need to decide whether each is data or parity, so that we'll know
+ * which mapping function to call to get the corresponding disk
+ * offsets.
+ *
+ * this is kind of unpleasant, but doing it this way allows the
+ * reconstruction code to use parity stripe IDs rather than physical
+ * disks address to march through the failed disk, which greatly
+ * simplifies a lot of code, as well as eliminating the need for a
+ * reverse-mapping function.  I also think it will execute faster,
+ * since the calls to the mapping module are kept to a minimum.
+ *
+ * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
+ * THE STRIPE IN THE CORRECT ORDER */
+
+
+static int 
+ComputePSDiskOffsets(
+    RF_Raid_t * raidPtr,	/* raid descriptor */
+    RF_StripeNum_t psid,	/* parity stripe identifier */
+    RF_RowCol_t row,		/* row and column of disk to find the offsets
+				 * for */
+    RF_RowCol_t col,
+    RF_SectorNum_t * outDiskOffset,
+    RF_SectorNum_t * outFailedDiskSectorOffset,
+    RF_RowCol_t * spRow,	/* OUT: row,col of spare unit for failed unit */
+    RF_RowCol_t * spCol,
+    RF_SectorNum_t * spOffset)
+{				/* OUT: offset into disk containing spare unit */
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol;
+	RF_RaidAddr_t sosRaidAddress;	/* start-of-stripe */
+	RF_RowCol_t *diskids;
+	u_int   i, j, k, i_offset, j_offset;
+	RF_RowCol_t prow, pcol;
+	int     testcol, testrow;
+	RF_RowCol_t stripe;
+	RF_SectorNum_t poffset;
+	char    i_is_parity = 0, j_is_parity = 0;
+	RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
+
+	/* get a listing of the disks comprising that stripe */
+	sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
+	(layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids, &stripe);
+	RF_ASSERT(diskids);
+
+	/* reject this entire parity stripe if it does not contain the
+	 * indicated disk or it does not contain the failed disk */
+	if (row != stripe)
+		goto skipit;
+	for (i = 0; i < stripeWidth; i++) {
+		if (col == diskids[i])
+			break;
+	}
+	if (i == stripeWidth)
+		goto skipit;
+	for (j = 0; j < stripeWidth; j++) {
+		if (fcol == diskids[j])
+			break;
+	}
+	if (j == stripeWidth) {
+		goto skipit;
+	}
+	/* find out which disk the parity is on */
+	(layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &prow, &pcol, &poffset, RF_DONT_REMAP);
+
+	/* find out if either the current RU or the failed RU is parity */
+	/* also, if the parity occurs in this stripe prior to the data and/or
+	 * failed col, we need to decrement i and/or j */
+	for (k = 0; k < stripeWidth; k++)
+		if (diskids[k] == pcol)
+			break;
+	RF_ASSERT(k < stripeWidth);
+	i_offset = i;
+	j_offset = j;
+	if (k < i)
+		i_offset--;
+	else
+		if (k == i) {
+			i_is_parity = 1;
+			i_offset = 0;
+		}		/* set offsets to zero to disable multiply
+				 * below */
+	if (k < j)
+		j_offset--;
+	else
+		if (k == j) {
+			j_is_parity = 1;
+			j_offset = 0;
+		}
+	/* at this point, [ij]_is_parity tells us whether the [current,failed]
+	 * disk is parity at the start of this RU, and, if data, "[ij]_offset"
+	 * tells us how far into the stripe the [current,failed] disk is. */
+
+	/* call the mapping routine to get the offset into the current disk,
+	 * repeat for failed disk. */
+	if (i_is_parity)
+		layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outDiskOffset, RF_DONT_REMAP);
+	else
+		layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outDiskOffset, RF_DONT_REMAP);
+
+	RF_ASSERT(row == testrow && col == testcol);
+
+	if (j_is_parity)
+		layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
+	else
+		layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
+	RF_ASSERT(row == testrow && fcol == testcol);
+
+	/* now locate the spare unit for the failed unit */
+	if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
+		if (j_is_parity)
+			layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spRow, spCol, spOffset, RF_REMAP);
+		else
+			layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spRow, spCol, spOffset, RF_REMAP);
+	} else {
+		*spRow = raidPtr->reconControl[row]->spareRow;
+		*spCol = raidPtr->reconControl[row]->spareCol;
+		*spOffset = *outFailedDiskSectorOffset;
+	}
+
+	return (0);
+
+skipit:
+	Dprintf3("RECON: Skipping psid %ld: nothing needed from r%d c%d\n",
+	    psid, row, col);
+	return (1);
+}
+/* this is called when a buffer has become ready to write to the replacement disk */
+static int 
+IssueNextWriteRequest(raidPtr, row)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t row;
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
+	RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol;
+	RF_ReconBuffer_t *rbuf;
+	RF_DiskQueueData_t *req;
+
+	rbuf = rf_GetFullReconBuffer(raidPtr->reconControl[row]);
+	RF_ASSERT(rbuf);	/* there must be one available, or we wouldn't
+				 * have gotten the event that sent us here */
+	RF_ASSERT(rbuf->pssPtr);
+
+	rbuf->pssPtr->writeRbuf = rbuf;
+	rbuf->pssPtr = NULL;
+
+	Dprintf7("RECON: New write (r %d c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n",
+	    rbuf->spRow, rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
+	    rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
+	Dprintf6("RECON: new write psid %ld   %02x %02x %02x %02x %02x\n",
+	    rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
+	    rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
+
+	/* should be ok to use a NULL b_proc here b/c all addrs should be in
+	 * kernel space */
+	req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
+	    sectorsPerRU, rbuf->buffer,
+	    rbuf->parityStripeID, rbuf->which_ru,
+	    ReconWriteDoneProc, (void *) rbuf, NULL,
+	    &raidPtr->recon_tracerecs[fcol],
+	    (void *) raidPtr, 0, NULL);
+
+	RF_ASSERT(req);		/* XXX -- fix this -- XXX */
+
+	rbuf->arg = (void *) req;
+	rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spRow][rbuf->spCol], req, RF_IO_RECON_PRIORITY);
+
+	return (0);
+}
+
+/*
+ * this gets called upon the completion of a reconstruction read
+ * operation the arg is a pointer to the per-disk reconstruction
+ * control structure for the process that just finished a read.
+ *
+ * called at interrupt context in the kernel, so don't do anything
+ * illegal here.  
+ */
+static int 
+ReconReadDoneProc(arg, status)
+	void   *arg;
+	int     status;
+{
+	RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
+	RF_Raid_t *raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
+
+	if (status) {
+		/*
+	         * XXX
+	         */
+		printf("Recon read failed!\n");
+		RF_PANIC();
+	}
+	RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
+	RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
+	raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us =
+	    RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
+	RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
+
+	rf_CauseReconEvent(raidPtr, ctrl->row, ctrl->col, NULL, RF_REVENT_READDONE);
+	return (0);
+}
+/* this gets called upon the completion of a reconstruction write operation.
+ * the arg is a pointer to the rbuf that was just written
+ *
+ * called at interrupt context in the kernel, so don't do anything illegal here.
+ */
+static int 
+ReconWriteDoneProc(arg, status)
+	void   *arg;
+	int     status;
+{
+	RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
+
+	Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru);
+	if (status) {
+		printf("Recon write failed!\n");	/* fprintf(stderr,"Recon
+							 * write failed!\n"); */
+		RF_PANIC();
+	}
+	rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col, arg, RF_REVENT_WRITEDONE);
+	return (0);
+}
+
+
+/* 
+ * computes a new minimum head sep, and wakes up anyone who needs to
+ * be woken as a result 
+ */
+static void 
+CheckForNewMinHeadSep(raidPtr, row, hsCtr)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t row;
+	RF_HeadSepLimit_t hsCtr;
+{
+	RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
+	RF_HeadSepLimit_t new_min;
+	RF_RowCol_t i;
+	RF_CallbackDesc_t *p;
+	RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter);	/* from the definition
+								 * of a minimum */
+
+
+	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
+
+	new_min = ~(1L << (8 * sizeof(long) - 1));	/* 0x7FFF....FFF */
+	for (i = 0; i < raidPtr->numCol; i++)
+		if (i != reconCtrlPtr->fcol) {
+			if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min)
+				new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter;
+		}
+	/* set the new minimum and wake up anyone who can now run again */
+	if (new_min != reconCtrlPtr->minHeadSepCounter) {
+		reconCtrlPtr->minHeadSepCounter = new_min;
+		Dprintf1("RECON:  new min head pos counter val is %ld\n", new_min);
+		while (reconCtrlPtr->headSepCBList) {
+			if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min)
+				break;
+			p = reconCtrlPtr->headSepCBList;
+			reconCtrlPtr->headSepCBList = p->next;
+			p->next = NULL;
+			rf_CauseReconEvent(raidPtr, p->row, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
+			rf_FreeCallbackDesc(p);
+		}
+
+	}
+	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
+}
+
+/*
+ * checks to see that the maximum head separation will not be violated
+ * if we initiate a reconstruction I/O on the indicated disk.
+ * Limiting the maximum head separation between two disks eliminates
+ * the nasty buffer-stall conditions that occur when one disk races
+ * ahead of the others and consumes all of the floating recon buffers.
+ * This code is complex and unpleasant but it's necessary to avoid
+ * some very nasty, albeit fairly rare, reconstruction behavior.
+ *
+ * returns non-zero if and only if we have to stop working on the
+ * indicated disk due to a head-separation delay.  
+ */
+static int 
+CheckHeadSeparation(
+    RF_Raid_t * raidPtr,
+    RF_PerDiskReconCtrl_t * ctrl,
+    RF_RowCol_t row,
+    RF_RowCol_t col,
+    RF_HeadSepLimit_t hsCtr,
+    RF_ReconUnitNum_t which_ru)
+{
+	RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
+	RF_CallbackDesc_t *cb, *p, *pt;
+	int     retval = 0;
+
+	/* if we're too far ahead of the slowest disk, stop working on this
+	 * disk until the slower ones catch up.  We do this by scheduling a
+	 * wakeup callback for the time when the slowest disk has caught up.
+	 * We define "caught up" with 20% hysteresis, i.e. the head separation
+	 * must have fallen to at most 80% of the max allowable head
+	 * separation before we'll wake up.
+	 * 
+	 */
+	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
+	if ((raidPtr->headSepLimit >= 0) &&
+	    ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) {
+		Dprintf6("raid%d: RECON: head sep stall: row %d col %d hsCtr %ld minHSCtr %ld limit %ld\n",
+			 raidPtr->raidid, row, col, ctrl->headSepCounter, 
+			 reconCtrlPtr->minHeadSepCounter, 
+			 raidPtr->headSepLimit);
+		cb = rf_AllocCallbackDesc();
+		/* the minHeadSepCounter value we have to get to before we'll
+		 * wake up.  build in 20% hysteresis. */
+		cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
+		cb->row = row;
+		cb->col = col;
+		cb->next = NULL;
+
+		/* insert this callback descriptor into the sorted list of
+		 * pending head-sep callbacks */
+		p = reconCtrlPtr->headSepCBList;
+		if (!p)
+			reconCtrlPtr->headSepCBList = cb;
+		else
+			if (cb->callbackArg.v < p->callbackArg.v) {
+				cb->next = reconCtrlPtr->headSepCBList;
+				reconCtrlPtr->headSepCBList = cb;
+			} else {
+				for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next);
+				cb->next = p;
+				pt->next = cb;
+			}
+		retval = 1;
+#if RF_RECON_STATS > 0
+		ctrl->reconCtrl->reconDesc->hsStallCount++;
+#endif				/* RF_RECON_STATS > 0 */
+	}
+	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
+
+	return (retval);
+}
+/* 
+ * checks to see if reconstruction has been either forced or blocked
+ * by a user operation.  if forced, we skip this RU entirely.  else if
+ * blocked, put ourselves on the wait list.  else return 0.
+ *
+ * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY 
+ */
+static int 
+CheckForcedOrBlockedReconstruction(
+    RF_Raid_t * raidPtr,
+    RF_ReconParityStripeStatus_t * pssPtr,
+    RF_PerDiskReconCtrl_t * ctrl,
+    RF_RowCol_t row,
+    RF_RowCol_t col,
+    RF_StripeNum_t psid,
+    RF_ReconUnitNum_t which_ru)
+{
+	RF_CallbackDesc_t *cb;
+	int     retcode = 0;
+
+	if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
+		retcode = RF_PSS_FORCED_ON_WRITE;
+	else
+		if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
+			Dprintf4("RECON: row %d col %d blocked at psid %ld ru %d\n", row, col, psid, which_ru);
+			cb = rf_AllocCallbackDesc();	/* append ourselves to
+							 * the blockage-wait
+							 * list */
+			cb->row = row;
+			cb->col = col;
+			cb->next = pssPtr->blockWaitList;
+			pssPtr->blockWaitList = cb;
+			retcode = RF_PSS_RECON_BLOCKED;
+		}
+	if (!retcode)
+		pssPtr->flags |= RF_PSS_UNDER_RECON;	/* mark this RU as under
+							 * reconstruction */
+
+	return (retcode);
+}
+/*
+ * if reconstruction is currently ongoing for the indicated stripeID,
+ * reconstruction is forced to completion and we return non-zero to
+ * indicate that the caller must wait.  If not, then reconstruction is
+ * blocked on the indicated stripe and the routine returns zero.  If
+ * and only if we return non-zero, we'll cause the cbFunc to get
+ * invoked with the cbArg when the reconstruction has completed.  
+ */
+int 
+rf_ForceOrBlockRecon(raidPtr, asmap, cbFunc, cbArg)
+	RF_Raid_t *raidPtr;
+	RF_AccessStripeMap_t *asmap;
+	void    (*cbFunc) (RF_Raid_t *, void *);
+	void   *cbArg;
+{
+	RF_RowCol_t row = asmap->physInfo->row;	/* which row of the array
+						 * we're working on */
+	RF_StripeNum_t stripeID = asmap->stripeID;	/* the stripe ID we're
+							 * forcing recon on */
+	RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;	/* num sects in one RU */
+	RF_ReconParityStripeStatus_t *pssPtr;	/* a pointer to the parity
+						 * stripe status structure */
+	RF_StripeNum_t psid;	/* parity stripe id */
+	RF_SectorNum_t offset, fd_offset;	/* disk offset, failed-disk
+						 * offset */
+	RF_RowCol_t *diskids;
+	RF_RowCol_t stripe;
+	RF_ReconUnitNum_t which_ru;	/* RU within parity stripe */
+	RF_RowCol_t fcol, diskno, i;
+	RF_ReconBuffer_t *new_rbuf;	/* ptr to newly allocated rbufs */
+	RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */
+	RF_CallbackDesc_t *cb;
+	int     created = 0, nPromoted;
+
+	psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
+
+	RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
+
+	pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, &created);
+
+	/* if recon is not ongoing on this PS, just return */
+	if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
+		RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
+		return (0);
+	}
+	/* otherwise, we have to wait for reconstruction to complete on this
+	 * RU. */
+	/* In order to avoid waiting for a potentially large number of
+	 * low-priority accesses to complete, we force a normal-priority (i.e.
+	 * not low-priority) reconstruction on this RU. */
+	if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
+		DDprintf1("Forcing recon on psid %ld\n", psid);
+		pssPtr->flags |= RF_PSS_FORCED_ON_WRITE;	/* mark this RU as under
+								 * forced recon */
+		pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;	/* clear the blockage
+							 * that we just set */
+		fcol = raidPtr->reconControl[row]->fcol;
+
+		/* get a listing of the disks comprising the indicated stripe */
+		(raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids, &stripe);
+		RF_ASSERT(row == stripe);
+
+		/* For previously issued reads, elevate them to normal
+		 * priority.  If the I/O has already completed, it won't be
+		 * found in the queue, and hence this will be a no-op. For
+		 * unissued reads, allocate buffers and issue new reads.  The
+		 * fact that we've set the FORCED bit means that the regular
+		 * recon procs will not re-issue these reqs */
+		for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++)
+			if ((diskno = diskids[i]) != fcol) {
+				if (pssPtr->issued[diskno]) {
+					nPromoted = rf_DiskIOPromote(&raidPtr->Queues[row][diskno], psid, which_ru);
+					if (rf_reconDebug && nPromoted)
+						printf("raid%d: promoted read from row %d col %d\n", raidPtr->raidid, row, diskno);
+				} else {
+					new_rbuf = rf_MakeReconBuffer(raidPtr, row, diskno, RF_RBUF_TYPE_FORCED);	/* create new buf */
+					ComputePSDiskOffsets(raidPtr, psid, row, diskno, &offset, &fd_offset,
+					    &new_rbuf->spRow, &new_rbuf->spCol, &new_rbuf->spOffset);	/* find offsets & spare
+													 * location */
+					new_rbuf->parityStripeID = psid;	/* fill in the buffer */
+					new_rbuf->which_ru = which_ru;
+					new_rbuf->failedDiskSectorOffset = fd_offset;
+					new_rbuf->priority = RF_IO_NORMAL_PRIORITY;
+
+					/* use NULL b_proc b/c all addrs
+					 * should be in kernel space */
+					req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer,
+					    psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf, NULL,
+					    NULL, (void *) raidPtr, 0, NULL);
+
+					RF_ASSERT(req);	/* XXX -- fix this --
+							 * XXX */
+
+					new_rbuf->arg = req;
+					rf_DiskIOEnqueue(&raidPtr->Queues[row][diskno], req, RF_IO_NORMAL_PRIORITY);	/* enqueue the I/O */
+					Dprintf3("raid%d: Issued new read req on row %d col %d\n", raidPtr->raidid, row, diskno);
+				}
+			}
+		/* if the write is sitting in the disk queue, elevate its
+		 * priority */
+		if (rf_DiskIOPromote(&raidPtr->Queues[row][fcol], psid, which_ru))
+			printf("raid%d: promoted write to row %d col %d\n", 
+			       raidPtr->raidid, row, fcol);
+	}
+	/* install a callback descriptor to be invoked when recon completes on
+	 * this parity stripe. */
+	cb = rf_AllocCallbackDesc();
+	/* XXX the following is bogus.. These functions don't really match!!
+	 * GO */
+	cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
+	cb->callbackArg.p = (void *) cbArg;
+	cb->next = pssPtr->procWaitList;
+	pssPtr->procWaitList = cb;
+	DDprintf2("raid%d: Waiting for forced recon on psid %ld\n", 
+		  raidPtr->raidid, psid);
+
+	RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
+	return (1);
+}
+/* called upon the completion of a forced reconstruction read.
+ * all we do is schedule the FORCEDREADONE event.
+ * called at interrupt context in the kernel, so don't do anything illegal here.
+ */
+static void 
+ForceReconReadDoneProc(arg, status)
+	void   *arg;
+	int     status;
+{
+	RF_ReconBuffer_t *rbuf = arg;
+
+	if (status) {
+		printf("Forced recon read failed!\n");	/* fprintf(stderr,"Forced
+							 *  recon read
+							 * failed!\n"); */
+		RF_PANIC();
+	}
+	rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE);
+}
+/* releases a block on the reconstruction of the indicated stripe */
+int 
+rf_UnblockRecon(raidPtr, asmap)
+	RF_Raid_t *raidPtr;
+	RF_AccessStripeMap_t *asmap;
+{
+	RF_RowCol_t row = asmap->origRow;
+	RF_StripeNum_t stripeID = asmap->stripeID;
+	RF_ReconParityStripeStatus_t *pssPtr;
+	RF_ReconUnitNum_t which_ru;
+	RF_StripeNum_t psid;
+	int     created = 0;
+	RF_CallbackDesc_t *cb;
+
+	psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
+	RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
+	pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_NONE, &created);
+
+	/* When recon is forced, the pss desc can get deleted before we get
+	 * back to unblock recon. But, this can _only_ happen when recon is
+	 * forced. It would be good to put some kind of sanity check here, but
+	 * how to decide if recon was just forced or not? */
+	if (!pssPtr) {
+		/* printf("Warning: no pss descriptor upon unblock on psid %ld
+		 * RU %d\n",psid,which_ru); */
+		if (rf_reconDebug || rf_pssDebug)
+			printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru);
+		goto out;
+	}
+	pssPtr->blockCount--;
+	Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n",
+		 raidPtr->raidid, psid, pssPtr->blockCount);
+	if (pssPtr->blockCount == 0) {	/* if recon blockage has been released */
+
+		/* unblock recon before calling CauseReconEvent in case
+		 * CauseReconEvent causes us to try to issue a new read before
+		 * returning here. */
+		pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
+
+
+		while (pssPtr->blockWaitList) {	
+			/* spin through the block-wait list and
+			   release all the waiters */
+			cb = pssPtr->blockWaitList;
+			pssPtr->blockWaitList = cb->next;
+			cb->next = NULL;
+			rf_CauseReconEvent(raidPtr, cb->row, cb->col, NULL, RF_REVENT_BLOCKCLEAR);
+			rf_FreeCallbackDesc(cb);
+		}
+		if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
+			/* if no recon was requested while recon was blocked */
+			rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]->pssTable, pssPtr);
+		}
+	}
+out:
+	RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
+	return (0);
+}
diff --git a/sys/dev/raidframe/rf_reconstruct.h b/sys/dev/raidframe/rf_reconstruct.h
new file mode 100644
index 0000000..318d546
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconstruct.h
@@ -0,0 +1,202 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_reconstruct.h,v 1.5 2000/05/28 00:48:30 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*********************************************************
+ * rf_reconstruct.h -- header file for reconstruction code
+ *********************************************************/
+
+#ifndef _RF__RF_RECONSTRUCT_H_
+#define _RF__RF_RECONSTRUCT_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <sys/time.h>
+#include <dev/raidframe/rf_reconmap.h>
+#include <dev/raidframe/rf_psstatus.h>
+
+/* reconstruction configuration information */
+struct RF_ReconConfig_s {
+	unsigned numFloatingReconBufs;	/* number of floating recon bufs to
+					 * use */
+	RF_HeadSepLimit_t headSepLimit;	/* how far apart the heads are allow
+					 * to become, in parity stripes */
+};
+/* a reconstruction buffer */
+struct RF_ReconBuffer_s {
+	RF_Raid_t *raidPtr;	/* void * to avoid recursive includes */
+	caddr_t buffer;		/* points to the data */
+	RF_StripeNum_t parityStripeID;	/* the parity stripe that this data
+					 * relates to */
+	int     which_ru;	/* which reconstruction unit within the PSS */
+	RF_SectorNum_t failedDiskSectorOffset;	/* the offset into the failed
+						 * disk */
+	RF_RowCol_t row, col;	/* which disk this buffer belongs to or is
+				 * targeted at */
+	RF_StripeCount_t count;	/* counts the # of SUs installed so far */
+	int     priority;	/* used to force hi priority recon */
+	RF_RbufType_t type;	/* FORCED or FLOATING */
+	char   *arrived;	/* [x] = 1/0 if SU from disk x has/hasn't
+				 * arrived */
+	RF_ReconBuffer_t *next;	/* used for buffer management */
+	void   *arg;		/* generic field for general use */
+	RF_RowCol_t spRow, spCol;	/* spare disk to which this buf should
+					 * be written */
+	/* if dist sparing off, always identifies the replacement disk */
+	RF_SectorNum_t spOffset;/* offset into the spare disk */
+	/* if dist sparing off, identical to failedDiskSectorOffset */
+	RF_ReconParityStripeStatus_t *pssPtr;	/* debug- pss associated with
+						 * issue-pending write */
+};
+/* a reconstruction event descriptor.  The event types currently are:
+ *    RF_REVENT_READDONE    -- a read operation has completed
+ *    RF_REVENT_WRITEDONE   -- a write operation has completed
+ *    RF_REVENT_BUFREADY    -- the buffer manager has produced a full buffer
+ *    RF_REVENT_BLOCKCLEAR  -- a reconstruction blockage has been cleared
+ *    RF_REVENT_BUFCLEAR    -- the buffer manager has released a process blocked on submission
+ *    RF_REVENT_SKIP        -- we need to skip the current RU and go on to the next one, typ. b/c we found recon forced
+ *    RF_REVENT_FORCEDREADONE- a forced-reconstructoin read operation has completed
+ */
+typedef enum RF_Revent_e {
+	RF_REVENT_READDONE,
+	RF_REVENT_WRITEDONE,
+	RF_REVENT_BUFREADY,
+	RF_REVENT_BLOCKCLEAR,
+	RF_REVENT_BUFCLEAR,
+	RF_REVENT_HEADSEPCLEAR,
+	RF_REVENT_SKIP,
+	RF_REVENT_FORCEDREADDONE
+}       RF_Revent_t;
+
+struct RF_ReconEvent_s {
+	RF_Revent_t type;	/* what kind of event has occurred */
+	RF_RowCol_t col;	/* row ID is implicit in the queue in which
+				 * the event is placed */
+	void   *arg;		/* a generic argument */
+	RF_ReconEvent_t *next;
+};
+/*
+ * Reconstruction control information maintained per-disk
+ * (for surviving disks)
+ */
+struct RF_PerDiskReconCtrl_s {
+	RF_ReconCtrl_t *reconCtrl;
+	RF_RowCol_t row, col;	/* to make this structure self-identifying */
+	RF_StripeNum_t curPSID;	/* the next parity stripe ID to check on this
+				 * disk */
+	RF_HeadSepLimit_t headSepCounter;	/* counter used to control
+						 * maximum head separation */
+	RF_SectorNum_t diskOffset;	/* the offset into the indicated disk
+					 * of the current PU */
+	RF_ReconUnitNum_t ru_count;	/* this counts off the recon units
+					 * within each parity unit */
+	RF_ReconBuffer_t *rbuf;	/* the recon buffer assigned to this disk */
+};
+/* main reconstruction control structure */
+struct RF_ReconCtrl_s {
+	RF_RaidReconDesc_t *reconDesc;
+	RF_RowCol_t fcol;	/* which column has failed */
+	RF_PerDiskReconCtrl_t *perDiskInfo;	/* information maintained
+						 * per-disk */
+	RF_ReconMap_t *reconMap;/* map of what has/has not been reconstructed */
+	RF_RowCol_t spareRow;	/* which of the spare disks we're using */
+	RF_RowCol_t spareCol;
+	RF_StripeNum_t lastPSID;/* the ID of the last parity stripe we want
+				 * reconstructed */
+	int     percentComplete;/* percentage completion of reconstruction */
+	int     numRUsComplete; /* number of Reconstruction Units done */
+	int     numRUsTotal;    /* total number of Reconstruction Units */
+
+	/* reconstruction event queue */
+	RF_ReconEvent_t *eventQueue;	/* queue of pending reconstruction
+					 * events */
+	        RF_DECLARE_MUTEX(eq_mutex)	/* mutex for locking event
+						 * queue */
+	        RF_DECLARE_COND(eq_cond)	/* condition variable for
+						 * signalling recon events */
+	int     eq_count;	/* debug only */
+
+	/* reconstruction buffer management */
+	        RF_DECLARE_MUTEX(rb_mutex)	/* mutex for messing around
+						 * with recon buffers */
+	RF_ReconBuffer_t *floatingRbufs;	/* available floating
+						 * reconstruction buffers */
+	RF_ReconBuffer_t *committedRbufs;	/* recon buffers that have
+						 * been committed to some
+						 * waiting disk */
+	RF_ReconBuffer_t *fullBufferList;	/* full buffers waiting to be
+						 * written out */
+	RF_ReconBuffer_t *priorityList;	/* full buffers that have been
+					 * elevated to higher priority */
+	RF_CallbackDesc_t *bufferWaitList;	/* disks that are currently
+						 * blocked waiting for buffers */
+
+	/* parity stripe status table */
+	RF_PSStatusHeader_t *pssTable;	/* stores the reconstruction status of
+					 * active parity stripes */
+
+	/* maximum-head separation control */
+	RF_HeadSepLimit_t minHeadSepCounter;	/* the minimum hs counter over
+						 * all disks */
+	RF_CallbackDesc_t *headSepCBList;	/* list of callbacks to be
+						 * done as minPSID advances */
+
+	/* performance monitoring */
+	struct timeval starttime;	/* recon start time */
+
+	void    (*continueFunc) (void *);	/* function to call when io
+						 * returns */
+	void   *continueArg;	/* argument for Func */
+};
+/* the default priority for reconstruction accesses */
+#define RF_IO_RECON_PRIORITY RF_IO_LOW_PRIORITY
+
+int     rf_ConfigureReconstruction(RF_ShutdownList_t ** listp);
+
+int 
+rf_ReconstructFailedDisk(RF_Raid_t * raidPtr, RF_RowCol_t row,
+    RF_RowCol_t col);
+
+int 
+rf_ReconstructFailedDiskBasic(RF_Raid_t * raidPtr, RF_RowCol_t row,
+    RF_RowCol_t col);
+
+int 
+rf_ReconstructInPlace(RF_Raid_t * raidPtr, RF_RowCol_t row, RF_RowCol_t col);
+
+int     rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t * reconDesc);
+
+int 
+rf_ForceOrBlockRecon(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap,
+    void (*cbFunc) (RF_Raid_t *, void *), void *cbArg);
+
+	int     rf_UnblockRecon(RF_Raid_t * raidPtr, RF_AccessStripeMap_t * asmap);
+
+	int     rf_RegisterReconDoneProc(RF_Raid_t * raidPtr, void (*proc) (RF_Raid_t *, void *), void *arg,
+            RF_ReconDoneProc_t ** handlep);
+
+#endif				/* !_RF__RF_RECONSTRUCT_H_ */
diff --git a/sys/dev/raidframe/rf_reconutil.c b/sys/dev/raidframe/rf_reconutil.c
new file mode 100644
index 0000000..51e9c07
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconutil.c
@@ -0,0 +1,336 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_reconutil.c,v 1.3 1999/02/05 00:06:17 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/********************************************
+ * rf_reconutil.c -- reconstruction utilities
+ ********************************************/
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_desc.h>
+#include <dev/raidframe/rf_reconutil.h>
+#include <dev/raidframe/rf_reconbuffer.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_decluster.h>
+#include <dev/raidframe/rf_raid5_rotatedspare.h>
+#include <dev/raidframe/rf_interdecluster.h>
+#include <dev/raidframe/rf_chaindecluster.h>
+
+/*******************************************************************
+ * allocates/frees the reconstruction control information structures
+ *******************************************************************/
+RF_ReconCtrl_t *
+rf_MakeReconControl(reconDesc, frow, fcol, srow, scol)
+	RF_RaidReconDesc_t *reconDesc;
+	RF_RowCol_t frow;	/* failed row and column */
+	RF_RowCol_t fcol;
+	RF_RowCol_t srow;	/* identifies which spare we're using */
+	RF_RowCol_t scol;
+{
+	RF_Raid_t *raidPtr = reconDesc->raidPtr;
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
+	RF_ReconUnitCount_t numSpareRUs;
+	RF_ReconCtrl_t *reconCtrlPtr;
+	RF_ReconBuffer_t *rbuf;
+	RF_LayoutSW_t *lp;
+	int     retcode, rc;
+	RF_RowCol_t i;
+
+	lp = raidPtr->Layout.map;
+
+	/* make and zero the global reconstruction structure and the per-disk
+	 * structure */
+	RF_Calloc(reconCtrlPtr, 1, sizeof(RF_ReconCtrl_t), (RF_ReconCtrl_t *));
+	RF_Calloc(reconCtrlPtr->perDiskInfo, raidPtr->numCol, sizeof(RF_PerDiskReconCtrl_t), (RF_PerDiskReconCtrl_t *));	/* this zeros it */
+	reconCtrlPtr->reconDesc = reconDesc;
+	reconCtrlPtr->fcol = fcol;
+	reconCtrlPtr->spareRow = srow;
+	reconCtrlPtr->spareCol = scol;
+	reconCtrlPtr->lastPSID = layoutPtr->numStripe / layoutPtr->SUsPerPU;
+	reconCtrlPtr->percentComplete = 0;
+
+	/* initialize each per-disk recon information structure */
+	for (i = 0; i < raidPtr->numCol; i++) {
+		reconCtrlPtr->perDiskInfo[i].reconCtrl = reconCtrlPtr;
+		reconCtrlPtr->perDiskInfo[i].row = frow;
+		reconCtrlPtr->perDiskInfo[i].col = i;
+		reconCtrlPtr->perDiskInfo[i].curPSID = -1;	/* make it appear as if
+								 * we just finished an
+								 * RU */
+		reconCtrlPtr->perDiskInfo[i].ru_count = RUsPerPU - 1;
+	}
+
+	/* Get the number of spare units per disk and the sparemap in case
+	 * spare is distributed  */
+
+	if (lp->GetNumSpareRUs) {
+		numSpareRUs = lp->GetNumSpareRUs(raidPtr);
+	} else {
+		numSpareRUs = 0;
+	}
+
+	/*
+         * Not all distributed sparing archs need dynamic mappings
+         */
+	if (lp->InstallSpareTable) {
+		retcode = rf_InstallSpareTable(raidPtr, frow, fcol);
+		if (retcode) {
+			RF_PANIC();	/* XXX fix this */
+		}
+	}
+	/* make the reconstruction map */
+	reconCtrlPtr->reconMap = rf_MakeReconMap(raidPtr, (int) (layoutPtr->SUsPerRU * layoutPtr->sectorsPerStripeUnit),
+	    raidPtr->sectorsPerDisk, numSpareRUs);
+
+	/* make the per-disk reconstruction buffers */
+	for (i = 0; i < raidPtr->numCol; i++) {
+		reconCtrlPtr->perDiskInfo[i].rbuf = (i == fcol) ? NULL : rf_MakeReconBuffer(raidPtr, frow, i, RF_RBUF_TYPE_EXCLUSIVE);
+	}
+
+	/* initialize the event queue */
+	rc = rf_mutex_init(&reconCtrlPtr->eq_mutex, __FUNCTION__);
+	if (rc) {
+		/* XXX deallocate, cleanup */
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		return (NULL);
+	}
+	rc = rf_cond_init(&reconCtrlPtr->eq_cond);
+	if (rc) {
+		/* XXX deallocate, cleanup */
+		RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		return (NULL);
+	}
+	reconCtrlPtr->eventQueue = NULL;
+	reconCtrlPtr->eq_count = 0;
+
+	/* make the floating recon buffers and append them to the free list */
+	rc = rf_mutex_init(&reconCtrlPtr->rb_mutex, __FUNCTION__);
+	if (rc) {
+		/* XXX deallocate, cleanup */
+		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		return (NULL);
+	}
+	reconCtrlPtr->fullBufferList = NULL;
+	reconCtrlPtr->priorityList = NULL;
+	reconCtrlPtr->floatingRbufs = NULL;
+	reconCtrlPtr->committedRbufs = NULL;
+	for (i = 0; i < raidPtr->numFloatingReconBufs; i++) {
+		rbuf = rf_MakeReconBuffer(raidPtr, frow, fcol, RF_RBUF_TYPE_FLOATING);
+		rbuf->next = reconCtrlPtr->floatingRbufs;
+		reconCtrlPtr->floatingRbufs = rbuf;
+	}
+
+	/* create the parity stripe status table */
+	reconCtrlPtr->pssTable = rf_MakeParityStripeStatusTable(raidPtr);
+
+	/* set the initial min head sep counter val */
+	reconCtrlPtr->minHeadSepCounter = 0;
+
+	return (reconCtrlPtr);
+}
+
+void 
+rf_FreeReconControl(raidPtr, row)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t row;
+{
+	RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
+	RF_ReconBuffer_t *t;
+	RF_ReconUnitNum_t i;
+
+	RF_ASSERT(reconCtrlPtr);
+	for (i = 0; i < raidPtr->numCol; i++)
+		if (reconCtrlPtr->perDiskInfo[i].rbuf)
+			rf_FreeReconBuffer(reconCtrlPtr->perDiskInfo[i].rbuf);
+	for (i = 0; i < raidPtr->numFloatingReconBufs; i++) {
+		t = reconCtrlPtr->floatingRbufs;
+		RF_ASSERT(t);
+		reconCtrlPtr->floatingRbufs = t->next;
+		rf_FreeReconBuffer(t);
+	}
+	rf_mutex_destroy(&reconCtrlPtr->rb_mutex);
+	rf_mutex_destroy(&reconCtrlPtr->eq_mutex);
+	rf_cond_destroy(&reconCtrlPtr->eq_cond);
+	rf_FreeReconMap(reconCtrlPtr->reconMap);
+	rf_FreeParityStripeStatusTable(raidPtr, reconCtrlPtr->pssTable);
+	RF_Free(reconCtrlPtr->perDiskInfo, raidPtr->numCol * sizeof(RF_PerDiskReconCtrl_t));
+	RF_Free(reconCtrlPtr, sizeof(*reconCtrlPtr));
+}
+
+
+/******************************************************************************
+ * computes the default head separation limit
+ *****************************************************************************/
+RF_HeadSepLimit_t 
+rf_GetDefaultHeadSepLimit(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	RF_HeadSepLimit_t hsl;
+	RF_LayoutSW_t *lp;
+
+	lp = raidPtr->Layout.map;
+	if (lp->GetDefaultHeadSepLimit == NULL)
+		return (-1);
+	hsl = lp->GetDefaultHeadSepLimit(raidPtr);
+	return (hsl);
+}
+
+
+/******************************************************************************
+ * computes the default number of floating recon buffers
+ *****************************************************************************/
+int 
+rf_GetDefaultNumFloatingReconBuffers(raidPtr)
+	RF_Raid_t *raidPtr;
+{
+	RF_LayoutSW_t *lp;
+	int     nrb;
+
+	lp = raidPtr->Layout.map;
+	if (lp->GetDefaultNumFloatingReconBuffers == NULL)
+		return (3 * raidPtr->numCol);
+	nrb = lp->GetDefaultNumFloatingReconBuffers(raidPtr);
+	return (nrb);
+}
+
+
+/******************************************************************************
+ * creates and initializes a reconstruction buffer
+ *****************************************************************************/
+RF_ReconBuffer_t *
+rf_MakeReconBuffer(
+    RF_Raid_t * raidPtr,
+    RF_RowCol_t row,
+    RF_RowCol_t col,
+    RF_RbufType_t type)
+{
+	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+	RF_ReconBuffer_t *t;
+	u_int   recon_buffer_size = rf_RaidAddressToByte(raidPtr, layoutPtr->SUsPerRU * layoutPtr->sectorsPerStripeUnit);
+
+	RF_Malloc(t, sizeof(RF_ReconBuffer_t), (RF_ReconBuffer_t *));
+	RF_Malloc(t->buffer, recon_buffer_size, (caddr_t));
+	RF_Malloc(t->arrived, raidPtr->numCol * sizeof(char), (char *));
+	t->raidPtr = raidPtr;
+	t->row = row;
+	t->col = col;
+	t->priority = RF_IO_RECON_PRIORITY;
+	t->type = type;
+	t->pssPtr = NULL;
+	t->next = NULL;
+	return (t);
+}
+/******************************************************************************
+ * frees a reconstruction buffer
+ *****************************************************************************/
+void 
+rf_FreeReconBuffer(rbuf)
+	RF_ReconBuffer_t *rbuf;
+{
+	RF_Raid_t *raidPtr = rbuf->raidPtr;
+	u_int   recon_buffer_size = rf_RaidAddressToByte(raidPtr, raidPtr->Layout.SUsPerRU * raidPtr->Layout.sectorsPerStripeUnit);
+
+	RF_Free(rbuf->arrived, raidPtr->numCol * sizeof(char));
+	RF_Free(rbuf->buffer, recon_buffer_size);
+	RF_Free(rbuf, sizeof(*rbuf));
+}
+
+
+/******************************************************************************
+ * debug only:  sanity check the number of floating recon bufs in use
+ *****************************************************************************/
+void 
+rf_CheckFloatingRbufCount(raidPtr, dolock)
+	RF_Raid_t *raidPtr;
+	int     dolock;
+{
+	RF_ReconParityStripeStatus_t *p;
+	RF_PSStatusHeader_t *pssTable;
+	RF_ReconBuffer_t *rbuf;
+	int     i, j, sum = 0;
+	RF_RowCol_t frow = 0;
+
+	for (i = 0; i < raidPtr->numRow; i++)
+		if (raidPtr->reconControl[i]) {
+			frow = i;
+			break;
+		}
+	RF_ASSERT(frow >= 0);
+
+	if (dolock)
+		RF_LOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
+	pssTable = raidPtr->reconControl[frow]->pssTable;
+
+	for (i = 0; i < raidPtr->pssTableSize; i++) {
+		RF_LOCK_MUTEX(pssTable[i].mutex);
+		for (p = pssTable[i].chain; p; p = p->next) {
+			rbuf = (RF_ReconBuffer_t *) p->rbuf;
+			if (rbuf && rbuf->type == RF_RBUF_TYPE_FLOATING)
+				sum++;
+
+			rbuf = (RF_ReconBuffer_t *) p->writeRbuf;
+			if (rbuf && rbuf->type == RF_RBUF_TYPE_FLOATING)
+				sum++;
+
+			for (j = 0; j < p->xorBufCount; j++) {
+				rbuf = (RF_ReconBuffer_t *) p->rbufsForXor[j];
+				RF_ASSERT(rbuf);
+				if (rbuf->type == RF_RBUF_TYPE_FLOATING)
+					sum++;
+			}
+		}
+		RF_UNLOCK_MUTEX(pssTable[i].mutex);
+	}
+
+	for (rbuf = raidPtr->reconControl[frow]->floatingRbufs; rbuf; rbuf = rbuf->next) {
+		if (rbuf->type == RF_RBUF_TYPE_FLOATING)
+			sum++;
+	}
+	for (rbuf = raidPtr->reconControl[frow]->committedRbufs; rbuf; rbuf = rbuf->next) {
+		if (rbuf->type == RF_RBUF_TYPE_FLOATING)
+			sum++;
+	}
+	for (rbuf = raidPtr->reconControl[frow]->fullBufferList; rbuf; rbuf = rbuf->next) {
+		if (rbuf->type == RF_RBUF_TYPE_FLOATING)
+			sum++;
+	}
+	for (rbuf = raidPtr->reconControl[frow]->priorityList; rbuf; rbuf = rbuf->next) {
+		if (rbuf->type == RF_RBUF_TYPE_FLOATING)
+			sum++;
+	}
+
+	RF_ASSERT(sum == raidPtr->numFloatingReconBufs);
+
+	if (dolock)
+		RF_UNLOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
+}
diff --git a/sys/dev/raidframe/rf_reconutil.h b/sys/dev/raidframe/rf_reconutil.h
new file mode 100644
index 0000000..744d7b9
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconutil.h
@@ -0,0 +1,52 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_reconutil.h,v 1.3 1999/02/05 00:06:17 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/************************************************************
+ * rf_reconutil.h -- header file for reconstruction utilities
+ ************************************************************/
+
+#ifndef _RF__RF_RECONUTIL_H_
+#define _RF__RF_RECONUTIL_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_reconstruct.h>
+
+RF_ReconCtrl_t *
+rf_MakeReconControl(RF_RaidReconDesc_t * reconDesc,
+    RF_RowCol_t frow, RF_RowCol_t fcol, RF_RowCol_t srow, RF_RowCol_t scol);
+void    rf_FreeReconControl(RF_Raid_t * raidPtr, RF_RowCol_t row);
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimit(RF_Raid_t * raidPtr);
+int     rf_GetDefaultNumFloatingReconBuffers(RF_Raid_t * raidPtr);
+RF_ReconBuffer_t *
+rf_MakeReconBuffer(RF_Raid_t * raidPtr, RF_RowCol_t row,
+    RF_RowCol_t col, RF_RbufType_t type);
+void    rf_FreeReconBuffer(RF_ReconBuffer_t * rbuf);
+void    rf_CheckFloatingRbufCount(RF_Raid_t * raidPtr, int dolock);
+
+#endif				/* !_RF__RF_RECONUTIL_H_ */
diff --git a/sys/dev/raidframe/rf_revent.c b/sys/dev/raidframe/rf_revent.c
new file mode 100644
index 0000000..06df033
--- /dev/null
+++ b/sys/dev/raidframe/rf_revent.c
@@ -0,0 +1,228 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_revent.c,v 1.9 2000/09/21 01:45:46 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author:
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * revent.c -- reconstruction event handling code
+ */
+
+#include <sys/errno.h>
+
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_revent.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_freelist.h>
+#include <dev/raidframe/rf_desc.h>
+#include <dev/raidframe/rf_shutdown.h>
+#include <dev/raidframe/rf_kintf.h>
+
+static RF_FreeList_t *rf_revent_freelist;
+#define RF_MAX_FREE_REVENT 128
+#define RF_REVENT_INC        8
+#define RF_REVENT_INITIAL    8
+
+
+
+#include <sys/proc.h>
+#include <sys/kernel.h>
+
+#define DO_WAIT(_rc)  \
+	RF_LTSLEEP(&(_rc)->eventQueue, PRIBIO,  "raidframe eventq", \
+		0, &((_rc)->eq_mutex))
+
+#define DO_SIGNAL(_rc)     wakeup(&(_rc)->eventQueue)
+
+
+static void rf_ShutdownReconEvent(void *);
+
+static RF_ReconEvent_t *
+GetReconEventDesc(RF_RowCol_t row, RF_RowCol_t col,
+    void *arg, RF_Revent_t type);
+
+static void rf_ShutdownReconEvent(ignored)
+	void   *ignored;
+{
+	RF_FREELIST_DESTROY(rf_revent_freelist, next, (RF_ReconEvent_t *));
+}
+
+int 
+rf_ConfigureReconEvent(listp)
+	RF_ShutdownList_t **listp;
+{
+	int     rc;
+
+	RF_FREELIST_CREATE(rf_revent_freelist, RF_MAX_FREE_REVENT,
+	    RF_REVENT_INC, sizeof(RF_ReconEvent_t));
+	if (rf_revent_freelist == NULL)
+		return (ENOMEM);
+	rc = rf_ShutdownCreate(listp, rf_ShutdownReconEvent, NULL);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+		    __LINE__, rc);
+		rf_ShutdownReconEvent(NULL);
+		return (rc);
+	}
+	RF_FREELIST_PRIME(rf_revent_freelist, RF_REVENT_INITIAL, next,
+	    (RF_ReconEvent_t *));
+	return (0);
+}
+
+/* returns the next reconstruction event, blocking the calling thread
+ * until one becomes available.  will now return null if it is blocked
+ * or will return an event if it is not */
+
+RF_ReconEvent_t *
+rf_GetNextReconEvent(reconDesc, row, continueFunc, continueArg)
+	RF_RaidReconDesc_t *reconDesc;
+	RF_RowCol_t row;
+	void    (*continueFunc) (void *);
+	void   *continueArg;
+{
+	RF_Raid_t *raidPtr = reconDesc->raidPtr;
+	RF_ReconCtrl_t *rctrl = raidPtr->reconControl[row];
+	RF_ReconEvent_t *event;
+
+	RF_ASSERT(row >= 0 && row <= raidPtr->numRow);
+	RF_LOCK_MUTEX(rctrl->eq_mutex);
+	/* q null and count==0 must be equivalent conditions */
+	RF_ASSERT((rctrl->eventQueue == NULL) == (rctrl->eq_count == 0));
+
+	rctrl->continueFunc = continueFunc;
+	rctrl->continueArg = continueArg;
+
+
+	/* mpsleep timeout value: secs = timo_val/hz.  'ticks' here is
+	   defined as cycle-counter ticks, not softclock ticks */
+
+#define MAX_RECON_EXEC_USECS (100 * 1000)  /* 100 ms */
+#define RECON_DELAY_MS 25
+#define RECON_TIMO     ((RECON_DELAY_MS * hz) / 1000)
+
+	/* we are not pre-emptible in the kernel, but we don't want to run
+	 * forever.  If we run w/o blocking for more than MAX_RECON_EXEC_TICKS
+	 * ticks of the cycle counter, delay for RECON_DELAY before
+	 * continuing. this may murder us with context switches, so we may
+	 * need to increase both the MAX...TICKS and the RECON_DELAY_MS. */
+	if (reconDesc->reconExecTimerRunning) {
+		int     status;
+
+		RF_ETIMER_STOP(reconDesc->recon_exec_timer);
+		RF_ETIMER_EVAL(reconDesc->recon_exec_timer);
+		reconDesc->reconExecTicks += 
+			RF_ETIMER_VAL_US(reconDesc->recon_exec_timer);
+		if (reconDesc->reconExecTicks > reconDesc->maxReconExecTicks)
+			reconDesc->maxReconExecTicks = 
+				reconDesc->reconExecTicks;
+		if (reconDesc->reconExecTicks >= MAX_RECON_EXEC_USECS) {
+			/* we've been running too long.  delay for
+			 * RECON_DELAY_MS */
+#if RF_RECON_STATS > 0
+			reconDesc->numReconExecDelays++;
+#endif				/* RF_RECON_STATS > 0 */
+
+			status = RF_LTSLEEP(&reconDesc->reconExecTicks, PRIBIO, 
+					 "recon delay", RECON_TIMO,
+					 &rctrl->eq_mutex);
+			RF_ASSERT(status == EWOULDBLOCK);
+			reconDesc->reconExecTicks = 0;
+		}
+	}
+	while (!rctrl->eventQueue) {
+#if RF_RECON_STATS > 0
+		reconDesc->numReconEventWaits++;
+#endif				/* RF_RECON_STATS > 0 */
+		DO_WAIT(rctrl);
+		reconDesc->reconExecTicks = 0;	/* we've just waited */
+	}
+
+	reconDesc->reconExecTimerRunning = 1;
+	if (RF_ETIMER_VAL_US(reconDesc->recon_exec_timer)!=0) {
+		/* it moved!!  reset the timer. */
+		RF_ETIMER_START(reconDesc->recon_exec_timer);
+	}
+	event = rctrl->eventQueue;
+	rctrl->eventQueue = event->next;
+	event->next = NULL;
+	rctrl->eq_count--;
+
+	/* q null and count==0 must be equivalent conditions */
+	RF_ASSERT((rctrl->eventQueue == NULL) == (rctrl->eq_count == 0));
+	RF_UNLOCK_MUTEX(rctrl->eq_mutex);
+	return (event);
+}
+/* enqueues a reconstruction event on the indicated queue */
+void 
+rf_CauseReconEvent(raidPtr, row, col, arg, type)
+	RF_Raid_t *raidPtr;
+	RF_RowCol_t row;
+	RF_RowCol_t col;
+	void   *arg;
+	RF_Revent_t type;
+{
+	RF_ReconCtrl_t *rctrl = raidPtr->reconControl[row];
+	RF_ReconEvent_t *event = GetReconEventDesc(row, col, arg, type);
+
+	if (type == RF_REVENT_BUFCLEAR) {
+		RF_ASSERT(col != rctrl->fcol);
+	}
+	RF_ASSERT(row >= 0 && row <= raidPtr->numRow && col >= 0 && col <= raidPtr->numCol);
+	RF_LOCK_MUTEX(rctrl->eq_mutex);
+	/* q null and count==0 must be equivalent conditions */
+	RF_ASSERT((rctrl->eventQueue == NULL) == (rctrl->eq_count == 0));
+	event->next = rctrl->eventQueue;
+	rctrl->eventQueue = event;
+	rctrl->eq_count++;
+	RF_UNLOCK_MUTEX(rctrl->eq_mutex);
+
+	DO_SIGNAL(rctrl);
+}
+/* allocates and initializes a recon event descriptor */
+static RF_ReconEvent_t *
+GetReconEventDesc(row, col, arg, type)
+	RF_RowCol_t row;
+	RF_RowCol_t col;
+	void   *arg;
+	RF_Revent_t type;
+{
+	RF_ReconEvent_t *t;
+
+	RF_FREELIST_GET(rf_revent_freelist, t, next, (RF_ReconEvent_t *));
+	if (t == NULL)
+		return (NULL);
+	t->col = col;
+	t->arg = arg;
+	t->type = type;
+	return (t);
+}
+
+void 
+rf_FreeReconEventDesc(event)
+	RF_ReconEvent_t *event;
+{
+	RF_FREELIST_FREE(rf_revent_freelist, event, next);
+}
diff --git a/sys/dev/raidframe/rf_revent.h b/sys/dev/raidframe/rf_revent.h
new file mode 100644
index 0000000..51c3202
--- /dev/null
+++ b/sys/dev/raidframe/rf_revent.h
@@ -0,0 +1,52 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_revent.h,v 1.3 1999/02/05 00:06:17 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author:
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*******************************************************************
+ *
+ * rf_revent.h -- header file for reconstruction event handling code
+ *
+ *******************************************************************/
+
+#ifndef _RF__RF_REVENT_H_
+#define _RF__RF_REVENT_H_
+
+#include <dev/raidframe/rf_types.h>
+
+int     rf_ConfigureReconEvent(RF_ShutdownList_t ** listp);
+
+RF_ReconEvent_t *
+rf_GetNextReconEvent(RF_RaidReconDesc_t * reconDesc,
+    RF_RowCol_t row, void (*continueFunc) (void *), void *continueArg);
+
+	void    rf_CauseReconEvent(RF_Raid_t * raidPtr, RF_RowCol_t row, RF_RowCol_t col,
+            void *arg, RF_Revent_t type);
+
+	void    rf_FreeReconEventDesc(RF_ReconEvent_t * event);
+
+#endif				/* !_RF__RF_REVENT_H_ */
diff --git a/sys/dev/raidframe/rf_shutdown.c b/sys/dev/raidframe/rf_shutdown.c
new file mode 100644
index 0000000..7db93e9
--- /dev/null
+++ b/sys/dev/raidframe/rf_shutdown.c
@@ -0,0 +1,102 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_shutdown.c,v 1.6 2000/01/13 23:41:18 oster Exp $	*/
+/*
+ * rf_shutdown.c
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * Maintain lists of cleanup functions. Also, mechanisms for coordinating
+ * thread startup and shutdown.
+ */
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_shutdown.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_freelist.h>
+
+static void 
+rf_FreeShutdownEnt(RF_ShutdownList_t * ent)
+{
+	FREE(ent, M_RAIDFRAME);
+}
+
+int 
+_rf_ShutdownCreate(
+    RF_ShutdownList_t ** listp,
+    void (*cleanup) (void *arg),
+    void *arg,
+    char *file,
+    int line)
+{
+	RF_ShutdownList_t *ent;
+
+	/*
+         * Have to directly allocate memory here, since we start up before
+         * and shutdown after RAIDframe internal allocation system.
+         */
+	/* 	ent = (RF_ShutdownList_t *) malloc(sizeof(RF_ShutdownList_t), 
+		M_RAIDFRAME, M_WAITOK); */
+	ent = (RF_ShutdownList_t *) malloc(sizeof(RF_ShutdownList_t), 
+					   M_RAIDFRAME, M_NOWAIT);
+	if (ent == NULL)
+		return (ENOMEM);
+	ent->cleanup = cleanup;
+	ent->arg = arg;
+	ent->file = file;
+	ent->line = line;
+	ent->next = *listp;
+	*listp = ent;
+	return (0);
+}
+
+int 
+rf_ShutdownList(RF_ShutdownList_t ** list)
+{
+	RF_ShutdownList_t *r, *next;
+	char   *file;
+	int     line;
+
+	for (r = *list; r; r = next) {
+		next = r->next;
+		file = r->file;
+		line = r->line;
+
+		if (rf_shutdownDebug) {
+			printf("call shutdown, created %s:%d\n", file, line);
+		}
+		r->cleanup(r->arg);
+
+		if (rf_shutdownDebug) {
+			printf("completed shutdown, created %s:%d\n", file, line);
+		}
+		rf_FreeShutdownEnt(r);
+	}
+	*list = NULL;
+	return (0);
+}
diff --git a/sys/dev/raidframe/rf_shutdown.h b/sys/dev/raidframe/rf_shutdown.h
new file mode 100644
index 0000000..5abc5ba
--- /dev/null
+++ b/sys/dev/raidframe/rf_shutdown.h
@@ -0,0 +1,67 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_shutdown.h,v 1.2 1999/02/05 00:06:17 oster Exp $	*/
+/*
+ * rf_shutdown.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * Maintain lists of cleanup functions. Also, mechanisms for coordinating
+ * thread startup and shutdown.
+ */
+
+#ifndef _RF__RF_SHUTDOWN_H_
+#define _RF__RF_SHUTDOWN_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+
+/*
+ * Important note: the shutdown list is run like a stack, new
+ * entries pushed on top. Therefore, the most recently added
+ * entry (last started) is the first removed (stopped). This
+ * should handle system-dependencies pretty nicely- if a system
+ * is there when you start another, it'll be there when you
+ * shut down another. Hopefully, this subsystem will remove
+ * more complexity than it introduces.
+ */
+
+struct RF_ShutdownList_s {
+	void    (*cleanup) (void *arg);
+	void   *arg;
+	char   *file;
+	int     line;
+	RF_ShutdownList_t *next;
+};
+#define rf_ShutdownCreate(_listp_,_func_,_arg_) \
+  _rf_ShutdownCreate(_listp_,_func_,_arg_,__FILE__,__LINE__)
+
+int     _rf_ShutdownCreate(RF_ShutdownList_t ** listp, void (*cleanup) (void *arg),
+            void *arg, char *file, int line);
+int     rf_ShutdownList(RF_ShutdownList_t ** listp);
+
+#endif				/* !_RF__RF_SHUTDOWN_H_ */
diff --git a/sys/dev/raidframe/rf_sstf.c b/sys/dev/raidframe/rf_sstf.c
new file mode 100644
index 0000000..3d20275
--- /dev/null
+++ b/sys/dev/raidframe/rf_sstf.c
@@ -0,0 +1,656 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_sstf.c,v 1.6 2001/01/27 20:18:55 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*******************************************************************************
+ *
+ * sstf.c --  prioritized shortest seek time first disk queueing code
+ *
+ ******************************************************************************/
+
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_stripelocks.h>
+#include <dev/raidframe/rf_layout.h>
+#include <dev/raidframe/rf_diskqueue.h>
+#include <dev/raidframe/rf_sstf.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_options.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_types.h>
+
+#define DIR_LEFT   1
+#define DIR_RIGHT  2
+#define DIR_EITHER 3
+
+#define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
+
+#define QSUM(_sstfq_) (((_sstfq_)->lopri.qlen)+((_sstfq_)->left.qlen)+((_sstfq_)->right.qlen))
+
+
+static void 
+do_sstf_ord_q(RF_DiskQueueData_t **,
+    RF_DiskQueueData_t **,
+    RF_DiskQueueData_t *);
+
+static RF_DiskQueueData_t *
+closest_to_arm(RF_SstfQ_t *,
+    RF_SectorNum_t,
+    int *,
+    int);
+static void do_dequeue(RF_SstfQ_t *, RF_DiskQueueData_t *);
+
+
+static void 
+do_sstf_ord_q(queuep, tailp, req)
+	RF_DiskQueueData_t **queuep;
+	RF_DiskQueueData_t **tailp;
+	RF_DiskQueueData_t *req;
+{
+	RF_DiskQueueData_t *r, *s;
+
+	if (*queuep == NULL) {
+		*queuep = req;
+		*tailp = req;
+		req->next = NULL;
+		req->prev = NULL;
+		return;
+	}
+	if (req->sectorOffset <= (*queuep)->sectorOffset) {
+		req->next = *queuep;
+		req->prev = NULL;
+		(*queuep)->prev = req;
+		*queuep = req;
+		return;
+	}
+	if (req->sectorOffset > (*tailp)->sectorOffset) {
+		/* optimization */
+		r = NULL;
+		s = *tailp;
+		goto q_at_end;
+	}
+	for (s = NULL, r = *queuep; r; s = r, r = r->next) {
+		if (r->sectorOffset >= req->sectorOffset) {
+			/* insert after s, before r */
+			RF_ASSERT(s);
+			req->next = r;
+			r->prev = req;
+			s->next = req;
+			req->prev = s;
+			return;
+		}
+	}
+q_at_end:
+	/* insert after s, at end of queue */
+	RF_ASSERT(r == NULL);
+	RF_ASSERT(s);
+	RF_ASSERT(s == (*tailp));
+	req->next = NULL;
+	req->prev = s;
+	s->next = req;
+	*tailp = req;
+}
+/* for removing from head-of-queue */
+#define DO_HEAD_DEQ(_r_,_q_) { \
+	_r_ = (_q_)->queue; \
+	RF_ASSERT((_r_) != NULL); \
+	(_q_)->queue = (_r_)->next; \
+	(_q_)->qlen--; \
+	if ((_q_)->qlen == 0) { \
+		RF_ASSERT((_r_) == (_q_)->qtail); \
+		RF_ASSERT((_q_)->queue == NULL); \
+		(_q_)->qtail = NULL; \
+	} \
+	else { \
+		RF_ASSERT((_q_)->queue->prev == (_r_)); \
+		(_q_)->queue->prev = NULL; \
+	} \
+}
+
+/* for removing from end-of-queue */
+#define DO_TAIL_DEQ(_r_,_q_) { \
+	_r_ = (_q_)->qtail; \
+	RF_ASSERT((_r_) != NULL); \
+	(_q_)->qtail = (_r_)->prev; \
+	(_q_)->qlen--; \
+	if ((_q_)->qlen == 0) { \
+		RF_ASSERT((_r_) == (_q_)->queue); \
+		RF_ASSERT((_q_)->qtail == NULL); \
+		(_q_)->queue = NULL; \
+	} \
+	else { \
+		RF_ASSERT((_q_)->qtail->next == (_r_)); \
+		(_q_)->qtail->next = NULL; \
+	} \
+}
+
+#define DO_BEST_DEQ(_l_,_r_,_q_) { \
+	if (SNUM_DIFF((_q_)->queue->sectorOffset,_l_) \
+		< SNUM_DIFF((_q_)->qtail->sectorOffset,_l_)) \
+	{ \
+		DO_HEAD_DEQ(_r_,_q_); \
+	} \
+	else { \
+		DO_TAIL_DEQ(_r_,_q_); \
+	} \
+}
+
+static RF_DiskQueueData_t *
+closest_to_arm(queue, arm_pos, dir, allow_reverse)
+	RF_SstfQ_t *queue;
+	RF_SectorNum_t arm_pos;
+	int    *dir;
+	int     allow_reverse;
+{
+	RF_SectorNum_t best_pos_l = 0, this_pos_l = 0, last_pos = 0;
+	RF_SectorNum_t best_pos_r = 0, this_pos_r = 0;
+	RF_DiskQueueData_t *r, *best_l, *best_r;
+
+	best_r = best_l = NULL;
+	for (r = queue->queue; r; r = r->next) {
+		if (r->sectorOffset < arm_pos) {
+			if (best_l == NULL) {
+				best_l = r;
+				last_pos = best_pos_l = this_pos_l;
+			} else {
+				this_pos_l = arm_pos - r->sectorOffset;
+				if (this_pos_l < best_pos_l) {
+					best_l = r;
+					last_pos = best_pos_l = this_pos_l;
+				} else {
+					last_pos = this_pos_l;
+				}
+			}
+		} else {
+			if (best_r == NULL) {
+				best_r = r;
+				last_pos = best_pos_r = this_pos_r;
+			} else {
+				this_pos_r = r->sectorOffset - arm_pos;
+				if (this_pos_r < best_pos_r) {
+					best_r = r;
+					last_pos = best_pos_r = this_pos_r;
+				} else {
+					last_pos = this_pos_r;
+				}
+				if (this_pos_r > last_pos) {
+					/* getting farther away */
+					break;
+				}
+			}
+		}
+	}
+	if ((best_r == NULL) && (best_l == NULL))
+		return (NULL);
+	if ((*dir == DIR_RIGHT) && best_r)
+		return (best_r);
+	if ((*dir == DIR_LEFT) && best_l)
+		return (best_l);
+	if (*dir == DIR_EITHER) {
+		if (best_l == NULL)
+			return (best_r);
+		if (best_r == NULL)
+			return (best_l);
+		if (best_pos_r < best_pos_l)
+			return (best_r);
+		else
+			return (best_l);
+	}
+	/*
+	 * Nothing in the direction we want to go. Reverse or
+	 * reset the arm. We know we have an I/O in the other
+	 * direction.
+	 */
+	if (allow_reverse) {
+		if (*dir == DIR_RIGHT) {
+			*dir = DIR_LEFT;
+			return (best_l);
+		} else {
+			*dir = DIR_RIGHT;
+			return (best_r);
+		}
+	}
+	/*
+	 * Reset (beginning of queue).
+	 */
+	RF_ASSERT(*dir == DIR_RIGHT);
+	return (queue->queue);
+}
+
+void   *
+rf_SstfCreate(sect_per_disk, cl_list, listp)
+	RF_SectorCount_t sect_per_disk;
+	RF_AllocListElem_t *cl_list;
+	RF_ShutdownList_t **listp;
+{
+	RF_Sstf_t *sstfq;
+
+	RF_CallocAndAdd(sstfq, 1, sizeof(RF_Sstf_t), (RF_Sstf_t *), cl_list);
+	sstfq->dir = DIR_EITHER;
+	sstfq->allow_reverse = 1;
+	return ((void *) sstfq);
+}
+
+void   *
+rf_ScanCreate(sect_per_disk, cl_list, listp)
+	RF_SectorCount_t sect_per_disk;
+	RF_AllocListElem_t *cl_list;
+	RF_ShutdownList_t **listp;
+{
+	RF_Sstf_t *scanq;
+
+	RF_CallocAndAdd(scanq, 1, sizeof(RF_Sstf_t), (RF_Sstf_t *), cl_list);
+	scanq->dir = DIR_RIGHT;
+	scanq->allow_reverse = 1;
+	return ((void *) scanq);
+}
+
+void   *
+rf_CscanCreate(sect_per_disk, cl_list, listp)
+	RF_SectorCount_t sect_per_disk;
+	RF_AllocListElem_t *cl_list;
+	RF_ShutdownList_t **listp;
+{
+	RF_Sstf_t *cscanq;
+
+	RF_CallocAndAdd(cscanq, 1, sizeof(RF_Sstf_t), (RF_Sstf_t *), cl_list);
+	cscanq->dir = DIR_RIGHT;
+	return ((void *) cscanq);
+}
+
+void 
+rf_SstfEnqueue(qptr, req, priority)
+	void   *qptr;
+	RF_DiskQueueData_t *req;
+	int     priority;
+{
+	RF_Sstf_t *sstfq;
+
+	sstfq = (RF_Sstf_t *) qptr;
+
+	if (priority == RF_IO_LOW_PRIORITY) {
+		if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) {
+			RF_DiskQueue_t *dq;
+			dq = (RF_DiskQueue_t *) req->queue;
+			printf("raid%d: ENQ lopri %d,%d queues are %d,%d,%d\n",
+			       req->raidPtr->raidid,
+			       dq->row, dq->col, 
+			       sstfq->left.qlen, sstfq->right.qlen,
+			       sstfq->lopri.qlen);
+		}
+		do_sstf_ord_q(&sstfq->lopri.queue, &sstfq->lopri.qtail, req);
+		sstfq->lopri.qlen++;
+	} else {
+		if (req->sectorOffset < sstfq->last_sector) {
+			do_sstf_ord_q(&sstfq->left.queue, &sstfq->left.qtail, req);
+			sstfq->left.qlen++;
+		} else {
+			do_sstf_ord_q(&sstfq->right.queue, &sstfq->right.qtail, req);
+			sstfq->right.qlen++;
+		}
+	}
+}
+
+static void 
+do_dequeue(queue, req)
+	RF_SstfQ_t *queue;
+	RF_DiskQueueData_t *req;
+{
+	RF_DiskQueueData_t *req2;
+
+	if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) {
+		printf("raid%d: do_dequeue\n", req->raidPtr->raidid);
+	}
+	if (req == queue->queue) {
+		DO_HEAD_DEQ(req2, queue);
+		RF_ASSERT(req2 == req);
+	} else
+		if (req == queue->qtail) {
+			DO_TAIL_DEQ(req2, queue);
+			RF_ASSERT(req2 == req);
+		} else {
+			/* dequeue from middle of list */
+			RF_ASSERT(req->next);
+			RF_ASSERT(req->prev);
+			queue->qlen--;
+			req->next->prev = req->prev;
+			req->prev->next = req->next;
+			req->next = req->prev = NULL;
+		}
+}
+
+RF_DiskQueueData_t *
+rf_SstfDequeue(qptr)
+	void   *qptr;
+{
+	RF_DiskQueueData_t *req = NULL;
+	RF_Sstf_t *sstfq;
+
+	sstfq = (RF_Sstf_t *) qptr;
+
+	if (rf_sstfDebug) {
+		RF_DiskQueue_t *dq;
+		dq = (RF_DiskQueue_t *) req->queue;
+		RF_ASSERT(QSUM(sstfq) == dq->queueLength);
+		printf("raid%d: sstf: Dequeue %d,%d queues are %d,%d,%d\n",
+		       req->raidPtr->raidid, dq->row, dq->col, 
+		       sstfq->left.qlen, sstfq->right.qlen, sstfq->lopri.qlen);
+	}
+	if (sstfq->left.queue == NULL) {
+		RF_ASSERT(sstfq->left.qlen == 0);
+		if (sstfq->right.queue == NULL) {
+			RF_ASSERT(sstfq->right.qlen == 0);
+			if (sstfq->lopri.queue == NULL) {
+				RF_ASSERT(sstfq->lopri.qlen == 0);
+				return (NULL);
+			}
+			if (rf_sstfDebug) {
+				printf("raid%d: sstf: check for close lopri",
+				       req->raidPtr->raidid);
+			}
+			req = closest_to_arm(&sstfq->lopri, sstfq->last_sector,
+			    &sstfq->dir, sstfq->allow_reverse);
+			if (rf_sstfDebug) {
+				printf("raid%d: sstf: closest_to_arm said %lx",
+				       req->raidPtr->raidid, (long) req);
+			}
+			if (req == NULL)
+				return (NULL);
+			do_dequeue(&sstfq->lopri, req);
+		} else {
+			DO_BEST_DEQ(sstfq->last_sector, req, &sstfq->right);
+		}
+	} else {
+		if (sstfq->right.queue == NULL) {
+			RF_ASSERT(sstfq->right.qlen == 0);
+			DO_BEST_DEQ(sstfq->last_sector, req, &sstfq->left);
+		} else {
+			if (SNUM_DIFF(sstfq->last_sector, sstfq->right.queue->sectorOffset)
+			    < SNUM_DIFF(sstfq->last_sector, sstfq->left.qtail->sectorOffset)) {
+				DO_HEAD_DEQ(req, &sstfq->right);
+			} else {
+				DO_TAIL_DEQ(req, &sstfq->left);
+			}
+		}
+	}
+	RF_ASSERT(req);
+	sstfq->last_sector = req->sectorOffset;
+	return (req);
+}
+
+RF_DiskQueueData_t *
+rf_ScanDequeue(qptr)
+	void   *qptr;
+{
+	RF_DiskQueueData_t *req = NULL;
+	RF_Sstf_t *scanq;
+
+	scanq = (RF_Sstf_t *) qptr;
+
+	if (rf_scanDebug) {
+		RF_DiskQueue_t *dq;
+		dq = (RF_DiskQueue_t *) req->queue;
+		RF_ASSERT(QSUM(scanq) == dq->queueLength);
+		printf("raid%d: scan: Dequeue %d,%d queues are %d,%d,%d\n", 
+		       req->raidPtr->raidid, dq->row, dq->col, 
+		       scanq->left.qlen, scanq->right.qlen, scanq->lopri.qlen);
+	}
+	if (scanq->left.queue == NULL) {
+		RF_ASSERT(scanq->left.qlen == 0);
+		if (scanq->right.queue == NULL) {
+			RF_ASSERT(scanq->right.qlen == 0);
+			if (scanq->lopri.queue == NULL) {
+				RF_ASSERT(scanq->lopri.qlen == 0);
+				return (NULL);
+			}
+			req = closest_to_arm(&scanq->lopri, scanq->last_sector,
+			    &scanq->dir, scanq->allow_reverse);
+			if (req == NULL)
+				return (NULL);
+			do_dequeue(&scanq->lopri, req);
+		} else {
+			scanq->dir = DIR_RIGHT;
+			DO_HEAD_DEQ(req, &scanq->right);
+		}
+	} else
+		if (scanq->right.queue == NULL) {
+			RF_ASSERT(scanq->right.qlen == 0);
+			RF_ASSERT(scanq->left.queue);
+			scanq->dir = DIR_LEFT;
+			DO_TAIL_DEQ(req, &scanq->left);
+		} else {
+			RF_ASSERT(scanq->right.queue);
+			RF_ASSERT(scanq->left.queue);
+			if (scanq->dir == DIR_RIGHT) {
+				DO_HEAD_DEQ(req, &scanq->right);
+			} else {
+				DO_TAIL_DEQ(req, &scanq->left);
+			}
+		}
+	RF_ASSERT(req);
+	scanq->last_sector = req->sectorOffset;
+	return (req);
+}
+
+RF_DiskQueueData_t *
+rf_CscanDequeue(qptr)
+	void   *qptr;
+{
+	RF_DiskQueueData_t *req = NULL;
+	RF_Sstf_t *cscanq;
+
+	cscanq = (RF_Sstf_t *) qptr;
+
+	RF_ASSERT(cscanq->dir == DIR_RIGHT);
+	if (rf_cscanDebug) {
+		RF_DiskQueue_t *dq;
+		dq = (RF_DiskQueue_t *) req->queue;
+		RF_ASSERT(QSUM(cscanq) == dq->queueLength);
+		printf("raid%d: scan: Dequeue %d,%d queues are %d,%d,%d\n", 
+		       req->raidPtr->raidid, dq->row, dq->col,
+		       cscanq->left.qlen, cscanq->right.qlen,
+		       cscanq->lopri.qlen);
+	}
+	if (cscanq->right.queue) {
+		DO_HEAD_DEQ(req, &cscanq->right);
+	} else {
+		RF_ASSERT(cscanq->right.qlen == 0);
+		if (cscanq->left.queue == NULL) {
+			RF_ASSERT(cscanq->left.qlen == 0);
+			if (cscanq->lopri.queue == NULL) {
+				RF_ASSERT(cscanq->lopri.qlen == 0);
+				return (NULL);
+			}
+			req = closest_to_arm(&cscanq->lopri, cscanq->last_sector,
+			    &cscanq->dir, cscanq->allow_reverse);
+			if (req == NULL)
+				return (NULL);
+			do_dequeue(&cscanq->lopri, req);
+		} else {
+			/*
+			 * There's I/Os to the left of the arm. Swing
+			 * on back (swap queues).
+			 */
+			cscanq->right = cscanq->left;
+			cscanq->left.qlen = 0;
+			cscanq->left.queue = cscanq->left.qtail = NULL;
+			DO_HEAD_DEQ(req, &cscanq->right);
+		}
+	}
+	RF_ASSERT(req);
+	cscanq->last_sector = req->sectorOffset;
+	return (req);
+}
+
+RF_DiskQueueData_t *
+rf_SstfPeek(qptr)
+	void   *qptr;
+{
+	RF_DiskQueueData_t *req;
+	RF_Sstf_t *sstfq;
+
+	sstfq = (RF_Sstf_t *) qptr;
+
+	if ((sstfq->left.queue == NULL) && (sstfq->right.queue == NULL)) {
+		req = closest_to_arm(&sstfq->lopri, sstfq->last_sector, &sstfq->dir,
+		    sstfq->allow_reverse);
+	} else {
+		if (sstfq->left.queue == NULL)
+			req = sstfq->right.queue;
+		else {
+			if (sstfq->right.queue == NULL)
+				req = sstfq->left.queue;
+			else {
+				if (SNUM_DIFF(sstfq->last_sector, sstfq->right.queue->sectorOffset)
+				    < SNUM_DIFF(sstfq->last_sector, sstfq->left.qtail->sectorOffset)) {
+					req = sstfq->right.queue;
+				} else {
+					req = sstfq->left.qtail;
+				}
+			}
+		}
+	}
+	if (req == NULL) {
+		RF_ASSERT(QSUM(sstfq) == 0);
+	}
+	return (req);
+}
+
+RF_DiskQueueData_t *
+rf_ScanPeek(qptr)
+	void   *qptr;
+{
+	RF_DiskQueueData_t *req;
+	RF_Sstf_t *scanq;
+	int     dir;
+
+	scanq = (RF_Sstf_t *) qptr;
+	dir = scanq->dir;
+
+	if (scanq->left.queue == NULL) {
+		RF_ASSERT(scanq->left.qlen == 0);
+		if (scanq->right.queue == NULL) {
+			RF_ASSERT(scanq->right.qlen == 0);
+			if (scanq->lopri.queue == NULL) {
+				RF_ASSERT(scanq->lopri.qlen == 0);
+				return (NULL);
+			}
+			req = closest_to_arm(&scanq->lopri, scanq->last_sector,
+			    &dir, scanq->allow_reverse);
+		} else {
+			req = scanq->right.queue;
+		}
+	} else
+		if (scanq->right.queue == NULL) {
+			RF_ASSERT(scanq->right.qlen == 0);
+			RF_ASSERT(scanq->left.queue);
+			req = scanq->left.qtail;
+		} else {
+			RF_ASSERT(scanq->right.queue);
+			RF_ASSERT(scanq->left.queue);
+			if (scanq->dir == DIR_RIGHT) {
+				req = scanq->right.queue;
+			} else {
+				req = scanq->left.qtail;
+			}
+		}
+	if (req == NULL) {
+		RF_ASSERT(QSUM(scanq) == 0);
+	}
+	return (req);
+}
+
+RF_DiskQueueData_t *
+rf_CscanPeek(qptr)
+	void   *qptr;
+{
+	RF_DiskQueueData_t *req;
+	RF_Sstf_t *cscanq;
+
+	cscanq = (RF_Sstf_t *) qptr;
+
+	RF_ASSERT(cscanq->dir == DIR_RIGHT);
+	if (cscanq->right.queue) {
+		req = cscanq->right.queue;
+	} else {
+		RF_ASSERT(cscanq->right.qlen == 0);
+		if (cscanq->left.queue == NULL) {
+			RF_ASSERT(cscanq->left.qlen == 0);
+			if (cscanq->lopri.queue == NULL) {
+				RF_ASSERT(cscanq->lopri.qlen == 0);
+				return (NULL);
+			}
+			req = closest_to_arm(&cscanq->lopri, cscanq->last_sector,
+			    &cscanq->dir, cscanq->allow_reverse);
+		} else {
+			/*
+			 * There's I/Os to the left of the arm. We'll end
+			 * up swinging on back.
+			 */
+			req = cscanq->left.queue;
+		}
+	}
+	if (req == NULL) {
+		RF_ASSERT(QSUM(cscanq) == 0);
+	}
+	return (req);
+}
+
+int 
+rf_SstfPromote(qptr, parityStripeID, which_ru)
+	void   *qptr;
+	RF_StripeNum_t parityStripeID;
+	RF_ReconUnitNum_t which_ru;
+{
+	RF_DiskQueueData_t *r, *next;
+	RF_Sstf_t *sstfq;
+	int     n;
+
+	sstfq = (RF_Sstf_t *) qptr;
+
+	n = 0;
+	for (r = sstfq->lopri.queue; r; r = next) {
+		next = r->next;
+		if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) {
+			printf("raid%d: check promote %lx\n",
+			       r->raidPtr->raidid, (long) r);
+		}
+		if ((r->parityStripeID == parityStripeID)
+		    && (r->which_ru == which_ru)) {
+			do_dequeue(&sstfq->lopri, r);
+			rf_SstfEnqueue(qptr, r, RF_IO_NORMAL_PRIORITY);
+			n++;
+		}
+	}
+	if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) {
+		printf("raid%d: promoted %d matching I/Os queues are %d,%d,%d\n",
+		       r->raidPtr->raidid, n, sstfq->left.qlen, 
+		       sstfq->right.qlen, sstfq->lopri.qlen);
+	}
+	return (n);
+}
diff --git a/sys/dev/raidframe/rf_sstf.h b/sys/dev/raidframe/rf_sstf.h
new file mode 100644
index 0000000..2fc1c0d
--- /dev/null
+++ b/sys/dev/raidframe/rf_sstf.h
@@ -0,0 +1,70 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_sstf.h,v 1.3 1999/02/05 00:06:17 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_SSTF_H_
+#define _RF__RF_SSTF_H_
+
+#include <dev/raidframe/rf_diskqueue.h>
+
+typedef struct RF_SstfQ_s {
+	RF_DiskQueueData_t *queue;
+	RF_DiskQueueData_t *qtail;
+	int     qlen;
+}       RF_SstfQ_t;
+
+typedef struct RF_Sstf_s {
+	RF_SstfQ_t left;
+	RF_SstfQ_t right;
+	RF_SstfQ_t lopri;
+	RF_SectorNum_t last_sector;
+	int     dir;
+	int     allow_reverse;
+}       RF_Sstf_t;
+
+void   *
+rf_SstfCreate(RF_SectorCount_t sect_per_disk,
+    RF_AllocListElem_t * cl_list, RF_ShutdownList_t ** listp);
+void   *
+rf_ScanCreate(RF_SectorCount_t sect_per_disk,
+    RF_AllocListElem_t * cl_list, RF_ShutdownList_t ** listp);
+void   *
+rf_CscanCreate(RF_SectorCount_t sect_per_disk,
+    RF_AllocListElem_t * cl_list, RF_ShutdownList_t ** listp);
+void    rf_SstfEnqueue(void *qptr, RF_DiskQueueData_t * req, int priority);
+RF_DiskQueueData_t *rf_SstfDequeue(void *qptr);
+RF_DiskQueueData_t *rf_SstfPeek(void *qptr);
+int 
+rf_SstfPromote(void *qptr, RF_StripeNum_t parityStripeID,
+    RF_ReconUnitNum_t which_ru);
+RF_DiskQueueData_t *rf_ScanDequeue(void *qptr);
+RF_DiskQueueData_t *rf_ScanPeek(void *qptr);
+RF_DiskQueueData_t *rf_CscanDequeue(void *qptr);
+RF_DiskQueueData_t *rf_CscanPeek(void *qptr);
+
+#endif				/* !_RF__RF_SSTF_H_ */
diff --git a/sys/dev/raidframe/rf_states.c b/sys/dev/raidframe/rf_states.c
new file mode 100644
index 0000000..c96ee87
--- /dev/null
+++ b/sys/dev/raidframe/rf_states.c
@@ -0,0 +1,667 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_states.c,v 1.15 2000/10/20 02:24:45 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II, Robby Findler
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <sys/errno.h>
+
+#include <dev/raidframe/rf_archs.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_desc.h>
+#include <dev/raidframe/rf_aselect.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_states.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_driver.h>
+#include <dev/raidframe/rf_engine.h>
+#include <dev/raidframe/rf_map.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_kintf.h>
+
+/* prototypes for some of the available states.
+
+   States must:
+
+     - not block.
+
+     - either schedule rf_ContinueRaidAccess as a callback and return
+       RF_TRUE, or complete all of their work and return RF_FALSE.
+
+     - increment desc->state when they have finished their work.
+*/
+
+static char *
+StateName(RF_AccessState_t state)
+{
+	switch (state) {
+		case rf_QuiesceState:return "QuiesceState";
+	case rf_MapState:
+		return "MapState";
+	case rf_LockState:
+		return "LockState";
+	case rf_CreateDAGState:
+		return "CreateDAGState";
+	case rf_ExecuteDAGState:
+		return "ExecuteDAGState";
+	case rf_ProcessDAGState:
+		return "ProcessDAGState";
+	case rf_CleanupState:
+		return "CleanupState";
+	case rf_LastState:
+		return "LastState";
+	case rf_IncrAccessesCountState:
+		return "IncrAccessesCountState";
+	case rf_DecrAccessesCountState:
+		return "DecrAccessesCountState";
+	default:
+		return "!!! UnnamedState !!!";
+	}
+}
+
+void 
+rf_ContinueRaidAccess(RF_RaidAccessDesc_t * desc)
+{
+	int     suspended = RF_FALSE;
+	int     current_state_index = desc->state;
+	RF_AccessState_t current_state = desc->states[current_state_index];
+	int     unit = desc->raidPtr->raidid;
+
+	do {
+
+		current_state_index = desc->state;
+		current_state = desc->states[current_state_index];
+
+		switch (current_state) {
+
+		case rf_QuiesceState:
+			suspended = rf_State_Quiesce(desc);
+			break;
+		case rf_IncrAccessesCountState:
+			suspended = rf_State_IncrAccessCount(desc);
+			break;
+		case rf_MapState:
+			suspended = rf_State_Map(desc);
+			break;
+		case rf_LockState:
+			suspended = rf_State_Lock(desc);
+			break;
+		case rf_CreateDAGState:
+			suspended = rf_State_CreateDAG(desc);
+			break;
+		case rf_ExecuteDAGState:
+			suspended = rf_State_ExecuteDAG(desc);
+			break;
+		case rf_ProcessDAGState:
+			suspended = rf_State_ProcessDAG(desc);
+			break;
+		case rf_CleanupState:
+			suspended = rf_State_Cleanup(desc);
+			break;
+		case rf_DecrAccessesCountState:
+			suspended = rf_State_DecrAccessCount(desc);
+			break;
+		case rf_LastState:
+			suspended = rf_State_LastState(desc);
+			break;
+		}
+
+		/* after this point, we cannot dereference desc since desc may
+		 * have been freed. desc is only freed in LastState, so if we
+		 * renter this function or loop back up, desc should be valid. */
+
+		if (rf_printStatesDebug) {
+			printf("raid%d: State: %-24s StateIndex: %3i desc: 0x%ld %s\n",
+			       unit, StateName(current_state), 
+			       current_state_index, (long) desc,
+			       suspended ? "callback scheduled" : "looping");
+		}
+	} while (!suspended && current_state != rf_LastState);
+
+	return;
+}
+
+
+void 
+rf_ContinueDagAccess(RF_DagList_t * dagList)
+{
+	RF_AccTraceEntry_t *tracerec = &(dagList->desc->tracerec);
+	RF_RaidAccessDesc_t *desc;
+	RF_DagHeader_t *dag_h;
+	RF_Etimer_t timer;
+	int     i;
+
+	desc = dagList->desc;
+
+	timer = tracerec->timer;
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->specific.user.exec_us = RF_ETIMER_VAL_US(timer);
+	RF_ETIMER_START(tracerec->timer);
+
+	/* skip to dag which just finished */
+	dag_h = dagList->dags;
+	for (i = 0; i < dagList->numDagsDone; i++) {
+		dag_h = dag_h->next;
+	}
+
+	/* check to see if retry is required */
+	if (dag_h->status == rf_rollBackward) {
+		/* when a dag fails, mark desc status as bad and allow all
+		 * other dags in the desc to execute to completion.  then,
+		 * free all dags and start over */
+		desc->status = 1;	/* bad status */
+		{
+			printf("raid%d: DAG failure: %c addr 0x%lx (%ld) nblk 0x%x (%d) buf 0x%lx\n",
+			       desc->raidPtr->raidid, desc->type, 
+			       (long) desc->raidAddress,
+			       (long) desc->raidAddress, (int) desc->numBlocks,
+			       (int) desc->numBlocks, 
+			       (unsigned long) (desc->bufPtr));
+		}
+	}
+	dagList->numDagsDone++;
+	rf_ContinueRaidAccess(desc);
+}
+
+int 
+rf_State_LastState(RF_RaidAccessDesc_t * desc)
+{
+	void    (*callbackFunc) (RF_CBParam_t) = desc->callbackFunc;
+	RF_CBParam_t callbackArg;
+
+	callbackArg.p = desc->callbackArg;
+	
+	/*
+	 * If this is not an async request, wake up the caller
+	 */
+	if (desc->async_flag == 0)
+		wakeup(desc->bp);
+
+	/*
+	 * That's all the IO for this one... unbusy the 'disk'.
+	 */
+
+	rf_disk_unbusy(desc);
+
+	/* 
+	 * Wakeup any requests waiting to go.
+	 */
+	
+	RF_LOCK_MUTEX(((RF_Raid_t *) desc->raidPtr)->mutex);
+	((RF_Raid_t *) desc->raidPtr)->openings++;
+	RF_UNLOCK_MUTEX(((RF_Raid_t *) desc->raidPtr)->mutex);
+
+	/* wake up any pending IO */
+	raidstart(((RF_Raid_t *) desc->raidPtr));
+		
+	/* printf("Calling biodone on 0x%x\n",desc->bp); */
+	biodone(desc->bp);	/* access came through ioctl */
+
+	if (callbackFunc)
+		callbackFunc(callbackArg);
+	rf_FreeRaidAccDesc(desc);
+
+	return RF_FALSE;
+}
+
+int 
+rf_State_IncrAccessCount(RF_RaidAccessDesc_t * desc)
+{
+	RF_Raid_t *raidPtr;
+
+	raidPtr = desc->raidPtr;
+	/* Bummer. We have to do this to be 100% safe w.r.t. the increment
+	 * below */
+	RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
+	raidPtr->accs_in_flight++;	/* used to detect quiescence */
+	RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
+
+	desc->state++;
+	return RF_FALSE;
+}
+
+int 
+rf_State_DecrAccessCount(RF_RaidAccessDesc_t * desc)
+{
+	RF_Raid_t *raidPtr;
+
+	raidPtr = desc->raidPtr;
+
+	RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
+	raidPtr->accs_in_flight--;
+	if (raidPtr->accesses_suspended && raidPtr->accs_in_flight == 0) {
+		rf_SignalQuiescenceLock(raidPtr, raidPtr->reconDesc);
+	}
+	rf_UpdateUserStats(raidPtr, RF_ETIMER_VAL_US(desc->timer), desc->numBlocks);
+	RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
+
+	desc->state++;
+	return RF_FALSE;
+}
+
+int 
+rf_State_Quiesce(RF_RaidAccessDesc_t * desc)
+{
+	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
+	RF_Etimer_t timer;
+	int     suspended = RF_FALSE;
+	RF_Raid_t *raidPtr;
+
+	raidPtr = desc->raidPtr;
+
+	RF_ETIMER_START(timer);
+	RF_ETIMER_START(desc->timer);
+
+	RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
+	if (raidPtr->accesses_suspended) {
+		RF_CallbackDesc_t *cb;
+		cb = rf_AllocCallbackDesc();
+		/* XXX the following cast is quite bogus...
+		 * rf_ContinueRaidAccess takes a (RF_RaidAccessDesc_t *) as an
+		 * argument..  GO */
+		cb->callbackFunc = (void (*) (RF_CBParam_t)) rf_ContinueRaidAccess;
+		cb->callbackArg.p = (void *) desc;
+		cb->next = raidPtr->quiesce_wait_list;
+		raidPtr->quiesce_wait_list = cb;
+		suspended = RF_TRUE;
+	}
+	RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
+
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->specific.user.suspend_ovhd_us += RF_ETIMER_VAL_US(timer);
+
+	if (suspended && rf_quiesceDebug)
+		printf("Stalling access due to quiescence lock\n");
+
+	desc->state++;
+	return suspended;
+}
+
+int 
+rf_State_Map(RF_RaidAccessDesc_t * desc)
+{
+	RF_Raid_t *raidPtr = desc->raidPtr;
+	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
+	RF_Etimer_t timer;
+
+	RF_ETIMER_START(timer);
+
+	if (!(desc->asmap = rf_MapAccess(raidPtr, desc->raidAddress, desc->numBlocks,
+		    desc->bufPtr, RF_DONT_REMAP)))
+		RF_PANIC();
+
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->specific.user.map_us = RF_ETIMER_VAL_US(timer);
+
+	desc->state++;
+	return RF_FALSE;
+}
+
+int 
+rf_State_Lock(RF_RaidAccessDesc_t * desc)
+{
+	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
+	RF_Raid_t *raidPtr = desc->raidPtr;
+	RF_AccessStripeMapHeader_t *asmh = desc->asmap;
+	RF_AccessStripeMap_t *asm_p;
+	RF_Etimer_t timer;
+	int     suspended = RF_FALSE;
+
+	RF_ETIMER_START(timer);
+	if (!(raidPtr->Layout.map->flags & RF_NO_STRIPE_LOCKS)) {
+		RF_StripeNum_t lastStripeID = -1;
+
+		/* acquire each lock that we don't already hold */
+		for (asm_p = asmh->stripeMap; asm_p; asm_p = asm_p->next) {
+			RF_ASSERT(RF_IO_IS_R_OR_W(desc->type));
+			if (!rf_suppressLocksAndLargeWrites &&
+			    asm_p->parityInfo &&
+			    !(desc->flags & RF_DAG_SUPPRESS_LOCKS) &&
+			    !(asm_p->flags & RF_ASM_FLAGS_LOCK_TRIED)) {
+				asm_p->flags |= RF_ASM_FLAGS_LOCK_TRIED;
+				RF_ASSERT(asm_p->stripeID > lastStripeID);
+
+				/* locks must be acquired hierarchically */
+
+				lastStripeID = asm_p->stripeID;
+				/* XXX the cast to (void (*)(RF_CBParam_t))
+				 * below is bogus!  GO */
+				RF_INIT_LOCK_REQ_DESC(asm_p->lockReqDesc,
+				    desc->type,
+				    (void (*) (RF_Buf_t)) rf_ContinueRaidAccess,
+				    desc, asm_p,
+				    raidPtr->Layout.dataSectorsPerStripe);
+				if (rf_AcquireStripeLock(raidPtr->lockTable,
+				    asm_p->stripeID, &asm_p->lockReqDesc)) {
+					suspended = RF_TRUE;
+					break;
+				}
+			}
+			if (desc->type == RF_IO_TYPE_WRITE &&
+			    raidPtr->status[asm_p->physInfo->row] == rf_rs_reconstructing) {
+				if (!(asm_p->flags & RF_ASM_FLAGS_FORCE_TRIED)) {
+					int     val;
+
+					asm_p->flags |= RF_ASM_FLAGS_FORCE_TRIED;
+					/* XXX the cast below is quite
+					 * bogus!!! XXX  GO */
+					val = rf_ForceOrBlockRecon(raidPtr, asm_p,
+					    (void (*) (RF_Raid_t *, void *)) rf_ContinueRaidAccess, desc);
+					if (val == 0) {
+						asm_p->flags |= RF_ASM_FLAGS_RECON_BLOCKED;
+					} else {
+						suspended = RF_TRUE;
+						break;
+					}
+				} else {
+					if (rf_pssDebug) {
+						printf("raid%d: skipping force/block because already done, psid %ld\n",
+						       desc->raidPtr->raidid, 
+						       (long) asm_p->stripeID);
+					}
+				}
+			} else {
+				if (rf_pssDebug) {
+					printf("raid%d: skipping force/block because not write or not under recon, psid %ld\n",
+					       desc->raidPtr->raidid, 
+					       (long) asm_p->stripeID);
+				}
+			}
+		}
+
+		RF_ETIMER_STOP(timer);
+		RF_ETIMER_EVAL(timer);
+		tracerec->specific.user.lock_us += RF_ETIMER_VAL_US(timer);
+
+		if (suspended)
+			return (RF_TRUE);
+	}
+	desc->state++;
+	return (RF_FALSE);
+}
+/*
+ * the following three states create, execute, and post-process dags
+ * the error recovery unit is a single dag.
+ * by default, SelectAlgorithm creates an array of dags, one per parity stripe
+ * in some tricky cases, multiple dags per stripe are created
+ *   - dags within a parity stripe are executed sequentially (arbitrary order)
+ *   - dags for distinct parity stripes are executed concurrently
+ *
+ * repeat until all dags complete successfully -or- dag selection fails
+ *
+ * while !done
+ *   create dag(s) (SelectAlgorithm)
+ *   if dag
+ *     execute dag (DispatchDAG)
+ *     if dag successful
+ *       done (SUCCESS)
+ *     else
+ *       !done (RETRY - start over with new dags)
+ *   else
+ *     done (FAIL)
+ */
+int 
+rf_State_CreateDAG(RF_RaidAccessDesc_t * desc)
+{
+	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
+	RF_Etimer_t timer;
+	RF_DagHeader_t *dag_h;
+	int     i, selectStatus;
+
+	/* generate a dag for the access, and fire it off.  When the dag
+	 * completes, we'll get re-invoked in the next state. */
+	RF_ETIMER_START(timer);
+	/* SelectAlgorithm returns one or more dags */
+	selectStatus = rf_SelectAlgorithm(desc, desc->flags | RF_DAG_SUPPRESS_LOCKS);
+	if (rf_printDAGsDebug)
+		for (i = 0; i < desc->numStripes; i++)
+			rf_PrintDAGList(desc->dagArray[i].dags);
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	/* update time to create all dags */
+	tracerec->specific.user.dag_create_us = RF_ETIMER_VAL_US(timer);
+
+	desc->status = 0;	/* good status */
+
+	if (selectStatus) {
+		/* failed to create a dag */
+		/* this happens when there are too many faults or incomplete
+		 * dag libraries */
+		printf("[Failed to create a DAG]\n");
+		RF_PANIC();
+	} else {
+		/* bind dags to desc */
+		for (i = 0; i < desc->numStripes; i++) {
+			dag_h = desc->dagArray[i].dags;
+			while (dag_h) {
+				dag_h->bp = (RF_Buf_t) desc->bp;
+				dag_h->tracerec = tracerec;
+				dag_h = dag_h->next;
+			}
+		}
+		desc->flags |= RF_DAG_DISPATCH_RETURNED;
+		desc->state++;	/* next state should be rf_State_ExecuteDAG */
+	}
+	return RF_FALSE;
+}
+
+
+
+/* the access has an array of dagLists, one dagList per parity stripe.
+ * fire the first dag in each parity stripe (dagList).
+ * dags within a stripe (dagList) must be executed sequentially
+ *  - this preserves atomic parity update
+ * dags for independents parity groups (stripes) are fired concurrently */
+
+int 
+rf_State_ExecuteDAG(RF_RaidAccessDesc_t * desc)
+{
+	int     i;
+	RF_DagHeader_t *dag_h;
+	RF_DagList_t *dagArray = desc->dagArray;
+
+	/* next state is always rf_State_ProcessDAG important to do this
+	 * before firing the first dag (it may finish before we leave this
+	 * routine) */
+	desc->state++;
+
+	/* sweep dag array, a stripe at a time, firing the first dag in each
+	 * stripe */
+	for (i = 0; i < desc->numStripes; i++) {
+		RF_ASSERT(dagArray[i].numDags > 0);
+		RF_ASSERT(dagArray[i].numDagsDone == 0);
+		RF_ASSERT(dagArray[i].numDagsFired == 0);
+		RF_ETIMER_START(dagArray[i].tracerec.timer);
+		/* fire first dag in this stripe */
+		dag_h = dagArray[i].dags;
+		RF_ASSERT(dag_h);
+		dagArray[i].numDagsFired++;
+		/* XXX Yet another case where we pass in a conflicting
+		 * function pointer :-(  XXX  GO */
+		rf_DispatchDAG(dag_h, (void (*) (void *)) rf_ContinueDagAccess, &dagArray[i]);
+	}
+
+	/* the DAG will always call the callback, even if there was no
+	 * blocking, so we are always suspended in this state */
+	return RF_TRUE;
+}
+
+
+
+/* rf_State_ProcessDAG is entered when a dag completes.
+ * first, check to all dags in the access have completed
+ * if not, fire as many dags as possible */
+
+int 
+rf_State_ProcessDAG(RF_RaidAccessDesc_t * desc)
+{
+	RF_AccessStripeMapHeader_t *asmh = desc->asmap;
+	RF_Raid_t *raidPtr = desc->raidPtr;
+	RF_DagHeader_t *dag_h;
+	int     i, j, done = RF_TRUE;
+	RF_DagList_t *dagArray = desc->dagArray;
+	RF_Etimer_t timer;
+
+	/* check to see if this is the last dag */
+	for (i = 0; i < desc->numStripes; i++)
+		if (dagArray[i].numDags != dagArray[i].numDagsDone)
+			done = RF_FALSE;
+
+	if (done) {
+		if (desc->status) {
+			/* a dag failed, retry */
+			RF_ETIMER_START(timer);
+			/* free all dags */
+			for (i = 0; i < desc->numStripes; i++) {
+				rf_FreeDAG(desc->dagArray[i].dags);
+			}
+			rf_MarkFailuresInASMList(raidPtr, asmh);
+			/* back up to rf_State_CreateDAG */
+			desc->state = desc->state - 2;
+			return RF_FALSE;
+		} else {
+			/* move on to rf_State_Cleanup */
+			desc->state++;
+		}
+		return RF_FALSE;
+	} else {
+		/* more dags to execute */
+		/* see if any are ready to be fired.  if so, fire them */
+		/* don't fire the initial dag in a list, it's fired in
+		 * rf_State_ExecuteDAG */
+		for (i = 0; i < desc->numStripes; i++) {
+			if ((dagArray[i].numDagsDone < dagArray[i].numDags)
+			    && (dagArray[i].numDagsDone == dagArray[i].numDagsFired)
+			    && (dagArray[i].numDagsFired > 0)) {
+				RF_ETIMER_START(dagArray[i].tracerec.timer);
+				/* fire next dag in this stripe */
+				/* first, skip to next dag awaiting execution */
+				dag_h = dagArray[i].dags;
+				for (j = 0; j < dagArray[i].numDagsDone; j++)
+					dag_h = dag_h->next;
+				dagArray[i].numDagsFired++;
+				/* XXX and again we pass a different function
+				 * pointer.. GO */
+				rf_DispatchDAG(dag_h, (void (*) (void *)) rf_ContinueDagAccess,
+				    &dagArray[i]);
+			}
+		}
+		return RF_TRUE;
+	}
+}
+/* only make it this far if all dags complete successfully */
+int 
+rf_State_Cleanup(RF_RaidAccessDesc_t * desc)
+{
+	RF_AccTraceEntry_t *tracerec = &desc->tracerec;
+	RF_AccessStripeMapHeader_t *asmh = desc->asmap;
+	RF_Raid_t *raidPtr = desc->raidPtr;
+	RF_AccessStripeMap_t *asm_p;
+	RF_DagHeader_t *dag_h;
+	RF_Etimer_t timer;
+	int i;
+
+	desc->state++;
+
+	timer = tracerec->timer;
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->specific.user.dag_retry_us = RF_ETIMER_VAL_US(timer);
+
+	/* the RAID I/O is complete.  Clean up. */
+	tracerec->specific.user.dag_retry_us = 0;
+
+	RF_ETIMER_START(timer);
+	if (desc->flags & RF_DAG_RETURN_DAG) {
+		/* copy dags into paramDAG */
+		*(desc->paramDAG) = desc->dagArray[0].dags;
+		dag_h = *(desc->paramDAG);
+		for (i = 1; i < desc->numStripes; i++) {
+			/* concatenate dags from remaining stripes */
+			RF_ASSERT(dag_h);
+			while (dag_h->next)
+				dag_h = dag_h->next;
+			dag_h->next = desc->dagArray[i].dags;
+		}
+	} else {
+		/* free all dags */
+		for (i = 0; i < desc->numStripes; i++) {
+			rf_FreeDAG(desc->dagArray[i].dags);
+		}
+	}
+
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->specific.user.cleanup_us = RF_ETIMER_VAL_US(timer);
+
+	RF_ETIMER_START(timer);
+	if (!(raidPtr->Layout.map->flags & RF_NO_STRIPE_LOCKS)) {
+		for (asm_p = asmh->stripeMap; asm_p; asm_p = asm_p->next) {
+			if (!rf_suppressLocksAndLargeWrites &&
+			    asm_p->parityInfo &&
+			    !(desc->flags & RF_DAG_SUPPRESS_LOCKS)) {
+				RF_ASSERT_VALID_LOCKREQ(&asm_p->lockReqDesc);
+				rf_ReleaseStripeLock(raidPtr->lockTable, 
+						     asm_p->stripeID,
+						     &asm_p->lockReqDesc);
+			}
+			if (asm_p->flags & RF_ASM_FLAGS_RECON_BLOCKED) {
+				rf_UnblockRecon(raidPtr, asm_p);
+			}
+		}
+	}
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->specific.user.lock_us += RF_ETIMER_VAL_US(timer);
+
+	RF_ETIMER_START(timer);
+	if (desc->flags & RF_DAG_RETURN_ASM)
+		*(desc->paramASM) = asmh;
+	else
+		rf_FreeAccessStripeMap(asmh);
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	tracerec->specific.user.cleanup_us += RF_ETIMER_VAL_US(timer);
+
+	RF_ETIMER_STOP(desc->timer);
+	RF_ETIMER_EVAL(desc->timer);
+
+	timer = desc->tracerec.tot_timer;
+	RF_ETIMER_STOP(timer);
+	RF_ETIMER_EVAL(timer);
+	desc->tracerec.total_us = RF_ETIMER_VAL_US(timer);
+
+	rf_LogTraceRec(raidPtr, tracerec);
+
+	desc->flags |= RF_DAG_ACCESS_COMPLETE;
+
+	return RF_FALSE;
+}
diff --git a/sys/dev/raidframe/rf_states.h b/sys/dev/raidframe/rf_states.h
new file mode 100644
index 0000000..6c0aee4
--- /dev/null
+++ b/sys/dev/raidframe/rf_states.h
@@ -0,0 +1,48 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_states.h,v 1.3 1999/02/05 00:06:17 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II, Robby Findler
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_STATES_H_
+#define _RF__RF_STATES_H_
+
+#include <dev/raidframe/rf_types.h>
+
+void    rf_ContinueRaidAccess(RF_RaidAccessDesc_t * desc);
+void    rf_ContinueDagAccess(RF_DagList_t * dagList);
+int     rf_State_LastState(RF_RaidAccessDesc_t * desc);
+int     rf_State_IncrAccessCount(RF_RaidAccessDesc_t * desc);
+int     rf_State_DecrAccessCount(RF_RaidAccessDesc_t * desc);
+int     rf_State_Quiesce(RF_RaidAccessDesc_t * desc);
+int     rf_State_Map(RF_RaidAccessDesc_t * desc);
+int     rf_State_Lock(RF_RaidAccessDesc_t * desc);
+int     rf_State_CreateDAG(RF_RaidAccessDesc_t * desc);
+int     rf_State_ExecuteDAG(RF_RaidAccessDesc_t * desc);
+int     rf_State_ProcessDAG(RF_RaidAccessDesc_t * desc);
+int     rf_State_Cleanup(RF_RaidAccessDesc_t * desc);
+
+#endif				/* !_RF__RF_STATES_H_ */
diff --git a/sys/dev/raidframe/rf_stripelocks.c b/sys/dev/raidframe/rf_stripelocks.c
new file mode 100644
index 0000000..bcee719
--- /dev/null
+++ b/sys/dev/raidframe/rf_stripelocks.c
@@ -0,0 +1,667 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_stripelocks.c,v 1.6 2000/12/04 11:35:46 fvdl Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Mark Holland, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * stripelocks.c -- code to lock stripes for read and write access
+ *
+ * The code distinguishes between read locks and write locks. There can be
+ * as many readers to given stripe as desired. When a write request comes
+ * in, no further readers are allowed to enter, and all subsequent requests
+ * are queued in FIFO order. When a the number of readers goes to zero, the
+ * writer is given the lock. When a writer releases the lock, the list of
+ * queued requests is scanned, and all readersq up to the next writer are
+ * given the lock.
+ *
+ * The lock table size must be one less than a power of two, but HASH_STRIPEID
+ * is the only function that requires this.
+ *
+ * The code now supports "range locks". When you ask to lock a stripe, you
+ * specify a range of addresses in that stripe that you want to lock. When
+ * you acquire the lock, you've locked only this range of addresses, and
+ * other threads can concurrently read/write any non-overlapping portions
+ * of the stripe. The "addresses" that you lock are abstract in that you
+ * can pass in anything you like.  The expectation is that you'll pass in
+ * the range of physical disk offsets of the parity bits you're planning
+ * to update. The idea behind this, of course, is to allow sub-stripe
+ * locking. The implementation is perhaps not the best imaginable; in the
+ * worst case a lock release is O(n^2) in the total number of outstanding
+ * requests to a given stripe.  Note that if you're striping with a
+ * stripe unit size equal to an entire disk (i.e. not striping), there will
+ * be only one stripe and you may spend some significant number of cycles
+ * searching through stripe lock descriptors.
+ */
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_stripelocks.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_freelist.h>
+#include <dev/raidframe/rf_debugprint.h>
+#include <dev/raidframe/rf_driver.h>
+#include <dev/raidframe/rf_shutdown.h>
+
+#define Dprintf1(s,a)         rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf2(s,a,b)       rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf3(s,a,b,c)     rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
+#define Dprintf4(s,a,b,c,d)   rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
+#define Dprintf5(s,a,b,c,d,e) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
+#define Dprintf6(s,a,b,c,d,e,f) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
+#define Dprintf7(s,a,b,c,d,e,f,g) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
+#define Dprintf8(s,a,b,c,d,e,f,g,h) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),(void *)((unsigned long)h))
+
+#define FLUSH
+
+#define HASH_STRIPEID(_sid_)  ( (_sid_) & (rf_lockTableSize-1) )
+
+static void AddToWaitersQueue(RF_LockTableEntry_t * lockTable, RF_StripeLockDesc_t * lockDesc, RF_LockReqDesc_t * lockReqDesc);
+static RF_StripeLockDesc_t *AllocStripeLockDesc(RF_StripeNum_t stripeID);
+static void FreeStripeLockDesc(RF_StripeLockDesc_t * p);
+static void PrintLockedStripes(RF_LockTableEntry_t * lockTable);
+
+/* determines if two ranges overlap.  always yields false if either start value is negative  */
+#define SINGLE_RANGE_OVERLAP(_strt1, _stop1, _strt2, _stop2)                                     \
+  ( (_strt1 >= 0) && (_strt2 >= 0) && (RF_MAX(_strt1, _strt2) <= RF_MIN(_stop1, _stop2)) )
+
+/* determines if any of the ranges specified in the two lock descriptors overlap each other */
+#define RANGE_OVERLAP(_cand, _pred)                                                  \
+  ( SINGLE_RANGE_OVERLAP((_cand)->start,  (_cand)->stop,  (_pred)->start,  (_pred)->stop ) ||    \
+    SINGLE_RANGE_OVERLAP((_cand)->start2, (_cand)->stop2, (_pred)->start,  (_pred)->stop ) ||    \
+    SINGLE_RANGE_OVERLAP((_cand)->start,  (_cand)->stop,  (_pred)->start2, (_pred)->stop2) ||    \
+    SINGLE_RANGE_OVERLAP((_cand)->start2, (_cand)->stop2, (_pred)->start2, (_pred)->stop2) )
+
+/* Determines if a candidate lock request conflicts with a predecessor lock req.
+ * Note that the arguments are not interchangeable.
+ * The rules are:
+ *      a candidate read conflicts with a predecessor write if any ranges overlap
+ *      a candidate write conflicts with a predecessor read if any ranges overlap
+ *      a candidate write conflicts with a predecessor write if any ranges overlap
+ */
+#define STRIPELOCK_CONFLICT(_cand, _pred)                                        \
+  RANGE_OVERLAP((_cand), (_pred)) &&                                        \
+  ( ( (((_cand)->type == RF_IO_TYPE_READ) && ((_pred)->type == RF_IO_TYPE_WRITE)) ||                      \
+      (((_cand)->type == RF_IO_TYPE_WRITE) && ((_pred)->type == RF_IO_TYPE_READ)) ||                      \
+      (((_cand)->type == RF_IO_TYPE_WRITE) && ((_pred)->type == RF_IO_TYPE_WRITE))                         \
+    )                                                                            \
+  )
+
+static RF_FreeList_t *rf_stripelock_freelist;
+#define RF_MAX_FREE_STRIPELOCK 128
+#define RF_STRIPELOCK_INC        8
+#define RF_STRIPELOCK_INITIAL   32
+
+static void rf_ShutdownStripeLockFreeList(void *);
+static void rf_RaidShutdownStripeLocks(void *);
+
+static void 
+rf_ShutdownStripeLockFreeList(ignored)
+	void   *ignored;
+{
+	RF_FREELIST_DESTROY(rf_stripelock_freelist, next, (RF_StripeLockDesc_t *));
+}
+
+int 
+rf_ConfigureStripeLockFreeList(listp)
+	RF_ShutdownList_t **listp;
+{
+	unsigned mask;
+	int     rc;
+
+	RF_FREELIST_CREATE(rf_stripelock_freelist, RF_MAX_FREE_STRIPELOCK,
+	    RF_STRIPELOCK_INITIAL, sizeof(RF_StripeLockDesc_t));
+	rc = rf_ShutdownCreate(listp, rf_ShutdownStripeLockFreeList, NULL);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+		    __FILE__, __LINE__, rc);
+		rf_ShutdownStripeLockFreeList(NULL);
+		return (rc);
+	}
+	RF_FREELIST_PRIME(rf_stripelock_freelist, RF_STRIPELOCK_INITIAL, next,
+	    (RF_StripeLockDesc_t *));
+	for (mask = 0x1; mask; mask <<= 1)
+		if (rf_lockTableSize == mask)
+			break;
+	if (!mask) {
+		printf("[WARNING:  lock table size must be a power of two.  Setting to %d.]\n", RF_DEFAULT_LOCK_TABLE_SIZE);
+		rf_lockTableSize = RF_DEFAULT_LOCK_TABLE_SIZE;
+	}
+	return (0);
+}
+
+RF_LockTableEntry_t *
+rf_MakeLockTable()
+{
+	RF_LockTableEntry_t *lockTable;
+	int     i, rc;
+
+	RF_Calloc(lockTable, ((int) rf_lockTableSize), sizeof(RF_LockTableEntry_t), (RF_LockTableEntry_t *));
+	if (lockTable == NULL)
+		return (NULL);
+	for (i = 0; i < rf_lockTableSize; i++) {
+		rc = rf_mutex_init(&lockTable[i].mutex, __FUNCTION__);
+		if (rc) {
+			RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+			    __LINE__, rc);
+			/* XXX clean up other mutexes */
+			return (NULL);
+		}
+	}
+	return (lockTable);
+}
+
+void 
+rf_ShutdownStripeLocks(RF_LockTableEntry_t * lockTable)
+{
+	int     i;
+
+	if (rf_stripeLockDebug) {
+		PrintLockedStripes(lockTable);
+	}
+	for (i = 0; i < rf_lockTableSize; i++) {
+		rf_mutex_destroy(&lockTable[i].mutex);
+	}
+	RF_Free(lockTable, rf_lockTableSize * sizeof(RF_LockTableEntry_t));
+}
+
+static void 
+rf_RaidShutdownStripeLocks(arg)
+	void   *arg;
+{
+	RF_Raid_t *raidPtr = (RF_Raid_t *) arg;
+	rf_ShutdownStripeLocks(raidPtr->lockTable);
+}
+
+int 
+rf_ConfigureStripeLocks(
+    RF_ShutdownList_t ** listp,
+    RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr)
+{
+	int     rc;
+
+	raidPtr->lockTable = rf_MakeLockTable();
+	if (raidPtr->lockTable == NULL)
+		return (ENOMEM);
+	rc = rf_ShutdownCreate(listp, rf_RaidShutdownStripeLocks, raidPtr);
+	if (rc) {
+		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+		    __FILE__, __LINE__, rc);
+		rf_ShutdownStripeLocks(raidPtr->lockTable);
+		return (rc);
+	}
+	return (0);
+}
+/* returns 0 if you've got the lock, and non-zero if you have to wait.
+ * if and only if you have to wait, we'll cause cbFunc to get invoked
+ * with cbArg when you are granted the lock.  We store a tag in *releaseTag
+ * that you need to give back to us when you release the lock.
+ */
+int 
+rf_AcquireStripeLock(
+    RF_LockTableEntry_t * lockTable,
+    RF_StripeNum_t stripeID,
+    RF_LockReqDesc_t * lockReqDesc)
+{
+	RF_StripeLockDesc_t *lockDesc;
+	RF_LockReqDesc_t *p;
+	int     tid = 0, hashval = HASH_STRIPEID(stripeID);
+	int     retcode = 0;
+
+	RF_ASSERT(RF_IO_IS_R_OR_W(lockReqDesc->type));
+
+	if (rf_stripeLockDebug) {
+		if (stripeID == -1)
+			Dprintf1("[%d] Lock acquisition supressed (stripeID == -1)\n", tid);
+		else {
+			Dprintf8("[%d] Trying to acquire stripe lock table 0x%lx SID %ld type %c range %ld-%ld, range2 %ld-%ld hashval %d\n",
+			    tid, (unsigned long) lockTable, stripeID, lockReqDesc->type, lockReqDesc->start,
+			    lockReqDesc->stop, lockReqDesc->start2, lockReqDesc->stop2);
+			Dprintf3("[%d] lock %ld hashval %d\n", tid, stripeID, hashval);
+			FLUSH;
+		}
+	}
+	if (stripeID == -1)
+		return (0);
+	lockReqDesc->next = NULL;	/* just to be sure */
+
+	RF_LOCK_MUTEX(lockTable[hashval].mutex);
+	for (lockDesc = lockTable[hashval].descList; lockDesc; lockDesc = lockDesc->next) {
+		if (lockDesc->stripeID == stripeID)
+			break;
+	}
+
+	if (!lockDesc) {	/* no entry in table => no one reading or
+				 * writing */
+		lockDesc = AllocStripeLockDesc(stripeID);
+		lockDesc->next = lockTable[hashval].descList;
+		lockTable[hashval].descList = lockDesc;
+		if (lockReqDesc->type == RF_IO_TYPE_WRITE)
+			lockDesc->nWriters++;
+		lockDesc->granted = lockReqDesc;
+		if (rf_stripeLockDebug) {
+			Dprintf7("[%d] no one waiting: lock %ld %c %ld-%ld %ld-%ld granted\n",
+			    tid, stripeID, lockReqDesc->type, lockReqDesc->start, lockReqDesc->stop, lockReqDesc->start2, lockReqDesc->stop2);
+			FLUSH;
+		}
+	} else {
+
+		if (lockReqDesc->type == RF_IO_TYPE_WRITE)
+			lockDesc->nWriters++;
+
+		if (lockDesc->nWriters == 0) {	/* no need to search any lists
+						 * if there are no writers
+						 * anywhere */
+			lockReqDesc->next = lockDesc->granted;
+			lockDesc->granted = lockReqDesc;
+			if (rf_stripeLockDebug) {
+				Dprintf7("[%d] no writers: lock %ld %c %ld-%ld %ld-%ld granted\n",
+				    tid, stripeID, lockReqDesc->type, lockReqDesc->start, lockReqDesc->stop, lockReqDesc->start2, lockReqDesc->stop2);
+				FLUSH;
+			}
+		} else {
+
+			/* search the granted & waiting lists for a conflict.
+			 * stop searching as soon as we find one */
+			retcode = 0;
+			for (p = lockDesc->granted; p; p = p->next)
+				if (STRIPELOCK_CONFLICT(lockReqDesc, p)) {
+					retcode = 1;
+					break;
+				}
+			if (!retcode)
+				for (p = lockDesc->waitersH; p; p = p->next)
+					if (STRIPELOCK_CONFLICT(lockReqDesc, p)) {
+						retcode = 2;
+						break;
+					}
+			if (!retcode) {
+				lockReqDesc->next = lockDesc->granted;	/* no conflicts found =>
+									 * grant lock */
+				lockDesc->granted = lockReqDesc;
+				if (rf_stripeLockDebug) {
+					Dprintf7("[%d] no conflicts: lock %ld %c %ld-%ld %ld-%ld granted\n",
+					    tid, stripeID, lockReqDesc->type, lockReqDesc->start, lockReqDesc->stop,
+					    lockReqDesc->start2, lockReqDesc->stop2);
+					FLUSH;
+				}
+			} else {
+				if (rf_stripeLockDebug) {
+					Dprintf6("[%d] conflict: lock %ld %c %ld-%ld hashval=%d not granted\n",
+					    tid, stripeID, lockReqDesc->type, lockReqDesc->start, lockReqDesc->stop,
+					    hashval);
+					Dprintf3("[%d] lock %ld retcode=%d\n", tid, stripeID, retcode);
+					FLUSH;
+				}
+				AddToWaitersQueue(lockTable, lockDesc, lockReqDesc);	/* conflict => the
+											 * current access must
+											 * wait */
+			}
+		}
+	}
+
+	RF_UNLOCK_MUTEX(lockTable[hashval].mutex);
+	return (retcode);
+}
+
+void 
+rf_ReleaseStripeLock(
+    RF_LockTableEntry_t * lockTable,
+    RF_StripeNum_t stripeID,
+    RF_LockReqDesc_t * lockReqDesc)
+{
+	RF_StripeLockDesc_t *lockDesc, *ld_t;
+	RF_LockReqDesc_t *lr, *lr_t, *callbacklist, *t;
+	RF_IoType_t type = lockReqDesc->type;
+	int     tid = 0, hashval = HASH_STRIPEID(stripeID);
+	int     release_it, consider_it;
+	RF_LockReqDesc_t *candidate, *candidate_t, *predecessor;
+
+	RF_ASSERT(RF_IO_IS_R_OR_W(type));
+
+	if (rf_stripeLockDebug) {
+		if (stripeID == -1)
+			Dprintf1("[%d] Lock release supressed (stripeID == -1)\n", tid);
+		else {
+			Dprintf8("[%d] Releasing stripe lock on stripe ID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
+			    tid, stripeID, lockReqDesc->type, lockReqDesc->start, lockReqDesc->stop, lockReqDesc->start2, lockReqDesc->stop2, lockTable);
+			FLUSH;
+		}
+	}
+	if (stripeID == -1)
+		return;
+
+	RF_LOCK_MUTEX(lockTable[hashval].mutex);
+
+	/* find the stripe lock descriptor */
+	for (ld_t = NULL, lockDesc = lockTable[hashval].descList; lockDesc; ld_t = lockDesc, lockDesc = lockDesc->next) {
+		if (lockDesc->stripeID == stripeID)
+			break;
+	}
+	RF_ASSERT(lockDesc);	/* major error to release a lock that doesn't
+				 * exist */
+
+	/* find the stripe lock request descriptor & delete it from the list */
+	for (lr_t = NULL, lr = lockDesc->granted; lr; lr_t = lr, lr = lr->next)
+		if (lr == lockReqDesc)
+			break;
+
+	RF_ASSERT(lr && (lr == lockReqDesc));	/* major error to release a
+						 * lock that hasn't been
+						 * granted */
+	if (lr_t)
+		lr_t->next = lr->next;
+	else {
+		RF_ASSERT(lr == lockDesc->granted);
+		lockDesc->granted = lr->next;
+	}
+	lr->next = NULL;
+
+	if (lockReqDesc->type == RF_IO_TYPE_WRITE)
+		lockDesc->nWriters--;
+
+	/* search through the waiters list to see if anyone needs to be woken
+	 * up. for each such descriptor in the wait list, we check it against
+	 * everything granted and against everything _in front_ of it in the
+	 * waiters queue.  If it conflicts with none of these, we release it.
+	 * 
+	 * DON'T TOUCH THE TEMPLINK POINTER OF ANYTHING IN THE GRANTED LIST HERE.
+	 * This will roach the case where the callback tries to acquire a new
+	 * lock in the same stripe.  There are some asserts to try and detect
+	 * this.
+	 * 
+	 * We apply 2 performance optimizations: (1) if releasing this lock
+	 * results in no more writers to this stripe, we just release
+	 * everybody waiting, since we place no restrictions on the number of
+	 * concurrent reads. (2) we consider as candidates for wakeup only
+	 * those waiters that have a range overlap with either the descriptor
+	 * being woken up or with something in the callbacklist (i.e.
+	 * something we've just now woken up). This allows us to avoid the
+	 * long evaluation for some descriptors. */
+
+	callbacklist = NULL;
+	if (lockDesc->nWriters == 0) {	/* performance tweak (1) */
+		while (lockDesc->waitersH) {
+
+			lr = lockDesc->waitersH;	/* delete from waiters
+							 * list */
+			lockDesc->waitersH = lr->next;
+
+			RF_ASSERT(lr->type == RF_IO_TYPE_READ);
+
+			lr->next = lockDesc->granted;	/* add to granted list */
+			lockDesc->granted = lr;
+
+			RF_ASSERT(!lr->templink);
+			lr->templink = callbacklist;	/* put on callback list
+							 * so that we'll invoke
+							 * callback below */
+			callbacklist = lr;
+			if (rf_stripeLockDebug) {
+				Dprintf8("[%d] No writers: granting lock stripe ID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
+				    tid, stripeID, lr->type, lr->start, lr->stop, lr->start2, lr->stop2, (unsigned long) lockTable);
+				FLUSH;
+			}
+		}
+		lockDesc->waitersT = NULL;	/* we've purged the whole
+						 * waiters list */
+
+	} else
+		for (candidate_t = NULL, candidate = lockDesc->waitersH; candidate;) {
+
+			/* performance tweak (2) */
+			consider_it = 0;
+			if (RANGE_OVERLAP(lockReqDesc, candidate))
+				consider_it = 1;
+			else
+				for (t = callbacklist; t; t = t->templink)
+					if (RANGE_OVERLAP(t, candidate)) {
+						consider_it = 1;
+						break;
+					}
+			if (!consider_it) {
+				if (rf_stripeLockDebug) {
+					Dprintf8("[%d] No overlap: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
+					    tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
+					    (unsigned long) lockTable);
+					FLUSH;
+				}
+				candidate_t = candidate;
+				candidate = candidate->next;
+				continue;
+			}
+			/* we have a candidate for release.  check to make
+			 * sure it is not blocked by any granted locks */
+			release_it = 1;
+			for (predecessor = lockDesc->granted; predecessor; predecessor = predecessor->next) {
+				if (STRIPELOCK_CONFLICT(candidate, predecessor)) {
+					if (rf_stripeLockDebug) {
+						Dprintf8("[%d] Conflicts with granted lock: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
+						    tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
+						    (unsigned long) lockTable);
+						FLUSH;
+					}
+					release_it = 0;
+					break;
+				}
+			}
+
+			/* now check to see if the candidate is blocked by any
+			 * waiters that occur before it it the wait queue */
+			if (release_it)
+				for (predecessor = lockDesc->waitersH; predecessor != candidate; predecessor = predecessor->next) {
+					if (STRIPELOCK_CONFLICT(candidate, predecessor)) {
+						if (rf_stripeLockDebug) {
+							Dprintf8("[%d] Conflicts with waiting lock: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
+							    tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
+							    (unsigned long) lockTable);
+							FLUSH;
+						}
+						release_it = 0;
+						break;
+					}
+				}
+
+			/* release it if indicated */
+			if (release_it) {
+				if (rf_stripeLockDebug) {
+					Dprintf8("[%d] Granting lock to candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
+					    tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
+					    (unsigned long) lockTable);
+					FLUSH;
+				}
+				if (candidate_t) {
+					candidate_t->next = candidate->next;
+					if (lockDesc->waitersT == candidate)
+						lockDesc->waitersT = candidate_t;	/* cannot be waitersH
+											 * since candidate_t is
+											 * not NULL */
+				} else {
+					RF_ASSERT(candidate == lockDesc->waitersH);
+					lockDesc->waitersH = lockDesc->waitersH->next;
+					if (!lockDesc->waitersH)
+						lockDesc->waitersT = NULL;
+				}
+				candidate->next = lockDesc->granted;	/* move it to the
+									 * granted list */
+				lockDesc->granted = candidate;
+
+				RF_ASSERT(!candidate->templink);
+				candidate->templink = callbacklist;	/* put it on the list of
+									 * things to be called
+									 * after we release the
+									 * mutex */
+				callbacklist = candidate;
+
+				if (!candidate_t)
+					candidate = lockDesc->waitersH;
+				else
+					candidate = candidate_t->next;	/* continue with the
+									 * rest of the list */
+			} else {
+				candidate_t = candidate;
+				candidate = candidate->next;	/* continue with the
+								 * rest of the list */
+			}
+		}
+
+	/* delete the descriptor if no one is waiting or active */
+	if (!lockDesc->granted && !lockDesc->waitersH) {
+		RF_ASSERT(lockDesc->nWriters == 0);
+		if (rf_stripeLockDebug) {
+			Dprintf3("[%d] Last lock released (table 0x%lx): deleting desc for stripeID %ld\n", tid, (unsigned long) lockTable, stripeID);
+			FLUSH;
+		}
+		if (ld_t)
+			ld_t->next = lockDesc->next;
+		else {
+			RF_ASSERT(lockDesc == lockTable[hashval].descList);
+			lockTable[hashval].descList = lockDesc->next;
+		}
+		FreeStripeLockDesc(lockDesc);
+		lockDesc = NULL;/* only for the ASSERT below */
+	}
+	RF_UNLOCK_MUTEX(lockTable[hashval].mutex);
+
+	/* now that we've unlocked the mutex, invoke the callback on all the
+	 * descriptors in the list */
+	RF_ASSERT(!((callbacklist) && (!lockDesc)));	/* if we deleted the
+							 * descriptor, we should
+							 * have no callbacks to
+							 * do */
+	for (candidate = callbacklist; candidate;) {
+		t = candidate;
+		candidate = candidate->templink;
+		t->templink = NULL;
+		(t->cbFunc) (t->cbArg);
+	}
+}
+/* must have the indicated lock table mutex upon entry */
+static void 
+AddToWaitersQueue(
+    RF_LockTableEntry_t * lockTable,
+    RF_StripeLockDesc_t * lockDesc,
+    RF_LockReqDesc_t * lockReqDesc)
+{
+#if 0 /* XXX fvdl -- unitialized use of 'tid' */
+	int     tid;
+
+	if (rf_stripeLockDebug) {
+		Dprintf3("[%d] Waiting on lock for stripe %ld table 0x%lx\n", tid, lockDesc->stripeID, (unsigned long) lockTable);
+		FLUSH;
+	}
+#endif
+	if (!lockDesc->waitersH) {
+		lockDesc->waitersH = lockDesc->waitersT = lockReqDesc;
+	} else {
+		lockDesc->waitersT->next = lockReqDesc;
+		lockDesc->waitersT = lockReqDesc;
+	}
+}
+
+static RF_StripeLockDesc_t *
+AllocStripeLockDesc(RF_StripeNum_t stripeID)
+{
+	RF_StripeLockDesc_t *p;
+
+	RF_FREELIST_GET(rf_stripelock_freelist, p, next, (RF_StripeLockDesc_t *));
+	if (p) {
+		p->stripeID = stripeID;
+	}
+	return (p);
+}
+
+static void 
+FreeStripeLockDesc(RF_StripeLockDesc_t * p)
+{
+	RF_FREELIST_FREE(rf_stripelock_freelist, p, next);
+}
+
+static void 
+PrintLockedStripes(lockTable)
+	RF_LockTableEntry_t *lockTable;
+{
+	int     i, j, foundone = 0, did;
+	RF_StripeLockDesc_t *p;
+	RF_LockReqDesc_t *q;
+
+	RF_LOCK_MUTEX(rf_printf_mutex);
+	printf("Locked stripes:\n");
+	for (i = 0; i < rf_lockTableSize; i++)
+		if (lockTable[i].descList) {
+			foundone = 1;
+			for (p = lockTable[i].descList; p; p = p->next) {
+				printf("Stripe ID 0x%lx (%d) nWriters %d\n",
+				    (long) p->stripeID, (int) p->stripeID, p->nWriters);
+
+				if (!(p->granted))
+					printf("Granted: (none)\n");
+				else
+					printf("Granted:\n");
+				for (did = 1, j = 0, q = p->granted; q; j++, q = q->next) {
+					printf("  %c(%ld-%ld", q->type, (long) q->start, (long) q->stop);
+					if (q->start2 != -1)
+						printf(",%ld-%ld) ", (long) q->start2,
+						    (long) q->stop2);
+					else
+						printf(") ");
+					if (j && !(j % 4)) {
+						printf("\n");
+						did = 1;
+					} else
+						did = 0;
+				}
+				if (!did)
+					printf("\n");
+
+				if (!(p->waitersH))
+					printf("Waiting: (none)\n");
+				else
+					printf("Waiting:\n");
+				for (did = 1, j = 0, q = p->waitersH; q; j++, q = q->next) {
+					printf("%c(%ld-%ld", q->type, (long) q->start, (long) q->stop);
+					if (q->start2 != -1)
+						printf(",%ld-%ld) ", (long) q->start2, (long) q->stop2);
+					else
+						printf(") ");
+					if (j && !(j % 4)) {
+						printf("\n         ");
+						did = 1;
+					} else
+						did = 0;
+				}
+				if (!did)
+					printf("\n");
+			}
+		}
+	if (!foundone)
+		printf("(none)\n");
+	else
+		printf("\n");
+	RF_UNLOCK_MUTEX(rf_printf_mutex);
+}
diff --git a/sys/dev/raidframe/rf_stripelocks.h b/sys/dev/raidframe/rf_stripelocks.h
new file mode 100644
index 0000000..ab960c1
--- /dev/null
+++ b/sys/dev/raidframe/rf_stripelocks.h
@@ -0,0 +1,130 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_stripelocks.h,v 1.3 1999/02/05 00:06:18 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * stripelocks.h -- header file for locking stripes
+ *
+ * Note that these functions are called from the execution routines of certain
+ * DAG Nodes, and so they must be NON-BLOCKING to assure maximum parallelism
+ * in the DAG.  Accordingly, when a node wants to acquire a lock, it calls
+ * AcquireStripeLock, supplying a pointer to a callback function.  If the lock
+ * is free at the time of the call, 0 is returned, indicating that the lock
+ * has been acquired.  If the lock is not free, 1 is returned, and a copy of
+ * the function pointer and argument are held in the lock table.  When the
+ * lock becomes free, the callback function is invoked.
+ *
+ *****************************************************************************/
+
+#ifndef _RF__RF_STRIPELOCKS_H_
+#define _RF__RF_STRIPELOCKS_H_
+
+#if defined(__FreeBSD__)
+#include <sys/types.h>
+#if __FreeBSD_version > 500005
+#include <sys/bio.h>
+#endif
+#if _KERNEL
+#include <sys/systm.h>
+#endif
+#endif
+#include <sys/buf.h>
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_general.h>
+
+struct RF_LockReqDesc_s {
+	RF_IoType_t type;	/* read or write */
+	RF_int64 start, stop;	/* start and end of range to be locked */
+	RF_int64 start2, stop2;	/* start and end of 2nd range to be locked */
+	void    (*cbFunc) (RF_Buf_t);	/* callback function */
+	void   *cbArg;		/* argument to callback function */
+	RF_LockReqDesc_t *next;	/* next element in chain */
+	RF_LockReqDesc_t *templink;	/* for making short-lived lists of
+					 * request descriptors */
+};
+#define RF_ASSERT_VALID_LOCKREQ(_lr_) { \
+	RF_ASSERT(RF_IO_IS_R_OR_W((_lr_)->type)); \
+}
+
+struct RF_StripeLockDesc_s {
+	RF_StripeNum_t stripeID;/* the stripe ID */
+	RF_LockReqDesc_t *granted;	/* unordered list of granted requests */
+	RF_LockReqDesc_t *waitersH;	/* FIFO queue of all waiting reqs,
+					 * both read and write (Head and Tail) */
+	RF_LockReqDesc_t *waitersT;
+	int     nWriters;	/* number of writers either granted or waiting */
+	RF_StripeLockDesc_t *next;	/* for hash table collision resolution */
+};
+
+struct RF_LockTableEntry_s {
+	RF_DECLARE_MUTEX(mutex)	/* mutex on this hash chain */
+	RF_StripeLockDesc_t *descList;	/* hash chain of lock descriptors */
+};
+/*
+ * Initializes a stripe lock descriptor.  _defSize is the number of sectors
+ * that we lock when there is no parity information in the ASM (e.g. RAID0).
+ */
+
+#define RF_INIT_LOCK_REQ_DESC(_lrd, _typ, _cbf, _cba, _asm, _defSize)                                           \
+  {                                                                                                          \
+    (_lrd).type    = _typ;                                                                                   \
+    (_lrd).start2  = -1;                                                                                     \
+    (_lrd).stop2   = -1;                                                                                     \
+    if ((_asm)->parityInfo) {                                                                                \
+      (_lrd).start = (_asm)->parityInfo->startSector;                                                        \
+      (_lrd).stop  = (_asm)->parityInfo->startSector + (_asm)->parityInfo->numSector-1;                      \
+      if ((_asm)->parityInfo->next) {                                                                        \
+        (_lrd).start2  = (_asm)->parityInfo->next->startSector;                                              \
+        (_lrd).stop2   = (_asm)->parityInfo->next->startSector + (_asm)->parityInfo->next->numSector-1;      \
+      }                                                                                                      \
+    } else {                                                                                                 \
+      (_lrd).start = 0;                                                                                      \
+      (_lrd).stop  = (_defSize);                                                                             \
+    }													     \
+    (_lrd).templink= NULL;                                                                                   \
+    (_lrd).cbFunc  = (_cbf);                                                                                 \
+    (_lrd).cbArg   = (void *) (_cba);                                                                        \
+  }
+
+int     rf_ConfigureStripeLockFreeList(RF_ShutdownList_t ** listp);
+RF_LockTableEntry_t *rf_MakeLockTable(void);
+void    rf_ShutdownStripeLocks(RF_LockTableEntry_t * lockTable);
+int 
+rf_ConfigureStripeLocks(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr,
+    RF_Config_t * cfgPtr);
+int 
+rf_AcquireStripeLock(RF_LockTableEntry_t * lockTable,
+    RF_StripeNum_t stripeID, RF_LockReqDesc_t * lockReqDesc);
+void 
+rf_ReleaseStripeLock(RF_LockTableEntry_t * lockTable,
+    RF_StripeNum_t stripeID, RF_LockReqDesc_t * lockReqDesc);
+
+#endif				/* !_RF__RF_STRIPELOCKS_H_ */
diff --git a/sys/dev/raidframe/rf_strutils.c b/sys/dev/raidframe/rf_strutils.c
new file mode 100644
index 0000000..bb8a776
--- /dev/null
+++ b/sys/dev/raidframe/rf_strutils.c
@@ -0,0 +1,56 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_strutils.c,v 1.3 1999/02/05 00:06:18 oster Exp $	*/
+/*
+ * rf_strutils.c
+ *
+ * String-parsing funcs
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * rf_strutils.c -- some simple utilities for munging on strings.
+ * I put them in a file by themselves because they're needed in
+ * setconfig, in the user-level driver, and in the kernel.
+ *
+ */
+
+#include <dev/raidframe/rf_utils.h>
+
+/* finds a non-white character in the line */
+char   *
+rf_find_non_white(char *p)
+{
+	for (; *p != '\0' && (*p == ' ' || *p == '\t'); p++);
+	return (p);
+}
+/* finds a white character in the line */
+char   *
+rf_find_white(char *p)
+{
+	for (; *p != '\0' && (*p != ' ' && *p != '\t'); p++);
+	return (p);
+}
diff --git a/sys/dev/raidframe/rf_threadstuff.c b/sys/dev/raidframe/rf_threadstuff.c
new file mode 100644
index 0000000..d1ecf16
--- /dev/null
+++ b/sys/dev/raidframe/rf_threadstuff.c
@@ -0,0 +1,221 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_threadstuff.c,v 1.5 1999/12/07 02:13:28 oster Exp $	*/
+/*
+ * rf_threadstuff.c
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_threadstuff.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_shutdown.h>
+
+static void mutex_destroyer(void *);
+static void cond_destroyer(void *);
+
+/*
+ * Shared stuff
+ */
+
+static void 
+mutex_destroyer(arg)
+	void   *arg;
+{
+	int     rc;
+
+	rc = rf_mutex_destroy(arg);
+	if (rc) {
+		RF_ERRORMSG1("RAIDFRAME: Error %d auto-destroying mutex\n", rc);
+	}
+}
+
+static void 
+cond_destroyer(arg)
+	void   *arg;
+{
+	int     rc;
+
+	rc = rf_cond_destroy(arg);
+	if (rc) {
+		RF_ERRORMSG1("RAIDFRAME: Error %d auto-destroying condition\n", rc);
+	}
+}
+
+int 
+_rf_create_managed_mutex(listp, m, file, line)
+	RF_ShutdownList_t **listp;
+RF_DECLARE_MUTEX(*m)
+	char   *file;
+	int     line;
+{
+	int     rc, rc1;
+
+	rc = rf_mutex_init(m, __FUNCTION__);
+	if (rc)
+		return (rc);
+	rc = _rf_ShutdownCreate(listp, mutex_destroyer, (void *) m, file, line);
+	if (rc) {
+		RF_ERRORMSG1("RAIDFRAME: Error %d adding shutdown entry\n", rc);
+		rc1 = rf_mutex_destroy(m);
+		if (rc1) {
+			RF_ERRORMSG1("RAIDFRAME: Error %d destroying mutex\n", rc1);
+		}
+	}
+	return (rc);
+}
+
+int 
+_rf_create_managed_cond(listp, c, file, line)
+	RF_ShutdownList_t **listp;
+RF_DECLARE_COND(*c)
+	char   *file;
+	int     line;
+{
+	int     rc, rc1;
+
+	rc = rf_cond_init(c);
+	if (rc)
+		return (rc);
+	rc = _rf_ShutdownCreate(listp, cond_destroyer, (void *) c, file, line);
+	if (rc) {
+		RF_ERRORMSG1("RAIDFRAME: Error %d adding shutdown entry\n", rc);
+		rc1 = rf_cond_destroy(c);
+		if (rc1) {
+			RF_ERRORMSG1("RAIDFRAME: Error %d destroying cond\n", rc1);
+		}
+	}
+	return (rc);
+}
+
+int 
+_rf_init_managed_threadgroup(listp, g, file, line)
+	RF_ShutdownList_t **listp;
+	RF_ThreadGroup_t *g;
+	char   *file;
+	int     line;
+{
+	int     rc;
+
+	rc = _rf_create_managed_mutex(listp, &g->mutex, file, line);
+	if (rc)
+		return (rc);
+	rc = _rf_create_managed_cond(listp, &g->cond, file, line);
+	if (rc)
+		return (rc);
+	g->created = g->running = g->shutdown = 0;
+	return (0);
+}
+
+int 
+_rf_destroy_threadgroup(g, file, line)
+	RF_ThreadGroup_t *g;
+	char   *file;
+	int     line;
+{
+	int     rc1, rc2;
+
+	rc1 = rf_mutex_destroy(&g->mutex);
+	rc2 = rf_cond_destroy(&g->cond);
+	if (rc1)
+		return (rc1);
+	return (rc2);
+}
+
+int 
+_rf_init_threadgroup(g, file, line)
+	RF_ThreadGroup_t *g;
+	char   *file;
+	int     line;
+{
+	int     rc;
+
+	rc = rf_mutex_init(&g->mutex, __FUNCTION__);
+	if (rc)
+		return (rc);
+	rc = rf_cond_init(&g->cond);
+	if (rc) {
+		rf_mutex_destroy(&g->mutex);
+		return (rc);
+	}
+	g->created = g->running = g->shutdown = 0;
+	return (0);
+}
+
+
+/*
+ * Kernel
+ */
+#if defined(__FreeBSD__) && __FreeBSD_version > 500005
+int 
+rf_mutex_init(m, s)
+decl_simple_lock_data(, *m)
+const char *s;
+{
+	mtx_init(m, s, NULL, MTX_DEF);
+	return (0);
+}
+
+int 
+rf_mutex_destroy(m)
+decl_simple_lock_data(, *m)
+{
+	mtx_destroy(m);
+	return (0);
+}
+#else
+int 
+rf_mutex_init(m, s)
+decl_simple_lock_data(, *m)
+const char *s;
+{
+	simple_lock_init(m);
+	return (0);
+}
+
+int 
+rf_mutex_destroy(m)
+decl_simple_lock_data(, *m)
+{
+	return (0);
+}
+#endif
+
+int 
+rf_cond_init(c)
+RF_DECLARE_COND(*c)
+{
+	*c = 0;			/* no reason */
+	return (0);
+}
+
+int 
+rf_cond_destroy(c)
+RF_DECLARE_COND(*c)
+{
+	return (0);
+}
diff --git a/sys/dev/raidframe/rf_threadstuff.h b/sys/dev/raidframe/rf_threadstuff.h
new file mode 100644
index 0000000..f7e81ff
--- /dev/null
+++ b/sys/dev/raidframe/rf_threadstuff.h
@@ -0,0 +1,229 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_threadstuff.h,v 1.10 2001/01/27 20:42:21 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * threadstuff.h -- definitions for threads, locks, and synchronization
+ *
+ * The purpose of this file is provide some illusion of portability.
+ * If the functions below can be implemented with the same semantics on
+ * some new system, then at least the synchronization and thread control
+ * part of the code should not require modification to port to a new machine.
+ * the only other place where the pthread package is explicitly used is
+ * threadid.h
+ *
+ * this file should be included above stdio.h to get some necessary defines.
+ *
+ */
+
+#ifndef _RF__RF_THREADSTUFF_H_
+#define _RF__RF_THREADSTUFF_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#ifdef _KERNEL
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#endif
+
+#define rf_create_managed_mutex(a,b) _rf_create_managed_mutex(a,b,__FILE__,__LINE__)
+#define rf_create_managed_cond(a,b) _rf_create_managed_cond(a,b,__FILE__,__LINE__)
+#define rf_init_managed_threadgroup(a,b) _rf_init_managed_threadgroup(a,b,__FILE__,__LINE__)
+#define rf_init_threadgroup(a) _rf_init_threadgroup(a,__FILE__,__LINE__)
+#define rf_destroy_threadgroup(a) _rf_destroy_threadgroup(a,__FILE__,__LINE__)
+
+int     _rf_init_threadgroup(RF_ThreadGroup_t * g, char *file, int line);
+int     _rf_destroy_threadgroup(RF_ThreadGroup_t * g, char *file, int line);
+int 
+_rf_init_managed_threadgroup(RF_ShutdownList_t ** listp,
+    RF_ThreadGroup_t * g, char *file, int line);
+
+#include <sys/lock.h>
+#if defined(__FreeBSD__ ) && __FreeBSD_version > 500005
+#include <sys/mutex.h>
+#define decl_simple_lock_data(a,b) a struct mtx b;
+#define simple_lock_addr(a) ((struct mtx *)&(a))
+
+typedef struct thread *RF_Thread_t;
+typedef void *RF_ThreadArg_t;
+
+#ifdef _KERNEL
+static __inline struct ucred *
+rf_getucred(RF_Thread_t td)
+{
+	return (((struct thread *)td)->td_proc->p_ucred);
+}
+#endif
+
+#define RF_LOCK_MUTEX(_m_)              mtx_lock(&(_m_))
+#define RF_UNLOCK_MUTEX(_m_)            mtx_unlock(&(_m_))
+#else
+#define decl_simple_lock_data(a,b) a struct simplelock b;
+#define simple_lock_addr(a) ((struct simplelock *)&(a))
+
+typedef struct proc *RF_Thread_t;
+typedef void *RF_ThreadArg_t;
+
+static __inline struct ucred *
+rf_getucred(RF_Thread_t td)
+{
+	return (((struct proc *)td)->p_ucred);
+}
+
+#define RF_LOCK_MUTEX(_m_)              simple_lock(&(_m_))
+#define RF_UNLOCK_MUTEX(_m_)            simple_unlock(&(_m_))
+#endif
+
+#define RF_DECLARE_MUTEX(_m_)           decl_simple_lock_data(,(_m_))
+#define RF_DECLARE_STATIC_MUTEX(_m_)    decl_simple_lock_data(static,(_m_))
+#define RF_DECLARE_EXTERN_MUTEX(_m_)    decl_simple_lock_data(extern,(_m_))
+
+#define RF_DECLARE_COND(_c_)            int _c_;
+#define RF_DECLARE_STATIC_COND(_c_)     static int _c_;
+#define RF_DECLARE_EXTERN_COND(_c_)     extern int _c_;
+
+/*
+ * In NetBSD, kernel threads are simply processes which share several
+ * substructures and never run in userspace.
+ */
+#define RF_WAIT_COND(_c_,_m_)		\
+	RF_LTSLEEP(&(_c_), PRIBIO, "rfwcond", 0, &(_m_))
+#define RF_SIGNAL_COND(_c_)            wakeup_one(&(_c_))
+#define RF_BROADCAST_COND(_c_)         wakeup(&(_c_))
+#if defined(__NetBSD__)
+#define	RF_CREATE_THREAD(_handle_, _func_, _arg_, _name_) \
+	kthread_create1((void (*)(void *))(_func_), (void *)(_arg_), \
+	    (struct proc **)&(_handle_), _name_)
+#define RF_THREAD_EXIT(ret)	\
+	kthread_exit(ret)
+#elif defined(__FreeBSD__)
+#if __FreeBSD_version > 500005
+#define RF_CREATE_THREAD(_handle_, _func_, _arg_, _name_) \
+	kthread_create((void (*)(void *))(_func_), (void *)(_arg_), \
+	    (struct proc **)&(_handle_), 0, 4, _name_)
+#define RF_THREAD_EXIT(ret)	\
+	kthread_exit(ret)
+#else
+#define RF_CREATE_THREAD(_handle_, _func_, _arg_, _name_) \
+	kthread_create((void (*)(void *))(_func_), (void *)(_arg_), \
+	    (struct proc **)&(_handle_), _name_)
+#define RF_THREAD_EXIT(ret)	\
+	kthread_exit(ret);
+#endif
+#endif
+
+struct RF_ThreadGroup_s {
+	int     created;
+	int     running;
+	int     shutdown;
+	        RF_DECLARE_MUTEX(mutex)
+	        RF_DECLARE_COND(cond)
+};
+/*
+ * Someone has started a thread in the group
+ */
+#define RF_THREADGROUP_STARTED(_g_) { \
+	RF_LOCK_MUTEX((_g_)->mutex); \
+	(_g_)->created++; \
+	RF_UNLOCK_MUTEX((_g_)->mutex); \
+}
+
+/*
+ * Thread announcing that it is now running
+ */
+#define RF_THREADGROUP_RUNNING(_g_) { \
+	RF_LOCK_MUTEX((_g_)->mutex); \
+	(_g_)->running++; \
+	RF_UNLOCK_MUTEX((_g_)->mutex); \
+	RF_SIGNAL_COND((_g_)->cond); \
+}
+
+/*
+ * Thread announcing that it is now done
+ */
+#define RF_THREADGROUP_DONE(_g_) { \
+	RF_LOCK_MUTEX((_g_)->mutex); \
+	(_g_)->shutdown++; \
+	RF_UNLOCK_MUTEX((_g_)->mutex); \
+	RF_SIGNAL_COND((_g_)->cond); \
+}
+
+/*
+ * Wait for all threads to start running
+ */
+#define RF_THREADGROUP_WAIT_START(_g_) { \
+	RF_LOCK_MUTEX((_g_)->mutex); \
+	while((_g_)->running < (_g_)->created) { \
+		RF_WAIT_COND((_g_)->cond, (_g_)->mutex); \
+	} \
+	RF_UNLOCK_MUTEX((_g_)->mutex); \
+}
+
+/*
+ * Wait for all threads to stop running
+ */
+#ifndef __NetBSD__
+#define RF_THREADGROUP_WAIT_STOP(_g_) { \
+	RF_LOCK_MUTEX((_g_)->mutex); \
+	RF_ASSERT((_g_)->running == (_g_)->created); \
+	while((_g_)->shutdown < (_g_)->running) { \
+		RF_WAIT_COND((_g_)->cond, (_g_)->mutex); \
+	} \
+	RF_UNLOCK_MUTEX((_g_)->mutex); \
+}
+#else
+ /* XXX Note that we've removed the assert.  That should get put back in once
+  * we actually get something like a kernel thread running */
+#define RF_THREADGROUP_WAIT_STOP(_g_) { \
+	RF_LOCK_MUTEX((_g_)->mutex); \
+	while((_g_)->shutdown < (_g_)->running) { \
+		RF_WAIT_COND((_g_)->cond, (_g_)->mutex); \
+	} \
+	RF_UNLOCK_MUTEX((_g_)->mutex); \
+}
+#endif
+
+#if defined(__FreeBSD__) && __FreeBSD_version > 500005
+int     rf_mutex_init(struct mtx *, const char *);
+int     rf_mutex_destroy(struct mtx *);
+int	_rf_create_managed_mutex(RF_ShutdownList_t **, struct mtx *,
+				 char *, int);
+#else
+int     rf_mutex_init(struct simplelock *, const char *);
+int     rf_mutex_destroy(struct simplelock *);
+int	_rf_create_managed_mutex(RF_ShutdownList_t **, struct simplelock *,
+				 char *, int);
+#endif
+int	_rf_create_managed_cond(RF_ShutdownList_t ** listp, int *,
+				char *file, int line);
+
+int     rf_cond_init(int *c);
+int     rf_cond_destroy(int *c);
+#endif				/* !_RF__RF_THREADSTUFF_H_ */
diff --git a/sys/dev/raidframe/rf_types.h b/sys/dev/raidframe/rf_types.h
new file mode 100644
index 0000000..37a5519
--- /dev/null
+++ b/sys/dev/raidframe/rf_types.h
@@ -0,0 +1,245 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_types.h,v 1.6 1999/09/05 03:05:55 oster Exp $	*/
+/*
+ * rf_types.h
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/***********************************************************
+ *
+ * rf_types.h -- standard types for RAIDframe
+ *
+ ***********************************************************/
+
+#ifndef _RF__RF_TYPES_H_
+#define _RF__RF_TYPES_H_
+
+
+#include <dev/raidframe/rf_archs.h>
+
+#include <sys/errno.h>
+#include <sys/types.h>
+
+#include <sys/uio.h>
+#include <sys/param.h>
+#include <sys/lock.h>
+
+/*
+ * First, define system-dependent types and constants.
+ *
+ * If the machine is big-endian, RF_BIG_ENDIAN should be 1.
+ * Otherwise, it should be 0.
+ *
+ * The various integer types should be self-explanatory; we
+ * use these elsewhere to avoid size confusion.
+ *
+ * LONGSHIFT is lg(sizeof(long)) (that is, log base two of sizeof(long)
+ *
+ */
+
+#include <sys/types.h>
+#include <machine/endian.h>
+#include <machine/limits.h>
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define RF_IS_BIG_ENDIAN    1
+#elif BYTE_ORDER == LITTLE_ENDIAN
+#define RF_IS_BIG_ENDIAN    0
+#else
+#error byte order not defined
+#endif
+typedef int8_t RF_int8;
+typedef u_int8_t RF_uint8;
+typedef int16_t RF_int16;
+typedef u_int16_t RF_uint16;
+typedef int32_t RF_int32;
+typedef u_int32_t RF_uint32;
+typedef int64_t RF_int64;
+typedef u_int64_t RF_uint64;
+#if LONG_BIT == 32
+#define RF_LONGSHIFT        2
+#elif LONG_BIT == 64
+#define RF_LONGSHIFT        3
+#elif defined(__i386__)
+#define RF_LONGSHIFT	    2
+#elif defined(__alpha__)
+#define RF_LONGSHIFT	    3
+#else
+#error word size not defined
+#endif
+
+/*
+ * These are just zero and non-zero. We don't use "TRUE"
+ * and "FALSE" because there's too much nonsense trying
+ * to get them defined exactly once on every platform, given
+ * the different places they may be defined in system header
+ * files.
+ */
+#define RF_TRUE  1
+#define RF_FALSE 0
+
+/*
+ * Now, some generic types
+ */
+typedef RF_uint64 RF_IoCount_t;
+typedef RF_uint64 RF_Offset_t;
+typedef RF_uint32 RF_PSSFlags_t;
+typedef RF_uint64 RF_SectorCount_t;
+typedef RF_uint64 RF_StripeCount_t;
+typedef RF_int64 RF_SectorNum_t;/* these are unsigned so we can set them to
+				 * (-1) for "uninitialized" */
+typedef RF_int64 RF_StripeNum_t;
+typedef RF_int64 RF_RaidAddr_t;
+typedef int RF_RowCol_t;	/* unsigned so it can be (-1) */
+typedef RF_int64 RF_HeadSepLimit_t;
+typedef RF_int64 RF_ReconUnitCount_t;
+typedef int RF_ReconUnitNum_t;
+
+typedef char RF_ParityConfig_t;
+
+typedef char RF_DiskQueueType_t[1024];
+#define RF_DISK_QUEUE_TYPE_NONE ""
+
+/* values for the 'type' field in a reconstruction buffer */
+typedef int RF_RbufType_t;
+#define RF_RBUF_TYPE_EXCLUSIVE   0	/* this buf assigned exclusively to
+					 * one disk */
+#define RF_RBUF_TYPE_FLOATING    1	/* this is a floating recon buf */
+#define RF_RBUF_TYPE_FORCED      2	/* this rbuf was allocated to complete
+					 * a forced recon */
+
+typedef char RF_IoType_t;
+#define RF_IO_TYPE_READ          'r'
+#define RF_IO_TYPE_WRITE         'w'
+#define RF_IO_TYPE_NOP           'n'
+#define RF_IO_IS_R_OR_W(_type_) (((_type_) == RF_IO_TYPE_READ) \
+                                || ((_type_) == RF_IO_TYPE_WRITE))
+
+typedef void (*RF_VoidFuncPtr) (void *,...);
+
+typedef RF_uint32 RF_AccessStripeMapFlags_t;
+typedef RF_uint32 RF_DiskQueueDataFlags_t;
+typedef RF_uint32 RF_DiskQueueFlags_t;
+typedef RF_uint32 RF_RaidAccessFlags_t;
+
+#define RF_DISKQUEUE_DATA_FLAGS_NONE ((RF_DiskQueueDataFlags_t)0)
+
+typedef struct RF_AccessStripeMap_s RF_AccessStripeMap_t;
+typedef struct RF_AccessStripeMapHeader_s RF_AccessStripeMapHeader_t;
+typedef struct RF_AllocListElem_s RF_AllocListElem_t;
+typedef struct RF_CallbackDesc_s RF_CallbackDesc_t;
+typedef struct RF_ChunkDesc_s RF_ChunkDesc_t;
+typedef struct RF_CommonLogData_s RF_CommonLogData_t;
+typedef struct RF_Config_s RF_Config_t;
+typedef struct RF_CumulativeStats_s RF_CumulativeStats_t;
+typedef struct RF_DagHeader_s RF_DagHeader_t;
+typedef struct RF_DagList_s RF_DagList_t;
+typedef struct RF_DagNode_s RF_DagNode_t;
+typedef struct RF_DeclusteredConfigInfo_s RF_DeclusteredConfigInfo_t;
+typedef struct RF_DiskId_s RF_DiskId_t;
+typedef struct RF_DiskMap_s RF_DiskMap_t;
+typedef struct RF_DiskQueue_s RF_DiskQueue_t;
+typedef struct RF_DiskQueueData_s RF_DiskQueueData_t;
+typedef struct RF_DiskQueueSW_s RF_DiskQueueSW_t;
+typedef struct RF_Etimer_s RF_Etimer_t;
+typedef struct RF_EventCreate_s RF_EventCreate_t;
+typedef struct RF_FreeList_s RF_FreeList_t;
+typedef struct RF_LockReqDesc_s RF_LockReqDesc_t;
+typedef struct RF_LockTableEntry_s RF_LockTableEntry_t;
+typedef struct RF_MCPair_s RF_MCPair_t;
+typedef struct RF_OwnerInfo_s RF_OwnerInfo_t;
+typedef struct RF_ParityLog_s RF_ParityLog_t;
+typedef struct RF_ParityLogAppendQueue_s RF_ParityLogAppendQueue_t;
+typedef struct RF_ParityLogData_s RF_ParityLogData_t;
+typedef struct RF_ParityLogDiskQueue_s RF_ParityLogDiskQueue_t;
+typedef struct RF_ParityLogQueue_s RF_ParityLogQueue_t;
+typedef struct RF_ParityLogRecord_s RF_ParityLogRecord_t;
+typedef struct RF_PerDiskReconCtrl_s RF_PerDiskReconCtrl_t;
+typedef struct RF_PSStatusHeader_s RF_PSStatusHeader_t;
+typedef struct RF_PhysDiskAddr_s RF_PhysDiskAddr_t;
+typedef struct RF_PropHeader_s RF_PropHeader_t;
+typedef struct RF_Raid_s RF_Raid_t;
+typedef struct RF_RaidAccessDesc_s RF_RaidAccessDesc_t;
+typedef struct RF_RaidDisk_s RF_RaidDisk_t;
+typedef struct RF_RaidLayout_s RF_RaidLayout_t;
+typedef struct RF_RaidReconDesc_s RF_RaidReconDesc_t;
+typedef struct RF_ReconBuffer_s RF_ReconBuffer_t;
+typedef struct RF_ReconConfig_s RF_ReconConfig_t;
+typedef struct RF_ReconCtrl_s RF_ReconCtrl_t;
+typedef struct RF_ReconDoneProc_s RF_ReconDoneProc_t;
+typedef struct RF_ReconEvent_s RF_ReconEvent_t;
+typedef struct RF_ReconMap_s RF_ReconMap_t;
+typedef struct RF_ReconMapListElem_s RF_ReconMapListElem_t;
+typedef struct RF_ReconParityStripeStatus_s RF_ReconParityStripeStatus_t;
+typedef struct RF_RedFuncs_s RF_RedFuncs_t;
+typedef struct RF_RegionBufferQueue_s RF_RegionBufferQueue_t;
+typedef struct RF_RegionInfo_s RF_RegionInfo_t;
+typedef struct RF_ShutdownList_s RF_ShutdownList_t;
+typedef struct RF_SpareTableEntry_s RF_SpareTableEntry_t;
+typedef struct RF_SparetWait_s RF_SparetWait_t;
+typedef struct RF_StripeLockDesc_s RF_StripeLockDesc_t;
+typedef struct RF_ThreadGroup_s RF_ThreadGroup_t;
+typedef struct RF_ThroughputStats_s RF_ThroughputStats_t;
+
+/*
+ * Important assumptions regarding ordering of the states in this list
+ * have been made!!!
+ * Before disturbing this ordering, look at code in rf_states.c
+ */
+typedef enum RF_AccessState_e {
+	/* original states */
+	rf_QuiesceState,	/* handles queisence for reconstruction */
+	rf_IncrAccessesCountState,	/* count accesses in flight */
+	rf_DecrAccessesCountState,
+	rf_MapState,		/* map access to disk addresses */
+	rf_LockState,		/* take stripe locks */
+	rf_CreateDAGState,	/* create DAGs */
+	rf_ExecuteDAGState,	/* execute DAGs */
+	rf_ProcessDAGState,	/* DAGs are completing- check if correct, or
+				 * if we need to retry */
+	rf_CleanupState,	/* release stripe locks, clean up */
+	rf_LastState		/* must be the last state */
+}       RF_AccessState_t;
+#define RF_MAXROW    10		/* these are arbitrary and can be modified at
+				 * will */
+#define RF_MAXCOL    40
+#define RF_MAXSPARE  10
+#define RF_MAXDBGV   75		/* max number of debug variables */
+
+union RF_GenericParam_u {
+	void   *p;
+	RF_uint64 v;
+};
+typedef union RF_GenericParam_u RF_DagParam_t;
+typedef union RF_GenericParam_u RF_CBParam_t;
+
+#if defined(__FreeBSD__) && __FreeBSD_version > 500005
+typedef struct bio	*RF_Buf_t;
+#else
+typedef struct buf	*RF_Buf_t;
+#endif
+#endif				/* _RF__RF_TYPES_H_ */
diff --git a/sys/dev/raidframe/rf_utils.c b/sys/dev/raidframe/rf_utils.c
new file mode 100644
index 0000000..ee226d9
--- /dev/null
+++ b/sys/dev/raidframe/rf_utils.c
@@ -0,0 +1,147 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_utils.c,v 1.5 2000/01/07 03:41:03 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/****************************************
+ *
+ * rf_utils.c -- various support routines
+ *
+ ****************************************/
+
+
+#include <dev/raidframe/rf_threadstuff.h>
+
+#include <sys/time.h>
+
+#include <dev/raidframe/rf_utils.h>
+#include <dev/raidframe/rf_debugMem.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_general.h>
+
+/* creates & zeros 2-d array with b rows and k columns (MCH) */
+RF_RowCol_t **
+rf_make_2d_array(b, k, allocList)
+	int     b;
+	int     k;
+	RF_AllocListElem_t *allocList;
+{
+	RF_RowCol_t **retval, i;
+
+	RF_MallocAndAdd(retval, b * sizeof(RF_RowCol_t *), (RF_RowCol_t **), allocList);
+	for (i = 0; i < b; i++) {
+		RF_MallocAndAdd(retval[i], k * sizeof(RF_RowCol_t), (RF_RowCol_t *), allocList);
+		(void) bzero((char *) retval[i], k * sizeof(RF_RowCol_t));
+	}
+	return (retval);
+}
+
+void 
+rf_free_2d_array(a, b, k)
+	RF_RowCol_t **a;
+	int     b;
+	int     k;
+{
+	RF_RowCol_t i;
+
+	for (i = 0; i < b; i++)
+		RF_Free(a[i], k * sizeof(RF_RowCol_t));
+	RF_Free(a, b * sizeof(RF_RowCol_t));
+}
+
+
+/* creates & zeros a 1-d array with c columns */
+RF_RowCol_t *
+rf_make_1d_array(c, allocList)
+	int     c;
+	RF_AllocListElem_t *allocList;
+{
+	RF_RowCol_t *retval;
+
+	RF_MallocAndAdd(retval, c * sizeof(RF_RowCol_t), (RF_RowCol_t *), allocList);
+	(void) bzero((char *) retval, c * sizeof(RF_RowCol_t));
+	return (retval);
+}
+
+void 
+rf_free_1d_array(a, n)
+	RF_RowCol_t *a;
+	int     n;
+{
+	RF_Free(a, n * sizeof(RF_RowCol_t));
+}
+/* Euclid's algorithm:  finds and returns the greatest common divisor
+ * between a and b.     (MCH)
+ */
+int 
+rf_gcd(m, n)
+	int     m;
+	int     n;
+{
+	int     t;
+
+	while (m > 0) {
+		t = n % m;
+		n = m;
+		m = t;
+	}
+	return (n);
+}
+/* these convert between text and integer.  Apparently the regular C macros
+ * for doing this are not available in the kernel
+ */
+
+#define ISDIGIT(x)   ( (x) >= '0' && (x) <= '9' )
+#define ISHEXCHAR(x) ( ((x) >= 'a' && (x) <= 'f') || ((x) >= 'A' && (x) <= 'F') )
+#define ISHEX(x)     ( ISDIGIT(x) || ISHEXCHAR(x) )
+#define HC2INT(x)    ( ((x) >= 'a' && (x) <= 'f') ? (x) - 'a' + 10 :                    \
+		       ( ((x) >= 'A' && (x) <= 'F') ? (x) - 'A' + 10 : (x - '0') ) )
+
+int 
+rf_atoi(p)
+	char   *p;
+{
+	int     val = 0, negate = 0;
+
+	if (*p == '-') {
+		negate = 1;
+		p++;
+	}
+	for (; ISDIGIT(*p); p++)
+		val = 10 * val + (*p - '0');
+	return ((negate) ? -val : val);
+}
+
+int 
+rf_htoi(p)
+	char   *p;
+{
+	int     val = 0;
+	for (; ISHEXCHAR(*p); p++)
+		val = 16 * val + HC2INT(*p);
+	return (val);
+}
diff --git a/sys/dev/raidframe/rf_utils.h b/sys/dev/raidframe/rf_utils.h
new file mode 100644
index 0000000..18eac84
--- /dev/null
+++ b/sys/dev/raidframe/rf_utils.h
@@ -0,0 +1,70 @@
+/*	$FreeBSD$ */
+/*	$NetBSD: rf_utils.h,v 1.4 1999/08/13 03:26:55 oster Exp $	*/
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***************************************
+ *
+ * rf_utils.c -- header file for utils.c
+ *
+ ***************************************/
+
+
+#ifndef _RF__RF_UTILS_H_
+#define _RF__RF_UTILS_H_
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_alloclist.h>
+#include <dev/raidframe/rf_threadstuff.h>
+
+char   *rf_find_non_white(char *p);
+char   *rf_find_white(char *p);
+RF_RowCol_t **rf_make_2d_array(int b, int k, RF_AllocListElem_t * allocList);
+RF_RowCol_t *rf_make_1d_array(int c, RF_AllocListElem_t * allocList);
+void    rf_free_2d_array(RF_RowCol_t ** a, int b, int k);
+void    rf_free_1d_array(RF_RowCol_t * a, int n);
+int     rf_gcd(int m, int n);
+int     rf_atoi(char *p);
+int     rf_htoi(char *p);
+
+#define RF_USEC_PER_SEC 1000000
+#define RF_TIMEVAL_TO_US(_t_) (((_t_).tv_sec) \
+                * RF_USEC_PER_SEC + (_t_).tv_usec)
+
+#define RF_TIMEVAL_DIFF(_start_,_end_,_diff_) { \
+	if ((_end_)->tv_usec < (_start_)->tv_usec) { \
+		(_diff_)->tv_usec = ((_end_)->tv_usec + RF_USEC_PER_SEC) \
+				- (_start_)->tv_usec; \
+		(_diff_)->tv_sec = ((_end_)->tv_sec-1) - (_start_)->tv_sec; \
+	} \
+	else { \
+		(_diff_)->tv_usec = (_end_)->tv_usec - (_start_)->tv_usec; \
+		(_diff_)->tv_sec  = (_end_)->tv_sec  - (_start_)->tv_sec; \
+	} \
+}
+
+#endif				/* !_RF__RF_UTILS_H_ */
diff --git a/sys/modules/raidframe/Makefile b/sys/modules/raidframe/Makefile
new file mode 100644
index 0000000..d4ff5dc
--- /dev/null
+++ b/sys/modules/raidframe/Makefile
@@ -0,0 +1,32 @@
+# $FreeBSD$
+KMOD=	raidframe
+NOMAN=
+
+.PATH:	${.CURDIR}/../../dev/raidframe
+
+SRCS=	rf_acctrace.c rf_alloclist.c rf_aselect.c rf_callback.c \
+	rf_chaindecluster.c rf_copyback.c rf_cvscan.c rf_dagdegrd.c \
+	rf_dagdegwr.c rf_dagffrd.c rf_dagffwr.c rf_dagfuncs.c rf_dagutils.c \
+	rf_debugMem.c rf_debugprint.c rf_decluster.c rf_declusterPQ.c \
+	rf_diskqueue.c rf_disks.c rf_driver.c rf_engine.c rf_evenodd.c \
+	rf_evenodd_dagfuncs.c rf_evenodd_dags.c rf_fifo.c rf_interdecluster.c \
+	rf_invertq.c rf_layout.c rf_map.c rf_mcpair.c rf_memchunk.c \
+	rf_nwayxor.c rf_options.c rf_paritylog.c rf_paritylogDiskMgr.c \
+	rf_paritylogging.c rf_parityloggingdags.c rf_parityscan.c rf_pq.c \
+	rf_pqdeg.c rf_pqdegdags.c rf_psstatus.c rf_raid0.c rf_raid1.c \
+	rf_raid4.c rf_raid5.c rf_raid5_rotatedspare.c rf_reconbuffer.c \
+	rf_reconmap.c rf_reconstruct.c rf_reconutil.c rf_revent.c \
+	rf_shutdown.c rf_sstf.c rf_states.c rf_stripelocks.c rf_strutils.c \
+	rf_threadstuff.c rf_utils.c rf_freebsdkintf.c
+
+SRCS+=	opt_raid.h vnode_if.h
+RF_DEBUG?= 0
+RF_AUTOCONFIG?= 1
+
+#CFLAGS+= -g
+
+opt_raid.h:
+	@echo "#define RAID_AUTOCONFIG ${RF_AUTOCONFIG}" > opt_raid.h
+	@echo "#define RAID_DEBUG ${RF_DEBUG}" >> opt_raid.h
+
+.include <bsd.kmod.mk>
diff --git a/sys/sys/disklabel.h b/sys/sys/disklabel.h
index 039b5a6..fde605f 100644
--- a/sys/sys/disklabel.h
+++ b/sys/sys/disklabel.h
@@ -202,6 +202,7 @@ dkcksum(struct disklabel *lp)
 #define	DTYPE_CCD		11		/* concatenated disk */
 #define	DTYPE_VINUM		12		/* vinum volume */
 #define	DTYPE_DOC2K		13		/* Msys DiskOnChip */
+#define	DTYPE_RAID		14		/* CMU RAIDFrame */
 #define	DTYPE_JFS2		16		/* IBM JFS 2 */
 
 #ifdef DKTYPENAMES
@@ -220,7 +221,7 @@ static const char *dktypenames[] = {
 	"CCD",
 	"Vinum",
 	"DOC2K",
-	"?",
+	"Raid",
 	"?",
 	"jfs",
 	NULL
@@ -248,6 +249,7 @@ static const char *dktypenames[] = {
 #define	FS_ISO9660	12		/* ISO 9660, normally CD-ROM */
 #define	FS_BOOT		13		/* partition contains bootstrap */
 #define	FS_VINUM	14		/* Vinum drive */
+#define	FS_RAID		15		/* RAIDFrame drive */
 #define	FS_JFS2		21		/* IBM JFS2 */
 
 #ifdef	FSTYPENAMES
@@ -267,8 +269,7 @@ static const char *fstypenames[] = {
 	"ISO9660",
 	"boot",
 	"vinum",
-	"?",
-	"?",
+	"raid",
 	"?",
 	"?",
 	"?",
diff --git a/sys/sys/kernel.h b/sys/sys/kernel.h
index 333ff64..4089d59 100644
--- a/sys/sys/kernel.h
+++ b/sys/sys/kernel.h
@@ -154,6 +154,7 @@ enum sysinit_sub_id {
 	SI_SUB_ROOT_CONF	= 0xb000000,	/* Find root devices */
 	SI_SUB_DUMP_CONF	= 0xb200000,	/* Find dump devices */
 	SI_SUB_VINUM		= 0xb300000,	/* Configure vinum */
+	SI_SUB_RAID		= 0xb380000,	/* Configure RAIDframe */
 	SI_SUB_MOUNT_ROOT	= 0xb400000,	/* root mount*/
 	SI_SUB_SWAP		= 0xc000000,	/* swap*/
 	SI_SUB_INTRINSIC_POST	= 0xd000000,	/* proc 0 cleanup*/
author	scottl <scottl@FreeBSD.org>	2002-10-20 08:17:39 +0000
committer	scottl <scottl@FreeBSD.org>	2002-10-20 08:17:39 +0000
commit	710948de69ddeae56bda663219319f6d859aea1f (patch)
tree	71c65823ba2e8591de708d5cb2e990a75135ee11
parent	63bd46464d6d4587c20c1ca62fb6a6e3be132db9 (diff)
download	FreeBSD-src-710948de69ddeae56bda663219319f6d859aea1f.zip FreeBSD-src-710948de69ddeae56bda663219319f6d859aea1f.tar.gz