summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--etc/defaults/rc.conf3
-rwxr-xr-xetc/rc.d/Makefile2
-rw-r--r--etc/rc.d/hastd31
-rw-r--r--sbin/Makefile2
-rw-r--r--sbin/ggate/ggatec/ggatec.c2
-rw-r--r--sbin/ggate/ggatel/ggatel.c2
-rw-r--r--sbin/hastctl/Makefile36
-rw-r--r--sbin/hastctl/hastctl.8217
-rw-r--r--sbin/hastctl/hastctl.c526
-rw-r--r--sbin/hastd/Makefile37
-rw-r--r--sbin/hastd/activemap.c691
-rw-r--r--sbin/hastd/activemap.h69
-rw-r--r--sbin/hastd/control.c426
-rw-r--r--sbin/hastd/control.h44
-rw-r--r--sbin/hastd/ebuf.c252
-rw-r--r--sbin/hastd/ebuf.h51
-rw-r--r--sbin/hastd/hast.conf.5267
-rw-r--r--sbin/hastd/hast.h190
-rw-r--r--sbin/hastd/hast_proto.c401
-rw-r--r--sbin/hastd/hast_proto.h48
-rw-r--r--sbin/hastd/hastd.8232
-rw-r--r--sbin/hastd/hastd.c522
-rw-r--r--sbin/hastd/hastd.h48
-rw-r--r--sbin/hastd/hooks.c148
-rw-r--r--sbin/hastd/hooks.h40
-rw-r--r--sbin/hastd/metadata.c222
-rw-r--r--sbin/hastd/metadata.h48
-rw-r--r--sbin/hastd/nv.c882
-rw-r--r--sbin/hastd/nv.h158
-rw-r--r--sbin/hastd/parse.y507
-rw-r--r--sbin/hastd/pjdlog.c367
-rw-r--r--sbin/hastd/pjdlog.h88
-rw-r--r--sbin/hastd/primary.c1769
-rw-r--r--sbin/hastd/proto.c261
-rw-r--r--sbin/hastd/proto.h54
-rw-r--r--sbin/hastd/proto_common.c85
-rw-r--r--sbin/hastd/proto_impl.h75
-rw-r--r--sbin/hastd/proto_socketpair.c272
-rw-r--r--sbin/hastd/proto_tcp4.c447
-rw-r--r--sbin/hastd/proto_uds.c330
-rw-r--r--sbin/hastd/rangelock.c137
-rw-r--r--sbin/hastd/rangelock.h46
-rw-r--r--sbin/hastd/secondary.c697
-rw-r--r--sbin/hastd/subr.c118
-rw-r--r--sbin/hastd/subr.h51
-rw-r--r--sbin/hastd/synch.h162
-rw-r--r--sbin/hastd/token.l66
-rw-r--r--share/examples/Makefile6
-rwxr-xr-xshare/examples/hast/ucarp.sh69
-rwxr-xr-xshare/examples/hast/ucarp_down.sh98
-rwxr-xr-xshare/examples/hast/ucarp_up.sh105
-rwxr-xr-xshare/examples/hast/vip-down.sh5
-rwxr-xr-xshare/examples/hast/vip-up.sh7
-rw-r--r--share/man/man5/rc.conf.523
-rw-r--r--sys/geom/gate/g_gate.c190
-rw-r--r--sys/geom/gate/g_gate.h18
56 files changed, 11572 insertions, 78 deletions
diff --git a/etc/defaults/rc.conf b/etc/defaults/rc.conf
index 7d0a7d2..35b3a7b 100644
--- a/etc/defaults/rc.conf
+++ b/etc/defaults/rc.conf
@@ -260,6 +260,9 @@ syslogd_flags="-s" # Flags to syslogd (if enabled).
inetd_enable="NO" # Run the network daemon dispatcher (YES/NO).
inetd_program="/usr/sbin/inetd" # path to inetd, if you want a different one.
inetd_flags="-wW -C 60" # Optional flags to inetd
+hastd_enable="NO" # Run the HAST daemon (YES/NO).
+hastd_program="/sbin/hastd" # path to hastd, if you want a different one.
+hastd_flags="" # Optional flags to hastd.
#
# named. It may be possible to run named in a sandbox, man security for
# details.
diff --git a/etc/rc.d/Makefile b/etc/rc.d/Makefile
index d0e24b3..17f7634 100755
--- a/etc/rc.d/Makefile
+++ b/etc/rc.d/Makefile
@@ -12,7 +12,7 @@ FILES= DAEMON FILESYSTEMS LOGIN NETWORKING SERVERS \
encswap \
faith fsck ftp-proxy ftpd \
gbde geli geli2 gssd \
- hcsecd \
+ hastd hcsecd \
hostapd hostid hostid_save hostname \
inetd initrandom \
ip6addrctl ipfilter ipfs ipfw ipmon \
diff --git a/etc/rc.d/hastd b/etc/rc.d/hastd
new file mode 100644
index 0000000..3014caf
--- /dev/null
+++ b/etc/rc.d/hastd
@@ -0,0 +1,31 @@
+#!/bin/sh
+#
+# $FreeBSD$
+#
+
+# PROVIDE: hastd
+# REQUIRE: NETWORKING syslogd
+# BEFORE: DAEMON
+
+. /etc/rc.subr
+
+name="hastd"
+rcvar=`set_rcvar`
+pidfile="/var/run/${name}.pid"
+command="/sbin/${name}"
+hastctl="/sbin/hastctl"
+required_files="/etc/hast.conf"
+stop_precmd="hastd_stop_precmd"
+required_modules="geom_gate:g_gate"
+
+sockfile="/var/run/syslogd.sockets"
+evalargs="rc_flags=\"\`set_socketlist\` \$rc_flags\""
+altlog_proglist="named"
+
+hastd_stop_precmd()
+{
+ ${hastctl} role init all
+}
+
+load_rc_config $name
+run_rc_command "$1"
diff --git a/sbin/Makefile b/sbin/Makefile
index 8ece390..72f4bff 100644
--- a/sbin/Makefile
+++ b/sbin/Makefile
@@ -36,6 +36,8 @@ SUBDIR= adjkerntz \
ggate \
growfs \
gvinum \
+ hastctl \
+ hastd \
ifconfig \
init \
${_ipf} \
diff --git a/sbin/ggate/ggatec/ggatec.c b/sbin/ggate/ggatec/ggatec.c
index e421614..660bd8a 100644
--- a/sbin/ggate/ggatec/ggatec.c
+++ b/sbin/ggate/ggatec/ggatec.c
@@ -59,7 +59,7 @@ enum { UNSET, CREATE, DESTROY, LIST, RESCUE } action = UNSET;
static const char *path = NULL;
static const char *host = NULL;
-static int unit = -1;
+static int unit = G_GATE_UNIT_AUTO;
static unsigned flags = 0;
static int force = 0;
static unsigned queue_size = G_GATE_QUEUE_SIZE;
diff --git a/sbin/ggate/ggatel/ggatel.c b/sbin/ggate/ggatel/ggatel.c
index 03979c3..6a3f26e 100644
--- a/sbin/ggate/ggatel/ggatel.c
+++ b/sbin/ggate/ggatel/ggatel.c
@@ -50,7 +50,7 @@
enum { UNSET, CREATE, DESTROY, LIST, RESCUE } action = UNSET;
static const char *path = NULL;
-static int unit = -1;
+static int unit = G_GATE_UNIT_AUTO;
static unsigned flags = 0;
static int force = 0;
static unsigned queue_size = G_GATE_QUEUE_SIZE;
diff --git a/sbin/hastctl/Makefile b/sbin/hastctl/Makefile
new file mode 100644
index 0000000..43c8c20
--- /dev/null
+++ b/sbin/hastctl/Makefile
@@ -0,0 +1,36 @@
+# $FreeBSD$
+
+.include <bsd.own.mk>
+
+.PATH: ${.CURDIR}/../hastd
+
+PROG= hastctl
+SRCS= activemap.c
+SRCS+= ebuf.c
+SRCS+= hast_proto.c hastctl.c
+SRCS+= metadata.c
+SRCS+= nv.c
+SRCS+= parse.y pjdlog.c
+SRCS+= proto.c proto_common.c proto_tcp4.c proto_uds.c
+SRCS+= token.l
+SRCS+= subr.c
+SRCS+= y.tab.h
+WARNS?= 6
+MAN= hastctl.8
+
+CFLAGS+=-I${.CURDIR}/../hastd
+CFLAGS+=-DINET
+.if ${MK_INET6_SUPPORT} != "no"
+CFLAGS+=-DINET6
+.endif
+# This is needed to have WARNS > 1.
+CFLAGS+=-DYY_NO_UNPUT
+
+DPADD= ${LIBCRYPTO} ${LIBL}
+LDADD= -lcrypto -ll
+
+YFLAGS+=-v
+
+CLEANFILES=y.tab.c y.tab.h y.output
+
+.include <bsd.prog.mk>
diff --git a/sbin/hastctl/hastctl.8 b/sbin/hastctl/hastctl.8
new file mode 100644
index 0000000..bf03c2e
--- /dev/null
+++ b/sbin/hastctl/hastctl.8
@@ -0,0 +1,217 @@
+.\" Copyright (c) 2010 The FreeBSD Foundation
+.\" All rights reserved.
+.\"
+.\" This software was developed by Pawel Jakub Dawidek under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd February 1, 2010
+.Dt HASTCTL 8
+.Os
+.Sh NAME
+.Nm hastctl
+.Nd "Highly Available Storage control utility"
+.Sh SYNOPSIS
+.Nm
+.Cm create
+.Op Fl d
+.Op Fl c Ar config
+.Op Fl e Ar extentsize
+.Op Fl k Ar keepdirty
+.Op Fl m Ar mediasize
+.Ar name ...
+.Nm
+.Cm role
+.Op Fl d
+.Op Fl c Ar config
+.Aq init | primary | secondary
+.Ar all | name ...
+.Nm
+.Cm status
+.Op Fl d
+.Op Fl c Ar config
+.Op Ar all | name ...
+.Nm
+.Cm dump
+.Op Fl d
+.Op Fl c Ar config
+.Op Ar all | name ...
+.Sh DESCRIPTION
+The
+.Nm
+utility is used to control the behaviour of the
+.Xr hastd 8
+daemon.
+.Pp
+This utility should be used by HA software like
+.Nm heartbeat
+or
+.Nm ucarp
+to setup HAST resources role when changing from primary mode to
+secondary or vice versa.
+Be aware that if a file system like UFS exists on HAST provider and
+primary node dies, file system has to be checked for inconsistencies
+with the
+.Xr fsck 8
+utility after switching secondary node to primary role.
+.Pp
+The first argument to
+.Nm
+indicates an action to be performed:
+.Bl -tag -width ".Cm create"
+.It Cm create
+Initialize local provider configured for the given resource.
+Additional options include:
+.Bl -tag -width ".Fl e Ar extentsize"
+.It Fl e Ar extentsize
+Size of an extent.
+Extent is a block which is used for synchronization.
+.Nm
+maintains a map of dirty extents and extent is the smallest region that
+can be marked as dirty.
+If any part of an extent is modified, entire extent will be synchronized
+when nodes connect.
+If extent size is too small, there will be too much disk activity
+related to dirty map updates, which will degrade performance of the
+given resource.
+If extent size is too large, synchronization, even in case of short
+outage, can take a long time increasing the risk of loosing up-to-date
+node before synchronization process is completed.
+The default extent size is
+.Va 2MB .
+.It Fl k Ar keepdirty
+Maximum number of dirty extents to keep dirty all the time.
+Most recently used extents are kept dirty to reduce number of metadata
+updates.
+The default numer of most recently used extents which will be kept
+dirty is
+.Va 64 .
+.It Fl m Ar mediasize
+Size of the smaller provider used as backend storage on both nodes.
+This option can be omitted if node providers have the same size on both
+sides.
+.El
+.It Cm role
+Change role of the given resource.
+The role can be one of:
+.Bl -tag -width ".Cm secondary"
+.It Cm init
+Resource is turned off.
+.It Cm primary
+Local
+.Xr hastd 8
+daemon will act as primary node for the given resource.
+System on which resource role is set to primary can use
+.Pa /dev/hast/<name>
+GEOM provider.
+.It Cm secondary
+Local
+.Xr hastd 8
+daemon will act as secondary node for the given resource - it will wait
+for connection from the primary node and will handle I/O requests
+received from it.
+GEOM provider
+.Pa /dev/hast/<name>
+will not be created on secondary node.
+.El
+.It Cm status
+Present status of the configured resources.
+.It Cm dump
+Dump metadata stored on local component for the configured resources.
+.El
+.Pp
+In addition, every subcommand can be followed by the following options:
+.Bl -tag -width ".Fl c Ar config"
+.It Fl c Ar config
+Specify alternative location of the configuration file.
+The default location is
+.Pa /etc/hast.conf .
+.It Fl d
+Print debugging information.
+This option can be specified multiple times to raise the verbosity
+level.
+.El
+.Sh EXIT STATUS
+Exit status is 0 on success, or one of the values described in
+.Xr sysexits 3
+on failure.
+.Sh EXAMPLES
+Initialize HAST provider, create file system on it and mount it.
+.Bd -literal -offset indent
+nodeB# hastctl create shared
+nodeB# hastd
+nodeB# hastctl role secondary shared
+
+nodeB# hastctl create shared
+nodeA# hastd
+nodeA# hastctl role primary shared
+nodeA# newfs -U /dev/hast/shared
+nodeA# mount -o noatime /dev/hast/shared /shared
+nodeA# application_start
+.Ed
+.Pp
+Switch roles for the
+.Nm shared
+HAST resource.
+.Bd -literal -offset indent
+nodeA# application_stop
+nodeA# umount -f /shared
+nodeA# hastctl role secondary shared
+
+nodeB# hastctl role primary shared
+nodeB# fsck -t ufs /dev/hast/shared
+nodeB# mount -o noatime /dev/hast/shared /shared
+nodeB# application_start
+.Ed
+.Sh FILES
+.Bl -tag -width ".Pa /var/run/hastctl" -compact
+.It Pa /etc/hast.conf
+Configuration file for
+.Nm
+and
+.Xr hastd 8 .
+.It Pa /var/run/hastctl
+Control socket used by
+.Nm
+to communicate with the
+.Xr hastd 8
+daemon.
+.El
+.Sh SEE ALSO
+.Xr sysexits 3 ,
+.Xr geom 4 ,
+.Xr hast.conf 5 ,
+.Xr fsck 8 ,
+.Xr ggatec 8 ,
+.Xr ggatel 8 ,
+.Xr hastd 8 ,
+.Xr mount 8 ,
+.Xr newfs 8 .
+.Sh AUTHORS
+The
+.Nm
+was developed by
+.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org
+under sponsorship of the FreeBSD Foundation.
diff --git a/sbin/hastctl/hastctl.c b/sbin/hastctl/hastctl.c
new file mode 100644
index 0000000..8499528
--- /dev/null
+++ b/sbin/hastctl/hastctl.c
@@ -0,0 +1,526 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/disk.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <activemap.h>
+
+#include "hast.h"
+#include "hast_proto.h"
+#include "metadata.h"
+#include "nv.h"
+#include "pjdlog.h"
+#include "proto.h"
+#include "subr.h"
+
+/* Path to configuration file. */
+static const char *cfgpath = HAST_CONFIG;
+/* Hastd configuration. */
+static struct hastd_config *cfg;
+/* Control connection. */
+static struct proto_conn *controlconn;
+
+enum {
+ CMD_INVALID,
+ CMD_CREATE,
+ CMD_ROLE,
+ CMD_STATUS,
+ CMD_DUMP
+};
+
+static __dead2 void
+usage(void)
+{
+
+ fprintf(stderr,
+ "usage: %s create [-d] [-c config] [-e extentsize] [-k keepdirty]\n"
+ "\t\t[-m mediasize] name ...\n",
+ getprogname());
+ fprintf(stderr,
+ " %s role [-d] [-c config] <init | primary | secondary> all | name ...\n",
+ getprogname());
+ fprintf(stderr,
+ " %s status [-d] [-c config] [all | name ...]\n",
+ getprogname());
+ fprintf(stderr,
+ " %s dump [-d] [-c config] [all | name ...]\n",
+ getprogname());
+ exit(EX_USAGE);
+}
+
+static int
+create_one(struct hast_resource *res, intmax_t mediasize, intmax_t extentsize,
+ intmax_t keepdirty)
+{
+ unsigned char *buf;
+ size_t mapsize;
+ int ec;
+
+ ec = 0;
+ pjdlog_prefix_set("[%s] ", res->hr_name);
+
+ if (provinfo(res, true) < 0) {
+ ec = EX_NOINPUT;
+ goto end;
+ }
+ if (mediasize == 0)
+ mediasize = res->hr_local_mediasize;
+ else if (mediasize > res->hr_local_mediasize) {
+ pjdlog_error("Provided mediasize is larger than provider %s size.",
+ res->hr_localpath);
+ ec = EX_DATAERR;
+ goto end;
+ }
+ if (!powerof2(res->hr_local_sectorsize)) {
+ pjdlog_error("Sector size of provider %s is not power of 2 (%u).",
+ res->hr_localpath, res->hr_local_sectorsize);
+ ec = EX_DATAERR;
+ goto end;
+ }
+ if (extentsize == 0)
+ extentsize = HAST_EXTENTSIZE;
+ if (extentsize < res->hr_local_sectorsize) {
+ pjdlog_error("Extent size (%jd) is less than sector size (%u).",
+ (intmax_t)extentsize, res->hr_local_sectorsize);
+ ec = EX_DATAERR;
+ goto end;
+ }
+ if ((extentsize % res->hr_local_sectorsize) != 0) {
+ pjdlog_error("Extent size (%jd) is not multiple of sector size (%u).",
+ (intmax_t)extentsize, res->hr_local_sectorsize);
+ ec = EX_DATAERR;
+ goto end;
+ }
+ mapsize = activemap_calc_ondisk_size(mediasize - METADATA_SIZE,
+ extentsize, res->hr_local_sectorsize);
+ if (keepdirty == 0)
+ keepdirty = HAST_KEEPDIRTY;
+ res->hr_datasize = mediasize - METADATA_SIZE - mapsize;
+ res->hr_extentsize = extentsize;
+ res->hr_keepdirty = keepdirty;
+
+ res->hr_localoff = METADATA_SIZE + mapsize;
+
+ if (metadata_write(res) < 0) {
+ ec = EX_IOERR;
+ goto end;
+ }
+ buf = calloc(1, mapsize);
+ if (buf == NULL) {
+ pjdlog_error("Unable to allocate %zu bytes of memory for initial bitmap.",
+ mapsize);
+ ec = EX_TEMPFAIL;
+ goto end;
+ }
+ if (pwrite(res->hr_localfd, buf, mapsize, METADATA_SIZE) !=
+ (ssize_t)mapsize) {
+ pjdlog_errno(LOG_ERR, "Unable to store initial bitmap on %s",
+ res->hr_localpath);
+ free(buf);
+ ec = EX_IOERR;
+ goto end;
+ }
+ free(buf);
+end:
+ if (res->hr_localfd >= 0)
+ close(res->hr_localfd);
+ pjdlog_prefix_set("%s", "");
+ return (ec);
+}
+
+static void
+control_create(int argc, char *argv[], intmax_t mediasize, intmax_t extentsize,
+ intmax_t keepdirty)
+{
+ struct hast_resource *res;
+ int ec, ii, ret;
+
+ /* Initialize the given resources. */
+ if (argc < 1)
+ usage();
+ ec = 0;
+ for (ii = 0; ii < argc; ii++) {
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (strcmp(argv[ii], res->hr_name) == 0)
+ break;
+ }
+ if (res == NULL) {
+ pjdlog_error("Unknown resource %s.", argv[ii]);
+ if (ec == 0)
+ ec = EX_DATAERR;
+ continue;
+ }
+ ret = create_one(res, mediasize, extentsize, keepdirty);
+ if (ret != 0 && ec == 0)
+ ec = ret;
+ }
+ exit(ec);
+}
+
+static int
+dump_one(struct hast_resource *res)
+{
+ int ret;
+
+ ret = metadata_read(res, false);
+ if (ret != 0)
+ return (ret);
+
+ printf("resource: %s\n", res->hr_name);
+ printf(" datasize: %ju\n", (uintmax_t)res->hr_datasize);
+ printf(" extentsize: %d\n", res->hr_extentsize);
+ printf(" keepdirty: %d\n", res->hr_keepdirty);
+ printf(" localoff: %ju\n", (uintmax_t)res->hr_localoff);
+ printf(" resuid: %ju\n", (uintmax_t)res->hr_resuid);
+ printf(" localcnt: %ju\n", (uintmax_t)res->hr_primary_localcnt);
+ printf(" remotecnt: %ju\n", (uintmax_t)res->hr_primary_remotecnt);
+ printf(" prevrole: %s\n", role2str(res->hr_previous_role));
+
+ return (0);
+}
+
+static void
+control_dump(int argc, char *argv[])
+{
+ struct hast_resource *res;
+ int ec, ret;
+
+ /* Dump metadata of the given resource(s). */
+
+ ec = 0;
+ if (argc == 0 || (argc == 1 && strcmp(argv[0], "all") == 0)) {
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ ret = dump_one(res);
+ if (ret != 0 && ec == 0)
+ ec = ret;
+ }
+ } else {
+ int ii;
+
+ for (ii = 0; ii < argc; ii++) {
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (strcmp(argv[ii], res->hr_name) == 0)
+ break;
+ }
+ if (res == NULL) {
+ pjdlog_error("Unknown resource %s.", argv[ii]);
+ if (ec == 0)
+ ec = EX_DATAERR;
+ continue;
+ }
+ ret = dump_one(res);
+ if (ret != 0 && ec == 0)
+ ec = ret;
+ }
+ }
+ exit(ec);
+}
+
+static int
+control_set_role(struct nv *nv, const char *newrole)
+{
+ const char *res, *oldrole;
+ unsigned int ii;
+ int error, ret;
+
+ ret = 0;
+
+ for (ii = 0; ; ii++) {
+ res = nv_get_string(nv, "resource%u", ii);
+ if (res == NULL)
+ break;
+ pjdlog_prefix_set("[%s] ", res);
+ error = nv_get_int16(nv, "error%u", ii);
+ if (error != 0) {
+ if (ret == 0)
+ ret = error;
+ pjdlog_warning("Received error %d from hastd.", error);
+ continue;
+ }
+ oldrole = nv_get_string(nv, "role%u", ii);
+ if (strcmp(oldrole, newrole) == 0)
+ pjdlog_debug(2, "Role unchanged (%s).", oldrole);
+ else {
+ pjdlog_debug(1, "Role changed from %s to %s.", oldrole,
+ newrole);
+ }
+ }
+ pjdlog_prefix_set("%s", "");
+ return (ret);
+}
+
+static int
+control_status(struct nv *nv)
+{
+ unsigned int ii;
+ const char *str;
+ int error, ret;
+
+ ret = 0;
+
+ for (ii = 0; ; ii++) {
+ str = nv_get_string(nv, "resource%u", ii);
+ if (str == NULL)
+ break;
+ printf("%s:\n", str);
+ error = nv_get_int16(nv, "error%u", ii);
+ if (error != 0) {
+ if (ret == 0)
+ ret = error;
+ printf(" error: %d\n", error);
+ continue;
+ }
+ printf(" role: %s\n", nv_get_string(nv, "role%u", ii));
+ printf(" provname: %s\n",
+ nv_get_string(nv, "provname%u", ii));
+ printf(" localpath: %s\n",
+ nv_get_string(nv, "localpath%u", ii));
+ printf(" extentsize: %u\n",
+ (unsigned int)nv_get_uint32(nv, "extentsize%u", ii));
+ printf(" keepdirty: %u\n",
+ (unsigned int)nv_get_uint32(nv, "keepdirty%u", ii));
+ printf(" remoteaddr: %s\n",
+ nv_get_string(nv, "remoteaddr%u", ii));
+ printf(" replication: %s\n",
+ nv_get_string(nv, "replication%u", ii));
+ str = nv_get_string(nv, "status%u", ii);
+ if (str != NULL)
+ printf(" status: %s\n", str);
+ printf(" dirty: %ju bytes\n",
+ (uintmax_t)nv_get_uint64(nv, "dirty%u", ii));
+ }
+ return (ret);
+}
+
+static int
+numfromstr(const char *str, intmax_t *nump)
+{
+ intmax_t num;
+ char *suffix;
+ int rerrno;
+
+ rerrno = errno;
+ errno = 0;
+ num = strtoimax(str, &suffix, 0);
+ if (errno == 0 && *suffix != '\0')
+ errno = EINVAL;
+ if (errno != 0)
+ return (-1);
+ *nump = num;
+ errno = rerrno;
+ return (0);
+}
+
+int
+main(int argc, char *argv[])
+{
+ struct nv *nv;
+ intmax_t mediasize, extentsize, keepdirty;
+ int cmd, debug, error, ii;
+ const char *optstr;
+
+ debug = 0;
+ mediasize = extentsize = keepdirty = 0;
+
+ if (argc == 1)
+ usage();
+
+ if (strcmp(argv[1], "create") == 0) {
+ cmd = CMD_CREATE;
+ optstr = "c:de:k:m:h";
+ } else if (strcmp(argv[1], "role") == 0) {
+ cmd = CMD_ROLE;
+ optstr = "c:dh";
+ } else if (strcmp(argv[1], "status") == 0) {
+ cmd = CMD_STATUS;
+ optstr = "c:dh";
+ } else if (strcmp(argv[1], "dump") == 0) {
+ cmd = CMD_DUMP;
+ optstr = "c:dh";
+ } else
+ usage();
+
+ argc--;
+ argv++;
+
+ for (;;) {
+ int ch;
+
+ ch = getopt(argc, argv, optstr);
+ if (ch == -1)
+ break;
+ switch (ch) {
+ case 'c':
+ cfgpath = optarg;
+ break;
+ case 'd':
+ debug++;
+ break;
+ case 'e':
+ if (numfromstr(optarg, &extentsize) < 0)
+ err(1, "Invalid extentsize");
+ break;
+ case 'k':
+ if (numfromstr(optarg, &keepdirty) < 0)
+ err(1, "Invalid keepdirty");
+ break;
+ case 'm':
+ if (numfromstr(optarg, &mediasize) < 0)
+ err(1, "Invalid mediasize");
+ break;
+ case 'h':
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ switch (cmd) {
+ case CMD_CREATE:
+ case CMD_ROLE:
+ if (argc == 0)
+ usage();
+ break;
+ }
+
+ pjdlog_debug_set(debug);
+
+ cfg = yy_config_parse(cfgpath);
+ assert(cfg != NULL);
+
+ switch (cmd) {
+ case CMD_CREATE:
+ control_create(argc, argv, mediasize, extentsize, keepdirty);
+ /* NOTREACHED */
+ assert(!"What are we doing here?!");
+ break;
+ case CMD_DUMP:
+ /* Dump metadata from local component of the given resource. */
+ control_dump(argc, argv);
+ /* NOTREACHED */
+ assert(!"What are we doing here?!");
+ break;
+ case CMD_ROLE:
+ /* Change role for the given resources. */
+ if (argc < 2)
+ usage();
+ nv = nv_alloc();
+ nv_add_uint8(nv, HASTCTL_CMD_SETROLE, "cmd");
+ if (strcmp(argv[0], "init") == 0)
+ nv_add_uint8(nv, HAST_ROLE_INIT, "role");
+ else if (strcmp(argv[0], "primary") == 0)
+ nv_add_uint8(nv, HAST_ROLE_PRIMARY, "role");
+ else if (strcmp(argv[0], "secondary") == 0)
+ nv_add_uint8(nv, HAST_ROLE_SECONDARY, "role");
+ else
+ usage();
+ for (ii = 0; ii < argc - 1; ii++)
+ nv_add_string(nv, argv[ii + 1], "resource%d", ii);
+ break;
+ case CMD_STATUS:
+ /* Obtain status of the given resources. */
+ nv = nv_alloc();
+ nv_add_uint8(nv, HASTCTL_CMD_STATUS, "cmd");
+ if (argc == 0)
+ nv_add_string(nv, "all", "resource%d", 0);
+ else {
+ for (ii = 0; ii < argc; ii++)
+ nv_add_string(nv, argv[ii], "resource%d", ii);
+ }
+ break;
+ default:
+ assert(!"Impossible role!");
+ }
+
+ /* Setup control connection... */
+ if (proto_client(cfg->hc_controladdr, &controlconn) < 0) {
+ pjdlog_exit(EX_OSERR,
+ "Unable to setup control connection to %s",
+ cfg->hc_controladdr);
+ }
+ /* ...and connect to hastd. */
+ if (proto_connect(controlconn) < 0) {
+ pjdlog_exit(EX_OSERR, "Unable to connect to hastd via %s",
+ cfg->hc_controladdr);
+ }
+ /* Send the command to the server... */
+ if (hast_proto_send(NULL, controlconn, nv, NULL, 0) < 0) {
+ pjdlog_exit(EX_UNAVAILABLE,
+ "Unable to send command to hastd via %s",
+ cfg->hc_controladdr);
+ }
+ nv_free(nv);
+ /* ...and receive reply. */
+ if (hast_proto_recv(NULL, controlconn, &nv, NULL, 0) < 0) {
+ pjdlog_exit(EX_UNAVAILABLE,
+ "cannot receive reply from hastd via %s",
+ cfg->hc_controladdr);
+ }
+
+ error = nv_get_int16(nv, "error");
+ if (error != 0) {
+ pjdlog_exitx(EX_SOFTWARE, "Error %d received from hastd.",
+ error);
+ }
+ nv_set_error(nv, 0);
+
+ switch (cmd) {
+ case CMD_ROLE:
+ error = control_set_role(nv, argv[0]);
+ break;
+ case CMD_STATUS:
+ error = control_status(nv);
+ break;
+ default:
+ assert(!"Impossible role!");
+ }
+
+ exit(error);
+}
diff --git a/sbin/hastd/Makefile b/sbin/hastd/Makefile
new file mode 100644
index 0000000..16a0b8f
--- /dev/null
+++ b/sbin/hastd/Makefile
@@ -0,0 +1,37 @@
+# $FreeBSD$
+
+.include <bsd.own.mk>
+
+PROG= hastd
+SRCS= activemap.c
+SRCS+= control.c
+SRCS+= ebuf.c
+SRCS+= hast_proto.c hastd.c hooks.c
+SRCS+= metadata.c
+SRCS+= nv.c
+SRCS+= secondary.c
+SRCS+= parse.y pjdlog.c primary.c
+SRCS+= proto.c proto_common.c proto_socketpair.c proto_tcp4.c proto_uds.c
+SRCS+= rangelock.c
+SRCS+= subr.c
+SRCS+= token.l
+SRCS+= y.tab.h
+WARNS?= 6
+MAN= hastd.8 hast.conf.5
+
+CFLAGS+=-I${.CURDIR}
+CFLAGS+=-DINET
+.if ${MK_INET6_SUPPORT} != "no"
+CFLAGS+=-DINET6
+.endif
+# This is needed to have WARNS > 1.
+CFLAGS+=-DYY_NO_UNPUT
+
+DPADD= ${LIBCRYPTO} ${LIBGEOM} ${LIBL} ${LIBPTHREAD} ${LIBUTIL}
+LDADD= -lcrypto -lgeom -ll -lpthread -lutil
+
+YFLAGS+=-v
+
+CLEANFILES=y.tab.c y.tab.h y.output
+
+.include <bsd.prog.mk>
diff --git a/sbin/hastd/activemap.c b/sbin/hastd/activemap.c
new file mode 100644
index 0000000..10eb641
--- /dev/null
+++ b/sbin/hastd/activemap.c
@@ -0,0 +1,691 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h> /* powerof2() */
+#include <sys/queue.h>
+
+#include <assert.h>
+#include <bitstring.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <activemap.h>
+
+#define ACTIVEMAP_MAGIC 0xac71e4
+struct activemap {
+ int am_magic; /* Magic value. */
+ off_t am_mediasize; /* Media size in bytes. */
+ uint32_t am_extentsize; /* Extent size in bytes,
+ must be power of 2. */
+ uint8_t am_extentshift;/* 2 ^ extentbits == extentsize */
+ int am_nextents; /* Number of extents. */
+ size_t am_mapsize; /* Bitmap size in bytes. */
+ uint16_t *am_memtab; /* An array that holds number of pending
+ writes per extent. */
+ bitstr_t *am_diskmap; /* On-disk bitmap of dirty extents. */
+ bitstr_t *am_memmap; /* In-memory bitmap of dirty extents. */
+ size_t am_diskmapsize; /* Map size rounded up to sector size. */
+ uint64_t am_ndirty; /* Number of dirty regions. */
+ bitstr_t *am_syncmap; /* Bitmap of extents to sync. */
+ off_t am_syncoff; /* Next synchronization offset. */
+ TAILQ_HEAD(skeepdirty, keepdirty) am_keepdirty; /* List of extents that
+ we keep dirty to reduce bitmap
+ updates. */
+ int am_nkeepdirty; /* Number of am_keepdirty elements. */
+ int am_nkeepdirty_limit; /* Maximum number of am_keepdirty
+ elements. */
+};
+
+struct keepdirty {
+ int kd_extent;
+ TAILQ_ENTRY(keepdirty) kd_next;
+};
+
+/*
+ * Helper function taken from sys/systm.h to calculate extentshift.
+ */
+static uint32_t
+bitcount32(uint32_t x)
+{
+
+ x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1);
+ x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2);
+ x = (x + (x >> 4)) & 0x0f0f0f0f;
+ x = (x + (x >> 8));
+ x = (x + (x >> 16)) & 0x000000ff;
+ return (x);
+}
+
+static __inline int
+off2ext(const struct activemap *amp, off_t offset)
+{
+ int extent;
+
+ assert(offset >= 0 && offset < amp->am_mediasize);
+ extent = (offset >> amp->am_extentshift);
+ assert(extent >= 0 && extent < amp->am_nextents);
+ return (extent);
+}
+
+static __inline off_t
+ext2off(const struct activemap *amp, int extent)
+{
+ off_t offset;
+
+ assert(extent >= 0 && extent < amp->am_nextents);
+ offset = ((off_t)extent << amp->am_extentshift);
+ assert(offset >= 0 && offset < amp->am_mediasize);
+ return (offset);
+}
+
+/*
+ * Function calculates number of requests needed to synchronize the given
+ * extent.
+ */
+static __inline int
+ext2reqs(const struct activemap *amp, int ext)
+{
+ off_t left;
+
+ if (ext < amp->am_nextents - 1)
+ return (((amp->am_extentsize - 1) / MAXPHYS) + 1);
+
+ assert(ext == amp->am_nextents - 1);
+ left = amp->am_mediasize % amp->am_extentsize;
+ if (left == 0)
+ left = amp->am_extentsize;
+ return (((left - 1) / MAXPHYS) + 1);
+}
+
+/*
+ * Initialize activemap structure and allocate memory for internal needs.
+ * Function returns 0 on success and -1 if any of the allocations failed.
+ */
+int
+activemap_init(struct activemap **ampp, uint64_t mediasize, uint32_t extentsize,
+ uint32_t sectorsize, uint32_t keepdirty)
+{
+ struct activemap *amp;
+
+ assert(ampp != NULL);
+ assert(mediasize > 0);
+ assert(extentsize > 0);
+ assert(powerof2(extentsize));
+ assert(sectorsize > 0);
+ assert(powerof2(sectorsize));
+ assert(keepdirty > 0);
+
+ amp = malloc(sizeof(*amp));
+ if (amp == NULL)
+ return (-1);
+
+ amp->am_mediasize = mediasize;
+ amp->am_nkeepdirty_limit = keepdirty;
+ amp->am_extentsize = extentsize;
+ amp->am_extentshift = bitcount32(extentsize - 1);
+ amp->am_nextents = ((mediasize - 1) / extentsize) + 1;
+ amp->am_mapsize = sizeof(bitstr_t) * bitstr_size(amp->am_nextents);
+ amp->am_diskmapsize = roundup2(amp->am_mapsize, sectorsize);
+ amp->am_ndirty = 0;
+ amp->am_syncoff = -2;
+ TAILQ_INIT(&amp->am_keepdirty);
+ amp->am_nkeepdirty = 0;
+
+ amp->am_memtab = calloc(amp->am_nextents, sizeof(amp->am_memtab[0]));
+ amp->am_diskmap = calloc(1, amp->am_diskmapsize);
+ amp->am_memmap = bit_alloc(amp->am_nextents);
+ amp->am_syncmap = bit_alloc(amp->am_nextents);
+
+ /*
+ * Check to see if any of the allocations above failed.
+ */
+ if (amp->am_memtab == NULL || amp->am_diskmap == NULL ||
+ amp->am_memmap == NULL || amp->am_syncmap == NULL) {
+ if (amp->am_memtab != NULL)
+ free(amp->am_memtab);
+ if (amp->am_diskmap != NULL)
+ free(amp->am_diskmap);
+ if (amp->am_memmap != NULL)
+ free(amp->am_memmap);
+ if (amp->am_syncmap != NULL)
+ free(amp->am_syncmap);
+ amp->am_magic = 0;
+ free(amp);
+ errno = ENOMEM;
+ return (-1);
+ }
+
+ amp->am_magic = ACTIVEMAP_MAGIC;
+ *ampp = amp;
+
+ return (0);
+}
+
+static struct keepdirty *
+keepdirty_find(struct activemap *amp, int extent)
+{
+ struct keepdirty *kd;
+
+ TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next) {
+ if (kd->kd_extent == extent)
+ break;
+ }
+ return (kd);
+}
+
+static void
+keepdirty_add(struct activemap *amp, int extent)
+{
+ struct keepdirty *kd;
+
+ kd = keepdirty_find(amp, extent);
+ if (kd != NULL) {
+ /*
+ * Only move element at the begining.
+ */
+ TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
+ TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
+ return;
+ }
+ /*
+ * Add new element, but first remove the most unused one if
+ * we have too many.
+ */
+ if (amp->am_nkeepdirty >= amp->am_nkeepdirty_limit) {
+ kd = TAILQ_LAST(&amp->am_keepdirty, skeepdirty);
+ assert(kd != NULL);
+ TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
+ amp->am_nkeepdirty--;
+ assert(amp->am_nkeepdirty > 0);
+ }
+ if (kd == NULL)
+ kd = malloc(sizeof(*kd));
+ /* We can ignore allocation failure. */
+ if (kd != NULL) {
+ kd->kd_extent = extent;
+ amp->am_nkeepdirty++;
+ TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
+ }
+}
+
+static void
+keepdirty_fill(struct activemap *amp)
+{
+ struct keepdirty *kd;
+
+ TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next)
+ bit_set(amp->am_diskmap, kd->kd_extent);
+}
+
+static void
+keepdirty_free(struct activemap *amp)
+{
+ struct keepdirty *kd;
+
+ while ((kd = TAILQ_FIRST(&amp->am_keepdirty)) != NULL) {
+ TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
+ amp->am_nkeepdirty--;
+ free(kd);
+ }
+ assert(amp->am_nkeepdirty == 0);
+}
+
+/*
+ * Function frees resources allocated by activemap_init() function.
+ */
+void
+activemap_free(struct activemap *amp)
+{
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ amp->am_magic = 0;
+
+ keepdirty_free(amp);
+ free(amp->am_memtab);
+ free(amp->am_diskmap);
+ free(amp->am_memmap);
+ free(amp->am_syncmap);
+}
+
+/*
+ * Function should be called before we handle write requests. It updates
+ * internal structures and returns true if on-disk metadata should be updated.
+ */
+bool
+activemap_write_start(struct activemap *amp, off_t offset, off_t length)
+{
+ bool modified;
+ off_t end;
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+ assert(length > 0);
+
+ modified = false;
+ end = offset + length - 1;
+
+ for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
+ /*
+ * If the number of pending writes is increased from 0,
+ * we have to mark the extent as dirty also in on-disk bitmap.
+ * By returning true we inform the caller that on-disk bitmap
+ * was modified and has to be flushed to disk.
+ */
+ if (amp->am_memtab[ext]++ == 0) {
+ assert(!bit_test(amp->am_memmap, ext));
+ bit_set(amp->am_memmap, ext);
+ amp->am_ndirty++;
+ modified = true;
+ }
+ keepdirty_add(amp, ext);
+ }
+
+ return (modified);
+}
+
+/*
+ * Function should be called after receiving write confirmation. It updates
+ * internal structures and returns true if on-disk metadata should be updated.
+ */
+bool
+activemap_write_complete(struct activemap *amp, off_t offset, off_t length)
+{
+ bool modified;
+ off_t end;
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+ assert(length > 0);
+
+ modified = false;
+ end = offset + length - 1;
+
+ for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
+ /*
+ * If the number of pending writes goes down to 0, we have to
+ * mark the extent as clean also in on-disk bitmap.
+ * By returning true we inform the caller that on-disk bitmap
+ * was modified and has to be flushed to disk.
+ */
+ assert(amp->am_memtab[ext] > 0);
+ assert(bit_test(amp->am_memmap, ext));
+ if (--amp->am_memtab[ext] == 0) {
+ bit_clear(amp->am_memmap, ext);
+ amp->am_ndirty--;
+ modified = true;
+ }
+ }
+
+ return (modified);
+}
+
+/*
+ * Function should be called after finishing synchronization of one extent.
+ * It returns true if on-disk metadata should be updated.
+ */
+bool
+activemap_extent_complete(struct activemap *amp, int extent)
+{
+ bool modified;
+ int reqs;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+ assert(extent >= 0 && extent < amp->am_nextents);
+
+ modified = false;
+
+ reqs = ext2reqs(amp, extent);
+ assert(amp->am_memtab[extent] >= reqs);
+ amp->am_memtab[extent] -= reqs;
+ assert(bit_test(amp->am_memmap, extent));
+ if (amp->am_memtab[extent] == 0) {
+ bit_clear(amp->am_memmap, extent);
+ amp->am_ndirty--;
+ modified = true;
+ }
+
+ return (modified);
+}
+
+/*
+ * Function returns number of dirty regions.
+ */
+uint64_t
+activemap_ndirty(const struct activemap *amp)
+{
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ return (amp->am_ndirty);
+}
+
+/*
+ * Function compare on-disk bitmap and in-memory bitmap and returns true if
+ * they differ and should be flushed to the disk.
+ */
+bool
+activemap_differ(const struct activemap *amp)
+{
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ return (memcmp(amp->am_diskmap, amp->am_memmap,
+ amp->am_mapsize) != 0);
+}
+
+/*
+ * Function returns number of bytes used by bitmap.
+ */
+size_t
+activemap_size(const struct activemap *amp)
+{
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ return (amp->am_mapsize);
+}
+
+/*
+ * Function returns number of bytes needed for storing on-disk bitmap.
+ * This is the same as activemap_size(), but rounded up to sector size.
+ */
+size_t
+activemap_ondisk_size(const struct activemap *amp)
+{
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ return (amp->am_diskmapsize);
+}
+
+/*
+ * Function copies the given buffer read from disk to the internal bitmap.
+ */
+void
+activemap_copyin(struct activemap *amp, const unsigned char *buf, size_t size)
+{
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+ assert(size >= amp->am_mapsize);
+
+ memcpy(amp->am_diskmap, buf, amp->am_mapsize);
+ memcpy(amp->am_memmap, buf, amp->am_mapsize);
+ memcpy(amp->am_syncmap, buf, amp->am_mapsize);
+
+ bit_ffs(amp->am_memmap, amp->am_nextents, &ext);
+ if (ext == -1) {
+ /* There are no dirty extents, so we can leave now. */
+ return;
+ }
+ /*
+ * Set synchronization offset to the first dirty extent.
+ */
+ activemap_sync_rewind(amp);
+ /*
+ * We have dirty extents and we want them to stay that way until
+ * we synchronize, so we set number of pending writes to number
+ * of requests needed to synchronize one extent.
+ */
+ amp->am_ndirty = 0;
+ for (; ext < amp->am_nextents; ext++) {
+ if (bit_test(amp->am_memmap, ext)) {
+ amp->am_memtab[ext] = ext2reqs(amp, ext);
+ amp->am_ndirty++;
+ }
+ }
+}
+
+/*
+ * Function merges the given bitmap with existng one.
+ */
+void
+activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size)
+{
+ bitstr_t *remmap = __DECONST(bitstr_t *, buf);
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+ assert(size >= amp->am_mapsize);
+
+ bit_ffs(remmap, amp->am_nextents, &ext);
+ if (ext == -1) {
+ /* There are no dirty extents, so we can leave now. */
+ return;
+ }
+ /*
+ * We have dirty extents and we want them to stay that way until
+ * we synchronize, so we set number of pending writes to number
+ * of requests needed to synchronize one extent.
+ */
+ for (; ext < amp->am_nextents; ext++) {
+ /* Local extent already dirty. */
+ if (bit_test(amp->am_syncmap, ext))
+ continue;
+ /* Remote extent isn't dirty. */
+ if (!bit_test(remmap, ext))
+ continue;
+ bit_set(amp->am_syncmap, ext);
+ bit_set(amp->am_memmap, ext);
+ bit_set(amp->am_diskmap, ext);
+ if (amp->am_memtab[ext] == 0)
+ amp->am_ndirty++;
+ amp->am_memtab[ext] = ext2reqs(amp, ext);
+ }
+ /*
+ * Set synchronization offset to the first dirty extent.
+ */
+ activemap_sync_rewind(amp);
+}
+
+/*
+ * Function returns pointer to internal bitmap that should be written to disk.
+ */
+const unsigned char *
+activemap_bitmap(struct activemap *amp, size_t *sizep)
+{
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ if (sizep != NULL)
+ *sizep = amp->am_diskmapsize;
+ memcpy(amp->am_diskmap, amp->am_memmap, amp->am_mapsize);
+ keepdirty_fill(amp);
+ return ((const unsigned char *)amp->am_diskmap);
+}
+
+/*
+ * Function calculates size needed to store bitmap on disk.
+ */
+size_t
+activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize,
+ uint32_t sectorsize)
+{
+ uint64_t nextents, mapsize;
+
+ assert(mediasize > 0);
+ assert(extentsize > 0);
+ assert(powerof2(extentsize));
+ assert(sectorsize > 0);
+ assert(powerof2(sectorsize));
+
+ nextents = ((mediasize - 1) / extentsize) + 1;
+ mapsize = sizeof(bitstr_t) * bitstr_size(nextents);
+ return (roundup2(mapsize, sectorsize));
+}
+
+/*
+ * Set synchronization offset to the first dirty extent.
+ */
+void
+activemap_sync_rewind(struct activemap *amp)
+{
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
+ if (ext == -1) {
+ /* There are no extents to synchronize. */
+ amp->am_syncoff = -2;
+ return;
+ }
+ /*
+ * Mark that we want to start synchronization from the begining.
+ */
+ amp->am_syncoff = -1;
+}
+
+/*
+ * Return next offset of where we should synchronize.
+ */
+off_t
+activemap_sync_offset(struct activemap *amp, off_t *lengthp, int *syncextp)
+{
+ off_t syncoff, left;
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+ assert(lengthp != NULL);
+ assert(syncextp != NULL);
+
+ *syncextp = -1;
+
+ if (amp->am_syncoff == -2)
+ return (-1);
+
+ if (amp->am_syncoff >= 0 &&
+ (amp->am_syncoff + MAXPHYS >= amp->am_mediasize ||
+ off2ext(amp, amp->am_syncoff) !=
+ off2ext(amp, amp->am_syncoff + MAXPHYS))) {
+ /*
+ * We are about to change extent, so mark previous one as clean.
+ */
+ ext = off2ext(amp, amp->am_syncoff);
+ bit_clear(amp->am_syncmap, ext);
+ *syncextp = ext;
+ amp->am_syncoff = -1;
+ }
+
+ if (amp->am_syncoff == -1) {
+ /*
+ * Let's find first extent to synchronize.
+ */
+ bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
+ if (ext == -1) {
+ amp->am_syncoff = -2;
+ return (-1);
+ }
+ amp->am_syncoff = ext2off(amp, ext);
+ } else {
+ /*
+ * We don't change extent, so just increase offset.
+ */
+ amp->am_syncoff += MAXPHYS;
+ if (amp->am_syncoff >= amp->am_mediasize) {
+ amp->am_syncoff = -2;
+ return (-1);
+ }
+ }
+
+ syncoff = amp->am_syncoff;
+ left = ext2off(amp, off2ext(amp, syncoff)) +
+ amp->am_extentsize - syncoff;
+ if (syncoff + left > amp->am_mediasize)
+ left = amp->am_mediasize - syncoff;
+ if (left > MAXPHYS)
+ left = MAXPHYS;
+
+ assert(left >= 0 && left <= MAXPHYS);
+ assert(syncoff >= 0 && syncoff < amp->am_mediasize);
+ assert(syncoff + left >= 0 && syncoff + left <= amp->am_mediasize);
+
+ *lengthp = left;
+ return (syncoff);
+}
+
+/*
+ * Mark extent(s) containing the given region for synchronization.
+ * Most likely one of the components is unavailable.
+ */
+bool
+activemap_need_sync(struct activemap *amp, off_t offset, off_t length)
+{
+ bool modified;
+ off_t end;
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ modified = false;
+ end = offset + length - 1;
+
+ for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
+ if (bit_test(amp->am_syncmap, ext)) {
+ /* Already marked for synchronization. */
+ assert(bit_test(amp->am_memmap, ext));
+ continue;
+ }
+ bit_set(amp->am_syncmap, ext);
+ if (!bit_test(amp->am_memmap, ext)) {
+ bit_set(amp->am_memmap, ext);
+ amp->am_ndirty++;
+ }
+ amp->am_memtab[ext] += ext2reqs(amp, ext);
+ modified = true;
+ }
+
+ return (modified);
+}
+
+void
+activemap_dump(const struct activemap *amp)
+{
+ int bit;
+
+ printf("M: ");
+ for (bit = 0; bit < amp->am_nextents; bit++)
+ printf("%d", bit_test(amp->am_memmap, bit) ? 1 : 0);
+ printf("\n");
+ printf("D: ");
+ for (bit = 0; bit < amp->am_nextents; bit++)
+ printf("%d", bit_test(amp->am_diskmap, bit) ? 1 : 0);
+ printf("\n");
+ printf("S: ");
+ for (bit = 0; bit < amp->am_nextents; bit++)
+ printf("%d", bit_test(amp->am_syncmap, bit) ? 1 : 0);
+ printf("\n");
+}
diff --git a/sbin/hastd/activemap.h b/sbin/hastd/activemap.h
new file mode 100644
index 0000000..42f0221
--- /dev/null
+++ b/sbin/hastd/activemap.h
@@ -0,0 +1,69 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ACTIVEMAP_H_
+#define _ACTIVEMAP_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+struct activemap;
+
+int activemap_init(struct activemap **ampp, uint64_t mediasize,
+ uint32_t extentsize, uint32_t sectorsize, uint32_t keepdirty);
+void activemap_free(struct activemap *amp);
+
+bool activemap_write_start(struct activemap *amp, off_t offset, off_t length);
+bool activemap_write_complete(struct activemap *amp, off_t offset,
+ off_t length);
+bool activemap_extent_complete(struct activemap *amp, int extent);
+uint64_t activemap_ndirty(const struct activemap *amp);
+
+bool activemap_differ(const struct activemap *amp);
+size_t activemap_size(const struct activemap *amp);
+size_t activemap_ondisk_size(const struct activemap *amp);
+void activemap_copyin(struct activemap *amp, const unsigned char *buf,
+ size_t size);
+void activemap_merge(struct activemap *amp, const unsigned char *buf,
+ size_t size);
+const unsigned char *activemap_bitmap(struct activemap *amp, size_t *sizep);
+
+size_t activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize,
+ uint32_t sectorsize);
+
+void activemap_sync_rewind(struct activemap *amp);
+off_t activemap_sync_offset(struct activemap *amp, off_t *lengthp,
+ int *syncextp);
+bool activemap_need_sync(struct activemap *amp, off_t offset, off_t length);
+
+void activemap_dump(const struct activemap *amp);
+
+#endif /* !_ACTIVEMAP_H_ */
diff --git a/sbin/hastd/control.c b/sbin/hastd/control.c
new file mode 100644
index 0000000..0ad39b4
--- /dev/null
+++ b/sbin/hastd/control.c
@@ -0,0 +1,426 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <signal.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "hast.h"
+#include "hastd.h"
+#include "hast_proto.h"
+#include "nv.h"
+#include "pjdlog.h"
+#include "proto.h"
+#include "subr.h"
+
+#include "control.h"
+
+static void
+control_set_role(struct hastd_config *cfg, struct nv *nvout, uint8_t role,
+ struct hast_resource *res, const char *name, unsigned int no)
+{
+
+ assert(cfg != NULL);
+ assert(nvout != NULL);
+ assert(name != NULL);
+
+ /* Name is always needed. */
+ nv_add_string(nvout, name, "resource%u", no);
+
+ if (res == NULL) {
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (strcmp(res->hr_name, name) == 0)
+ break;
+ }
+ if (res == NULL) {
+ nv_add_int16(nvout, EHAST_NOENTRY, "error%u", no);
+ return;
+ }
+ }
+ assert(res != NULL);
+
+ /* Send previous role back. */
+ nv_add_string(nvout, role2str(res->hr_role), "role%u", no);
+
+ /* Nothing changed, return here. */
+ if (role == res->hr_role)
+ return;
+
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
+ pjdlog_info("Role changed to %s.", role2str(role));
+
+ /* Change role to the new one. */
+ res->hr_role = role;
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
+
+ /*
+ * If previous role was primary or secondary we have to kill process
+ * doing that work.
+ */
+ if (res->hr_workerpid != 0) {
+ if (kill(res->hr_workerpid, SIGTERM) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to kill worker process %u",
+ (unsigned int)res->hr_workerpid);
+ } else if (waitpid(res->hr_workerpid, NULL, 0) !=
+ res->hr_workerpid) {
+ pjdlog_errno(LOG_WARNING,
+ "Error while waiting for worker process %u",
+ (unsigned int)res->hr_workerpid);
+ } else {
+ pjdlog_debug(1, "Worker process %u stopped.",
+ (unsigned int)res->hr_workerpid);
+ }
+ res->hr_workerpid = 0;
+ }
+
+ /* Start worker process if we are changing to primary. */
+ if (role == HAST_ROLE_PRIMARY)
+ hastd_primary(res);
+ pjdlog_prefix_set("%s", "");
+}
+
+static void
+control_status_worker(struct hast_resource *res, struct nv *nvout,
+ unsigned int no)
+{
+ struct nv *cnvin, *cnvout;
+ const char *str;
+ int error;
+
+ cnvin = cnvout = NULL;
+ error = 0;
+
+ /*
+ * Prepare and send command to worker process.
+ */
+ cnvout = nv_alloc();
+ nv_add_uint8(cnvout, HASTCTL_STATUS, "cmd");
+ error = nv_error(cnvout);
+ if (error != 0) {
+ /* LOG */
+ goto end;
+ }
+ if (hast_proto_send(res, res->hr_ctrl, cnvout, NULL, 0) < 0) {
+ error = errno;
+ /* LOG */
+ goto end;
+ }
+
+ /*
+ * Receive response.
+ */
+ if (hast_proto_recv_hdr(res->hr_ctrl, &cnvin) < 0) {
+ error = errno;
+ /* LOG */
+ goto end;
+ }
+
+ error = nv_get_int64(cnvin, "error");
+ if (error != 0)
+ goto end;
+
+ if ((str = nv_get_string(cnvin, "status")) == NULL) {
+ error = ENOENT;
+ /* LOG */
+ goto end;
+ }
+ nv_add_string(nvout, str, "status%u", no);
+ nv_add_uint64(nvout, nv_get_uint64(cnvin, "dirty"), "dirty%u", no);
+ nv_add_uint32(nvout, nv_get_uint32(cnvin, "extentsize"),
+ "extentsize%u", no);
+ nv_add_uint32(nvout, nv_get_uint32(cnvin, "keepdirty"),
+ "keepdirty%u", no);
+end:
+ if (cnvin != NULL)
+ nv_free(cnvin);
+ if (cnvout != NULL)
+ nv_free(cnvout);
+ if (error != 0)
+ nv_add_int16(nvout, error, "error");
+}
+
+static void
+control_status(struct hastd_config *cfg, struct nv *nvout,
+ struct hast_resource *res, const char *name, unsigned int no)
+{
+
+ assert(cfg != NULL);
+ assert(nvout != NULL);
+ assert(name != NULL);
+
+ /* Name is always needed. */
+ nv_add_string(nvout, name, "resource%u", no);
+
+ if (res == NULL) {
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (strcmp(res->hr_name, name) == 0)
+ break;
+ }
+ if (res == NULL) {
+ nv_add_int16(nvout, EHAST_NOENTRY, "error%u", no);
+ return;
+ }
+ }
+ assert(res != NULL);
+ nv_add_string(nvout, res->hr_provname, "provname%u", no);
+ nv_add_string(nvout, res->hr_localpath, "localpath%u", no);
+ nv_add_string(nvout, res->hr_remoteaddr, "remoteaddr%u", no);
+ switch (res->hr_replication) {
+ case HAST_REPLICATION_FULLSYNC:
+ nv_add_string(nvout, "fullsync", "replication%u", no);
+ break;
+ case HAST_REPLICATION_MEMSYNC:
+ nv_add_string(nvout, "memsync", "replication%u", no);
+ break;
+ case HAST_REPLICATION_ASYNC:
+ nv_add_string(nvout, "async", "replication%u", no);
+ break;
+ default:
+ nv_add_string(nvout, "unknown", "replication%u", no);
+ break;
+ }
+ nv_add_string(nvout, role2str(res->hr_role), "role%u", no);
+
+ switch (res->hr_role) {
+ case HAST_ROLE_PRIMARY:
+ assert(res->hr_workerpid != 0);
+ /* FALLTHROUGH */
+ case HAST_ROLE_SECONDARY:
+ if (res->hr_workerpid != 0)
+ break;
+ /* FALLTHROUGH */
+ default:
+ return;
+ }
+
+ /*
+ * If we are here, it means that we have a worker process, which we
+ * want to ask some questions.
+ */
+ control_status_worker(res, nvout, no);
+}
+
+void
+control_handle(struct hastd_config *cfg)
+{
+ struct proto_conn *conn;
+ struct nv *nvin, *nvout;
+ unsigned int ii;
+ const char *str;
+ uint8_t cmd, role;
+ int error;
+
+ if (proto_accept(cfg->hc_controlconn, &conn) < 0) {
+ pjdlog_errno(LOG_ERR, "Unable to accept control connection");
+ return;
+ }
+
+ nvin = nvout = NULL;
+ role = HAST_ROLE_UNDEF;
+
+ if (hast_proto_recv_hdr(conn, &nvin) < 0) {
+ pjdlog_errno(LOG_ERR, "Unable to receive control header");
+ nvin = NULL;
+ goto close;
+ }
+
+ /* Obtain command code. 0 means that nv_get_uint8() failed. */
+ cmd = nv_get_uint8(nvin, "cmd");
+ if (cmd == 0) {
+ pjdlog_error("Control header is missing 'cmd' field.");
+ error = EHAST_INVALID;
+ goto close;
+ }
+
+ /* Allocate outgoing nv structure. */
+ nvout = nv_alloc();
+ if (nvout == NULL) {
+ pjdlog_error("Unable to allocate header for control response.");
+ error = EHAST_NOMEMORY;
+ goto close;
+ }
+
+ error = 0;
+
+ str = nv_get_string(nvin, "resource0");
+ if (str == NULL) {
+ pjdlog_error("Control header is missing 'resource0' field.");
+ error = EHAST_INVALID;
+ goto fail;
+ }
+ if (cmd == HASTCTL_SET_ROLE) {
+ role = nv_get_uint8(nvin, "role");
+ switch (role) {
+ case HAST_ROLE_INIT: /* Is that valid to set, hmm? */
+ case HAST_ROLE_PRIMARY:
+ case HAST_ROLE_SECONDARY:
+ break;
+ default:
+ pjdlog_error("Invalid role received (%hhu).", role);
+ error = EHAST_INVALID;
+ goto fail;
+ }
+ }
+ if (strcmp(str, "all") == 0) {
+ struct hast_resource *res;
+
+ /* All configured resources. */
+
+ ii = 0;
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ switch (cmd) {
+ case HASTCTL_SET_ROLE:
+ control_set_role(cfg, nvout, role, res,
+ res->hr_name, ii++);
+ break;
+ case HASTCTL_STATUS:
+ control_status(cfg, nvout, res, res->hr_name,
+ ii++);
+ break;
+ default:
+ pjdlog_error("Invalid command received (%hhu).",
+ cmd);
+ error = EHAST_UNIMPLEMENTED;
+ goto fail;
+ }
+ }
+ } else {
+ /* Only selected resources. */
+
+ for (ii = 0; ; ii++) {
+ str = nv_get_string(nvin, "resource%u", ii);
+ if (str == NULL)
+ break;
+ switch (cmd) {
+ case HASTCTL_SET_ROLE:
+ control_set_role(cfg, nvout, role, NULL, str,
+ ii);
+ break;
+ case HASTCTL_STATUS:
+ control_status(cfg, nvout, NULL, str, ii);
+ break;
+ default:
+ pjdlog_error("Invalid command received (%hhu).",
+ cmd);
+ error = EHAST_UNIMPLEMENTED;
+ goto fail;
+ }
+ }
+ }
+ if (nv_error(nvout) != 0)
+ goto close;
+fail:
+ if (error != 0)
+ nv_add_int16(nvout, error, "error");
+
+ if (hast_proto_send(NULL, conn, nvout, NULL, 0) < 0)
+ pjdlog_errno(LOG_ERR, "Unable to send control response");
+close:
+ if (nvin != NULL)
+ nv_free(nvin);
+ if (nvout != NULL)
+ nv_free(nvout);
+ proto_close(conn);
+}
+
+/*
+ * Thread handles control requests from the parent.
+ */
+void *
+ctrl_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct nv *nvin, *nvout;
+ uint8_t cmd;
+
+ for (;;) {
+ if (hast_proto_recv_hdr(res->hr_ctrl, &nvin) < 0) {
+ if (sigexit_received)
+ pthread_exit(NULL);
+ pjdlog_errno(LOG_ERR,
+ "Unable to receive control message");
+ continue;
+ }
+ cmd = nv_get_uint8(nvin, "cmd");
+ if (cmd == 0) {
+ pjdlog_error("Control message is missing 'cmd' field.");
+ nv_free(nvin);
+ continue;
+ }
+ nv_free(nvin);
+ nvout = nv_alloc();
+ switch (cmd) {
+ case HASTCTL_STATUS:
+ if (res->hr_remotein != NULL &&
+ res->hr_remoteout != NULL) {
+ nv_add_string(nvout, "complete", "status");
+ } else {
+ nv_add_string(nvout, "degraded", "status");
+ }
+ nv_add_uint32(nvout, (uint32_t)res->hr_extentsize,
+ "extentsize");
+ if (res->hr_role == HAST_ROLE_PRIMARY) {
+ nv_add_uint32(nvout,
+ (uint32_t)res->hr_keepdirty, "keepdirty");
+ nv_add_uint64(nvout,
+ (uint64_t)(activemap_ndirty(res->hr_amp) *
+ res->hr_extentsize), "dirty");
+ } else {
+ nv_add_uint32(nvout, (uint32_t)0, "keepdirty");
+ nv_add_uint64(nvout, (uint64_t)0, "dirty");
+ }
+ break;
+ default:
+ nv_add_int16(nvout, EINVAL, "error");
+ break;
+ }
+ if (nv_error(nvout) != 0) {
+ pjdlog_error("Unable to create answer on control message.");
+ nv_free(nvout);
+ continue;
+ }
+ if (hast_proto_send(NULL, res->hr_ctrl, nvout, NULL, 0) < 0) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to send reply to control message");
+ }
+ nv_free(nvout);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
diff --git a/sbin/hastd/control.h b/sbin/hastd/control.h
new file mode 100644
index 0000000..15ea290
--- /dev/null
+++ b/sbin/hastd/control.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CONTROL_H_
+#define _CONTROL_H_
+
+#define HASTCTL_SET_ROLE 1
+#define HASTCTL_STATUS 2
+
+struct hastd_config;
+
+void control_handle(struct hastd_config *cfg);
+
+void *ctrl_thread(void *arg);
+
+#endif /* !_CONTROL_H_ */
diff --git a/sbin/hastd/ebuf.c b/sbin/hastd/ebuf.c
new file mode 100644
index 0000000..47b7530
--- /dev/null
+++ b/sbin/hastd/ebuf.c
@@ -0,0 +1,252 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include "ebuf.h"
+
+#define EBUF_MAGIC 0xeb0f41c
+struct ebuf {
+ /* Magic to assert the caller uses valid structure. */
+ int eb_magic;
+ /* Address where we did the allocation. */
+ unsigned char *eb_start;
+ /* Allocation end address. */
+ unsigned char *eb_end;
+ /* Start of real data. */
+ unsigned char *eb_used;
+ /* Size of real data. */
+ size_t eb_size;
+};
+
+static int ebuf_head_extent(struct ebuf *eb, size_t size);
+static int ebuf_tail_extent(struct ebuf *eb, size_t size);
+
+struct ebuf *
+ebuf_alloc(size_t size)
+{
+ struct ebuf *eb;
+ int rerrno;
+
+ eb = malloc(sizeof(*eb));
+ if (eb == NULL)
+ return (NULL);
+ size += PAGE_SIZE;
+ eb->eb_start = malloc(size);
+ if (eb->eb_start == NULL) {
+ rerrno = errno;
+ free(eb);
+ errno = rerrno;
+ return (NULL);
+ }
+ eb->eb_end = eb->eb_start + size;
+ /*
+ * We set start address for real data not at the first entry, because
+ * we want to be able to add data at the front.
+ */
+ eb->eb_used = eb->eb_start + PAGE_SIZE / 4;
+ eb->eb_size = 0;
+ eb->eb_magic = EBUF_MAGIC;
+
+ return (eb);
+}
+
+void
+ebuf_free(struct ebuf *eb)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ eb->eb_magic = 0;
+
+ free(eb->eb_start);
+ free(eb);
+}
+
+int
+ebuf_add_head(struct ebuf *eb, const void *data, size_t size)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ if (size > (size_t)(eb->eb_used - eb->eb_start)) {
+ /*
+ * We can't add more entries at the front, so we have to extend
+ * our buffer.
+ */
+ if (ebuf_head_extent(eb, size) < 0)
+ return (-1);
+ }
+ assert(size <= (size_t)(eb->eb_used - eb->eb_start));
+
+ eb->eb_size += size;
+ eb->eb_used -= size;
+ /*
+ * If data is NULL the caller just wants to reserve place.
+ */
+ if (data != NULL)
+ bcopy(data, eb->eb_used, size);
+
+ return (0);
+}
+
+int
+ebuf_add_tail(struct ebuf *eb, const void *data, size_t size)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ if (size > (size_t)(eb->eb_end - (eb->eb_used + eb->eb_size))) {
+ /*
+ * We can't add more entries at the back, so we have to extend
+ * our buffer.
+ */
+ if (ebuf_tail_extent(eb, size) < 0)
+ return (-1);
+ }
+ assert(size <= (size_t)(eb->eb_end - (eb->eb_used + eb->eb_size)));
+
+ /*
+ * If data is NULL the caller just wants to reserve place.
+ */
+ if (data != NULL)
+ bcopy(data, eb->eb_used + eb->eb_size, size);
+ eb->eb_size += size;
+
+ return (0);
+}
+
+void
+ebuf_del_head(struct ebuf *eb, size_t size)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+ assert(size <= eb->eb_size);
+
+ eb->eb_used += size;
+ eb->eb_size -= size;
+}
+
+void
+ebuf_del_tail(struct ebuf *eb, size_t size)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+ assert(size <= eb->eb_size);
+
+ eb->eb_size -= size;
+}
+
+/*
+ * Return pointer to the data and data size.
+ */
+void *
+ebuf_data(struct ebuf *eb, size_t *sizep)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ if (sizep != NULL)
+ *sizep = eb->eb_size;
+ return (eb->eb_size > 0 ? eb->eb_used : NULL);
+}
+
+/*
+ * Return data size.
+ */
+size_t
+ebuf_size(struct ebuf *eb)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ return (eb->eb_size);
+}
+
+/*
+ * Function adds size + (PAGE_SIZE / 4) bytes at the front of the buffer..
+ */
+static int
+ebuf_head_extent(struct ebuf *eb, size_t size)
+{
+ unsigned char *newstart, *newused;
+ size_t newsize;
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ newsize = eb->eb_end - eb->eb_start + (PAGE_SIZE / 4) + size;
+
+ newstart = malloc(newsize);
+ if (newstart == NULL)
+ return (-1);
+ newused =
+ newstart + (PAGE_SIZE / 4) + size + (eb->eb_used - eb->eb_start);
+
+ bcopy(eb->eb_used, newused, eb->eb_size);
+
+ eb->eb_start = newstart;
+ eb->eb_used = newused;
+ eb->eb_end = newstart + newsize;
+
+ return (0);
+}
+
+/*
+ * Function adds size + ((3 * PAGE_SIZE) / 4) bytes at the back.
+ */
+static int
+ebuf_tail_extent(struct ebuf *eb, size_t size)
+{
+ unsigned char *newstart;
+ size_t newsize;
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ newsize = eb->eb_end - eb->eb_start + size + ((3 * PAGE_SIZE) / 4);
+
+ newstart = realloc(eb->eb_start, newsize);
+ if (newstart == NULL)
+ return (-1);
+
+ eb->eb_used = newstart + (eb->eb_used - eb->eb_start);
+ eb->eb_start = newstart;
+ eb->eb_end = newstart + newsize;
+
+ return (0);
+}
diff --git a/sbin/hastd/ebuf.h b/sbin/hastd/ebuf.h
new file mode 100644
index 0000000..06275e7
--- /dev/null
+++ b/sbin/hastd/ebuf.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _EBUF_H_
+#define _EBUF_H_
+
+#include <stdlib.h> /* size_t */
+
+struct ebuf;
+
+struct ebuf *ebuf_alloc(size_t size);
+void ebuf_free(struct ebuf *eb);
+
+int ebuf_add_head(struct ebuf *eb, const void *data, size_t size);
+int ebuf_add_tail(struct ebuf *eb, const void *data, size_t size);
+
+void ebuf_del_head(struct ebuf *eb, size_t size);
+void ebuf_del_tail(struct ebuf *eb, size_t size);
+
+void *ebuf_data(struct ebuf *eb, size_t *sizep);
+size_t ebuf_size(struct ebuf *eb);
+
+#endif /* !_EBUF_H_ */
diff --git a/sbin/hastd/hast.conf.5 b/sbin/hastd/hast.conf.5
new file mode 100644
index 0000000..5734ee8
--- /dev/null
+++ b/sbin/hastd/hast.conf.5
@@ -0,0 +1,267 @@
+.\" Copyright (c) 2010 The FreeBSD Foundation
+.\" All rights reserved.
+.\"
+.\" This software was developed by Pawel Jakub Dawidek under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd February 1, 2010
+.Dt HAST.CONF 5
+.Os
+.Sh NAME
+.Nm hast.conf
+.Nd configuration file for the
+.Xr hastd 8
+deamon and the
+.Xr hastctl 8
+utility.
+.Sh DESCRIPTION
+The
+.Nm
+file is used by both
+.Xr hastd 8
+daemon
+and
+.Xr hastctl 8
+control utility.
+Configuration file is designed in a way that exactly the same file can be
+(and should be) used on both HAST nodes.
+Every line starting with # is treated as comment and ignored.
+.Sh CONFIGURATION FILE SYNTAX
+General syntax of the
+.Nm
+file is following:
+.Bd -literal -offset indent
+# Global section
+control <addr>
+listen <addr>
+replication <mode>
+
+on <node> {
+ # Node section
+ control <addr>
+ listen <addr>
+}
+
+on <node> {
+ # Node section
+ control <addr>
+ listen <addr>
+}
+
+resource <name> {
+ # Resource section
+ replication <mode>
+ name <name>
+ local <path>
+
+ on <node> {
+ # Resource-node section
+ name <name>
+ # Required
+ local <path>
+ # Required
+ remote <addr>
+ }
+ on <node> {
+ # Resource-node section
+ name <name>
+ # Required
+ local <path>
+ # Required
+ remote <addr>
+ }
+}
+.Ed
+.Pp
+Most of the various available configuration parameters are optional.
+If parameter is not defined in the particular section, it will be
+inherited from the parent section.
+For example, if the
+.Ic listen
+parameter is not defined in the node section, it will be inherited from
+the global section.
+In case the global section does not define the
+.Ic listen
+parameter at all, the default value will be used.
+.Sh CONFIGURATION FILE DESCRIPTION
+The
+.Aq node
+argument can be replaced either by a full hostname as obtained by
+.Xr gethostname 3 ,
+only first part of the hostname, or by node's UUID as found in the
+.Va kern.hostuuid
+.Xr sysctl 8
+variable.
+.Pp
+The following statements are available:
+.Bl -tag -width ".Ic xxxx"
+.It Ic control Aq addr
+.Pp
+Address for communication with
+.Xr hastctl 8 .
+Each of the following examples defines the same control address:
+.Bd -literal -offset indent
+uds:///var/run/hastctl
+unix:///var/run/hastctl
+/var/run/hastctl
+.Ed
+.Pp
+The default value is
+.Pa uds:///var/run/hastctl .
+.It Ic listen Aq addr
+.Pp
+Address to listen on in form of:
+.Bd -literal -offset indent
+protocol://protocol-specific-address
+.Ed
+.Pp
+Each of the following examples defines the same listen address:
+.Bd -literal -offset indent
+0.0.0.0
+0.0.0.0:8457
+tcp://0.0.0.0
+tcp://0.0.0.0:8457
+tcp4://0.0.0.0
+tcp4://0.0.0.0:8457
+.Ed
+.Pp
+The default value is
+.Pa tcp4://0.0.0.0:8457 .
+.It Ic replication Aq mode
+.Pp
+Replication mode should be one of the following:
+.Bl -tag -width ".Ic xxxx"
+.It Ic memsync
+.Pp
+Report the write operation as completed when local write completes and
+when the remote node acknowledges the data receipt, but before it
+actually stores the data.
+The data on remote node will be stored directly after sending
+acknowledgement.
+This mode is intended to reduce latency, but still provides a very good
+reliability.
+The only situation where some small amount of data could be lost is when
+the data is stored on primary node and sent to the secondary.
+Secondary node then acknowledges data receipt and primary reports
+success to an application.
+However, it may happen that the seconderay goes down before the received
+data is really stored locally.
+Before secondary node returns, primary node dies entirely.
+When the secondary node comes back to life it becomes the new primary.
+Unfortunately some small amount of data which was confirmed to be stored
+to the application was lost.
+The risk of such a situation is very small, which is the reason for this
+mode to be the default.
+.It Ic fullsync
+.Pp
+Mark the write operation as completed when local as well as remote
+write completes.
+This is the safest and the slowest replication mode.
+The
+.Ic fullsync
+replication mode is currently not implemented.
+.It Ic async
+.Pp
+The write operation is reported as complete right after the local write
+completes.
+This is the fastest and the most dangerous replication mode.
+This mode should be used when replicating to a distant node where
+latency is too high for other modes.
+The
+.Ic async
+replication mode is currently not implemented.
+.El
+.It Ic name Aq name
+.Pp
+GEOM provider name that will appear as
+.Pa /dev/hast/<name> .
+If name is not defined, resource name will be used as provider name.
+.It Ic local Aq path
+.Pp
+Path to the local component which will be used as backend provider for
+the resource.
+This can be either GEOM provider or regular file.
+.It Ic remote Aq addr
+.Pp
+Address of the remote
+.Nm hastd
+daemon.
+Format is the same as for the
+.Ic listen
+statement.
+When operating as a primary node this address will be used to connect to
+the secondary node.
+When operating as a secondary node only connections from this address
+will be accepted.
+.El
+.Sh EXAMPLES
+The example configuration file can look as follows:
+.Bd -literal -offset indent
+resource shared {
+ local /dev/da0
+
+ on hasta {
+ remote tcp4://10.0.0.2
+ }
+ on hastb {
+ remote tcp4://10.0.0.1
+ }
+}
+resource tank {
+ on hasta {
+ local /dev/mirror/tanka
+ remote tcp4://10.0.0.2
+ }
+ on hastb {
+ local /dev/mirror/tankb
+ remote tcp4://10.0.0.1
+ }
+}
+.Ed
+.Sh FILES
+.Bl -tag -width ".Pa /var/run/hastctl" -compact
+.It Pa /etc/hast.conf
+The default
+.Nm
+configuration file.
+.It Pa /var/run/hastctl
+Control socket used by the
+.Xr hastctl 8
+control utility to communicate with the
+.Xr hastd 8
+daemon.
+.El
+.Sh SEE ALSO
+.Xr gethostname 3 ,
+.Xr geom 4 ,
+.Xr hastctl 8 ,
+.Xr hastd 8 .
+.Sh AUTHORS
+The
+.Nm
+was written by
+.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org
+under sponsorship of the FreeBSD Foundation.
diff --git a/sbin/hastd/hast.h b/sbin/hastd/hast.h
new file mode 100644
index 0000000..c5220b5
--- /dev/null
+++ b/sbin/hastd/hast.h
@@ -0,0 +1,190 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HAST_H_
+#define _HAST_H_
+
+#include <sys/queue.h>
+#include <sys/socket.h>
+
+#include <arpa/inet.h>
+
+#include <netinet/in.h>
+
+#include <limits.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <activemap.h>
+
+#include "proto.h"
+
+#define HAST_PROTO_VERSION 0
+
+#define EHAST_OK 0
+#define EHAST_NOENTRY 1
+#define EHAST_INVALID 2
+#define EHAST_NOMEMORY 3
+#define EHAST_UNIMPLEMENTED 4
+
+#define HASTCTL_CMD_UNKNOWN 0
+#define HASTCTL_CMD_SETROLE 1
+#define HASTCTL_CMD_STATUS 2
+
+#define HAST_ROLE_UNDEF 0
+#define HAST_ROLE_INIT 1
+#define HAST_ROLE_PRIMARY 2
+#define HAST_ROLE_SECONDARY 3
+
+#define HAST_SYNCSRC_UNDEF 0
+#define HAST_SYNCSRC_PRIMARY 1
+#define HAST_SYNCSRC_SECONDARY 2
+
+#define HIO_UNDEF 0
+#define HIO_READ 1
+#define HIO_WRITE 2
+#define HIO_DELETE 3
+#define HIO_FLUSH 4
+
+#define HAST_CONFIG "/etc/hast.conf"
+#define HAST_CONTROL "/var/run/hastctl"
+#define HASTD_PORT 8457
+#define HASTD_LISTEN "tcp4://0.0.0.0:8457"
+#define HASTD_PIDFILE "/var/run/hastd.pid"
+
+/* Default extent size. */
+#define HAST_EXTENTSIZE 2097152
+/* Default maximum number of extents that are kept dirty. */
+#define HAST_KEEPDIRTY 64
+
+#define HAST_ADDRSIZE 1024
+#define HAST_TOKEN_SIZE 16
+
+struct hastd_config {
+ /* Address to communicate with hastctl(8). */
+ char hc_controladdr[HAST_ADDRSIZE];
+ /* Protocol-specific data. */
+ struct proto_conn *hc_controlconn;
+ /* Address to listen on. */
+ char hc_listenaddr[HAST_ADDRSIZE];
+ /* Protocol-specific data. */
+ struct proto_conn *hc_listenconn;
+ /* List of resources. */
+ TAILQ_HEAD(, hast_resource) hc_resources;
+};
+
+#define HAST_REPLICATION_FULLSYNC 0
+#define HAST_REPLICATION_MEMSYNC 1
+#define HAST_REPLICATION_ASYNC 2
+
+/*
+ * Structure that describes single resource.
+ */
+struct hast_resource {
+ /* Resource name. */
+ char hr_name[NAME_MAX];
+ /* Replication mode (HAST_REPLICATION_*). */
+ int hr_replication;
+ /* Provider name that will appear in /dev/hast/. */
+ char hr_provname[NAME_MAX];
+ /* Synchronization extent size. */
+ int hr_extentsize;
+ /* Maximum number of extents that are kept dirty. */
+ int hr_keepdirty;
+
+ /* Path to local component. */
+ char hr_localpath[PATH_MAX];
+ /* Descriptor to access local component. */
+ int hr_localfd;
+ /* Offset into local component. */
+ off_t hr_localoff;
+ /* Size of usable space. */
+ off_t hr_datasize;
+ /* Size of entire local provider. */
+ off_t hr_local_mediasize;
+ /* Sector size of local provider. */
+ unsigned int hr_local_sectorsize;
+
+ /* Descriptor for /dev/ggctl communication. */
+ int hr_ggatefd;
+ /* Unit number for ggate communication. */
+ int hr_ggateunit;
+
+ /* Address of the remote component. */
+ char hr_remoteaddr[HAST_ADDRSIZE];
+ /* Connection for incoming data. */
+ struct proto_conn *hr_remotein;
+ /* Connection for outgoing data. */
+ struct proto_conn *hr_remoteout;
+ /* Token to verify both in and out connection are coming from
+ the same node (not necessarily from the same address). */
+ unsigned char hr_token[HAST_TOKEN_SIZE];
+
+ /* Resource unique identifier. */
+ uint64_t hr_resuid;
+ /* Primary's local modification count. */
+ uint64_t hr_primary_localcnt;
+ /* Primary's remote modification count. */
+ uint64_t hr_primary_remotecnt;
+ /* Secondary's local modification count. */
+ uint64_t hr_secondary_localcnt;
+ /* Secondary's remote modification count. */
+ uint64_t hr_secondary_remotecnt;
+ /* Synchronization source. */
+ uint8_t hr_syncsrc;
+
+ /* Resource role: HAST_ROLE_{INIT,PRIMARY,SECONDARY}. */
+ int hr_role;
+ /* Previous resource role: HAST_ROLE_{INIT,PRIMARY,SECONDARY}. */
+ int hr_previous_role;
+ /* PID of child worker process. 0 - no child. */
+ pid_t hr_workerpid;
+ /* Control connection between parent and child. */
+ struct proto_conn *hr_ctrl;
+
+ /* Activemap structure. */
+ struct activemap *hr_amp;
+ /* Locked used to synchronize access to hr_amp. */
+ pthread_mutex_t hr_amp_lock;
+
+ /* Next resource. */
+ TAILQ_ENTRY(hast_resource) hr_next;
+};
+
+struct hastd_config *yy_config_parse(const char *config);
+void yy_config_free(struct hastd_config *config);
+
+void yyerror(const char *);
+int yylex(void);
+int yyparse(void);
+
+#endif /* !_HAST_H_ */
diff --git a/sbin/hastd/hast_proto.c b/sbin/hastd/hast_proto.c
new file mode 100644
index 0000000..6e66006
--- /dev/null
+++ b/sbin/hastd/hast_proto.c
@@ -0,0 +1,401 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/endian.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+#include <strings.h>
+
+#include <openssl/sha.h>
+
+#include <hast.h>
+#include <ebuf.h>
+#include <nv.h>
+#include <pjdlog.h>
+#include <proto.h>
+
+#include "hast_proto.h"
+
+struct hast_main_header {
+ /* Protocol version. */
+ uint8_t version;
+ /* Size of nv headers. */
+ uint32_t size;
+} __packed;
+
+typedef int hps_send_t(struct hast_resource *, struct nv *nv, void **, size_t *, bool *);
+typedef int hps_recv_t(struct hast_resource *, struct nv *nv, void **, size_t *, bool *);
+
+struct hast_pipe_stage {
+ const char *hps_name;
+ hps_send_t *hps_send;
+ hps_recv_t *hps_recv;
+};
+
+static int compression_send(struct hast_resource *res, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap);
+static int compression_recv(struct hast_resource *res, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap);
+static int checksum_send(struct hast_resource *res, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap);
+static int checksum_recv(struct hast_resource *res, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap);
+
+static struct hast_pipe_stage pipeline[] = {
+ { "compression", compression_send, compression_recv },
+ { "checksum", checksum_send, checksum_recv }
+};
+
+static int
+compression_send(struct hast_resource *res, struct nv *nv, void **datap,
+ size_t *sizep, bool *freedatap)
+{
+ unsigned char *newbuf;
+
+ res = res; /* TODO */
+
+ /*
+ * TODO: For now we emulate compression.
+ * At 80% probability we succeed to compress data, which means we
+ * allocate new buffer, copy the data over set *freedatap to true.
+ */
+
+ if (arc4random_uniform(100) < 80) {
+ uint32_t *origsize;
+
+ /*
+ * Compression succeeded (but we will grow by 4 bytes, not
+ * shrink for now).
+ */
+ newbuf = malloc(sizeof(uint32_t) + *sizep);
+ if (newbuf == NULL)
+ return (-1);
+ origsize = (void *)newbuf;
+ *origsize = htole32((uint32_t)*sizep);
+ nv_add_string(nv, "null", "compression");
+ if (nv_error(nv) != 0) {
+ free(newbuf);
+ errno = nv_error(nv);
+ return (-1);
+ }
+ bcopy(*datap, newbuf + sizeof(uint32_t), *sizep);
+ if (*freedatap)
+ free(*datap);
+ *freedatap = true;
+ *datap = newbuf;
+ *sizep = sizeof(uint32_t) + *sizep;
+ } else {
+ /*
+ * Compression failed, so we leave everything as it was.
+ * It is not critical for compression to succeed.
+ */
+ }
+
+ return (0);
+}
+
+static int
+compression_recv(struct hast_resource *res, struct nv *nv, void **datap,
+ size_t *sizep, bool *freedatap)
+{
+ unsigned char *newbuf;
+ const char *algo;
+ size_t origsize;
+
+ res = res; /* TODO */
+
+ /*
+ * TODO: For now we emulate compression.
+ */
+
+ algo = nv_get_string(nv, "compression");
+ if (algo == NULL)
+ return (0); /* No compression. */
+ if (strcmp(algo, "null") != 0) {
+ pjdlog_error("Unknown compression algorithm '%s'.", algo);
+ return (-1); /* Unknown compression algorithm. */
+ }
+
+ origsize = le32toh(*(uint32_t *)*datap);
+ newbuf = malloc(origsize);
+ if (newbuf == NULL)
+ return (-1);
+ bcopy((unsigned char *)*datap + sizeof(uint32_t), newbuf, origsize);
+ if (*freedatap)
+ free(*datap);
+ *freedatap = true;
+ *datap = newbuf;
+ *sizep = origsize;
+
+ return (0);
+}
+
+static int
+checksum_send(struct hast_resource *res, struct nv *nv, void **datap,
+ size_t *sizep, bool *freedatap __unused)
+{
+ unsigned char hash[SHA256_DIGEST_LENGTH];
+ SHA256_CTX ctx;
+
+ res = res; /* TODO */
+
+ SHA256_Init(&ctx);
+ SHA256_Update(&ctx, *datap, *sizep);
+ SHA256_Final(hash, &ctx);
+
+ nv_add_string(nv, "sha256", "checksum");
+ nv_add_uint8_array(nv, hash, sizeof(hash), "hash");
+
+ return (0);
+}
+
+static int
+checksum_recv(struct hast_resource *res, struct nv *nv, void **datap,
+ size_t *sizep, bool *freedatap __unused)
+{
+ unsigned char chash[SHA256_DIGEST_LENGTH];
+ const unsigned char *rhash;
+ SHA256_CTX ctx;
+ const char *algo;
+ size_t size;
+
+ res = res; /* TODO */
+
+ algo = nv_get_string(nv, "checksum");
+ if (algo == NULL)
+ return (0); /* No checksum. */
+ if (strcmp(algo, "sha256") != 0) {
+ pjdlog_error("Unknown checksum algorithm '%s'.", algo);
+ return (-1); /* Unknown checksum algorithm. */
+ }
+ rhash = nv_get_uint8_array(nv, &size, "hash");
+ if (rhash == NULL) {
+ pjdlog_error("Checksum algorithm is present, but hash is missing.");
+ return (-1); /* Hash not found. */
+ }
+ if (size != sizeof(chash)) {
+ pjdlog_error("Invalid hash size (%zu) for %s, should be %zu.",
+ size, algo, sizeof(chash));
+ return (-1); /* Different hash size. */
+ }
+
+ SHA256_Init(&ctx);
+ SHA256_Update(&ctx, *datap, *sizep);
+ SHA256_Final(chash, &ctx);
+
+ if (bcmp(rhash, chash, sizeof(chash)) != 0) {
+ pjdlog_error("Hash mismatch.");
+ return (-1); /* Hash mismatch. */
+ }
+
+ return (0);
+}
+
+/*
+ * Send the given nv structure via conn.
+ * We keep headers in nv structure and pass data in separate argument.
+ * There can be no data at all (data is NULL then).
+ */
+int
+hast_proto_send(struct hast_resource *res, struct proto_conn *conn,
+ struct nv *nv, const void *data, size_t size)
+{
+ struct hast_main_header hdr;
+ struct ebuf *eb;
+ bool freedata;
+ void *dptr, *hptr;
+ size_t hsize;
+ int ret;
+
+ dptr = (void *)(uintptr_t)data;
+ freedata = false;
+ ret = -1;
+
+ if (data != NULL) {
+if (false) {
+ unsigned int ii;
+
+ for (ii = 0; ii < sizeof(pipeline) / sizeof(pipeline[0]);
+ ii++) {
+ ret = pipeline[ii].hps_send(res, nv, &dptr, &size,
+ &freedata);
+ if (ret == -1)
+ goto end;
+ }
+ ret = -1;
+}
+ nv_add_uint32(nv, size, "size");
+ if (nv_error(nv) != 0) {
+ errno = nv_error(nv);
+ goto end;
+ }
+ }
+
+ eb = nv_hton(nv);
+ if (eb == NULL)
+ goto end;
+
+ hdr.version = HAST_PROTO_VERSION;
+ hdr.size = htole32((uint32_t)ebuf_size(eb));
+ if (ebuf_add_head(eb, &hdr, sizeof(hdr)) < 0)
+ goto end;
+
+ hptr = ebuf_data(eb, &hsize);
+ if (proto_send(conn, hptr, hsize) < 0)
+ goto end;
+ if (data != NULL && proto_send(conn, dptr, size) < 0)
+ goto end;
+
+ ret = 0;
+end:
+ if (freedata)
+ free(dptr);
+ return (ret);
+}
+
+int
+hast_proto_recv_hdr(struct proto_conn *conn, struct nv **nvp)
+{
+ struct hast_main_header hdr;
+ struct nv *nv;
+ struct ebuf *eb;
+ void *hptr;
+
+ eb = NULL;
+ nv = NULL;
+
+ if (proto_recv(conn, &hdr, sizeof(hdr)) < 0)
+ goto fail;
+
+ if (hdr.version != HAST_PROTO_VERSION) {
+ errno = ERPCMISMATCH;
+ goto fail;
+ }
+
+ hdr.size = le32toh(hdr.size);
+
+ eb = ebuf_alloc(hdr.size);
+ if (eb == NULL)
+ goto fail;
+ if (ebuf_add_tail(eb, NULL, hdr.size) < 0)
+ goto fail;
+ hptr = ebuf_data(eb, NULL);
+ assert(hptr != NULL);
+ if (proto_recv(conn, hptr, hdr.size) < 0)
+ goto fail;
+ nv = nv_ntoh(eb);
+ if (nv == NULL)
+ goto fail;
+
+ *nvp = nv;
+ return (0);
+fail:
+ if (nv != NULL)
+ nv_free(nv);
+ else if (eb != NULL)
+ ebuf_free(eb);
+ return (-1);
+}
+
+int
+hast_proto_recv_data(struct hast_resource *res, struct proto_conn *conn,
+ struct nv *nv, void *data, size_t size)
+{
+ unsigned int ii;
+ bool freedata;
+ size_t dsize;
+ void *dptr;
+ int ret;
+
+ assert(data != NULL);
+ assert(size > 0);
+
+ ret = -1;
+ freedata = false;
+ dptr = data;
+
+ dsize = nv_get_uint32(nv, "size");
+ if (dsize == 0)
+ (void)nv_set_error(nv, 0);
+ else {
+ if (proto_recv(conn, data, dsize) < 0)
+ goto end;
+if (false) {
+ for (ii = sizeof(pipeline) / sizeof(pipeline[0]); ii > 0;
+ ii--) {
+ assert(!"to be verified");
+ ret = pipeline[ii - 1].hps_recv(res, nv, &dptr,
+ &dsize, &freedata);
+ if (ret == -1)
+ goto end;
+ }
+ ret = -1;
+ if (dsize < size)
+ goto end;
+ /* TODO: 'size' doesn't seem right here. It is maximum data size. */
+ if (dptr != data)
+ bcopy(dptr, data, dsize);
+}
+ }
+
+ ret = 0;
+end:
+if (ret < 0) printf("%s:%u %s\n", __func__, __LINE__, strerror(errno));
+ if (freedata)
+ free(dptr);
+ return (ret);
+}
+
+int
+hast_proto_recv(struct hast_resource *res, struct proto_conn *conn,
+ struct nv **nvp, void *data, size_t size)
+{
+ struct nv *nv;
+ size_t dsize;
+ int ret;
+
+ ret = hast_proto_recv_hdr(conn, &nv);
+ if (ret < 0)
+ return (ret);
+ dsize = nv_get_uint32(nv, "size");
+ if (dsize == 0)
+ (void)nv_set_error(nv, 0);
+ else
+ ret = hast_proto_recv_data(res, conn, nv, data, size);
+ if (ret < 0)
+ nv_free(nv);
+ else
+ *nvp = nv;
+ return (ret);
+}
diff --git a/sbin/hastd/hast_proto.h b/sbin/hastd/hast_proto.h
new file mode 100644
index 0000000..3894e38
--- /dev/null
+++ b/sbin/hastd/hast_proto.h
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HAST_PROTO_H_
+#define _HAST_PROTO_H_
+
+#include <stdlib.h> /* size_t */
+
+#include <nv.h>
+#include <proto.h>
+
+int hast_proto_send(struct hast_resource *res, struct proto_conn *conn,
+ struct nv *nv, const void *data, size_t size);
+int hast_proto_recv(struct hast_resource *res, struct proto_conn *conn,
+ struct nv **nvp, void *data, size_t size);
+int hast_proto_recv_hdr(struct proto_conn *conn, struct nv **nvp);
+int hast_proto_recv_data(struct hast_resource *res, struct proto_conn *conn,
+ struct nv *nv, void *data, size_t size);
+
+#endif /* !_HAST_PROTO_H_ */
diff --git a/sbin/hastd/hastd.8 b/sbin/hastd/hastd.8
new file mode 100644
index 0000000..276b3d3
--- /dev/null
+++ b/sbin/hastd/hastd.8
@@ -0,0 +1,232 @@
+.\" Copyright (c) 2010 The FreeBSD Foundation
+.\" All rights reserved.
+.\"
+.\" This software was developed by Pawel Jakub Dawidek under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd February 1, 2010
+.Dt HASTD 8
+.Os
+.Sh NAME
+.Nm hastd
+.Nd "Highly Available Storage daemon"
+.Sh SYNOPSIS
+.Nm
+.Op Fl dFh
+.Op Fl c Ar config
+.Op Fl P Ar pidfile
+.Sh DESCRIPTION
+The
+.Nm
+daemon is responsible for managing highly available GEOM providers.
+.Pp
+.Nm
+allows to transparently store data on two physically separated machines
+connected over the TCP/IP network.
+Only one machine (cluster node) can actively use storage provided by
+.Nm .
+This machine is called primary.
+The
+.Nm
+daemon operates on block level, which makes it transparent for file
+systems and applications.
+.Pp
+There is one main
+.Nm
+daemon which starts new worker process as soon as a role for the given
+resource is changed to primary or as soon as a role for the given
+resource is changed to secondary and remote (primary) node will
+successfully connect to it.
+Every worker process gets a new process title (see
+.Xr setproctitle 3 ) ,
+which describes its role and resource it controls.
+The exact format is:
+.Bd -literal -offset indent
+hastd: <resource name> (<role>)
+.Ed
+.Pp
+When (and only when)
+.Nm
+operates in primary role for the given resource, corresponding
+.Pa /dev/hast/<name>
+disk-like device (GEOM provider) is created.
+File systems and applications can use this provider to send I/O
+requests to.
+Every write, delete and flush operation
+.Dv ( BIO_WRITE , BIO_DELETE , BIO_FLUSH )
+is send to local component and synchronously replicated
+to the remote (secondary) node if it is available.
+Read operations
+.Dv ( BIO_READ )
+are handled locally unless I/O error occurs or local version of the data
+is not up-to-date yet (synchronization is in progress).
+.Pp
+The
+.Nm
+daemon uses the GEOM Gate class to receive I/O requests from the
+in-kernel GEOM infrastructure.
+The
+.Nm geom_gate.ko
+module is loaded automatically if the kernel was not compiled with the
+following option:
+.Bd -ragged -offset indent
+.Cd "options GEOM_GATE"
+.Ed
+.Pp
+The connection between two
+.Nm
+daemons is always initiated from the one running as primary to the one
+running as secondary.
+When primary
+.Nm
+is unable to connect or connection fails, it will try to re-establish
+connection every few seconds.
+Once connection is established, primary
+.Nm
+will synchronize every extent that was modified during connection outage
+to the secondary
+.Nm .
+.Pp
+It is possible that in case of connection outage between the nodes
+.Nm
+primary role for the given resource will be configured on both nodes.
+This in turn leads to incompatible data modifications.
+Such condition is called split-brain and cannot be automatically
+resolved by the
+.Nm
+daemon as this will lead most likely to data corruption or lost of
+important changes.
+Even though it cannot be fixed by
+.Nm
+itself, it will be detected and further connection between independently
+modified nodes will not be possible.
+Once this situation is manually resolved by an administrator, resource
+on one of the nodes can be initialized (erasing local data), which makes
+connection to the remote node possible again.
+Connection of freshly initialized component will trigger full resource
+synchronization.
+.Pp
+The
+.Nm
+daemon itself never picks his role up automatically.
+The role has to be configured with the
+.Xr hastctl 8
+control utility by additional software like
+.Nm ucarp
+or
+.Nm heartbeat
+that can reliably manage role separation and switch secondary node to
+primary role in case of original primary failure.
+.Pp
+The
+.Nm
+daemon can be started with the following command line arguments:
+.Bl -tag -width ".Fl P Ar pidfile"
+.It Fl c Ar config
+Specify alternative location of the configuration file.
+The default location is
+.Pa /etc/hast.conf .
+.It Fl d
+Print or log debugging information.
+This option can be specified multiple times to raise the verbosity
+level.
+.It Fl F
+Start the
+.Nm
+daemon in the foreground.
+By default
+.Nm
+starts in the background.
+.It Fl h
+Print the
+.Nm
+usage message.
+.It Fl P Ar pidfile
+Specify alternative location of a file where main process PID will be
+stored.
+The default location is
+.Pa /var/run/hastd.pid .
+.El
+.Sh EXIT STATUS
+Exit status is 0 on success, or one of the values described in
+.Xr sysexits 3
+on failure.
+.Sh EXAMPLES
+Launch
+.Nm
+on both nodes.
+Set role for resource
+.Nm shared
+to primary on
+.Nm nodeA
+and to secondary on
+.Nm nodeB .
+Create file system on
+.Pa /dev/hast/shared
+provider and mount it.
+.Bd -literal -offset indent
+nodeB# hastd
+nodeB# hastctl role secondary shared
+
+nodeA# hastd
+nodeA# hastctl role primary shared
+nodeA# newfs -U /dev/hast/shared
+nodeA# mount -o noatime /dev/hast/shared /shared
+.Ed
+.Sh FILES
+.Bl -tag -width ".Pa /var/run/hastctl" -compact
+.It Pa /etc/hast.conf
+The configuration file for
+.Nm
+and
+.Xr hastctl 8 .
+.It Pa /var/run/hastctl
+Control socket used by the
+.Xr hastctl 8
+control utility to communicate with
+.Nm .
+.It Pa /var/run/hastd.pid
+The default location of the
+.Nm
+PID file.
+.El
+.Sh SEE ALSO
+.Xr sysexits 3 ,
+.Xr geom 4 ,
+.Xr hast.conf 5 ,
+.Xr ggatec 8 ,
+.Xr ggated 8 ,
+.Xr ggatel 8 ,
+.Xr hastctl 8 ,
+.Xr mount 8 ,
+.Xr newfs 8 ,
+.Xr g_bio 9 .
+.Sh AUTHORS
+The
+.Nm
+was developed by
+.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org
+under sponsorship of the FreeBSD Foundation.
diff --git a/sbin/hastd/hastd.c b/sbin/hastd/hastd.c
new file mode 100644
index 0000000..19f0893
--- /dev/null
+++ b/sbin/hastd/hastd.c
@@ -0,0 +1,522 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker.h>
+#include <sys/module.h>
+#include <sys/wait.h>
+
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <libutil.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <activemap.h>
+#include <pjdlog.h>
+
+#include "control.h"
+#include "hast.h"
+#include "hast_proto.h"
+#include "hastd.h"
+#include "subr.h"
+
+/* Path to configuration file. */
+static const char *cfgpath = HAST_CONFIG;
+/* Hastd configuration. */
+static struct hastd_config *cfg;
+/* Was SIGCHLD signal received? */
+static bool sigchld_received = false;
+/* Was SIGHUP signal received? */
+static bool sighup_received = false;
+/* Was SIGINT or SIGTERM signal received? */
+bool sigexit_received = false;
+/* PID file handle. */
+struct pidfh *pfh;
+
+static void
+usage(void)
+{
+
+ errx(EX_USAGE, "[-dFh] [-c config] [-P pidfile]");
+}
+
+static void
+sighandler(int sig)
+{
+
+ switch (sig) {
+ case SIGCHLD:
+ sigchld_received = true;
+ break;
+ case SIGHUP:
+ sighup_received = true;
+ break;
+ default:
+ assert(!"invalid condition");
+ }
+}
+
+static void
+g_gate_load(void)
+{
+
+ if (modfind("g_gate") == -1) {
+ /* Not present in kernel, try loading it. */
+ if (kldload("geom_gate") == -1 || modfind("g_gate") == -1) {
+ if (errno != EEXIST) {
+ pjdlog_exit(EX_OSERR,
+ "Unable to load geom_gate module");
+ }
+ }
+ }
+}
+
+static void
+child_exit(void)
+{
+ struct hast_resource *res;
+ int status;
+ pid_t pid;
+
+ while ((pid = wait3(&status, WNOHANG, NULL)) > 0) {
+ /* Find resource related to the process that just exited. */
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (pid == res->hr_workerpid)
+ break;
+ }
+ if (res == NULL) {
+ /*
+ * This can happen when new connection arrives and we
+ * cancel child responsible for the old one.
+ */
+ continue;
+ }
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name,
+ role2str(res->hr_role));
+ if (WEXITSTATUS(status) == 0) {
+ pjdlog_debug(1,
+ "Worker process exited gracefully (pid=%u).",
+ (unsigned int)pid);
+ } else {
+ pjdlog_error("Worker process failed (pid=%u, status=%d).",
+ (unsigned int)pid, WEXITSTATUS(status));
+ }
+ res->hr_workerpid = 0;
+ if (res->hr_role == HAST_ROLE_PRIMARY) {
+ sleep(1);
+ pjdlog_info("Restarting worker process.");
+ hastd_primary(res);
+ }
+ pjdlog_prefix_set("%s", "");
+ }
+}
+
+static void
+hastd_reload(void)
+{
+
+ /* TODO */
+ pjdlog_warning("Configuration reload is not implemented.");
+}
+
+static void
+listen_accept(void)
+{
+ struct hast_resource *res;
+ struct proto_conn *conn;
+ struct nv *nvin, *nvout, *nverr;
+ const char *resname;
+ const unsigned char *token;
+ char laddr[256], raddr[256];
+ size_t size;
+ pid_t pid;
+ int status;
+
+ proto_local_address(cfg->hc_listenconn, laddr, sizeof(laddr));
+ pjdlog_debug(1, "Accepting connection to %s.", laddr);
+
+ if (proto_accept(cfg->hc_listenconn, &conn) < 0) {
+ pjdlog_errno(LOG_ERR, "Unable to accept connection %s", laddr);
+ return;
+ }
+
+ proto_local_address(conn, laddr, sizeof(laddr));
+ proto_remote_address(conn, raddr, sizeof(raddr));
+ pjdlog_info("Connection from %s to %s.", laddr, raddr);
+
+ nvin = nvout = nverr = NULL;
+
+ /*
+ * Before receiving any data see if remote host have access to any
+ * resource.
+ */
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (proto_address_match(conn, res->hr_remoteaddr))
+ break;
+ }
+ if (res == NULL) {
+ pjdlog_error("Client %s isn't known.", raddr);
+ goto close;
+ }
+ /* Ok, remote host can access at least one resource. */
+
+ if (hast_proto_recv_hdr(conn, &nvin) < 0) {
+ pjdlog_errno(LOG_ERR, "Unable to receive header from %s",
+ raddr);
+ goto close;
+ }
+
+ resname = nv_get_string(nvin, "resource");
+ if (resname == NULL) {
+ pjdlog_error("No 'resource' field in the header received from %s.",
+ raddr);
+ goto close;
+ }
+ pjdlog_debug(2, "%s: resource=%s", raddr, resname);
+ token = nv_get_uint8_array(nvin, &size, "token");
+ /*
+ * NULL token means that this is first conection.
+ */
+ if (token != NULL && size != sizeof(res->hr_token)) {
+ pjdlog_error("Received token of invalid size from %s (expected %zu, got %zu).",
+ raddr, sizeof(res->hr_token), size);
+ goto close;
+ }
+
+ /*
+ * From now on we want to send errors to the remote node.
+ */
+ nverr = nv_alloc();
+
+ /* Find resource related to this connection. */
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (strcmp(resname, res->hr_name) == 0)
+ break;
+ }
+ /* Have we found the resource? */
+ if (res == NULL) {
+ pjdlog_error("No resource '%s' as requested by %s.",
+ resname, raddr);
+ nv_add_stringf(nverr, "errmsg", "Resource not configured.");
+ goto fail;
+ }
+
+ /* Now that we know resource name setup log prefix. */
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
+
+ /* Does the remote host have access to this resource? */
+ if (!proto_address_match(conn, res->hr_remoteaddr)) {
+ pjdlog_error("Client %s has no access to the resource.", raddr);
+ nv_add_stringf(nverr, "errmsg", "No access to the resource.");
+ goto fail;
+ }
+ /* Is the resource marked as secondary? */
+ if (res->hr_role != HAST_ROLE_SECONDARY) {
+ pjdlog_error("We act as %s for the resource and not as %s as requested by %s.",
+ role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY),
+ raddr);
+ nv_add_stringf(nverr, "errmsg",
+ "Remote node acts as %s for the resource and not as %s.",
+ role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY));
+ goto fail;
+ }
+ /* Does token (if exists) match? */
+ if (token != NULL && memcmp(token, res->hr_token,
+ sizeof(res->hr_token)) != 0) {
+ pjdlog_error("Token received from %s doesn't match.", raddr);
+ nv_add_stringf(nverr, "errmsg", "Toke doesn't match.");
+ goto fail;
+ }
+ /*
+ * If there is no token, but we have half-open connection
+ * (only remotein) or full connection (worker process is running)
+ * we have to cancel those and accept the new connection.
+ */
+ if (token == NULL) {
+ assert(res->hr_remoteout == NULL);
+ pjdlog_debug(1, "Initial connection from %s.", raddr);
+ if (res->hr_workerpid != 0) {
+ assert(res->hr_remotein == NULL);
+ pjdlog_debug(1,
+ "Worker process exists (pid=%u), stopping it.",
+ (unsigned int)res->hr_workerpid);
+ /* Stop child process. */
+ if (kill(res->hr_workerpid, SIGINT) < 0) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to stop worker process (pid=%u)",
+ (unsigned int)res->hr_workerpid);
+ /*
+ * Other than logging the problem we
+ * ignore it - nothing smart to do.
+ */
+ }
+ /* Wait for it to exit. */
+ else if ((pid = waitpid(res->hr_workerpid,
+ &status, 0)) != res->hr_workerpid) {
+ pjdlog_errno(LOG_ERR,
+ "Waiting for worker process (pid=%u) failed",
+ (unsigned int)res->hr_workerpid);
+ /* See above. */
+ } else if (status != 0) {
+ pjdlog_error("Worker process (pid=%u) exited ungracefully: status=%d.",
+ (unsigned int)res->hr_workerpid, status);
+ /* See above. */
+ } else {
+ pjdlog_debug(1,
+ "Worker process (pid=%u) exited gracefully.",
+ (unsigned int)res->hr_workerpid);
+ }
+ res->hr_workerpid = 0;
+ } else if (res->hr_remotein != NULL) {
+ char oaddr[256];
+
+ proto_remote_address(conn, oaddr, sizeof(oaddr));
+ pjdlog_debug(1,
+ "Canceling half-open connection from %s on connection from %s.",
+ oaddr, raddr);
+ proto_close(res->hr_remotein);
+ res->hr_remotein = NULL;
+ }
+ }
+
+ /*
+ * Checks and cleanups are done.
+ */
+
+ if (token == NULL) {
+ arc4random_buf(res->hr_token, sizeof(res->hr_token));
+ nvout = nv_alloc();
+ nv_add_uint8_array(nvout, res->hr_token,
+ sizeof(res->hr_token), "token");
+ if (nv_error(nvout) != 0) {
+ pjdlog_common(LOG_ERR, 0, nv_error(nvout),
+ "Unable to prepare return header for %s", raddr);
+ nv_add_stringf(nverr, "errmsg",
+ "Remote node was unable to prepare return header: %s.",
+ strerror(nv_error(nvout)));
+ goto fail;
+ }
+ if (hast_proto_send(NULL, conn, nvout, NULL, 0) < 0) {
+ int error = errno;
+
+ pjdlog_errno(LOG_ERR, "Unable to send response to %s",
+ raddr);
+ nv_add_stringf(nverr, "errmsg",
+ "Remote node was unable to send response: %s.",
+ strerror(error));
+ goto fail;
+ }
+ res->hr_remotein = conn;
+ pjdlog_debug(1, "Incoming connection from %s configured.",
+ raddr);
+ } else {
+ res->hr_remoteout = conn;
+ pjdlog_debug(1, "Outgoing connection to %s configured.", raddr);
+ hastd_secondary(res, nvin);
+ }
+ nv_free(nvin);
+ nv_free(nvout);
+ nv_free(nverr);
+ pjdlog_prefix_set("%s", "");
+ return;
+fail:
+ if (nv_error(nverr) != 0) {
+ pjdlog_common(LOG_ERR, 0, nv_error(nverr),
+ "Unable to prepare error header for %s", raddr);
+ goto close;
+ }
+ if (hast_proto_send(NULL, conn, nverr, NULL, 0) < 0) {
+ pjdlog_errno(LOG_ERR, "Unable to send error to %s", raddr);
+ goto close;
+ }
+close:
+ if (nvin != NULL)
+ nv_free(nvin);
+ if (nvout != NULL)
+ nv_free(nvout);
+ if (nverr != NULL)
+ nv_free(nverr);
+ proto_close(conn);
+ pjdlog_prefix_set("%s", "");
+}
+
+static void
+main_loop(void)
+{
+ fd_set rfds, wfds;
+ int fd, maxfd, ret;
+
+ for (;;) {
+ if (sigchld_received) {
+ sigchld_received = false;
+ child_exit();
+ }
+ if (sighup_received) {
+ sighup_received = false;
+ hastd_reload();
+ }
+
+ maxfd = 0;
+ FD_ZERO(&rfds);
+ FD_ZERO(&wfds);
+
+ /* Setup descriptors for select(2). */
+#define SETUP_FD(conn) do { \
+ fd = proto_descriptor(conn); \
+ if (fd >= 0) { \
+ maxfd = fd > maxfd ? fd : maxfd; \
+ FD_SET(fd, &rfds); \
+ FD_SET(fd, &wfds); \
+ } \
+} while (0)
+ SETUP_FD(cfg->hc_controlconn);
+ SETUP_FD(cfg->hc_listenconn);
+#undef SETUP_FD
+
+ ret = select(maxfd + 1, &rfds, &wfds, NULL, NULL);
+ if (ret == -1) {
+ if (errno == EINTR)
+ continue;
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "select() failed");
+ }
+
+#define ISSET_FD(conn) \
+ (FD_ISSET((fd = proto_descriptor(conn)), &rfds) || FD_ISSET(fd, &wfds))
+ if (ISSET_FD(cfg->hc_controlconn))
+ control_handle(cfg);
+ if (ISSET_FD(cfg->hc_listenconn))
+ listen_accept();
+#undef ISSET_FD
+ }
+}
+
+int
+main(int argc, char *argv[])
+{
+ const char *pidfile;
+ pid_t otherpid;
+ bool foreground;
+ int debuglevel;
+
+ g_gate_load();
+
+ foreground = false;
+ debuglevel = 0;
+ pidfile = HASTD_PIDFILE;
+
+ for (;;) {
+ int ch;
+
+ ch = getopt(argc, argv, "c:dFhP:");
+ if (ch == -1)
+ break;
+ switch (ch) {
+ case 'c':
+ cfgpath = optarg;
+ break;
+ case 'd':
+ debuglevel++;
+ break;
+ case 'F':
+ foreground = true;
+ break;
+ case 'P':
+ pidfile = optarg;
+ break;
+ case 'h':
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ pjdlog_debug_set(debuglevel);
+
+ pfh = pidfile_open(pidfile, 0600, &otherpid);
+ if (pfh == NULL) {
+ if (errno == EEXIST) {
+ pjdlog_exitx(EX_TEMPFAIL,
+ "Another hastd is already running, pid: %jd.",
+ (intmax_t)otherpid);
+ }
+ /* If we cannot create pidfile from other reasons, only warn. */
+ pjdlog_errno(LOG_WARNING, "Cannot open or create pidfile");
+ }
+
+ cfg = yy_config_parse(cfgpath);
+ assert(cfg != NULL);
+
+ signal(SIGHUP, sighandler);
+ signal(SIGCHLD, sighandler);
+
+ /* Listen on control address. */
+ if (proto_server(cfg->hc_controladdr, &cfg->hc_controlconn) < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "Unable to listen on control address %s",
+ cfg->hc_controladdr);
+ }
+ /* Listen for remote connections. */
+ if (proto_server(cfg->hc_listenaddr, &cfg->hc_listenconn) < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "Unable to listen on address %s",
+ cfg->hc_listenaddr);
+ }
+
+ if (!foreground) {
+ if (daemon(0, 0) < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "Unable to daemonize");
+ }
+
+ /* Start logging to syslog. */
+ pjdlog_mode_set(PJDLOG_MODE_SYSLOG);
+
+ /* Write PID to a file. */
+ if (pidfile_write(pfh) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to write PID to a file");
+ }
+ }
+
+ main_loop();
+
+ exit(0);
+}
diff --git a/sbin/hastd/hastd.h b/sbin/hastd/hastd.h
new file mode 100644
index 0000000..199de8c
--- /dev/null
+++ b/sbin/hastd/hastd.h
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HASTD_H_
+#define _HASTD_H_
+
+#include <sys/param.h>
+#include <libutil.h>
+
+#include <nv.h>
+
+#include "hast.h"
+
+extern bool sigexit_received;
+extern struct pidfh *pfh;
+
+void hastd_primary(struct hast_resource *res);
+void hastd_secondary(struct hast_resource *res, struct nv *nvin);
+
+#endif /* !_HASTD_H_ */
diff --git a/sbin/hastd/hooks.c b/sbin/hastd/hooks.c
new file mode 100644
index 0000000..1fdeb75
--- /dev/null
+++ b/sbin/hastd/hooks.c
@@ -0,0 +1,148 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syslog.h>
+#include <libgen.h>
+#include <paths.h>
+
+#include <pjdlog.h>
+
+#include "hooks.h"
+
+static void
+descriptors(void)
+{
+ long maxfd;
+ int fd;
+
+ /*
+ * Close all descriptors.
+ */
+ maxfd = sysconf(_SC_OPEN_MAX);
+ if (maxfd < 0) {
+ pjdlog_errno(LOG_WARNING, "sysconf(_SC_OPEN_MAX) failed");
+ maxfd = 1024;
+ }
+ for (fd = 0; fd <= maxfd; fd++)
+ close(fd);
+ /*
+ * Redirect stdin, stdout and stderr to /dev/null.
+ */
+ fd = open(_PATH_DEVNULL, O_RDONLY);
+ if (fd < 0) {
+ pjdlog_errno(LOG_WARNING, "Unable to open %s for reading",
+ _PATH_DEVNULL);
+ } else if (fd != STDIN_FILENO) {
+ if (dup2(fd, STDIN_FILENO) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to duplicate descriptor for stdin");
+ }
+ close(fd);
+ }
+ fd = open(_PATH_DEVNULL, O_WRONLY);
+ if (fd < 0) {
+ pjdlog_errno(LOG_WARNING, "Unable to open %s for writing",
+ _PATH_DEVNULL);
+ } else {
+ if (fd != STDOUT_FILENO && dup2(fd, STDOUT_FILENO) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to duplicate descriptor for stdout");
+ }
+ if (fd != STDERR_FILENO && dup2(fd, STDERR_FILENO) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to duplicate descriptor for stderr");
+ }
+ if (fd != STDOUT_FILENO && fd != STDERR_FILENO)
+ close(fd);
+ }
+}
+
+int
+hook_exec(const char *path, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, path);
+ ret = hook_execv(path, ap);
+ va_end(ap);
+ return (ret);
+}
+
+int
+hook_execv(const char *path, va_list ap)
+{
+ char *args[64];
+ unsigned int ii;
+ pid_t pid, wpid;
+ int status;
+
+ if (path == NULL || path[0] == '\0')
+ return (0);
+
+ memset(args, 0, sizeof(args));
+ args[0] = basename(path);
+ for (ii = 1; ii < sizeof(args) / sizeof(args[0]); ii++) {
+ args[ii] = va_arg(ap, char *);
+ if (args[ii] == NULL)
+ break;
+ }
+ assert(ii < sizeof(args) / sizeof(args[0]));
+
+ pid = fork();
+ switch (pid) {
+ case -1: /* Error. */
+ pjdlog_errno(LOG_ERR, "Unable to fork %s", path);
+ return (-1);
+ case 0: /* Child. */
+ descriptors();
+ execv(path, args);
+ pjdlog_errno(LOG_ERR, "Unable to execute %s", path);
+ exit(EX_SOFTWARE);
+ default: /* Parent. */
+ break;
+ }
+
+ wpid = waitpid(pid, &status, 0);
+ assert(wpid == pid);
+
+ return (WEXITSTATUS(status));
+}
diff --git a/sbin/hastd/hooks.h b/sbin/hastd/hooks.h
new file mode 100644
index 0000000..799b781
--- /dev/null
+++ b/sbin/hastd/hooks.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HOOKS_H_
+#define _HOOKS_H_
+
+#include <stdarg.h>
+
+int hook_exec(const char *path, ...);
+int hook_execv(const char *path, va_list ap);
+
+#endif /* !_HOOKS_H_ */
diff --git a/sbin/hastd/metadata.c b/sbin/hastd/metadata.c
new file mode 100644
index 0000000..9bca66b
--- /dev/null
+++ b/sbin/hastd/metadata.c
@@ -0,0 +1,222 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <ebuf.h>
+#include <nv.h>
+#include <pjdlog.h>
+#include <subr.h>
+
+#include "metadata.h"
+
+int
+metadata_read(struct hast_resource *res, bool openrw)
+{
+ unsigned char *buf;
+ struct ebuf *eb;
+ struct nv *nv;
+ ssize_t done;
+ const char *str;
+ int rerrno;
+ bool opened_here;
+
+ opened_here = false;
+ rerrno = 0;
+
+ /*
+ * Is this first metadata_read() call for this resource?
+ */
+ if (res->hr_localfd == -1) {
+ if (provinfo(res, openrw) < 0) {
+ rerrno = errno;
+ goto fail;
+ }
+ opened_here = true;
+ pjdlog_debug(1, "Obtained info about %s.", res->hr_localpath);
+ if (openrw) {
+ if (flock(res->hr_localfd, LOCK_EX | LOCK_NB) < 0) {
+ rerrno = errno;
+ if (errno == EOPNOTSUPP) {
+ pjdlog_warning("Unable to lock %s (operation not supported), but continuing.",
+ res->hr_localpath);
+ } else {
+ pjdlog_errno(LOG_ERR,
+ "Unable to lock %s",
+ res->hr_localpath);
+ goto fail;
+ }
+ }
+ pjdlog_debug(1, "Locked %s.", res->hr_localpath);
+ }
+ }
+
+ eb = ebuf_alloc(METADATA_SIZE);
+ if (eb == NULL) {
+ rerrno = errno;
+ pjdlog_errno(LOG_ERR,
+ "Unable to allocate memory to read metadata");
+ goto fail;
+ }
+ if (ebuf_add_tail(eb, NULL, METADATA_SIZE) < 0) {
+ rerrno = errno;
+ pjdlog_errno(LOG_ERR,
+ "Unable to allocate memory to read metadata");
+ goto fail;
+ }
+ buf = ebuf_data(eb, NULL);
+ assert(buf != NULL);
+ done = pread(res->hr_localfd, buf, METADATA_SIZE, 0);
+ if (done < 0 || done != METADATA_SIZE) {
+ rerrno = errno;
+ pjdlog_errno(LOG_ERR, "Unable to read metadata");
+ ebuf_free(eb);
+ goto fail;
+ }
+ nv = nv_ntoh(eb);
+ if (nv == NULL) {
+ rerrno = errno;
+ pjdlog_errno(LOG_ERR, "Metadata read from %s is invalid",
+ res->hr_localpath);
+ ebuf_free(eb);
+ goto fail;
+ }
+
+ str = nv_get_string(nv, "resource");
+ if (strcmp(str, res->hr_name) != 0) {
+ pjdlog_error("Provider %s is not part of resource %s.",
+ res->hr_localpath, res->hr_name);
+ nv_free(nv);
+ goto fail;
+ }
+
+ res->hr_datasize = nv_get_uint64(nv, "datasize");
+ res->hr_extentsize = (int)nv_get_uint32(nv, "extentsize");
+ res->hr_keepdirty = (int)nv_get_uint32(nv, "keepdirty");
+ res->hr_localoff = nv_get_uint64(nv, "offset");
+ res->hr_resuid = nv_get_uint64(nv, "resuid");
+ if (res->hr_role != HAST_ROLE_PRIMARY) {
+ /* Secondary or init role. */
+ res->hr_secondary_localcnt = nv_get_uint64(nv, "localcnt");
+ res->hr_secondary_remotecnt = nv_get_uint64(nv, "remotecnt");
+ }
+ if (res->hr_role != HAST_ROLE_SECONDARY) {
+ /* Primary or init role. */
+ res->hr_primary_localcnt = nv_get_uint64(nv, "localcnt");
+ res->hr_primary_remotecnt = nv_get_uint64(nv, "remotecnt");
+ }
+ str = nv_get_string(nv, "prevrole");
+ if (str != NULL) {
+ if (strcmp(str, "primary") == 0)
+ res->hr_previous_role = HAST_ROLE_PRIMARY;
+ else if (strcmp(str, "secondary") == 0)
+ res->hr_previous_role = HAST_ROLE_SECONDARY;
+ }
+
+ if (nv_error(nv) != 0) {
+ errno = rerrno = nv_error(nv);
+ pjdlog_errno(LOG_ERR, "Unable to read metadata from %s",
+ res->hr_localpath);
+ nv_free(nv);
+ goto fail;
+ }
+ return (0);
+fail:
+ if (opened_here) {
+ close(res->hr_localfd);
+ res->hr_localfd = -1;
+ }
+ errno = rerrno;
+ return (-1);
+}
+
+int
+metadata_write(struct hast_resource *res)
+{
+ struct ebuf *eb;
+ struct nv *nv;
+ unsigned char *buf, *ptr;
+ size_t size;
+ ssize_t done;
+
+ buf = calloc(1, METADATA_SIZE);
+ if (buf == NULL) {
+ pjdlog_error("Unable to allocate %zu bytes for metadata.",
+ (size_t)METADATA_SIZE);
+ return (-1);
+ }
+
+ nv = nv_alloc();
+ nv_add_string(nv, res->hr_name, "resource");
+ nv_add_uint64(nv, (uint64_t)res->hr_datasize, "datasize");
+ nv_add_uint32(nv, (uint32_t)res->hr_extentsize, "extentsize");
+ nv_add_uint32(nv, (uint32_t)res->hr_keepdirty, "keepdirty");
+ nv_add_uint64(nv, (uint64_t)res->hr_localoff, "offset");
+ nv_add_uint64(nv, res->hr_resuid, "resuid");
+ if (res->hr_role == HAST_ROLE_PRIMARY ||
+ res->hr_role == HAST_ROLE_INIT) {
+ nv_add_uint64(nv, res->hr_primary_localcnt, "localcnt");
+ nv_add_uint64(nv, res->hr_primary_remotecnt, "remotecnt");
+ } else /* if (res->hr_role == HAST_ROLE_SECONDARY) */ {
+ assert(res->hr_role == HAST_ROLE_SECONDARY);
+ nv_add_uint64(nv, res->hr_secondary_localcnt, "localcnt");
+ nv_add_uint64(nv, res->hr_secondary_remotecnt, "remotecnt");
+ }
+ nv_add_string(nv, role2str(res->hr_role), "prevrole");
+ if (nv_error(nv) != 0) {
+ pjdlog_error("Unable to create metadata.");
+ goto fail;
+ }
+ res->hr_previous_role = res->hr_role;
+ eb = nv_hton(nv);
+ assert(eb != NULL);
+ ptr = ebuf_data(eb, &size);
+ assert(ptr != NULL);
+ assert(size < METADATA_SIZE);
+ bcopy(ptr, buf, size);
+ done = pwrite(res->hr_localfd, buf, METADATA_SIZE, 0);
+ if (done < 0 || done != METADATA_SIZE) {
+ pjdlog_errno(LOG_ERR, "Unable to write metadata");
+ goto fail;
+ }
+
+ return (0);
+fail:
+ free(buf);
+ nv_free(nv);
+ return (-1);
+}
diff --git a/sbin/hastd/metadata.h b/sbin/hastd/metadata.h
new file mode 100644
index 0000000..83d35f4
--- /dev/null
+++ b/sbin/hastd/metadata.h
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _METADATA_H_
+#define _METADATA_H_
+
+#include <stdbool.h>
+
+#include <hast.h>
+
+/*
+ * Maximum size of metadata.
+ * XXX: We should take sector size into account.
+ */
+#define METADATA_SIZE 4096
+
+int metadata_read(struct hast_resource *res, bool openrw);
+int metadata_write(struct hast_resource *res);
+
+#endif /* !_METADATA_H_ */
diff --git a/sbin/hastd/nv.c b/sbin/hastd/nv.c
new file mode 100644
index 0000000..0b4e362
--- /dev/null
+++ b/sbin/hastd/nv.c
@@ -0,0 +1,882 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/endian.h>
+
+#include <assert.h>
+#include <bitstring.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <ebuf.h>
+#include <nv.h>
+
+#define NV_MAGIC 0xaea1e
+struct nv {
+ int nv_magic;
+ int nv_error;
+ struct ebuf *nv_ebuf;
+};
+
+struct nvhdr {
+ uint8_t nvh_type;
+ uint8_t nvh_namesize;
+ uint32_t nvh_dsize;
+ char nvh_name[0];
+} __packed;
+#define NVH_DATA(nvh) ((unsigned char *)nvh + NVH_HSIZE(nvh))
+#define NVH_HSIZE(nvh) \
+ (sizeof(struct nvhdr) + roundup2((nvh)->nvh_namesize, 8))
+#define NVH_DSIZE(nvh) \
+ (((nvh)->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST ? \
+ (nvh)->nvh_dsize : \
+ le32toh((nvh)->nvh_dsize))
+#define NVH_SIZE(nvh) (NVH_HSIZE(nvh) + roundup2(NVH_DSIZE(nvh), 8))
+
+#define NV_CHECK(nv) do { \
+ assert((nv) != NULL); \
+ assert((nv)->nv_magic == NV_MAGIC); \
+} while (0)
+
+static void nv_add(struct nv *nv, const unsigned char *value, size_t vsize,
+ int type, const char *name);
+static void nv_addv(struct nv *nv, const unsigned char *value, size_t vsize,
+ int type, const char *namefmt, va_list nameap);
+static struct nvhdr *nv_find(struct nv *nv, int type, const char *namefmt,
+ va_list nameap);
+static void nv_swap(struct nvhdr *nvh, bool tohost);
+
+/*
+ * Allocate and initialize new nv structure.
+ * Return NULL in case of malloc(3) failure.
+ */
+struct nv *
+nv_alloc(void)
+{
+ struct nv *nv;
+
+ nv = malloc(sizeof(*nv));
+ if (nv == NULL)
+ return (NULL);
+ nv->nv_ebuf = ebuf_alloc(0);
+ if (nv->nv_ebuf == NULL) {
+ free(nv);
+ return (NULL);
+ }
+ nv->nv_error = 0;
+ nv->nv_magic = NV_MAGIC;
+ return (nv);
+}
+
+/*
+ * Free the given nv structure.
+ */
+void
+nv_free(struct nv *nv)
+{
+
+ if (nv == NULL)
+ return;
+
+ NV_CHECK(nv);
+
+ nv->nv_magic = 0;
+ ebuf_free(nv->nv_ebuf);
+ free(nv);
+}
+
+/*
+ * Return error for the given nv structure.
+ */
+int
+nv_error(const struct nv *nv)
+{
+
+ if (nv == NULL)
+ return (ENOMEM);
+
+ NV_CHECK(nv);
+
+ return (nv->nv_error);
+}
+
+/*
+ * Set error for the given nv structure and return previous error.
+ */
+int
+nv_set_error(struct nv *nv, int error)
+{
+ int preverr;
+
+ if (nv == NULL)
+ return (ENOMEM);
+
+ NV_CHECK(nv);
+
+ preverr = nv->nv_error;
+ nv->nv_error = error;
+ return (preverr);
+}
+
+/*
+ * Validate correctness of the entire nv structure and all its elements.
+ * If extrap is not NULL, store number of extra bytes at the end of the buffer.
+ */
+int
+nv_validate(struct nv *nv, size_t *extrap)
+{
+ struct nvhdr *nvh;
+ unsigned char *data, *ptr;
+ size_t dsize, size, vsize;
+ int error;
+
+ if (nv == NULL) {
+ errno = ENOMEM;
+ return (-1);
+ }
+
+ NV_CHECK(nv);
+ assert(nv->nv_error == 0);
+
+ /* TODO: Check that names are unique? */
+
+ error = 0;
+ ptr = ebuf_data(nv->nv_ebuf, &size);
+ while (size > 0) {
+ /*
+ * Zeros at the end of the buffer are acceptable.
+ */
+ if (ptr[0] == '\0')
+ break;
+ /*
+ * Minimum size at this point is size of nvhdr structure, one
+ * character long name plus terminating '\0'.
+ */
+ if (size < sizeof(*nvh) + 2) {
+ error = EINVAL;
+ break;
+ }
+ nvh = (struct nvhdr *)ptr;
+ if (size < NVH_HSIZE(nvh)) {
+ error = EINVAL;
+ break;
+ }
+ if (nvh->nvh_name[nvh->nvh_namesize - 1] != '\0') {
+ error = EINVAL;
+ break;
+ }
+ if (strlen(nvh->nvh_name) !=
+ (size_t)(nvh->nvh_namesize - 1)) {
+ error = EINVAL;
+ break;
+ }
+ if ((nvh->nvh_type & NV_TYPE_MASK) < NV_TYPE_FIRST ||
+ (nvh->nvh_type & NV_TYPE_MASK) > NV_TYPE_LAST) {
+ error = EINVAL;
+ break;
+ }
+ dsize = NVH_DSIZE(nvh);
+ if (dsize == 0) {
+ error = EINVAL;
+ break;
+ }
+ if (size < NVH_SIZE(nvh)) {
+ error = EINVAL;
+ break;
+ }
+ vsize = 0;
+ switch (nvh->nvh_type & NV_TYPE_MASK) {
+ case NV_TYPE_INT8:
+ case NV_TYPE_UINT8:
+ if (vsize == 0)
+ vsize = 1;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT16:
+ case NV_TYPE_UINT16:
+ if (vsize == 0)
+ vsize = 2;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT32:
+ case NV_TYPE_UINT32:
+ if (vsize == 0)
+ vsize = 4;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT64:
+ case NV_TYPE_UINT64:
+ if (vsize == 0)
+ vsize = 8;
+ if (dsize != vsize) {
+ error = EINVAL;
+ break;
+ }
+ break;
+ case NV_TYPE_INT8_ARRAY:
+ case NV_TYPE_UINT8_ARRAY:
+ break;
+ case NV_TYPE_INT16_ARRAY:
+ case NV_TYPE_UINT16_ARRAY:
+ if (vsize == 0)
+ vsize = 2;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT32_ARRAY:
+ case NV_TYPE_UINT32_ARRAY:
+ if (vsize == 0)
+ vsize = 4;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT64_ARRAY:
+ case NV_TYPE_UINT64_ARRAY:
+ if (vsize == 0)
+ vsize = 8;
+ if ((dsize % vsize) != 0) {
+ error = EINVAL;
+ break;
+ }
+ break;
+ case NV_TYPE_STRING:
+ data = NVH_DATA(nvh);
+ if (data[dsize - 1] != '\0') {
+ error = EINVAL;
+ break;
+ }
+ if (strlen((char *)data) != dsize - 1) {
+ error = EINVAL;
+ break;
+ }
+ break;
+ default:
+ assert(!"invalid condition");
+ }
+ if (error != 0)
+ break;
+ ptr += NVH_SIZE(nvh);
+ size -= NVH_SIZE(nvh);
+ }
+ if (error != 0) {
+ errno = error;
+ if (nv->nv_error == 0)
+ nv->nv_error = error;
+ return (-1);
+ }
+ if (extrap != NULL)
+ *extrap = size;
+ return (0);
+}
+
+/*
+ * Convert the given nv structure to network byte order and return ebuf
+ * structure.
+ */
+struct ebuf *
+nv_hton(struct nv *nv)
+{
+ struct nvhdr *nvh;
+ unsigned char *ptr;
+ size_t size;
+
+ NV_CHECK(nv);
+ assert(nv->nv_error == 0);
+
+ ptr = ebuf_data(nv->nv_ebuf, &size);
+ while (size > 0) {
+ /*
+ * Minimum size at this point is size of nvhdr structure,
+ * one character long name plus terminating '\0'.
+ */
+ assert(size >= sizeof(*nvh) + 2);
+ nvh = (struct nvhdr *)ptr;
+ assert(NVH_SIZE(nvh) <= size);
+ nv_swap(nvh, false);
+ ptr += NVH_SIZE(nvh);
+ size -= NVH_SIZE(nvh);
+ }
+
+ return (nv->nv_ebuf);
+}
+
+/*
+ * Create nv structure based on ebuf received from the network.
+ */
+struct nv *
+nv_ntoh(struct ebuf *eb)
+{
+ struct nv *nv;
+ size_t extra;
+ int rerrno;
+
+ assert(eb != NULL);
+
+ nv = malloc(sizeof(*nv));
+ if (nv == NULL)
+ return (NULL);
+ nv->nv_error = 0;
+ nv->nv_ebuf = eb;
+ nv->nv_magic = NV_MAGIC;
+
+ if (nv_validate(nv, &extra) < 0) {
+ rerrno = errno;
+ nv->nv_magic = 0;
+ free(nv);
+ errno = rerrno;
+ return (NULL);
+ }
+ /*
+ * Remove extra zeros at the end of the buffer.
+ */
+ ebuf_del_tail(eb, extra);
+
+ return (nv);
+}
+
+#define NV_DEFINE_ADD(type, TYPE) \
+void \
+nv_add_##type(struct nv *nv, type##_t value, const char *namefmt, ...) \
+{ \
+ va_list nameap; \
+ \
+ va_start(nameap, namefmt); \
+ nv_addv(nv, (unsigned char *)&value, sizeof(value), \
+ NV_TYPE_##TYPE, namefmt, nameap); \
+ va_end(nameap); \
+}
+
+NV_DEFINE_ADD(int8, INT8)
+NV_DEFINE_ADD(uint8, UINT8)
+NV_DEFINE_ADD(int16, INT16)
+NV_DEFINE_ADD(uint16, UINT16)
+NV_DEFINE_ADD(int32, INT32)
+NV_DEFINE_ADD(uint32, UINT32)
+NV_DEFINE_ADD(int64, INT64)
+NV_DEFINE_ADD(uint64, UINT64)
+
+#undef NV_DEFINE_ADD
+
+#define NV_DEFINE_ADD_ARRAY(type, TYPE) \
+void \
+nv_add_##type##_array(struct nv *nv, const type##_t *value, \
+ size_t nsize, const char *namefmt, ...) \
+{ \
+ va_list nameap; \
+ \
+ va_start(nameap, namefmt); \
+ nv_addv(nv, (const unsigned char *)value, \
+ sizeof(value[0]) * nsize, NV_TYPE_##TYPE##_ARRAY, namefmt, \
+ nameap); \
+ va_end(nameap); \
+}
+
+NV_DEFINE_ADD_ARRAY(int8, INT8)
+NV_DEFINE_ADD_ARRAY(uint8, UINT8)
+NV_DEFINE_ADD_ARRAY(int16, INT16)
+NV_DEFINE_ADD_ARRAY(uint16, UINT16)
+NV_DEFINE_ADD_ARRAY(int32, INT32)
+NV_DEFINE_ADD_ARRAY(uint32, UINT32)
+NV_DEFINE_ADD_ARRAY(int64, INT64)
+NV_DEFINE_ADD_ARRAY(uint64, UINT64)
+
+#undef NV_DEFINE_ADD_ARRAY
+
+void
+nv_add_string(struct nv *nv, const char *value, const char *namefmt, ...)
+{
+ va_list nameap;
+ size_t size;
+
+ size = strlen(value) + 1;
+
+ va_start(nameap, namefmt);
+ nv_addv(nv, (const unsigned char *)value, size, NV_TYPE_STRING,
+ namefmt, nameap);
+ va_end(nameap);
+}
+
+void
+nv_add_stringf(struct nv *nv, const char *name, const char *valuefmt, ...)
+{
+ va_list valueap;
+
+ va_start(valueap, valuefmt);
+ nv_add_stringv(nv, name, valuefmt, valueap);
+ va_end(valueap);
+}
+
+void
+nv_add_stringv(struct nv *nv, const char *name, const char *valuefmt,
+ va_list valueap)
+{
+ char *value;
+ ssize_t size;
+
+ size = vasprintf(&value, valuefmt, valueap);
+ if (size < 0) {
+ if (nv->nv_error == 0)
+ nv->nv_error = ENOMEM;
+ return;
+ }
+ size++;
+ nv_add(nv, (const unsigned char *)value, size, NV_TYPE_STRING, name);
+ free(value);
+}
+
+#define NV_DEFINE_GET(type, TYPE) \
+type##_t \
+nv_get_##type(struct nv *nv, const char *namefmt, ...) \
+{ \
+ struct nvhdr *nvh; \
+ va_list nameap; \
+ type##_t value; \
+ \
+ va_start(nameap, namefmt); \
+ nvh = nv_find(nv, NV_TYPE_##TYPE, namefmt, nameap); \
+ va_end(nameap); \
+ if (nvh == NULL) \
+ return (0); \
+ assert((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST); \
+ assert(sizeof(value) == nvh->nvh_dsize); \
+ bcopy(NVH_DATA(nvh), &value, sizeof(value)); \
+ \
+ return (value); \
+}
+
+NV_DEFINE_GET(int8, INT8)
+NV_DEFINE_GET(uint8, UINT8)
+NV_DEFINE_GET(int16, INT16)
+NV_DEFINE_GET(uint16, UINT16)
+NV_DEFINE_GET(int32, INT32)
+NV_DEFINE_GET(uint32, UINT32)
+NV_DEFINE_GET(int64, INT64)
+NV_DEFINE_GET(uint64, UINT64)
+
+#undef NV_DEFINE_GET
+
+#define NV_DEFINE_GET_ARRAY(type, TYPE) \
+const type##_t * \
+nv_get_##type##_array(struct nv *nv, size_t *sizep, \
+ const char *namefmt, ...) \
+{ \
+ struct nvhdr *nvh; \
+ va_list nameap; \
+ \
+ va_start(nameap, namefmt); \
+ nvh = nv_find(nv, NV_TYPE_##TYPE##_ARRAY, namefmt, nameap); \
+ va_end(nameap); \
+ if (nvh == NULL) \
+ return (NULL); \
+ assert((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST); \
+ assert((nvh->nvh_dsize % sizeof(type##_t)) == 0); \
+ if (sizep != NULL) \
+ *sizep = nvh->nvh_dsize / sizeof(type##_t); \
+ return ((type##_t *)(void *)NVH_DATA(nvh)); \
+}
+
+NV_DEFINE_GET_ARRAY(int8, INT8)
+NV_DEFINE_GET_ARRAY(uint8, UINT8)
+NV_DEFINE_GET_ARRAY(int16, INT16)
+NV_DEFINE_GET_ARRAY(uint16, UINT16)
+NV_DEFINE_GET_ARRAY(int32, INT32)
+NV_DEFINE_GET_ARRAY(uint32, UINT32)
+NV_DEFINE_GET_ARRAY(int64, INT64)
+NV_DEFINE_GET_ARRAY(uint64, UINT64)
+
+#undef NV_DEFINE_GET_ARRAY
+
+const char *
+nv_get_string(struct nv *nv, const char *namefmt, ...)
+{
+ struct nvhdr *nvh;
+ va_list nameap;
+ char *str;
+
+ va_start(nameap, namefmt);
+ nvh = nv_find(nv, NV_TYPE_STRING, namefmt, nameap);
+ va_end(nameap);
+ if (nvh == NULL)
+ return (NULL);
+ assert((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST);
+ assert(nvh->nvh_dsize >= 1);
+ str = NVH_DATA(nvh);
+ assert(str[nvh->nvh_dsize - 1] == '\0');
+ assert(strlen(str) == nvh->nvh_dsize - 1);
+ return (str);
+}
+
+/*
+ * Dump content of the nv structure.
+ */
+void
+nv_dump(struct nv *nv)
+{
+ struct nvhdr *nvh;
+ unsigned char *data, *ptr;
+ size_t dsize, size;
+ unsigned int ii;
+ bool swap;
+
+ if (nv_validate(nv, NULL) < 0) {
+ printf("error: %d\n", errno);
+ return;
+ }
+
+ NV_CHECK(nv);
+ assert(nv->nv_error == 0);
+
+ ptr = ebuf_data(nv->nv_ebuf, &size);
+ while (size > 0) {
+ assert(size >= sizeof(*nvh) + 2);
+ nvh = (struct nvhdr *)ptr;
+ assert(size >= NVH_SIZE(nvh));
+ swap = ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_NETWORK);
+ dsize = NVH_DSIZE(nvh);
+ data = NVH_DATA(nvh);
+ printf(" %s", nvh->nvh_name);
+ switch (nvh->nvh_type & NV_TYPE_MASK) {
+ case NV_TYPE_INT8:
+ printf("(int8): %jd", (intmax_t)(*(int8_t *)data));
+ break;
+ case NV_TYPE_UINT8:
+ printf("(uint8): %ju", (uintmax_t)(*(uint8_t *)data));
+ break;
+ case NV_TYPE_INT16:
+ printf("(int16): %jd", swap ?
+ (intmax_t)le16toh(*(int16_t *)(void *)data) :
+ (intmax_t)*(int16_t *)(void *)data);
+ break;
+ case NV_TYPE_UINT16:
+ printf("(uint16): %ju", swap ?
+ (uintmax_t)le16toh(*(uint16_t *)(void *)data) :
+ (uintmax_t)*(uint16_t *)(void *)data);
+ break;
+ case NV_TYPE_INT32:
+ printf("(int32): %jd", swap ?
+ (intmax_t)le32toh(*(int32_t *)(void *)data) :
+ (intmax_t)*(int32_t *)(void *)data);
+ break;
+ case NV_TYPE_UINT32:
+ printf("(uint32): %ju", swap ?
+ (uintmax_t)le32toh(*(uint32_t *)(void *)data) :
+ (uintmax_t)*(uint32_t *)(void *)data);
+ break;
+ case NV_TYPE_INT64:
+ printf("(int64): %jd", swap ?
+ (intmax_t)le64toh(*(int64_t *)(void *)data) :
+ (intmax_t)*(int64_t *)(void *)data);
+ break;
+ case NV_TYPE_UINT64:
+ printf("(uint64): %ju", swap ?
+ (uintmax_t)le64toh(*(uint64_t *)(void *)data) :
+ (uintmax_t)*(uint64_t *)(void *)data);
+ break;
+ case NV_TYPE_INT8_ARRAY:
+ printf("(int8 array):");
+ for (ii = 0; ii < dsize; ii++)
+ printf(" %jd", (intmax_t)((int8_t *)data)[ii]);
+ break;
+ case NV_TYPE_UINT8_ARRAY:
+ printf("(uint8 array):");
+ for (ii = 0; ii < dsize; ii++)
+ printf(" %ju", (uintmax_t)((uint8_t *)data)[ii]);
+ break;
+ case NV_TYPE_INT16_ARRAY:
+ printf("(int16 array):");
+ for (ii = 0; ii < dsize / 2; ii++) {
+ printf(" %jd", swap ?
+ (intmax_t)le16toh(((int16_t *)(void *)data)[ii]) :
+ (intmax_t)((int16_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_UINT16_ARRAY:
+ printf("(uint16 array):");
+ for (ii = 0; ii < dsize / 2; ii++) {
+ printf(" %ju", swap ?
+ (uintmax_t)le16toh(((uint16_t *)(void *)data)[ii]) :
+ (uintmax_t)((uint16_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_INT32_ARRAY:
+ printf("(int32 array):");
+ for (ii = 0; ii < dsize / 4; ii++) {
+ printf(" %jd", swap ?
+ (intmax_t)le32toh(((int32_t *)(void *)data)[ii]) :
+ (intmax_t)((int32_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_UINT32_ARRAY:
+ printf("(uint32 array):");
+ for (ii = 0; ii < dsize / 4; ii++) {
+ printf(" %ju", swap ?
+ (uintmax_t)le32toh(((uint32_t *)(void *)data)[ii]) :
+ (uintmax_t)((uint32_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_INT64_ARRAY:
+ printf("(int64 array):");
+ for (ii = 0; ii < dsize / 8; ii++) {
+ printf(" %ju", swap ?
+ (uintmax_t)le64toh(((uint64_t *)(void *)data)[ii]) :
+ (uintmax_t)((uint64_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_UINT64_ARRAY:
+ printf("(uint64 array):");
+ for (ii = 0; ii < dsize / 8; ii++) {
+ printf(" %ju", swap ?
+ (uintmax_t)le64toh(((uint64_t *)(void *)data)[ii]) :
+ (uintmax_t)((uint64_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_STRING:
+ printf("(string): %s", (char *)data);
+ break;
+ default:
+ assert(!"invalid condition");
+ }
+ printf("\n");
+ ptr += NVH_SIZE(nvh);
+ size -= NVH_SIZE(nvh);
+ }
+}
+
+/*
+ * Local routines below.
+ */
+
+static void
+nv_add(struct nv *nv, const unsigned char *value, size_t vsize, int type,
+ const char *name)
+{
+ static unsigned char align[7];
+ struct nvhdr *nvh;
+ size_t namesize;
+
+ if (nv == NULL) {
+ errno = ENOMEM;
+ return;
+ }
+
+ NV_CHECK(nv);
+
+ namesize = strlen(name) + 1;
+
+ nvh = malloc(sizeof(*nvh) + roundup2(namesize, 8));
+ if (nvh == NULL) {
+ if (nv->nv_error == 0)
+ nv->nv_error = ENOMEM;
+ return;
+ }
+ nvh->nvh_type = NV_ORDER_HOST | type;
+ nvh->nvh_namesize = (uint8_t)namesize;
+ nvh->nvh_dsize = (uint32_t)vsize;
+ bcopy(name, nvh->nvh_name, namesize);
+
+ /* Add header first. */
+ if (ebuf_add_tail(nv->nv_ebuf, nvh, NVH_HSIZE(nvh)) < 0) {
+ assert(errno != 0);
+ if (nv->nv_error == 0)
+ nv->nv_error = errno;
+ return;
+ }
+ /* Add the actual data. */
+ if (ebuf_add_tail(nv->nv_ebuf, value, vsize) < 0) {
+ assert(errno != 0);
+ if (nv->nv_error == 0)
+ nv->nv_error = errno;
+ return;
+ }
+ /* Align the data (if needed). */
+ vsize = roundup2(vsize, 8) - vsize;
+ if (vsize == 0)
+ return;
+ assert(vsize > 0 && vsize <= sizeof(align));
+ if (ebuf_add_tail(nv->nv_ebuf, align, vsize) < 0) {
+ assert(errno != 0);
+ if (nv->nv_error == 0)
+ nv->nv_error = errno;
+ return;
+ }
+}
+
+static void
+nv_addv(struct nv *nv, const unsigned char *value, size_t vsize, int type,
+ const char *namefmt, va_list nameap)
+{
+ char name[255];
+ size_t namesize;
+
+ namesize = vsnprintf(name, sizeof(name), namefmt, nameap);
+ assert(namesize > 0 && namesize < sizeof(name));
+
+ nv_add(nv, value, vsize, type, name);
+}
+
+static struct nvhdr *
+nv_find(struct nv *nv, int type, const char *namefmt, va_list nameap)
+{
+ char name[255];
+ struct nvhdr *nvh;
+ unsigned char *ptr;
+ size_t size, namesize;
+
+ if (nv == NULL) {
+ errno = ENOMEM;
+ return (NULL);
+ }
+
+ NV_CHECK(nv);
+
+ namesize = vsnprintf(name, sizeof(name), namefmt, nameap);
+ assert(namesize > 0 && namesize < sizeof(name));
+ namesize++;
+
+ ptr = ebuf_data(nv->nv_ebuf, &size);
+ while (size > 0) {
+ assert(size >= sizeof(*nvh) + 2);
+ nvh = (struct nvhdr *)ptr;
+ assert(size >= NVH_SIZE(nvh));
+ nv_swap(nvh, true);
+ if (strcmp(nvh->nvh_name, name) == 0) {
+ if ((nvh->nvh_type & NV_TYPE_MASK) != type) {
+ errno = EINVAL;
+ if (nv->nv_error == 0)
+ nv->nv_error = EINVAL;
+ return (NULL);
+ }
+ return (nvh);
+ }
+ ptr += NVH_SIZE(nvh);
+ size -= NVH_SIZE(nvh);
+ }
+ errno = ENOENT;
+ if (nv->nv_error == 0)
+ nv->nv_error = ENOENT;
+ return (NULL);
+}
+
+static void
+nv_swap(struct nvhdr *nvh, bool tohost)
+{
+ unsigned char *data, *end, *p;
+ size_t vsize;
+
+ data = NVH_DATA(nvh);
+ if (tohost) {
+ if ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST)
+ return;
+ nvh->nvh_dsize = le32toh(nvh->nvh_dsize);
+ end = data + nvh->nvh_dsize;
+ nvh->nvh_type &= ~NV_ORDER_MASK;
+ nvh->nvh_type |= NV_ORDER_HOST;
+ } else {
+ if ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_NETWORK)
+ return;
+ end = data + nvh->nvh_dsize;
+ nvh->nvh_dsize = htole32(nvh->nvh_dsize);
+ nvh->nvh_type &= ~NV_ORDER_MASK;
+ nvh->nvh_type |= NV_ORDER_NETWORK;
+ }
+
+ vsize = 0;
+
+ switch (nvh->nvh_type & NV_TYPE_MASK) {
+ case NV_TYPE_INT8:
+ case NV_TYPE_UINT8:
+ case NV_TYPE_INT8_ARRAY:
+ case NV_TYPE_UINT8_ARRAY:
+ break;
+ case NV_TYPE_INT16:
+ case NV_TYPE_UINT16:
+ case NV_TYPE_INT16_ARRAY:
+ case NV_TYPE_UINT16_ARRAY:
+ if (vsize == 0)
+ vsize = 2;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT32:
+ case NV_TYPE_UINT32:
+ case NV_TYPE_INT32_ARRAY:
+ case NV_TYPE_UINT32_ARRAY:
+ if (vsize == 0)
+ vsize = 4;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT64:
+ case NV_TYPE_UINT64:
+ case NV_TYPE_INT64_ARRAY:
+ case NV_TYPE_UINT64_ARRAY:
+ if (vsize == 0)
+ vsize = 8;
+ for (p = data; p < end; p += vsize) {
+ if (tohost) {
+ switch (vsize) {
+ case 2:
+ *(uint16_t *)(void *)p =
+ le16toh(*(uint16_t *)(void *)p);
+ break;
+ case 4:
+ *(uint32_t *)(void *)p =
+ le32toh(*(uint32_t *)(void *)p);
+ break;
+ case 8:
+ *(uint64_t *)(void *)p =
+ le64toh(*(uint64_t *)(void *)p);
+ break;
+ default:
+ assert(!"invalid condition");
+ }
+ } else {
+ switch (vsize) {
+ case 2:
+ *(uint16_t *)(void *)p =
+ htole16(*(uint16_t *)(void *)p);
+ break;
+ case 4:
+ *(uint32_t *)(void *)p =
+ htole32(*(uint32_t *)(void *)p);
+ break;
+ case 8:
+ *(uint64_t *)(void *)p =
+ htole64(*(uint64_t *)(void *)p);
+ break;
+ default:
+ assert(!"invalid condition");
+ }
+ }
+ }
+ break;
+ case NV_TYPE_STRING:
+ break;
+ default:
+ assert(!"unrecognized type");
+ }
+}
diff --git a/sbin/hastd/nv.h b/sbin/hastd/nv.h
new file mode 100644
index 0000000..1677548
--- /dev/null
+++ b/sbin/hastd/nv.h
@@ -0,0 +1,158 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NV_H_
+#define _NV_H_
+
+#include <sys/cdefs.h>
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <ebuf.h>
+
+#define NV_TYPE_INT8 1
+#define NV_TYPE_UINT8 2
+#define NV_TYPE_INT16 3
+#define NV_TYPE_UINT16 4
+#define NV_TYPE_INT32 5
+#define NV_TYPE_UINT32 6
+#define NV_TYPE_INT64 7
+#define NV_TYPE_UINT64 8
+#define NV_TYPE_INT8_ARRAY 9
+#define NV_TYPE_UINT8_ARRAY 10
+#define NV_TYPE_INT16_ARRAY 11
+#define NV_TYPE_UINT16_ARRAY 12
+#define NV_TYPE_INT32_ARRAY 13
+#define NV_TYPE_UINT32_ARRAY 14
+#define NV_TYPE_INT64_ARRAY 15
+#define NV_TYPE_UINT64_ARRAY 16
+#define NV_TYPE_STRING 17
+
+#define NV_TYPE_MASK 0x7f
+#define NV_TYPE_FIRST NV_TYPE_INT8
+#define NV_TYPE_LAST NV_TYPE_STRING
+
+#define NV_ORDER_NETWORK 0x00
+#define NV_ORDER_HOST 0x80
+
+#define NV_ORDER_MASK 0x80
+
+struct nv;
+
+struct nv *nv_alloc(void);
+void nv_free(struct nv *nv);
+int nv_error(const struct nv *nv);
+int nv_set_error(struct nv *nv, int error);
+int nv_validate(struct nv *nv, size_t *extrap);
+
+struct ebuf *nv_hton(struct nv *nv);
+struct nv *nv_ntoh(struct ebuf *eb);
+
+void nv_add_int8(struct nv *nv, int8_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_uint8(struct nv *nv, uint8_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_int16(struct nv *nv, int16_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_uint16(struct nv *nv, uint16_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_int32(struct nv *nv, int32_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_uint32(struct nv *nv, uint32_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_int64(struct nv *nv, int64_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_uint64(struct nv *nv, uint64_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_int8_array(struct nv *nv, const int8_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_uint8_array(struct nv *nv, const uint8_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_int16_array(struct nv *nv, const int16_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_uint16_array(struct nv *nv, const uint16_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_int32_array(struct nv *nv, const int32_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_uint32_array(struct nv *nv, const uint32_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_int64_array(struct nv *nv, const int64_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_uint64_array(struct nv *nv, const uint64_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_string(struct nv *nv, const char *value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_stringf(struct nv *nv, const char *name, const char *valuefmt, ...)
+ __printflike(3, 4);
+void nv_add_stringv(struct nv *nv, const char *name, const char *valuefmt,
+ va_list valueap) __printflike(3, 0);
+
+int8_t nv_get_int8(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+uint8_t nv_get_uint8(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+int16_t nv_get_int16(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+uint16_t nv_get_uint16(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+int32_t nv_get_int32(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+uint32_t nv_get_uint32(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+int64_t nv_get_int64(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+uint64_t nv_get_uint64(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+const int8_t *nv_get_int8_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const uint8_t *nv_get_uint8_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const int16_t *nv_get_int16_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const uint16_t *nv_get_uint16_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const int32_t *nv_get_int32_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const uint32_t *nv_get_uint32_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const int64_t *nv_get_int64_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const uint64_t *nv_get_uint64_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const char *nv_get_string(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+
+void nv_dump(struct nv *nv);
+
+#endif /* !_NV_H_ */
diff --git a/sbin/hastd/parse.y b/sbin/hastd/parse.y
new file mode 100644
index 0000000..6755320
--- /dev/null
+++ b/sbin/hastd/parse.y
@@ -0,0 +1,507 @@
+%{
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h> /* MAXHOSTNAMELEN */
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+
+#include <arpa/inet.h>
+
+#include <assert.h>
+#include <err.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include "hast.h"
+
+extern int depth;
+extern int lineno;
+
+extern FILE *yyin;
+extern char *yytext;
+
+static struct hastd_config lconfig;
+static struct hast_resource *curres;
+static bool mynode;
+
+static char depth0_control[HAST_ADDRSIZE];
+static char depth0_listen[HAST_ADDRSIZE];
+static int depth0_replication;
+
+static char depth1_provname[PATH_MAX];
+static char depth1_localpath[PATH_MAX];
+
+static bool
+isitme(const char *name)
+{
+ char buf[MAXHOSTNAMELEN];
+ char *pos;
+ size_t bufsize;
+
+ /*
+ * First check if the give name matches our full hostname.
+ */
+ if (gethostname(buf, sizeof(buf)) < 0)
+ err(EX_OSERR, "gethostname() failed");
+ if (strcmp(buf, name) == 0)
+ return (true);
+
+ /*
+ * Now check if it matches first part of the host name.
+ */
+ pos = strchr(buf, '.');
+ if (pos != NULL && pos != buf && strncmp(buf, name, pos - buf) == 0)
+ return (true);
+
+ /*
+ * At the end check if name is equal to our host's UUID.
+ */
+ bufsize = sizeof(buf);
+ if (sysctlbyname("kern.hostuuid", buf, &bufsize, NULL, 0) < 0)
+ err(EX_OSERR, "sysctlbyname(kern.hostuuid) failed");
+ if (strcasecmp(buf, name) == 0)
+ return (true);
+
+ /*
+ * Looks like this isn't about us.
+ */
+ return (false);
+}
+
+void
+yyerror(const char *str)
+{
+
+ fprintf(stderr, "error at line %d near '%s': %s\n",
+ lineno, yytext, str);
+}
+
+struct hastd_config *
+yy_config_parse(const char *config)
+{
+ int ret;
+
+ curres = NULL;
+ mynode = false;
+
+ depth0_replication = HAST_REPLICATION_MEMSYNC;
+ strlcpy(depth0_control, HAST_CONTROL, sizeof(depth0_control));
+ strlcpy(depth0_listen, HASTD_LISTEN, sizeof(depth0_listen));
+
+ TAILQ_INIT(&lconfig.hc_resources);
+
+ yyin = fopen(config, "r");
+ if (yyin == NULL)
+ err(EX_OSFILE, "cannot open configuration file %s", config);
+ ret = yyparse();
+ fclose(yyin);
+ if (ret != 0) {
+ yy_config_free(&lconfig);
+ exit(EX_CONFIG);
+ }
+
+ /*
+ * Let's see if everything is set up.
+ */
+ if (lconfig.hc_controladdr[0] == '\0') {
+ strlcpy(lconfig.hc_controladdr, depth0_control,
+ sizeof(lconfig.hc_controladdr));
+ }
+ if (lconfig.hc_listenaddr[0] == '\0') {
+ strlcpy(lconfig.hc_listenaddr, depth0_listen,
+ sizeof(lconfig.hc_listenaddr));
+ }
+ TAILQ_FOREACH(curres, &lconfig.hc_resources, hr_next) {
+ assert(curres->hr_provname[0] != '\0');
+ assert(curres->hr_localpath[0] != '\0');
+ assert(curres->hr_remoteaddr[0] != '\0');
+
+ if (curres->hr_replication == -1) {
+ /*
+ * Replication is not set at resource-level.
+ * Use global or default setting.
+ */
+ curres->hr_replication = depth0_replication;
+ }
+ }
+
+ return (&lconfig);
+}
+
+void
+yy_config_free(struct hastd_config *config)
+{
+ struct hast_resource *res;
+
+ while ((res = TAILQ_FIRST(&config->hc_resources)) != NULL) {
+ TAILQ_REMOVE(&config->hc_resources, res, hr_next);
+ free(res);
+ }
+}
+%}
+
+%token CONTROL LISTEN PORT REPLICATION EXTENTSIZE RESOURCE NAME LOCAL REMOTE ON
+%token FULLSYNC MEMSYNC ASYNC
+%token NUM STR OB CB
+
+%type <num> replication_type
+
+%union
+{
+ int num;
+ char *str;
+}
+
+%token <num> NUM
+%token <str> STR
+
+%%
+
+statements:
+ |
+ statements statement
+ ;
+
+statement:
+ control_statement
+ |
+ listen_statement
+ |
+ replication_statement
+ |
+ node_statement
+ |
+ resource_statement
+ ;
+
+control_statement: CONTROL STR
+ {
+ switch (depth) {
+ case 0:
+ if (strlcpy(depth0_control, $2,
+ sizeof(depth0_control)) >=
+ sizeof(depth0_control)) {
+ errx(EX_CONFIG, "control argument too long");
+ }
+ break;
+ case 1:
+ if (mynode) {
+ if (strlcpy(lconfig.hc_controladdr, $2,
+ sizeof(lconfig.hc_controladdr)) >=
+ sizeof(lconfig.hc_controladdr)) {
+ errx(EX_CONFIG,
+ "control argument too long");
+ }
+ }
+ break;
+ default:
+ assert(!"control at wrong depth level");
+ }
+ }
+ ;
+
+listen_statement: LISTEN STR
+ {
+ switch (depth) {
+ case 0:
+ if (strlcpy(depth0_listen, $2,
+ sizeof(depth0_listen)) >=
+ sizeof(depth0_listen)) {
+ errx(EX_CONFIG, "listen argument too long");
+ }
+ break;
+ case 1:
+ if (mynode) {
+ if (strlcpy(lconfig.hc_listenaddr, $2,
+ sizeof(lconfig.hc_listenaddr)) >=
+ sizeof(lconfig.hc_listenaddr)) {
+ errx(EX_CONFIG,
+ "listen argument too long");
+ }
+ }
+ break;
+ default:
+ assert(!"listen at wrong depth level");
+ }
+ }
+ ;
+
+replication_statement: REPLICATION replication_type
+ {
+ switch (depth) {
+ case 0:
+ depth0_replication = $2;
+ break;
+ case 1:
+ if (curres != NULL)
+ curres->hr_replication = $2;
+ break;
+ default:
+ assert(!"replication at wrong depth level");
+ }
+ }
+ ;
+
+replication_type:
+ FULLSYNC { $$ = HAST_REPLICATION_FULLSYNC; }
+ |
+ MEMSYNC { $$ = HAST_REPLICATION_MEMSYNC; }
+ |
+ ASYNC { $$ = HAST_REPLICATION_ASYNC; }
+ ;
+
+node_statement: ON node_start OB node_entries CB
+ {
+ mynode = false;
+ }
+ ;
+
+node_start: STR
+ {
+ if (isitme($1))
+ mynode = true;
+ }
+ ;
+
+node_entries:
+ |
+ node_entries node_entry
+ ;
+
+node_entry:
+ control_statement
+ |
+ listen_statement
+ ;
+
+resource_statement: RESOURCE resource_start OB resource_entries CB
+ {
+ if (curres != NULL) {
+ /*
+ * Let's see there are some resource-level settings
+ * that we can use for node-level settings.
+ */
+ if (curres->hr_provname[0] == '\0' &&
+ depth1_provname[0] != '\0') {
+ /*
+ * Provider name is not set at node-level,
+ * but is set at resource-level, use it.
+ */
+ strlcpy(curres->hr_provname, depth1_provname,
+ sizeof(curres->hr_provname));
+ }
+ if (curres->hr_localpath[0] == '\0' &&
+ depth1_localpath[0] != '\0') {
+ /*
+ * Path to local provider is not set at
+ * node-level, but is set at resource-level,
+ * use it.
+ */
+ strlcpy(curres->hr_localpath, depth1_localpath,
+ sizeof(curres->hr_localpath));
+ }
+
+ /*
+ * If provider name is not given, use resource name
+ * as provider name.
+ */
+ if (curres->hr_provname[0] == '\0') {
+ strlcpy(curres->hr_provname, curres->hr_name,
+ sizeof(curres->hr_provname));
+ }
+
+ /*
+ * Remote address has to be configured at this point.
+ */
+ if (curres->hr_remoteaddr[0] == '\0') {
+ errx(EX_CONFIG,
+ "remote address not configured for resource %s",
+ curres->hr_name);
+ }
+ /*
+ * Path to local provider has to be configured at this
+ * point.
+ */
+ if (curres->hr_localpath[0] == '\0') {
+ errx(EX_CONFIG,
+ "path local component not configured for resource %s",
+ curres->hr_name);
+ }
+
+ /* Put it onto resource list. */
+ TAILQ_INSERT_TAIL(&lconfig.hc_resources, curres, hr_next);
+ curres = NULL;
+ }
+ }
+ ;
+
+resource_start: STR
+ {
+ /*
+ * Clear those, so we can tell if they were set at
+ * resource-level or not.
+ */
+ depth1_provname[0] = '\0';
+ depth1_localpath[0] = '\0';
+
+ curres = calloc(1, sizeof(*curres));
+ if (curres == NULL) {
+ errx(EX_TEMPFAIL,
+ "cannot allocate memory for resource");
+ }
+ if (strlcpy(curres->hr_name, $1,
+ sizeof(curres->hr_name)) >=
+ sizeof(curres->hr_name)) {
+ errx(EX_CONFIG,
+ "resource name (%s) too long", $1);
+ }
+ curres->hr_role = HAST_ROLE_INIT;
+ curres->hr_previous_role = HAST_ROLE_INIT;
+ curres->hr_replication = -1;
+ curres->hr_provname[0] = '\0';
+ curres->hr_localpath[0] = '\0';
+ curres->hr_localfd = -1;
+ curres->hr_remoteaddr[0] = '\0';
+ curres->hr_ggateunit = -1;
+ }
+ ;
+
+resource_entries:
+ |
+ resource_entries resource_entry
+ ;
+
+resource_entry:
+ replication_statement
+ |
+ name_statement
+ |
+ local_statement
+ |
+ resource_node_statement
+ ;
+
+name_statement: NAME STR
+ {
+ switch (depth) {
+ case 1:
+ if (strlcpy(depth1_provname, $2,
+ sizeof(depth1_provname)) >=
+ sizeof(depth1_provname)) {
+ errx(EX_CONFIG, "name argument too long");
+ }
+ break;
+ case 2:
+ if (mynode) {
+ assert(curres != NULL);
+ if (strlcpy(curres->hr_provname, $2,
+ sizeof(curres->hr_provname)) >=
+ sizeof(curres->hr_provname)) {
+ errx(EX_CONFIG,
+ "name argument too long");
+ }
+ }
+ break;
+ default:
+ assert(!"name at wrong depth level");
+ }
+ }
+ ;
+
+local_statement: LOCAL STR
+ {
+ switch (depth) {
+ case 1:
+ if (strlcpy(depth1_localpath, $2,
+ sizeof(depth1_localpath)) >=
+ sizeof(depth1_localpath)) {
+ errx(EX_CONFIG, "local argument too long");
+ }
+ break;
+ case 2:
+ if (mynode) {
+ assert(curres != NULL);
+ if (strlcpy(curres->hr_localpath, $2,
+ sizeof(curres->hr_localpath)) >=
+ sizeof(curres->hr_localpath)) {
+ errx(EX_CONFIG,
+ "local argument too long");
+ }
+ }
+ break;
+ default:
+ assert(!"local at wrong depth level");
+ }
+ }
+ ;
+
+resource_node_statement:ON resource_node_start OB resource_node_entries CB
+ {
+ mynode = false;
+ }
+ ;
+
+resource_node_start: STR
+ {
+ if (curres != NULL && isitme($1))
+ mynode = true;
+ }
+ ;
+
+resource_node_entries:
+ |
+ resource_node_entries resource_node_entry
+ ;
+
+resource_node_entry:
+ name_statement
+ |
+ local_statement
+ |
+ remote_statement
+ ;
+
+remote_statement: REMOTE STR
+ {
+ assert(depth == 2);
+ if (mynode) {
+ assert(curres != NULL);
+ if (strlcpy(curres->hr_remoteaddr, $2,
+ sizeof(curres->hr_remoteaddr)) >=
+ sizeof(curres->hr_remoteaddr)) {
+ errx(EX_CONFIG, "remote argument too long");
+ }
+ }
+ }
+ ;
diff --git a/sbin/hastd/pjdlog.c b/sbin/hastd/pjdlog.c
new file mode 100644
index 0000000..38c5539
--- /dev/null
+++ b/sbin/hastd/pjdlog.c
@@ -0,0 +1,367 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
+
+#include "pjdlog.h"
+
+static int pjdlog_mode = PJDLOG_MODE_STD;
+static int pjdlog_debug_level = 0;
+static char pjdlog_prefix[128];
+
+/*
+ * Configure where the logs should go.
+ * By default they are send to stdout/stderr, but after going into background
+ * (eg. by calling daemon(3)) application is responsible for changing mode to
+ * PJDLOG_MODE_SYSLOG, so logs will be send to syslog.
+ */
+void
+pjdlog_mode_set(int mode)
+{
+
+ assert(mode == PJDLOG_MODE_STD || mode == PJDLOG_MODE_SYSLOG);
+
+ pjdlog_mode = mode;
+}
+
+/*
+ * Return current mode.
+ */
+int
+pjdlog_mode_get(void)
+{
+
+ return (pjdlog_mode);
+}
+
+/*
+ * Set debug level. All the logs above the level specified here will be
+ * ignored.
+ */
+void
+pjdlog_debug_set(int level)
+{
+
+ assert(level >= 0);
+
+ pjdlog_debug_level = level;
+}
+
+/*
+ * Return current debug level.
+ */
+int
+pjdlog_debug_get(void)
+{
+
+ return (pjdlog_debug_level);
+}
+
+/*
+ * Set prefix that will be used before each log.
+ * Setting prefix to NULL will remove it.
+ */
+void
+pjdlog_prefix_set(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlog_prefix_setv(fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Set prefix that will be used before each log.
+ * Setting prefix to NULL will remove it.
+ */
+void
+pjdlog_prefix_setv(const char *fmt, va_list ap)
+{
+
+ assert(fmt != NULL);
+
+ vsnprintf(pjdlog_prefix, sizeof(pjdlog_prefix), fmt, ap);
+}
+
+/*
+ * Convert log level into string.
+ */
+static const char *
+pjdlog_level_string(int loglevel)
+{
+
+ switch (loglevel) {
+ case LOG_EMERG:
+ return ("EMERG");
+ case LOG_ALERT:
+ return ("ALERT");
+ case LOG_CRIT:
+ return ("CRIT");
+ case LOG_ERR:
+ return ("ERROR");
+ case LOG_WARNING:
+ return ("WARNING");
+ case LOG_NOTICE:
+ return ("NOTICE");
+ case LOG_INFO:
+ return ("INFO");
+ case LOG_DEBUG:
+ return ("DEBUG");
+ }
+ assert(!"Invalid log level.");
+ abort(); /* XXX: gcc */
+}
+
+/*
+ * Common log routine.
+ */
+void
+pjdlog_common(int loglevel, int debuglevel, int error, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv_common(loglevel, debuglevel, error, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Common log routine, which can handle regular log level as well as debug
+ * level. We decide here where to send the logs (stdout/stderr or syslog).
+ */
+void
+pjdlogv_common(int loglevel, int debuglevel, int error, const char *fmt,
+ va_list ap)
+{
+
+ assert(loglevel == LOG_EMERG || loglevel == LOG_ALERT ||
+ loglevel == LOG_CRIT || loglevel == LOG_ERR ||
+ loglevel == LOG_WARNING || loglevel == LOG_NOTICE ||
+ loglevel == LOG_INFO || loglevel == LOG_DEBUG);
+ assert(loglevel != LOG_DEBUG || debuglevel > 0);
+ assert(error >= -1);
+
+ /* Ignore debug above configured level. */
+ if (loglevel == LOG_DEBUG && debuglevel > pjdlog_debug_level)
+ return;
+
+ switch (pjdlog_mode) {
+ case PJDLOG_MODE_STD:
+ {
+ FILE *out;
+
+ /*
+ * We send errors and warning to stderr and the rest to stdout.
+ */
+ switch (loglevel) {
+ case LOG_EMERG:
+ case LOG_ALERT:
+ case LOG_CRIT:
+ case LOG_ERR:
+ case LOG_WARNING:
+ out = stderr;
+ break;
+ case LOG_NOTICE:
+ case LOG_INFO:
+ case LOG_DEBUG:
+ out = stdout;
+ break;
+ default:
+ assert(!"Invalid loglevel.");
+ abort(); /* XXX: gcc */
+ }
+
+ fprintf(out, "[%s]", pjdlog_level_string(loglevel));
+ /* Attach debuglevel if this is debug log. */
+ if (loglevel == LOG_DEBUG)
+ fprintf(out, "[%d]", debuglevel);
+ fprintf(out, " ");
+ fprintf(out, "%s", pjdlog_prefix);
+ vfprintf(out, fmt, ap);
+ if (error != -1)
+ fprintf(out, ": %s.", strerror(error));
+ fprintf(out, "\n");
+ break;
+ }
+ case PJDLOG_MODE_SYSLOG:
+ {
+ char log[1024];
+ int len;
+
+ len = snprintf(log, sizeof(log), "%s", pjdlog_prefix);
+ if ((size_t)len < sizeof(log))
+ len = vsnprintf(log + len, sizeof(log) - len, fmt, ap);
+ if (error != -1 && (size_t)len < sizeof(log)) {
+ (void)snprintf(log + len, sizeof(log) - len, ": %s.",
+ strerror(error));
+ }
+ syslog(loglevel, "%s", log);
+ break;
+ }
+ default:
+ assert(!"Invalid mode.");
+ }
+}
+
+/*
+ * Regular logs.
+ */
+void
+pjdlogv(int loglevel, const char *fmt, va_list ap)
+{
+
+ /* LOG_DEBUG is invalid here, pjdlogv?_debug() should be used. */
+ assert(loglevel == LOG_EMERG || loglevel == LOG_ALERT ||
+ loglevel == LOG_CRIT || loglevel == LOG_ERR ||
+ loglevel == LOG_WARNING || loglevel == LOG_NOTICE ||
+ loglevel == LOG_INFO);
+
+ pjdlogv_common(loglevel, 0, -1, fmt, ap);
+}
+
+/*
+ * Regular logs.
+ */
+void
+pjdlog(int loglevel, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv(loglevel, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Debug logs.
+ */
+void
+pjdlogv_debug(int debuglevel, const char *fmt, va_list ap)
+{
+
+ pjdlogv_common(LOG_DEBUG, debuglevel, -1, fmt, ap);
+}
+
+/*
+ * Debug logs.
+ */
+void
+pjdlog_debug(int debuglevel, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv_debug(debuglevel, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Error logs with errno logging.
+ */
+void
+pjdlogv_errno(int loglevel, const char *fmt, va_list ap)
+{
+
+ pjdlogv_common(loglevel, 0, errno, fmt, ap);
+}
+
+/*
+ * Error logs with errno logging.
+ */
+void
+pjdlog_errno(int loglevel, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv_errno(loglevel, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Log error, errno and exit.
+ */
+void
+pjdlogv_exit(int exitcode, const char *fmt, va_list ap)
+{
+
+ pjdlogv_errno(LOG_ERR, fmt, ap);
+ exit(exitcode);
+}
+
+/*
+ * Log error, errno and exit.
+ */
+void
+pjdlog_exit(int exitcode, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv_exit(exitcode, fmt, ap);
+ /* NOTREACHED */
+ va_end(ap);
+}
+
+/*
+ * Log error and exit.
+ */
+void
+pjdlogv_exitx(int exitcode, const char *fmt, va_list ap)
+{
+
+ pjdlogv(LOG_ERR, fmt, ap);
+ exit(exitcode);
+}
+
+/*
+ * Log error and exit.
+ */
+void
+pjdlog_exitx(int exitcode, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv_exitx(exitcode, fmt, ap);
+ /* NOTREACHED */
+ va_end(ap);
+}
diff --git a/sbin/hastd/pjdlog.h b/sbin/hastd/pjdlog.h
new file mode 100644
index 0000000..2136b12
--- /dev/null
+++ b/sbin/hastd/pjdlog.h
@@ -0,0 +1,88 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PJDLOG_H_
+#define _PJDLOG_H_
+
+#include <sys/cdefs.h>
+
+#include <stdarg.h>
+#include <sysexits.h>
+#include <syslog.h>
+
+#define PJDLOG_MODE_STD 0
+#define PJDLOG_MODE_SYSLOG 1
+
+void pjdlog_mode_set(int mode);
+int pjdlog_mode_get(void);
+
+void pjdlog_debug_set(int level);
+int pjdlog_debug_get(void);
+
+void pjdlog_prefix_set(const char *fmt, ...) __printflike(1, 2);
+void pjdlog_prefix_setv(const char *fmt, va_list ap) __printflike(1, 0);
+
+void pjdlog_common(int loglevel, int debuglevel, int error, const char *fmt,
+ ...) __printflike(4, 5);
+void pjdlogv_common(int loglevel, int debuglevel, int error, const char *fmt,
+ va_list ap) __printflike(4, 0);
+
+void pjdlog(int loglevel, const char *fmt, ...) __printflike(2, 3);
+void pjdlogv(int loglevel, const char *fmt, va_list ap) __printflike(2, 0);
+
+#define pjdlogv_emergency(fmt, ap) pjdlogv(LOG_EMERG, (fmt), (ap))
+#define pjdlog_emergency(...) pjdlog(LOG_EMERG, __VA_ARGS__)
+#define pjdlogv_alert(fmt, ap) pjdlogv(LOG_ALERT, (fmt), (ap))
+#define pjdlog_alert(...) pjdlog(LOG_ALERT, __VA_ARGS__)
+#define pjdlogv_critical(fmt, ap) pjdlogv(LOG_CRIT, (fmt), (ap))
+#define pjdlog_critical(...) pjdlog(LOG_CRIT, __VA_ARGS__)
+#define pjdlogv_error(fmt, ap) pjdlogv(LOG_ERR, (fmt), (ap))
+#define pjdlog_error(...) pjdlog(LOG_ERR, __VA_ARGS__)
+#define pjdlogv_warning(fmt, ap) pjdlogv(LOG_WARNING, (fmt), (ap))
+#define pjdlog_warning(...) pjdlog(LOG_WARNING, __VA_ARGS__)
+#define pjdlogv_notice(fmt, ap) pjdlogv(LOG_NOTICE, (fmt), (ap))
+#define pjdlog_notice(...) pjdlog(LOG_NOTICE, __VA_ARGS__)
+#define pjdlogv_info(fmt, ap) pjdlogv(LOG_INFO, (fmt), (ap))
+#define pjdlog_info(...) pjdlog(LOG_INFO, __VA_ARGS__)
+
+void pjdlog_debug(int debuglevel, const char *fmt, ...) __printflike(2, 3);
+void pjdlogv_debug(int debuglevel, const char *fmt, va_list ap) __printflike(2, 0);
+
+void pjdlog_errno(int loglevel, const char *fmt, ...) __printflike(2, 3);
+void pjdlogv_errno(int loglevel, const char *fmt, va_list ap) __printflike(2, 0);
+
+void pjdlog_exit(int exitcode, const char *fmt, ...) __printflike(2, 3) __dead2;
+void pjdlogv_exit(int exitcode, const char *fmt, va_list ap) __printflike(2, 0) __dead2;
+
+void pjdlog_exitx(int exitcode, const char *fmt, ...) __printflike(2, 3) __dead2;
+void pjdlogv_exitx(int exitcode, const char *fmt, va_list ap) __printflike(2, 0) __dead2;
+
+#endif /* !_PJDLOG_H_ */
diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c
new file mode 100644
index 0000000..ed6e91c
--- /dev/null
+++ b/sbin/hastd/primary.c
@@ -0,0 +1,1769 @@
+/*-
+ * Copyright (c) 2009 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/bio.h>
+#include <sys/disk.h>
+#include <sys/refcount.h>
+#include <sys/stat.h>
+
+#include <geom/gate/g_gate.h>
+
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgeom.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <activemap.h>
+#include <nv.h>
+#include <rangelock.h>
+
+#include "control.h"
+#include "hast.h"
+#include "hast_proto.h"
+#include "hastd.h"
+#include "metadata.h"
+#include "proto.h"
+#include "pjdlog.h"
+#include "subr.h"
+#include "synch.h"
+
+struct hio {
+ /*
+ * Number of components we are still waiting for.
+ * When this field goes to 0, we can send the request back to the
+ * kernel. Each component has to decrease this counter by one
+ * even on failure.
+ */
+ unsigned int hio_countdown;
+ /*
+ * Each component has a place to store its own error.
+ * Once the request is handled by all components we can decide if the
+ * request overall is successful or not.
+ */
+ int *hio_errors;
+ /*
+ * Structure used to comunicate with GEOM Gate class.
+ */
+ struct g_gate_ctl_io hio_ggio;
+ TAILQ_ENTRY(hio) *hio_next;
+};
+#define hio_free_next hio_next[0]
+#define hio_done_next hio_next[0]
+
+/*
+ * Free list holds unused structures. When free list is empty, we have to wait
+ * until some in-progress requests are freed.
+ */
+static TAILQ_HEAD(, hio) hio_free_list;
+static pthread_mutex_t hio_free_list_lock;
+static pthread_cond_t hio_free_list_cond;
+/*
+ * There is one send list for every component. One requests is placed on all
+ * send lists - each component gets the same request, but each component is
+ * responsible for managing his own send list.
+ */
+static TAILQ_HEAD(, hio) *hio_send_list;
+static pthread_mutex_t *hio_send_list_lock;
+static pthread_cond_t *hio_send_list_cond;
+/*
+ * There is one recv list for every component, although local components don't
+ * use recv lists as local requests are done synchronously.
+ */
+static TAILQ_HEAD(, hio) *hio_recv_list;
+static pthread_mutex_t *hio_recv_list_lock;
+static pthread_cond_t *hio_recv_list_cond;
+/*
+ * Request is placed on done list by the slowest component (the one that
+ * decreased hio_countdown from 1 to 0).
+ */
+static TAILQ_HEAD(, hio) hio_done_list;
+static pthread_mutex_t hio_done_list_lock;
+static pthread_cond_t hio_done_list_cond;
+/*
+ * Structure below are for interaction with sync thread.
+ */
+static bool sync_inprogress;
+static pthread_mutex_t sync_lock;
+static pthread_cond_t sync_cond;
+/*
+ * The lock below allows to synchornize access to remote connections.
+ */
+static pthread_rwlock_t *hio_remote_lock;
+static pthread_mutex_t hio_guard_lock;
+static pthread_cond_t hio_guard_cond;
+
+/*
+ * Lock to synchronize metadata updates. Also synchronize access to
+ * hr_primary_localcnt and hr_primary_remotecnt fields.
+ */
+static pthread_mutex_t metadata_lock;
+
+/*
+ * Maximum number of outstanding I/O requests.
+ */
+#define HAST_HIO_MAX 256
+/*
+ * Number of components. At this point there are only two components: local
+ * and remote, but in the future it might be possible to use multiple local
+ * and remote components.
+ */
+#define HAST_NCOMPONENTS 2
+/*
+ * Number of seconds to sleep before next reconnect try.
+ */
+#define RECONNECT_SLEEP 5
+
+#define ISCONNECTED(res, no) \
+ ((res)->hr_remotein != NULL && (res)->hr_remoteout != NULL)
+
+#define QUEUE_INSERT1(hio, name, ncomp) do { \
+ bool _wakeup; \
+ \
+ mtx_lock(&hio_##name##_list_lock[(ncomp)]); \
+ _wakeup = TAILQ_EMPTY(&hio_##name##_list[(ncomp)]); \
+ TAILQ_INSERT_TAIL(&hio_##name##_list[(ncomp)], (hio), \
+ hio_next[(ncomp)]); \
+ mtx_unlock(&hio_##name##_list_lock[ncomp]); \
+ if (_wakeup) \
+ cv_signal(&hio_##name##_list_cond[(ncomp)]); \
+} while (0)
+#define QUEUE_INSERT2(hio, name) do { \
+ bool _wakeup; \
+ \
+ mtx_lock(&hio_##name##_list_lock); \
+ _wakeup = TAILQ_EMPTY(&hio_##name##_list); \
+ TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_##name##_next);\
+ mtx_unlock(&hio_##name##_list_lock); \
+ if (_wakeup) \
+ cv_signal(&hio_##name##_list_cond); \
+} while (0)
+#define QUEUE_TAKE1(hio, name, ncomp) do { \
+ mtx_lock(&hio_##name##_list_lock[(ncomp)]); \
+ while (((hio) = TAILQ_FIRST(&hio_##name##_list[(ncomp)])) == NULL) { \
+ cv_wait(&hio_##name##_list_cond[(ncomp)], \
+ &hio_##name##_list_lock[(ncomp)]); \
+ } \
+ TAILQ_REMOVE(&hio_##name##_list[(ncomp)], (hio), \
+ hio_next[(ncomp)]); \
+ mtx_unlock(&hio_##name##_list_lock[(ncomp)]); \
+} while (0)
+#define QUEUE_TAKE2(hio, name) do { \
+ mtx_lock(&hio_##name##_list_lock); \
+ while (((hio) = TAILQ_FIRST(&hio_##name##_list)) == NULL) { \
+ cv_wait(&hio_##name##_list_cond, \
+ &hio_##name##_list_lock); \
+ } \
+ TAILQ_REMOVE(&hio_##name##_list, (hio), hio_##name##_next); \
+ mtx_unlock(&hio_##name##_list_lock); \
+} while (0)
+
+#define SYNCREQ(hio) do { (hio)->hio_ggio.gctl_unit = -1; } while (0)
+#define ISSYNCREQ(hio) ((hio)->hio_ggio.gctl_unit == -1)
+#define SYNCREQDONE(hio) do { (hio)->hio_ggio.gctl_unit = -2; } while (0)
+#define ISSYNCREQDONE(hio) ((hio)->hio_ggio.gctl_unit == -2)
+
+static struct hast_resource *gres;
+
+static pthread_mutex_t range_lock;
+static struct rangelocks *range_regular;
+static bool range_regular_wait;
+static pthread_cond_t range_regular_cond;
+static struct rangelocks *range_sync;
+static bool range_sync_wait;
+static pthread_cond_t range_sync_cond;
+
+static void *ggate_recv_thread(void *arg);
+static void *local_send_thread(void *arg);
+static void *remote_send_thread(void *arg);
+static void *remote_recv_thread(void *arg);
+static void *ggate_send_thread(void *arg);
+static void *sync_thread(void *arg);
+static void *guard_thread(void *arg);
+
+static void sighandler(int sig);
+
+static void
+cleanup(struct hast_resource *res)
+{
+ int rerrno;
+
+ /* Remember errno. */
+ rerrno = errno;
+
+ /*
+ * Close descriptor to /dev/hast/<name>
+ * to work-around race in the kernel.
+ */
+ close(res->hr_localfd);
+
+ /* Destroy ggate provider if we created one. */
+ if (res->hr_ggateunit >= 0) {
+ struct g_gate_ctl_destroy ggiod;
+
+ ggiod.gctl_version = G_GATE_VERSION;
+ ggiod.gctl_unit = res->hr_ggateunit;
+ ggiod.gctl_force = 1;
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_DESTROY, &ggiod) < 0) {
+ pjdlog_warning("Unable to destroy hast/%s device",
+ res->hr_provname);
+ }
+ res->hr_ggateunit = -1;
+ }
+
+ /* Restore errno. */
+ errno = rerrno;
+}
+
+static void
+primary_exit(int exitcode, const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(exitcode != EX_OK);
+ va_start(ap, fmt);
+ pjdlogv_errno(LOG_ERR, fmt, ap);
+ va_end(ap);
+ cleanup(gres);
+ exit(exitcode);
+}
+
+static void
+primary_exitx(int exitcode, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv(exitcode == EX_OK ? LOG_INFO : LOG_ERR, fmt, ap);
+ va_end(ap);
+ cleanup(gres);
+ exit(exitcode);
+}
+
+static int
+hast_activemap_flush(struct hast_resource *res)
+{
+ const unsigned char *buf;
+ size_t size;
+
+ buf = activemap_bitmap(res->hr_amp, &size);
+ assert(buf != NULL);
+ assert((size % res->hr_local_sectorsize) == 0);
+ if (pwrite(res->hr_localfd, buf, size, METADATA_SIZE) !=
+ (ssize_t)size) {
+ KEEP_ERRNO(pjdlog_errno(LOG_ERR,
+ "Unable to flush activemap to disk"));
+ return (-1);
+ }
+ return (0);
+}
+
+static void
+init_environment(struct hast_resource *res __unused)
+{
+ struct hio *hio;
+ unsigned int ii, ncomps;
+
+ /*
+ * In the future it might be per-resource value.
+ */
+ ncomps = HAST_NCOMPONENTS;
+
+ /*
+ * Allocate memory needed by lists.
+ */
+ hio_send_list = malloc(sizeof(hio_send_list[0]) * ncomps);
+ if (hio_send_list == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for send lists.",
+ sizeof(hio_send_list[0]) * ncomps);
+ }
+ hio_send_list_lock = malloc(sizeof(hio_send_list_lock[0]) * ncomps);
+ if (hio_send_list_lock == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for send list locks.",
+ sizeof(hio_send_list_lock[0]) * ncomps);
+ }
+ hio_send_list_cond = malloc(sizeof(hio_send_list_cond[0]) * ncomps);
+ if (hio_send_list_cond == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for send list condition variables.",
+ sizeof(hio_send_list_cond[0]) * ncomps);
+ }
+ hio_recv_list = malloc(sizeof(hio_recv_list[0]) * ncomps);
+ if (hio_recv_list == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for recv lists.",
+ sizeof(hio_recv_list[0]) * ncomps);
+ }
+ hio_recv_list_lock = malloc(sizeof(hio_recv_list_lock[0]) * ncomps);
+ if (hio_recv_list_lock == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for recv list locks.",
+ sizeof(hio_recv_list_lock[0]) * ncomps);
+ }
+ hio_recv_list_cond = malloc(sizeof(hio_recv_list_cond[0]) * ncomps);
+ if (hio_recv_list_cond == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for recv list condition variables.",
+ sizeof(hio_recv_list_cond[0]) * ncomps);
+ }
+ hio_remote_lock = malloc(sizeof(hio_remote_lock[0]) * ncomps);
+ if (hio_remote_lock == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for remote connections locks.",
+ sizeof(hio_remote_lock[0]) * ncomps);
+ }
+
+ /*
+ * Initialize lists, their locks and theirs condition variables.
+ */
+ TAILQ_INIT(&hio_free_list);
+ mtx_init(&hio_free_list_lock);
+ cv_init(&hio_free_list_cond);
+ for (ii = 0; ii < HAST_NCOMPONENTS; ii++) {
+ TAILQ_INIT(&hio_send_list[ii]);
+ mtx_init(&hio_send_list_lock[ii]);
+ cv_init(&hio_send_list_cond[ii]);
+ TAILQ_INIT(&hio_recv_list[ii]);
+ mtx_init(&hio_recv_list_lock[ii]);
+ cv_init(&hio_recv_list_cond[ii]);
+ rw_init(&hio_remote_lock[ii]);
+ }
+ TAILQ_INIT(&hio_done_list);
+ mtx_init(&hio_done_list_lock);
+ cv_init(&hio_done_list_cond);
+ mtx_init(&hio_guard_lock);
+ cv_init(&hio_guard_cond);
+ mtx_init(&metadata_lock);
+
+ /*
+ * Allocate requests pool and initialize requests.
+ */
+ for (ii = 0; ii < HAST_HIO_MAX; ii++) {
+ hio = malloc(sizeof(*hio));
+ if (hio == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for hio request.",
+ sizeof(*hio));
+ }
+ hio->hio_countdown = 0;
+ hio->hio_errors = malloc(sizeof(hio->hio_errors[0]) * ncomps);
+ if (hio->hio_errors == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable allocate %zu bytes of memory for hio errors.",
+ sizeof(hio->hio_errors[0]) * ncomps);
+ }
+ hio->hio_next = malloc(sizeof(hio->hio_next[0]) * ncomps);
+ if (hio->hio_next == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable allocate %zu bytes of memory for hio_next field.",
+ sizeof(hio->hio_next[0]) * ncomps);
+ }
+ hio->hio_ggio.gctl_version = G_GATE_VERSION;
+ hio->hio_ggio.gctl_data = malloc(MAXPHYS);
+ if (hio->hio_ggio.gctl_data == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for gctl_data.",
+ MAXPHYS);
+ }
+ hio->hio_ggio.gctl_length = MAXPHYS;
+ hio->hio_ggio.gctl_error = 0;
+ TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_free_next);
+ }
+
+ /*
+ * Turn on signals handling.
+ */
+ signal(SIGINT, sighandler);
+ signal(SIGTERM, sighandler);
+}
+
+static void
+init_local(struct hast_resource *res)
+{
+ unsigned char *buf;
+ size_t mapsize;
+
+ if (metadata_read(res, true) < 0)
+ exit(EX_NOINPUT);
+ mtx_init(&res->hr_amp_lock);
+ if (activemap_init(&res->hr_amp, res->hr_datasize, res->hr_extentsize,
+ res->hr_local_sectorsize, res->hr_keepdirty) < 0) {
+ primary_exit(EX_TEMPFAIL, "Unable to create activemap");
+ }
+ mtx_init(&range_lock);
+ cv_init(&range_regular_cond);
+ if (rangelock_init(&range_regular) < 0)
+ primary_exit(EX_TEMPFAIL, "Unable to create regular range lock");
+ cv_init(&range_sync_cond);
+ if (rangelock_init(&range_sync) < 0)
+ primary_exit(EX_TEMPFAIL, "Unable to create sync range lock");
+ mapsize = activemap_ondisk_size(res->hr_amp);
+ buf = calloc(1, mapsize);
+ if (buf == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate buffer for activemap.");
+ }
+ if (pread(res->hr_localfd, buf, mapsize, METADATA_SIZE) !=
+ (ssize_t)mapsize) {
+ primary_exit(EX_NOINPUT, "Unable to read activemap");
+ }
+ activemap_copyin(res->hr_amp, buf, mapsize);
+ if (res->hr_resuid != 0)
+ return;
+ /*
+ * We're using provider for the first time, so we have to generate
+ * resource unique identifier and initialize local and remote counts.
+ */
+ arc4random_buf(&res->hr_resuid, sizeof(res->hr_resuid));
+ res->hr_primary_localcnt = 1;
+ res->hr_primary_remotecnt = 0;
+ if (metadata_write(res) < 0)
+ exit(EX_NOINPUT);
+}
+
+static void
+init_remote(struct hast_resource *res)
+{
+ struct nv *nvout, *nvin;
+ const unsigned char *token;
+ unsigned char *map;
+ const char *errmsg;
+ int32_t extentsize;
+ int64_t datasize;
+ uint32_t mapsize;
+ size_t size;
+
+ /* Prepare outgoing connection with remote node. */
+ if (proto_client(res->hr_remoteaddr, &res->hr_remoteout) < 0) {
+ primary_exit(EX_OSERR, "Unable to create connection to %s",
+ res->hr_remoteaddr);
+ }
+ /* Try to connect, but accept failure. */
+ if (proto_connect(res->hr_remoteout) < 0) {
+ pjdlog_errno(LOG_WARNING, "Unable to connect to %s",
+ res->hr_remoteaddr);
+ goto close;
+ }
+ /*
+ * First handshake step.
+ * Setup outgoing connection with remote node.
+ */
+ nvout = nv_alloc();
+ nv_add_string(nvout, res->hr_name, "resource");
+ if (nv_error(nvout) != 0) {
+ pjdlog_common(LOG_WARNING, 0, nv_error(nvout),
+ "Unable to allocate header for connection with %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ goto close;
+ }
+ if (hast_proto_send(res, res->hr_remoteout, nvout, NULL, 0) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to send handshake header to %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ goto close;
+ }
+ nv_free(nvout);
+ if (hast_proto_recv_hdr(res->hr_remoteout, &nvin) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to receive handshake header from %s",
+ res->hr_remoteaddr);
+ goto close;
+ }
+ errmsg = nv_get_string(nvin, "errmsg");
+ if (errmsg != NULL) {
+ pjdlog_warning("%s", errmsg);
+ nv_free(nvin);
+ goto close;
+ }
+ token = nv_get_uint8_array(nvin, &size, "token");
+ if (token == NULL) {
+ pjdlog_warning("Handshake header from %s has no 'token' field.",
+ res->hr_remoteaddr);
+ nv_free(nvin);
+ goto close;
+ }
+ if (size != sizeof(res->hr_token)) {
+ pjdlog_warning("Handshake header from %s contains 'token' of wrong size (got %zu, expected %zu).",
+ res->hr_remoteaddr, size, sizeof(res->hr_token));
+ nv_free(nvin);
+ goto close;
+ }
+ bcopy(token, res->hr_token, sizeof(res->hr_token));
+ nv_free(nvin);
+
+ /*
+ * Second handshake step.
+ * Setup incoming connection with remote node.
+ */
+ if (proto_client(res->hr_remoteaddr, &res->hr_remotein) < 0) {
+ pjdlog_errno(LOG_WARNING, "Unable to create connection to %s",
+ res->hr_remoteaddr);
+ }
+ /* Try to connect, but accept failure. */
+ if (proto_connect(res->hr_remotein) < 0) {
+ pjdlog_errno(LOG_WARNING, "Unable to connect to %s",
+ res->hr_remoteaddr);
+ goto close;
+ }
+ nvout = nv_alloc();
+ nv_add_string(nvout, res->hr_name, "resource");
+ nv_add_uint8_array(nvout, res->hr_token, sizeof(res->hr_token),
+ "token");
+ nv_add_uint64(nvout, res->hr_resuid, "resuid");
+ nv_add_uint64(nvout, res->hr_primary_localcnt, "localcnt");
+ nv_add_uint64(nvout, res->hr_primary_remotecnt, "remotecnt");
+ if (nv_error(nvout) != 0) {
+ pjdlog_common(LOG_WARNING, 0, nv_error(nvout),
+ "Unable to allocate header for connection with %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ goto close;
+ }
+ if (hast_proto_send(res, res->hr_remotein, nvout, NULL, 0) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to send handshake header to %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ goto close;
+ }
+ nv_free(nvout);
+ if (hast_proto_recv_hdr(res->hr_remoteout, &nvin) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to receive handshake header from %s",
+ res->hr_remoteaddr);
+ goto close;
+ }
+ errmsg = nv_get_string(nvin, "errmsg");
+ if (errmsg != NULL) {
+ pjdlog_warning("%s", errmsg);
+ nv_free(nvin);
+ goto close;
+ }
+ datasize = nv_get_int64(nvin, "datasize");
+ if (datasize != res->hr_datasize) {
+ pjdlog_warning("Data size differs between nodes (local=%jd, remote=%jd).",
+ (intmax_t)res->hr_datasize, (intmax_t)datasize);
+ nv_free(nvin);
+ goto close;
+ }
+ extentsize = nv_get_int32(nvin, "extentsize");
+ if (extentsize != res->hr_extentsize) {
+ pjdlog_warning("Extent size differs between nodes (local=%zd, remote=%zd).",
+ (ssize_t)res->hr_extentsize, (ssize_t)extentsize);
+ nv_free(nvin);
+ goto close;
+ }
+ res->hr_secondary_localcnt = nv_get_uint64(nvin, "localcnt");
+ res->hr_secondary_remotecnt = nv_get_uint64(nvin, "remotecnt");
+ res->hr_syncsrc = nv_get_uint8(nvin, "syncsrc");
+ map = NULL;
+ mapsize = nv_get_uint32(nvin, "mapsize");
+ if (mapsize > 0) {
+ map = malloc(mapsize);
+ if (map == NULL) {
+ pjdlog_error("Unable to allocate memory for remote activemap (mapsize=%ju).",
+ (uintmax_t)mapsize);
+ nv_free(nvin);
+ goto close;
+ }
+ /*
+ * Remote node have some dirty extents on its own, lets
+ * download its activemap.
+ */
+ if (hast_proto_recv_data(res, res->hr_remoteout, nvin, map,
+ mapsize) < 0) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to receive remote activemap");
+ nv_free(nvin);
+ free(map);
+ goto close;
+ }
+ /*
+ * Merge local and remote bitmaps.
+ */
+ activemap_merge(res->hr_amp, map, mapsize);
+ free(map);
+ /*
+ * Now that we merged bitmaps from both nodes, flush it to the
+ * disk before we start to synchronize.
+ */
+ (void)hast_activemap_flush(res);
+ }
+ pjdlog_info("Connected to %s.", res->hr_remoteaddr);
+ mtx_lock(&sync_lock);
+ sync_inprogress = true;
+ mtx_unlock(&sync_lock);
+ cv_signal(&sync_cond);
+ return;
+close:
+ proto_close(res->hr_remoteout);
+ res->hr_remoteout = NULL;
+ if (res->hr_remotein != NULL) {
+ proto_close(res->hr_remotein);
+ res->hr_remotein = NULL;
+ }
+}
+
+static void
+init_ggate(struct hast_resource *res)
+{
+ struct g_gate_ctl_create ggiocreate;
+ struct g_gate_ctl_cancel ggiocancel;
+
+ /*
+ * We communicate with ggate via /dev/ggctl. Open it.
+ */
+ res->hr_ggatefd = open("/dev/" G_GATE_CTL_NAME, O_RDWR);
+ if (res->hr_ggatefd < 0)
+ primary_exit(EX_OSFILE, "Unable to open /dev/" G_GATE_CTL_NAME);
+ /*
+ * Create provider before trying to connect, as connection failure
+ * is not critical, but may take some time.
+ */
+ ggiocreate.gctl_version = G_GATE_VERSION;
+ ggiocreate.gctl_mediasize = res->hr_datasize;
+ ggiocreate.gctl_sectorsize = res->hr_local_sectorsize;
+ ggiocreate.gctl_flags = 0;
+ ggiocreate.gctl_maxcount = 128;
+ ggiocreate.gctl_timeout = 0;
+ ggiocreate.gctl_unit = G_GATE_NAME_GIVEN;
+ snprintf(ggiocreate.gctl_name, sizeof(ggiocreate.gctl_name), "hast/%s",
+ res->hr_provname);
+ bzero(ggiocreate.gctl_info, sizeof(ggiocreate.gctl_info));
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_CREATE, &ggiocreate) == 0) {
+ pjdlog_info("Device hast/%s created.", res->hr_provname);
+ res->hr_ggateunit = ggiocreate.gctl_unit;
+ return;
+ }
+ if (errno != EEXIST) {
+ primary_exit(EX_OSERR, "Unable to create hast/%s device",
+ res->hr_provname);
+ }
+ pjdlog_debug(1,
+ "Device hast/%s already exists, we will try to take it over.",
+ res->hr_provname);
+ /*
+ * If we received EEXIST, we assume that the process who created the
+ * provider died and didn't clean up. In that case we will start from
+ * where he left of.
+ */
+ ggiocancel.gctl_version = G_GATE_VERSION;
+ ggiocancel.gctl_unit = G_GATE_NAME_GIVEN;
+ snprintf(ggiocancel.gctl_name, sizeof(ggiocancel.gctl_name), "hast/%s",
+ res->hr_provname);
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_CANCEL, &ggiocancel) == 0) {
+ pjdlog_info("Device hast/%s recovered.", res->hr_provname);
+ res->hr_ggateunit = ggiocancel.gctl_unit;
+ return;
+ }
+ primary_exit(EX_OSERR, "Unable to take over hast/%s device",
+ res->hr_provname);
+}
+
+void
+hastd_primary(struct hast_resource *res)
+{
+ pthread_t td;
+ pid_t pid;
+ int error;
+
+ gres = res;
+
+ /*
+ * Create communication channel between parent and child.
+ */
+ if (proto_client("socketpair://", &res->hr_ctrl) < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ primary_exit(EX_OSERR,
+ "Unable to create control sockets between parent and child");
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ primary_exit(EX_OSERR, "Unable to fork");
+ }
+
+ if (pid > 0) {
+ /* This is parent. */
+ res->hr_workerpid = pid;
+ return;
+ }
+ (void)pidfile_close(pfh);
+
+ setproctitle("%s (primary)", res->hr_name);
+
+ init_local(res);
+ init_remote(res);
+ init_ggate(res);
+ init_environment(res);
+ error = pthread_create(&td, NULL, ggate_recv_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, local_send_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, remote_send_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, remote_recv_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, ggate_send_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, sync_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, ctrl_thread, res);
+ assert(error == 0);
+ (void)guard_thread(res);
+}
+
+static void
+reqlog(int loglevel, int debuglevel, struct g_gate_ctl_io *ggio, const char *fmt, ...)
+{
+ char msg[1024];
+ va_list ap;
+ int len;
+
+ va_start(ap, fmt);
+ len = vsnprintf(msg, sizeof(msg), fmt, ap);
+ va_end(ap);
+ if ((size_t)len < sizeof(msg)) {
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "READ(%ju, %ju).", (uintmax_t)ggio->gctl_offset,
+ (uintmax_t)ggio->gctl_length);
+ break;
+ case BIO_DELETE:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "DELETE(%ju, %ju).", (uintmax_t)ggio->gctl_offset,
+ (uintmax_t)ggio->gctl_length);
+ break;
+ case BIO_FLUSH:
+ (void)snprintf(msg + len, sizeof(msg) - len, "FLUSH.");
+ break;
+ case BIO_WRITE:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "WRITE(%ju, %ju).", (uintmax_t)ggio->gctl_offset,
+ (uintmax_t)ggio->gctl_length);
+ break;
+ default:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "UNKNOWN(%u).", (unsigned int)ggio->gctl_cmd);
+ break;
+ }
+ }
+ pjdlog_common(loglevel, debuglevel, -1, "%s", msg);
+}
+
+static void
+remote_close(struct hast_resource *res, int ncomp)
+{
+
+ rw_wlock(&hio_remote_lock[ncomp]);
+ /*
+ * A race is possible between dropping rlock and acquiring wlock -
+ * another thread can close connection in-between.
+ */
+ if (!ISCONNECTED(res, ncomp)) {
+ assert(res->hr_remotein == NULL);
+ assert(res->hr_remoteout == NULL);
+ rw_unlock(&hio_remote_lock[ncomp]);
+ return;
+ }
+
+ assert(res->hr_remotein != NULL);
+ assert(res->hr_remoteout != NULL);
+
+ pjdlog_debug(2, "Closing old incoming connection to %s.",
+ res->hr_remoteaddr);
+ proto_close(res->hr_remotein);
+ res->hr_remotein = NULL;
+ pjdlog_debug(2, "Closing old outgoing connection to %s.",
+ res->hr_remoteaddr);
+ proto_close(res->hr_remoteout);
+ res->hr_remoteout = NULL;
+
+ rw_unlock(&hio_remote_lock[ncomp]);
+
+ /*
+ * Stop synchronization if in-progress.
+ */
+ mtx_lock(&sync_lock);
+ if (sync_inprogress)
+ sync_inprogress = false;
+ mtx_unlock(&sync_lock);
+
+ /*
+ * Wake up guard thread, so it can immediately start reconnect.
+ */
+ mtx_lock(&hio_guard_lock);
+ cv_signal(&hio_guard_cond);
+ mtx_unlock(&hio_guard_lock);
+}
+
+/*
+ * Thread receives ggate I/O requests from the kernel and passes them to
+ * appropriate threads:
+ * WRITE - always goes to both local_send and remote_send threads
+ * READ (when the block is up-to-date on local component) -
+ * only local_send thread
+ * READ (when the block isn't up-to-date on local component) -
+ * only remote_send thread
+ * DELETE - always goes to both local_send and remote_send threads
+ * FLUSH - always goes to both local_send and remote_send threads
+ */
+static void *
+ggate_recv_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ unsigned int ii, ncomp, ncomps;
+ int error;
+
+ ncomps = HAST_NCOMPONENTS;
+
+ for (;;) {
+ pjdlog_debug(2, "ggate_recv: Taking free request.");
+ QUEUE_TAKE2(hio, free);
+ pjdlog_debug(2, "ggate_recv: (%p) Got free request.", hio);
+ ggio = &hio->hio_ggio;
+ ggio->gctl_unit = res->hr_ggateunit;
+ ggio->gctl_length = MAXPHYS;
+ ggio->gctl_error = 0;
+ pjdlog_debug(2,
+ "ggate_recv: (%p) Waiting for request from the kernel.",
+ hio);
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_START, ggio) < 0) {
+ if (sigexit_received)
+ pthread_exit(NULL);
+ primary_exit(EX_OSERR, "G_GATE_CMD_START failed");
+ }
+ error = ggio->gctl_error;
+ switch (error) {
+ case 0:
+ break;
+ case ECANCELED:
+ /* Exit gracefully. */
+ if (!sigexit_received) {
+ pjdlog_debug(2,
+ "ggate_recv: (%p) Received cancel from the kernel.",
+ hio);
+ pjdlog_info("Received cancel from the kernel, exiting.");
+ }
+ pthread_exit(NULL);
+ case ENOMEM:
+ /*
+ * Buffer too small? Impossible, we allocate MAXPHYS
+ * bytes - request can't be bigger than that.
+ */
+ /* FALLTHROUGH */
+ case ENXIO:
+ default:
+ primary_exitx(EX_OSERR, "G_GATE_CMD_START failed: %s.",
+ strerror(error));
+ }
+ for (ii = 0; ii < ncomps; ii++)
+ hio->hio_errors[ii] = EINVAL;
+ reqlog(LOG_DEBUG, 2, ggio,
+ "ggate_recv: (%p) Request received from the kernel: ",
+ hio);
+ /*
+ * Inform all components about new write request.
+ * For read request prefer local component unless the given
+ * range is out-of-date, then use remote component.
+ */
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ pjdlog_debug(2,
+ "ggate_recv: (%p) Moving request to the send queue.",
+ hio);
+ refcount_init(&hio->hio_countdown, 1);
+ mtx_lock(&metadata_lock);
+ if (res->hr_syncsrc == HAST_SYNCSRC_UNDEF ||
+ res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) {
+ /*
+ * This range is up-to-date on local component,
+ * so handle request locally.
+ */
+ /* Local component is 0 for now. */
+ ncomp = 0;
+ } else /* if (res->hr_syncsrc ==
+ HAST_SYNCSRC_SECONDARY) */ {
+ assert(res->hr_syncsrc ==
+ HAST_SYNCSRC_SECONDARY);
+ /*
+ * This range is out-of-date on local component,
+ * so send request to the remote node.
+ */
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+ }
+ mtx_unlock(&metadata_lock);
+ QUEUE_INSERT1(hio, send, ncomp);
+ break;
+ case BIO_WRITE:
+ for (;;) {
+ mtx_lock(&range_lock);
+ if (rangelock_islocked(range_sync,
+ ggio->gctl_offset, ggio->gctl_length)) {
+ pjdlog_debug(2,
+ "regular: Range offset=%jd length=%zu locked.",
+ (intmax_t)ggio->gctl_offset,
+ (size_t)ggio->gctl_length);
+ range_regular_wait = true;
+ cv_wait(&range_regular_cond, &range_lock);
+ range_regular_wait = false;
+ mtx_unlock(&range_lock);
+ continue;
+ }
+ if (rangelock_add(range_regular,
+ ggio->gctl_offset, ggio->gctl_length) < 0) {
+ mtx_unlock(&range_lock);
+ pjdlog_debug(2,
+ "regular: Range offset=%jd length=%zu is already locked, waiting.",
+ (intmax_t)ggio->gctl_offset,
+ (size_t)ggio->gctl_length);
+ sleep(1);
+ continue;
+ }
+ mtx_unlock(&range_lock);
+ break;
+ }
+ mtx_lock(&res->hr_amp_lock);
+ if (activemap_write_start(res->hr_amp,
+ ggio->gctl_offset, ggio->gctl_length)) {
+ (void)hast_activemap_flush(res);
+ }
+ mtx_unlock(&res->hr_amp_lock);
+ /* FALLTHROUGH */
+ case BIO_DELETE:
+ case BIO_FLUSH:
+ pjdlog_debug(2,
+ "ggate_recv: (%p) Moving request to the send queues.",
+ hio);
+ refcount_init(&hio->hio_countdown, ncomps);
+ for (ii = 0; ii < ncomps; ii++)
+ QUEUE_INSERT1(hio, send, ii);
+ break;
+ }
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread reads from or writes to local component.
+ * If local read fails, it redirects it to remote_send thread.
+ */
+static void *
+local_send_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ unsigned int ncomp, rncomp;
+ ssize_t ret;
+
+ /* Local component is 0 for now. */
+ ncomp = 0;
+ /* Remote component is 1 for now. */
+ rncomp = 1;
+
+ for (;;) {
+ pjdlog_debug(2, "local_send: Taking request.");
+ QUEUE_TAKE1(hio, send, ncomp);
+ pjdlog_debug(2, "local_send: (%p) Got request.", hio);
+ ggio = &hio->hio_ggio;
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ ret = pread(res->hr_localfd, ggio->gctl_data,
+ ggio->gctl_length,
+ ggio->gctl_offset + res->hr_localoff);
+ if (ret == ggio->gctl_length)
+ hio->hio_errors[ncomp] = 0;
+ else {
+ /*
+ * If READ failed, try to read from remote node.
+ */
+ QUEUE_INSERT1(hio, send, rncomp);
+ continue;
+ }
+ break;
+ case BIO_WRITE:
+ ret = pwrite(res->hr_localfd, ggio->gctl_data,
+ ggio->gctl_length,
+ ggio->gctl_offset + res->hr_localoff);
+ if (ret < 0)
+ hio->hio_errors[ncomp] = errno;
+ else if (ret != ggio->gctl_length)
+ hio->hio_errors[ncomp] = EIO;
+ else
+ hio->hio_errors[ncomp] = 0;
+ break;
+ case BIO_DELETE:
+ ret = g_delete(res->hr_localfd,
+ ggio->gctl_offset + res->hr_localoff,
+ ggio->gctl_length);
+ if (ret < 0)
+ hio->hio_errors[ncomp] = errno;
+ else
+ hio->hio_errors[ncomp] = 0;
+ break;
+ case BIO_FLUSH:
+ ret = g_flush(res->hr_localfd);
+ if (ret < 0)
+ hio->hio_errors[ncomp] = errno;
+ else
+ hio->hio_errors[ncomp] = 0;
+ break;
+ }
+ if (refcount_release(&hio->hio_countdown)) {
+ if (ISSYNCREQ(hio)) {
+ mtx_lock(&sync_lock);
+ SYNCREQDONE(hio);
+ mtx_unlock(&sync_lock);
+ cv_signal(&sync_cond);
+ } else {
+ pjdlog_debug(2,
+ "local_send: (%p) Moving request to the done queue.",
+ hio);
+ QUEUE_INSERT2(hio, done);
+ }
+ }
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread sends request to secondary node.
+ */
+static void *
+remote_send_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ struct nv *nv;
+ unsigned int ncomp;
+ bool wakeup;
+ uint64_t offset, length;
+ uint8_t cmd;
+ void *data;
+
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+
+ for (;;) {
+ pjdlog_debug(2, "remote_send: Taking request.");
+ QUEUE_TAKE1(hio, send, ncomp);
+ pjdlog_debug(2, "remote_send: (%p) Got request.", hio);
+ ggio = &hio->hio_ggio;
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ cmd = HIO_READ;
+ data = NULL;
+ offset = ggio->gctl_offset;
+ length = ggio->gctl_length;
+ break;
+ case BIO_WRITE:
+ cmd = HIO_WRITE;
+ data = ggio->gctl_data;
+ offset = ggio->gctl_offset;
+ length = ggio->gctl_length;
+ break;
+ case BIO_DELETE:
+ cmd = HIO_DELETE;
+ data = NULL;
+ offset = ggio->gctl_offset;
+ length = ggio->gctl_length;
+ break;
+ case BIO_FLUSH:
+ cmd = HIO_FLUSH;
+ data = NULL;
+ offset = 0;
+ length = 0;
+ break;
+ default:
+ assert(!"invalid condition");
+ abort();
+ }
+ nv = nv_alloc();
+ nv_add_uint8(nv, cmd, "cmd");
+ nv_add_uint64(nv, (uint64_t)ggio->gctl_seq, "seq");
+ nv_add_uint64(nv, offset, "offset");
+ nv_add_uint64(nv, length, "length");
+ if (nv_error(nv) != 0) {
+ hio->hio_errors[ncomp] = nv_error(nv);
+ pjdlog_debug(2,
+ "remote_send: (%p) Unable to prepare header to send.",
+ hio);
+ reqlog(LOG_ERR, 0, ggio,
+ "Unable to prepare header to send (%s): ",
+ strerror(nv_error(nv)));
+ /* Move failed request immediately to the done queue. */
+ goto done_queue;
+ }
+ pjdlog_debug(2,
+ "remote_send: (%p) Moving request to the recv queue.",
+ hio);
+ /*
+ * Protect connection from disappearing.
+ */
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (!ISCONNECTED(res, ncomp)) {
+ rw_unlock(&hio_remote_lock[ncomp]);
+ hio->hio_errors[ncomp] = ENOTCONN;
+ goto done_queue;
+ }
+ /*
+ * Move the request to recv queue before sending it, because
+ * in different order we can get reply before we move request
+ * to recv queue.
+ */
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ wakeup = TAILQ_EMPTY(&hio_recv_list[ncomp]);
+ TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], hio, hio_next[ncomp]);
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ if (hast_proto_send(res, res->hr_remoteout, nv, data,
+ data != NULL ? length : 0) < 0) {
+ hio->hio_errors[ncomp] = errno;
+ rw_unlock(&hio_remote_lock[ncomp]);
+ remote_close(res, ncomp);
+ pjdlog_debug(2,
+ "remote_send: (%p) Unable to send request.", hio);
+ reqlog(LOG_ERR, 0, ggio,
+ "Unable to send request (%s): ",
+ strerror(hio->hio_errors[ncomp]));
+ /*
+ * Take request back from the receive queue and move
+ * it immediately to the done queue.
+ */
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ TAILQ_REMOVE(&hio_recv_list[ncomp], hio, hio_next[ncomp]);
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ goto done_queue;
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ nv_free(nv);
+ if (wakeup)
+ cv_signal(&hio_recv_list_cond[ncomp]);
+ continue;
+done_queue:
+ nv_free(nv);
+ if (ISSYNCREQ(hio)) {
+ if (!refcount_release(&hio->hio_countdown))
+ continue;
+ mtx_lock(&sync_lock);
+ SYNCREQDONE(hio);
+ mtx_unlock(&sync_lock);
+ cv_signal(&sync_cond);
+ continue;
+ }
+ if (ggio->gctl_cmd == BIO_WRITE) {
+ mtx_lock(&res->hr_amp_lock);
+ if (activemap_need_sync(res->hr_amp, ggio->gctl_offset,
+ ggio->gctl_length)) {
+ (void)hast_activemap_flush(res);
+ }
+ mtx_unlock(&res->hr_amp_lock);
+ }
+ if (!refcount_release(&hio->hio_countdown))
+ continue;
+ pjdlog_debug(2,
+ "remote_send: (%p) Moving request to the done queue.",
+ hio);
+ QUEUE_INSERT2(hio, done);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread receives answer from secondary node and passes it to ggate_send
+ * thread.
+ */
+static void *
+remote_recv_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ struct nv *nv;
+ unsigned int ncomp;
+ uint64_t seq;
+ int error;
+
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+
+ for (;;) {
+ /* Wait until there is anything to receive. */
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ while (TAILQ_EMPTY(&hio_recv_list[ncomp])) {
+ pjdlog_debug(2, "remote_recv: No requests, waiting.");
+ cv_wait(&hio_recv_list_cond[ncomp],
+ &hio_recv_list_lock[ncomp]);
+ }
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (!ISCONNECTED(res, ncomp)) {
+ rw_unlock(&hio_remote_lock[ncomp]);
+ /*
+ * Connection is dead, so move all pending requests to
+ * the done queue (one-by-one).
+ */
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ hio = TAILQ_FIRST(&hio_recv_list[ncomp]);
+ assert(hio != NULL);
+ TAILQ_REMOVE(&hio_recv_list[ncomp], hio,
+ hio_next[ncomp]);
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ goto done_queue;
+ }
+ if (hast_proto_recv_hdr(res->hr_remotein, &nv) < 0) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to receive reply header");
+ rw_unlock(&hio_remote_lock[ncomp]);
+ remote_close(res, ncomp);
+ continue;
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ seq = nv_get_uint64(nv, "seq");
+ if (seq == 0) {
+ pjdlog_error("Header contains no 'seq' field.");
+ nv_free(nv);
+ continue;
+ }
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ TAILQ_FOREACH(hio, &hio_recv_list[ncomp], hio_next[ncomp]) {
+ if (hio->hio_ggio.gctl_seq == seq) {
+ TAILQ_REMOVE(&hio_recv_list[ncomp], hio,
+ hio_next[ncomp]);
+ break;
+ }
+ }
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ if (hio == NULL) {
+ pjdlog_error("Found no request matching received 'seq' field (%ju).",
+ (uintmax_t)seq);
+ nv_free(nv);
+ continue;
+ }
+ error = nv_get_int16(nv, "error");
+ if (error != 0) {
+ /* Request failed on remote side. */
+ hio->hio_errors[ncomp] = 0;
+ nv_free(nv);
+ goto done_queue;
+ }
+ ggio = &hio->hio_ggio;
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (!ISCONNECTED(res, ncomp)) {
+ rw_unlock(&hio_remote_lock[ncomp]);
+ nv_free(nv);
+ goto done_queue;
+ }
+ if (hast_proto_recv_data(res, res->hr_remotein, nv,
+ ggio->gctl_data, ggio->gctl_length) < 0) {
+ hio->hio_errors[ncomp] = errno;
+ pjdlog_errno(LOG_ERR,
+ "Unable to receive reply data");
+ rw_unlock(&hio_remote_lock[ncomp]);
+ nv_free(nv);
+ remote_close(res, ncomp);
+ goto done_queue;
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ break;
+ case BIO_WRITE:
+ case BIO_DELETE:
+ case BIO_FLUSH:
+ break;
+ default:
+ assert(!"invalid condition");
+ abort();
+ }
+ hio->hio_errors[ncomp] = 0;
+ nv_free(nv);
+done_queue:
+ if (refcount_release(&hio->hio_countdown)) {
+ if (ISSYNCREQ(hio)) {
+ mtx_lock(&sync_lock);
+ SYNCREQDONE(hio);
+ mtx_unlock(&sync_lock);
+ cv_signal(&sync_cond);
+ } else {
+ pjdlog_debug(2,
+ "remote_recv: (%p) Moving request to the done queue.",
+ hio);
+ QUEUE_INSERT2(hio, done);
+ }
+ }
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread sends answer to the kernel.
+ */
+static void *
+ggate_send_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ unsigned int ii, ncomp, ncomps;
+
+ ncomps = HAST_NCOMPONENTS;
+
+ for (;;) {
+ pjdlog_debug(2, "ggate_send: Taking request.");
+ QUEUE_TAKE2(hio, done);
+ pjdlog_debug(2, "ggate_send: (%p) Got request.", hio);
+ ggio = &hio->hio_ggio;
+ for (ii = 0; ii < ncomps; ii++) {
+ if (hio->hio_errors[ii] == 0) {
+ /*
+ * One successful request is enough to declare
+ * success.
+ */
+ ggio->gctl_error = 0;
+ break;
+ }
+ }
+ if (ii == ncomps) {
+ /*
+ * None of the requests were successful.
+ * Use first error.
+ */
+ ggio->gctl_error = hio->hio_errors[0];
+ }
+ if (ggio->gctl_error == 0 && ggio->gctl_cmd == BIO_WRITE) {
+ mtx_lock(&res->hr_amp_lock);
+ activemap_write_complete(res->hr_amp,
+ ggio->gctl_offset, ggio->gctl_length);
+ mtx_unlock(&res->hr_amp_lock);
+ }
+ if (ggio->gctl_cmd == BIO_WRITE) {
+ /*
+ * Unlock range we locked.
+ */
+ mtx_lock(&range_lock);
+ rangelock_del(range_regular, ggio->gctl_offset,
+ ggio->gctl_length);
+ if (range_sync_wait)
+ cv_signal(&range_sync_cond);
+ mtx_unlock(&range_lock);
+ /*
+ * Bump local count if this is first write after
+ * connection failure with remote node.
+ */
+ ncomp = 1;
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (!ISCONNECTED(res, ncomp)) {
+ mtx_lock(&metadata_lock);
+ if (res->hr_primary_localcnt ==
+ res->hr_secondary_remotecnt) {
+ res->hr_primary_localcnt++;
+ pjdlog_debug(1,
+ "Increasing localcnt to %ju.",
+ (uintmax_t)res->hr_primary_localcnt);
+ (void)metadata_write(res);
+ }
+ mtx_unlock(&metadata_lock);
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ }
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_DONE, ggio) < 0)
+ primary_exit(EX_OSERR, "G_GATE_CMD_DONE failed");
+ pjdlog_debug(2,
+ "ggate_send: (%p) Moving request to the free queue.", hio);
+ QUEUE_INSERT2(hio, free);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread synchronize local and remote components.
+ */
+static void *
+sync_thread(void *arg __unused)
+{
+ struct hast_resource *res = arg;
+ struct hio *hio;
+ struct g_gate_ctl_io *ggio;
+ unsigned int ii, ncomp, ncomps;
+ off_t offset, length, synced;
+ bool dorewind;
+ int syncext;
+
+ ncomps = HAST_NCOMPONENTS;
+ dorewind = true;
+ synced = 0;
+
+ for (;;) {
+ mtx_lock(&sync_lock);
+ while (!sync_inprogress) {
+ dorewind = true;
+ synced = 0;
+ cv_wait(&sync_cond, &sync_lock);
+ }
+ mtx_unlock(&sync_lock);
+ /*
+ * Obtain offset at which we should synchronize.
+ * Rewind synchronization if needed.
+ */
+ mtx_lock(&res->hr_amp_lock);
+ if (dorewind)
+ activemap_sync_rewind(res->hr_amp);
+ offset = activemap_sync_offset(res->hr_amp, &length, &syncext);
+ if (syncext != -1) {
+ /*
+ * We synchronized entire syncext extent, we can mark
+ * it as clean now.
+ */
+ if (activemap_extent_complete(res->hr_amp, syncext))
+ (void)hast_activemap_flush(res);
+ }
+ mtx_unlock(&res->hr_amp_lock);
+ if (dorewind) {
+ dorewind = false;
+ if (offset < 0)
+ pjdlog_info("Nodes are in sync.");
+ else {
+ pjdlog_info("Synchronization started. %ju bytes to go.",
+ (uintmax_t)(res->hr_extentsize *
+ activemap_ndirty(res->hr_amp)));
+ }
+ }
+ if (offset < 0) {
+ mtx_lock(&sync_lock);
+ sync_inprogress = false;
+ mtx_unlock(&sync_lock);
+ pjdlog_debug(1, "Nothing to synchronize.");
+ /*
+ * Synchronization complete, make both localcnt and
+ * remotecnt equal.
+ */
+ ncomp = 1;
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (ISCONNECTED(res, ncomp)) {
+ if (synced > 0) {
+ pjdlog_info("Synchronization complete. "
+ "%jd bytes synchronized.",
+ (intmax_t)synced);
+ }
+ mtx_lock(&metadata_lock);
+ res->hr_syncsrc = HAST_SYNCSRC_UNDEF;
+ res->hr_primary_localcnt =
+ res->hr_secondary_localcnt;
+ res->hr_primary_remotecnt =
+ res->hr_secondary_remotecnt;
+ pjdlog_debug(1,
+ "Setting localcnt to %ju and remotecnt to %ju.",
+ (uintmax_t)res->hr_primary_localcnt,
+ (uintmax_t)res->hr_secondary_localcnt);
+ (void)metadata_write(res);
+ mtx_unlock(&metadata_lock);
+ } else if (synced > 0) {
+ pjdlog_info("Synchronization interrupted. "
+ "%jd bytes synchronized so far.",
+ (intmax_t)synced);
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ continue;
+ }
+ pjdlog_debug(2, "sync: Taking free request.");
+ QUEUE_TAKE2(hio, free);
+ pjdlog_debug(2, "sync: (%p) Got free request.", hio);
+ /*
+ * Lock the range we are going to synchronize. We don't want
+ * race where someone writes between our read and write.
+ */
+ for (;;) {
+ mtx_lock(&range_lock);
+ if (rangelock_islocked(range_regular, offset, length)) {
+ pjdlog_debug(2,
+ "sync: Range offset=%jd length=%jd locked.",
+ (intmax_t)offset, (intmax_t)length);
+ range_sync_wait = true;
+ cv_wait(&range_sync_cond, &range_lock);
+ range_sync_wait = false;
+ mtx_unlock(&range_lock);
+ continue;
+ }
+ if (rangelock_add(range_sync, offset, length) < 0) {
+ mtx_unlock(&range_lock);
+ pjdlog_debug(2,
+ "sync: Range offset=%jd length=%jd is already locked, waiting.",
+ (intmax_t)offset, (intmax_t)length);
+ sleep(1);
+ continue;
+ }
+ mtx_unlock(&range_lock);
+ break;
+ }
+ /*
+ * First read the data from synchronization source.
+ */
+ SYNCREQ(hio);
+ ggio = &hio->hio_ggio;
+ ggio->gctl_cmd = BIO_READ;
+ ggio->gctl_offset = offset;
+ ggio->gctl_length = length;
+ ggio->gctl_error = 0;
+ for (ii = 0; ii < ncomps; ii++)
+ hio->hio_errors[ii] = EINVAL;
+ reqlog(LOG_DEBUG, 2, ggio, "sync: (%p) Sending sync request: ",
+ hio);
+ pjdlog_debug(2, "sync: (%p) Moving request to the send queue.",
+ hio);
+ mtx_lock(&metadata_lock);
+ if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) {
+ /*
+ * This range is up-to-date on local component,
+ * so handle request locally.
+ */
+ /* Local component is 0 for now. */
+ ncomp = 0;
+ } else /* if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) */ {
+ assert(res->hr_syncsrc == HAST_SYNCSRC_SECONDARY);
+ /*
+ * This range is out-of-date on local component,
+ * so send request to the remote node.
+ */
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+ }
+ mtx_unlock(&metadata_lock);
+ refcount_init(&hio->hio_countdown, 1);
+ QUEUE_INSERT1(hio, send, ncomp);
+
+ /*
+ * Let's wait for READ to finish.
+ */
+ mtx_lock(&sync_lock);
+ while (!ISSYNCREQDONE(hio))
+ cv_wait(&sync_cond, &sync_lock);
+ mtx_unlock(&sync_lock);
+
+ if (hio->hio_errors[ncomp] != 0) {
+ pjdlog_error("Unable to read synchronization data: %s.",
+ strerror(hio->hio_errors[ncomp]));
+ goto free_queue;
+ }
+
+ /*
+ * We read the data from synchronization source, now write it
+ * to synchronization target.
+ */
+ SYNCREQ(hio);
+ ggio->gctl_cmd = BIO_WRITE;
+ for (ii = 0; ii < ncomps; ii++)
+ hio->hio_errors[ii] = EINVAL;
+ reqlog(LOG_DEBUG, 2, ggio, "sync: (%p) Sending sync request: ",
+ hio);
+ pjdlog_debug(2, "sync: (%p) Moving request to the send queue.",
+ hio);
+ mtx_lock(&metadata_lock);
+ if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) {
+ /*
+ * This range is up-to-date on local component,
+ * so we update remote component.
+ */
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+ } else /* if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) */ {
+ assert(res->hr_syncsrc == HAST_SYNCSRC_SECONDARY);
+ /*
+ * This range is out-of-date on local component,
+ * so we update it.
+ */
+ /* Local component is 0 for now. */
+ ncomp = 0;
+ }
+ mtx_unlock(&metadata_lock);
+
+ pjdlog_debug(2, "sync: (%p) Moving request to the send queues.",
+ hio);
+ refcount_init(&hio->hio_countdown, 1);
+ QUEUE_INSERT1(hio, send, ncomp);
+
+ /*
+ * Let's wait for WRITE to finish.
+ */
+ mtx_lock(&sync_lock);
+ while (!ISSYNCREQDONE(hio))
+ cv_wait(&sync_cond, &sync_lock);
+ mtx_unlock(&sync_lock);
+
+ if (hio->hio_errors[ncomp] != 0) {
+ pjdlog_error("Unable to write synchronization data: %s.",
+ strerror(hio->hio_errors[ncomp]));
+ goto free_queue;
+ }
+free_queue:
+ mtx_lock(&range_lock);
+ rangelock_del(range_sync, offset, length);
+ if (range_regular_wait)
+ cv_signal(&range_regular_cond);
+ mtx_unlock(&range_lock);
+
+ synced += length;
+
+ pjdlog_debug(2, "sync: (%p) Moving request to the free queue.",
+ hio);
+ QUEUE_INSERT2(hio, free);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+static void
+sighandler(int sig)
+{
+ bool unlock;
+
+ switch (sig) {
+ case SIGINT:
+ case SIGTERM:
+ sigexit_received = true;
+ break;
+ default:
+ assert(!"invalid condition");
+ }
+ /*
+ * XXX: Racy, but if we cannot obtain hio_guard_lock here, we don't
+ * want to risk deadlock.
+ */
+ unlock = mtx_trylock(&hio_guard_lock);
+ cv_signal(&hio_guard_cond);
+ if (unlock)
+ mtx_unlock(&hio_guard_lock);
+}
+
+/*
+ * Thread guards remote connections and reconnects when needed, handles
+ * signals, etc.
+ */
+static void *
+guard_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ unsigned int ii, ncomps;
+ int timeout;
+
+ ncomps = HAST_NCOMPONENTS;
+ /* The is only one remote component for now. */
+#define ISREMOTE(no) ((no) == 1)
+
+ for (;;) {
+ if (sigexit_received) {
+ primary_exitx(EX_OK,
+ "Termination signal received, exiting.");
+ }
+ /*
+ * If all the connection will be fine, we will sleep until
+ * someone wakes us up.
+ * If any of the connections will be broken and we won't be
+ * able to connect, we will sleep only for RECONNECT_SLEEP
+ * seconds so we can retry soon.
+ */
+ timeout = 0;
+ pjdlog_debug(2, "remote_guard: Checking connections.");
+ mtx_lock(&hio_guard_lock);
+ for (ii = 0; ii < ncomps; ii++) {
+ if (!ISREMOTE(ii))
+ continue;
+ rw_rlock(&hio_remote_lock[ii]);
+ if (ISCONNECTED(res, ii)) {
+ assert(res->hr_remotein != NULL);
+ assert(res->hr_remoteout != NULL);
+ rw_unlock(&hio_remote_lock[ii]);
+ pjdlog_debug(2,
+ "remote_guard: Connection to %s is ok.",
+ res->hr_remoteaddr);
+ } else {
+ assert(res->hr_remotein == NULL);
+ assert(res->hr_remoteout == NULL);
+ /*
+ * Upgrade the lock. It doesn't have to be
+ * atomic as no other thread can change
+ * connection status from disconnected to
+ * connected.
+ */
+ rw_unlock(&hio_remote_lock[ii]);
+ rw_wlock(&hio_remote_lock[ii]);
+ assert(res->hr_remotein == NULL);
+ assert(res->hr_remoteout == NULL);
+ pjdlog_debug(2,
+ "remote_guard: Reconnecting to %s.",
+ res->hr_remoteaddr);
+ init_remote(res);
+ if (ISCONNECTED(res, ii)) {
+ pjdlog_info("Successfully reconnected to %s.",
+ res->hr_remoteaddr);
+ } else {
+ /* Both connections should be NULL. */
+ assert(res->hr_remotein == NULL);
+ assert(res->hr_remoteout == NULL);
+ pjdlog_debug(2,
+ "remote_guard: Reconnect to %s failed.",
+ res->hr_remoteaddr);
+ timeout = RECONNECT_SLEEP;
+ }
+ rw_unlock(&hio_remote_lock[ii]);
+ }
+ }
+ (void)cv_timedwait(&hio_guard_cond, &hio_guard_lock, timeout);
+ mtx_unlock(&hio_guard_lock);
+ }
+#undef ISREMOTE
+ /* NOTREACHED */
+ return (NULL);
+}
diff --git a/sbin/hastd/proto.c b/sbin/hastd/proto.c
new file mode 100644
index 0000000..103f20c
--- /dev/null
+++ b/sbin/hastd/proto.c
@@ -0,0 +1,261 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/queue.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdint.h>
+
+#include "proto.h"
+#include "proto_impl.h"
+
+#define PROTO_CONN_MAGIC 0x907041c
+struct proto_conn {
+ int pc_magic;
+ struct hast_proto *pc_proto;
+ void *pc_ctx;
+ int pc_side;
+#define PROTO_SIDE_CLIENT 0
+#define PROTO_SIDE_SERVER_LISTEN 1
+#define PROTO_SIDE_SERVER_WORK 2
+};
+
+static LIST_HEAD(, hast_proto) protos = LIST_HEAD_INITIALIZER(protos);
+
+void
+proto_register(struct hast_proto *proto)
+{
+
+ LIST_INSERT_HEAD(&protos, proto, hp_next);
+}
+
+static int
+proto_common_setup(const char *addr, struct proto_conn **connp, int side)
+{
+ struct hast_proto *proto;
+ struct proto_conn *conn;
+ void *ctx;
+ int ret;
+
+ assert(side == PROTO_SIDE_CLIENT || side == PROTO_SIDE_SERVER_LISTEN);
+
+ conn = malloc(sizeof(*conn));
+ if (conn == NULL)
+ return (-1);
+
+ LIST_FOREACH(proto, &protos, hp_next) {
+ if (side == PROTO_SIDE_CLIENT)
+ ret = proto->hp_client(addr, &ctx);
+ else /* if (side == PROTO_SIDE_SERVER_LISTEN) */
+ ret = proto->hp_server(addr, &ctx);
+ /*
+ * ret == 0 - success
+ * ret == -1 - addr is not for this protocol
+ * ret > 0 - right protocol, but an error occured
+ */
+ if (ret >= 0)
+ break;
+ }
+ if (proto == NULL) {
+ /* Unrecognized address. */
+ free(conn);
+ errno = EINVAL;
+ return (-1);
+ }
+ if (ret > 0) {
+ /* An error occured. */
+ free(conn);
+ errno = ret;
+ return (-1);
+ }
+ conn->pc_proto = proto;
+ conn->pc_ctx = ctx;
+ conn->pc_side = side;
+ conn->pc_magic = PROTO_CONN_MAGIC;
+ *connp = conn;
+ return (0);
+}
+
+int
+proto_client(const char *addr, struct proto_conn **connp)
+{
+
+ return (proto_common_setup(addr, connp, PROTO_SIDE_CLIENT));
+}
+
+int
+proto_connect(struct proto_conn *conn)
+{
+ int ret;
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_side == PROTO_SIDE_CLIENT);
+ assert(conn->pc_proto != NULL);
+
+ ret = conn->pc_proto->hp_connect(conn->pc_ctx);
+ if (ret != 0) {
+ errno = ret;
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
+proto_server(const char *addr, struct proto_conn **connp)
+{
+
+ return (proto_common_setup(addr, connp, PROTO_SIDE_SERVER_LISTEN));
+}
+
+int
+proto_accept(struct proto_conn *conn, struct proto_conn **newconnp)
+{
+ struct proto_conn *newconn;
+ int ret;
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_side == PROTO_SIDE_SERVER_LISTEN);
+ assert(conn->pc_proto != NULL);
+
+ newconn = malloc(sizeof(*newconn));
+ if (newconn == NULL)
+ return (-1);
+
+ ret = conn->pc_proto->hp_accept(conn->pc_ctx, &newconn->pc_ctx);
+ if (ret != 0) {
+ free(newconn);
+ errno = ret;
+ return (-1);
+ }
+
+ newconn->pc_proto = conn->pc_proto;
+ newconn->pc_side = PROTO_SIDE_SERVER_WORK;
+ newconn->pc_magic = PROTO_CONN_MAGIC;
+ *newconnp = newconn;
+
+ return (0);
+}
+
+int
+proto_send(struct proto_conn *conn, const void *data, size_t size)
+{
+ int ret;
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ ret = conn->pc_proto->hp_send(conn->pc_ctx, data, size);
+ if (ret != 0) {
+ errno = ret;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+proto_recv(struct proto_conn *conn, void *data, size_t size)
+{
+ int ret;
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ ret = conn->pc_proto->hp_recv(conn->pc_ctx, data, size);
+ if (ret != 0) {
+ errno = ret;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+proto_descriptor(const struct proto_conn *conn)
+{
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ return (conn->pc_proto->hp_descriptor(conn->pc_ctx));
+}
+
+bool
+proto_address_match(const struct proto_conn *conn, const char *addr)
+{
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ return (conn->pc_proto->hp_address_match(conn->pc_ctx, addr));
+}
+
+void
+proto_local_address(const struct proto_conn *conn, char *addr, size_t size)
+{
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ conn->pc_proto->hp_local_address(conn->pc_ctx, addr, size);
+}
+
+void
+proto_remote_address(const struct proto_conn *conn, char *addr, size_t size)
+{
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ conn->pc_proto->hp_remote_address(conn->pc_ctx, addr, size);
+}
+
+void
+proto_close(struct proto_conn *conn)
+{
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ conn->pc_proto->hp_close(conn->pc_ctx);
+ conn->pc_magic = 0;
+ free(conn);
+}
diff --git a/sbin/hastd/proto.h b/sbin/hastd/proto.h
new file mode 100644
index 0000000..cb196d8
--- /dev/null
+++ b/sbin/hastd/proto.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PROTO_H_
+#define _PROTO_H_
+
+#include <stdbool.h> /* bool */
+#include <stdlib.h> /* size_t */
+
+struct proto_conn;
+
+int proto_client(const char *addr, struct proto_conn **connp);
+int proto_connect(struct proto_conn *conn);
+int proto_server(const char *addr, struct proto_conn **connp);
+int proto_accept(struct proto_conn *conn, struct proto_conn **newconnp);
+int proto_send(struct proto_conn *conn, const void *data, size_t size);
+int proto_recv(struct proto_conn *conn, void *data, size_t size);
+int proto_descriptor(const struct proto_conn *conn);
+bool proto_address_match(const struct proto_conn *conn, const char *addr);
+void proto_local_address(const struct proto_conn *conn, char *addr,
+ size_t size);
+void proto_remote_address(const struct proto_conn *conn, char *addr,
+ size_t size);
+void proto_close(struct proto_conn *conn);
+
+#endif /* !_PROTO_H_ */
diff --git a/sbin/hastd/proto_common.c b/sbin/hastd/proto_common.c
new file mode 100644
index 0000000..22102d8
--- /dev/null
+++ b/sbin/hastd/proto_common.c
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#include "proto_impl.h"
+
+/* Maximum size of packet we want to use when sending data. */
+#ifndef MAX_SEND_SIZE
+//#define MAX_SEND_SIZE 32768
+#define MAX_SEND_SIZE 131072
+#endif
+
+int
+proto_common_send(int fd, const unsigned char *data, size_t size)
+{
+ ssize_t done;
+ size_t sendsize;
+
+ do {
+ sendsize = size < MAX_SEND_SIZE ? size : MAX_SEND_SIZE;
+ done = send(fd, data, sendsize, MSG_NOSIGNAL);
+ if (done == 0)
+ return (ENOTCONN);
+ else if (done < 0) {
+ if (errno == EAGAIN)
+ continue;
+ return (errno);
+ }
+ data += done;
+ size -= done;
+ } while (size > 0);
+
+ return (0);
+}
+
+int
+proto_common_recv(int fd, unsigned char *data, size_t size)
+{
+ ssize_t done;
+
+ do {
+ done = recv(fd, data, size, MSG_WAITALL);
+ } while (done == -1 && errno == EAGAIN);
+ if (done == 0)
+ return (ENOTCONN);
+ else if (done < 0)
+ return (errno);
+ return (0);
+}
diff --git a/sbin/hastd/proto_impl.h b/sbin/hastd/proto_impl.h
new file mode 100644
index 0000000..ea6548d
--- /dev/null
+++ b/sbin/hastd/proto_impl.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PROTO_IMPL_H_
+#define _PROTO_IMPL_H_
+
+#include <sys/queue.h>
+
+#include <stdbool.h> /* bool */
+#include <stdlib.h> /* size_t */
+
+#define __constructor __attribute__((constructor))
+
+typedef int hp_client_t(const char *, void **);
+typedef int hp_connect_t(void *);
+typedef int hp_server_t(const char *, void **);
+typedef int hp_accept_t(void *, void **);
+typedef int hp_send_t(void *, const unsigned char *, size_t);
+typedef int hp_recv_t(void *, unsigned char *, size_t);
+typedef int hp_descriptor_t(const void *);
+typedef bool hp_address_match_t(const void *, const char *);
+typedef void hp_local_address_t(const void *, char *, size_t);
+typedef void hp_remote_address_t(const void *, char *, size_t);
+typedef void hp_close_t(void *);
+
+struct hast_proto {
+ const char *hp_name;
+ hp_client_t *hp_client;
+ hp_connect_t *hp_connect;
+ hp_server_t *hp_server;
+ hp_accept_t *hp_accept;
+ hp_send_t *hp_send;
+ hp_recv_t *hp_recv;
+ hp_descriptor_t *hp_descriptor;
+ hp_address_match_t *hp_address_match;
+ hp_local_address_t *hp_local_address;
+ hp_remote_address_t *hp_remote_address;
+ hp_close_t *hp_close;
+ LIST_ENTRY(hast_proto) hp_next;
+};
+
+void proto_register(struct hast_proto *proto);
+
+int proto_common_send(int fd, const unsigned char *data, size_t size);
+int proto_common_recv(int fd, unsigned char *data, size_t size);
+
+#endif /* !_PROTO_IMPL_H_ */
diff --git a/sbin/hastd/proto_socketpair.c b/sbin/hastd/proto_socketpair.c
new file mode 100644
index 0000000..0e2cfa2
--- /dev/null
+++ b/sbin/hastd/proto_socketpair.c
@@ -0,0 +1,272 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "hast.h"
+#include "proto_impl.h"
+
+#define SP_CTX_MAGIC 0x50c3741
+struct sp_ctx {
+ int sp_magic;
+ int sp_fd[2];
+ int sp_side;
+#define SP_SIDE_UNDEF 0
+#define SP_SIDE_CLIENT 1
+#define SP_SIDE_SERVER 2
+};
+
+static void sp_close(void *ctx);
+
+static int
+sp_client(const char *addr, void **ctxp)
+{
+ struct sp_ctx *spctx;
+ int ret;
+
+ if (strcmp(addr, "socketpair://") != 0)
+ return (-1);
+
+ spctx = malloc(sizeof(*spctx));
+ if (spctx == NULL)
+ return (errno);
+
+ if (socketpair(PF_UNIX, SOCK_STREAM, 0, spctx->sp_fd) < 0) {
+ ret = errno;
+ free(spctx);
+ return (ret);
+ }
+
+ spctx->sp_side = SP_SIDE_UNDEF;
+ spctx->sp_magic = SP_CTX_MAGIC;
+ *ctxp = spctx;
+
+ return (0);
+}
+
+static int
+sp_connect(void *ctx __unused)
+{
+
+ assert(!"proto_connect() not supported on socketpairs");
+ abort();
+}
+
+static int
+sp_server(const char *addr __unused, void **ctxp __unused)
+{
+
+ assert(!"proto_server() not supported on socketpairs");
+ abort();
+}
+
+static int
+sp_accept(void *ctx __unused, void **newctxp __unused)
+{
+
+ assert(!"proto_server() not supported on socketpairs");
+ abort();
+}
+
+static int
+sp_send(void *ctx, const unsigned char *data, size_t size)
+{
+ struct sp_ctx *spctx = ctx;
+ int fd;
+
+ assert(spctx != NULL);
+ assert(spctx->sp_magic == SP_CTX_MAGIC);
+
+ switch (spctx->sp_side) {
+ case SP_SIDE_UNDEF:
+ /*
+ * If the first operation done by the caller is proto_send(),
+ * we assume this the client.
+ */
+ /* FALLTHROUGH */
+ spctx->sp_side = SP_SIDE_CLIENT;
+ /* Close other end. */
+ close(spctx->sp_fd[1]);
+ case SP_SIDE_CLIENT:
+ assert(spctx->sp_fd[0] >= 0);
+ fd = spctx->sp_fd[0];
+ break;
+ case SP_SIDE_SERVER:
+ assert(spctx->sp_fd[1] >= 0);
+ fd = spctx->sp_fd[1];
+ break;
+ default:
+ abort();
+ }
+
+ return (proto_common_send(fd, data, size));
+}
+
+static int
+sp_recv(void *ctx, unsigned char *data, size_t size)
+{
+ struct sp_ctx *spctx = ctx;
+ int fd;
+
+ assert(spctx != NULL);
+ assert(spctx->sp_magic == SP_CTX_MAGIC);
+
+ switch (spctx->sp_side) {
+ case SP_SIDE_UNDEF:
+ /*
+ * If the first operation done by the caller is proto_recv(),
+ * we assume this the server.
+ */
+ /* FALLTHROUGH */
+ spctx->sp_side = SP_SIDE_SERVER;
+ /* Close other end. */
+ close(spctx->sp_fd[0]);
+ case SP_SIDE_SERVER:
+ assert(spctx->sp_fd[1] >= 0);
+ fd = spctx->sp_fd[1];
+ break;
+ case SP_SIDE_CLIENT:
+ assert(spctx->sp_fd[0] >= 0);
+ fd = spctx->sp_fd[0];
+ break;
+ default:
+ abort();
+ }
+
+ return (proto_common_recv(fd, data, size));
+}
+
+static int
+sp_descriptor(const void *ctx)
+{
+ const struct sp_ctx *spctx = ctx;
+
+ assert(spctx != NULL);
+ assert(spctx->sp_magic == SP_CTX_MAGIC);
+ assert(spctx->sp_side == SP_SIDE_CLIENT ||
+ spctx->sp_side == SP_SIDE_SERVER);
+
+ switch (spctx->sp_side) {
+ case SP_SIDE_CLIENT:
+ assert(spctx->sp_fd[0] >= 0);
+ return (spctx->sp_fd[0]);
+ case SP_SIDE_SERVER:
+ assert(spctx->sp_fd[1] >= 0);
+ return (spctx->sp_fd[1]);
+ }
+
+ abort();
+}
+
+static bool
+sp_address_match(const void *ctx __unused, const char *addr __unused)
+{
+
+ assert(!"proto_address_match() not supported on socketpairs");
+ abort();
+}
+
+static void
+sp_local_address(const void *ctx __unused, char *addr __unused,
+ size_t size __unused)
+{
+
+ assert(!"proto_local_address() not supported on socketpairs");
+ abort();
+}
+
+static void
+sp_remote_address(const void *ctx __unused, char *addr __unused,
+ size_t size __unused)
+{
+
+ assert(!"proto_remote_address() not supported on socketpairs");
+ abort();
+}
+
+static void
+sp_close(void *ctx)
+{
+ struct sp_ctx *spctx = ctx;
+
+ assert(spctx != NULL);
+ assert(spctx->sp_magic == SP_CTX_MAGIC);
+
+ switch (spctx->sp_side) {
+ case SP_SIDE_UNDEF:
+ close(spctx->sp_fd[0]);
+ close(spctx->sp_fd[1]);
+ break;
+ case SP_SIDE_CLIENT:
+ close(spctx->sp_fd[0]);
+ break;
+ case SP_SIDE_SERVER:
+ close(spctx->sp_fd[1]);
+ break;
+ default:
+ abort();
+ }
+
+ spctx->sp_magic = 0;
+ free(spctx);
+}
+
+static struct hast_proto sp_proto = {
+ .hp_name = "socketpair",
+ .hp_client = sp_client,
+ .hp_connect = sp_connect,
+ .hp_server = sp_server,
+ .hp_accept = sp_accept,
+ .hp_send = sp_send,
+ .hp_recv = sp_recv,
+ .hp_descriptor = sp_descriptor,
+ .hp_address_match = sp_address_match,
+ .hp_local_address = sp_local_address,
+ .hp_remote_address = sp_remote_address,
+ .hp_close = sp_close
+};
+
+static __constructor void
+sp_ctor(void)
+{
+
+ proto_register(&sp_proto);
+}
diff --git a/sbin/hastd/proto_tcp4.c b/sbin/hastd/proto_tcp4.c
new file mode 100644
index 0000000..2fba996
--- /dev/null
+++ b/sbin/hastd/proto_tcp4.c
@@ -0,0 +1,447 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h> /* MAXHOSTNAMELEN */
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "hast.h"
+#include "pjdlog.h"
+#include "proto_impl.h"
+
+#define TCP4_CTX_MAGIC 0x7c441c
+struct tcp4_ctx {
+ int tc_magic;
+ struct sockaddr_in tc_sin;
+ int tc_fd;
+ int tc_side;
+#define TCP4_SIDE_CLIENT 0
+#define TCP4_SIDE_SERVER_LISTEN 1
+#define TCP4_SIDE_SERVER_WORK 2
+};
+
+static void tcp4_close(void *ctx);
+
+static in_addr_t
+str2ip(const char *str)
+{
+ struct hostent *hp;
+ in_addr_t ip;
+
+ ip = inet_addr(str);
+ if (ip != INADDR_NONE) {
+ /* It is a valid IP address. */
+ return (ip);
+ }
+ /* Check if it is a valid host name. */
+ hp = gethostbyname(str);
+ if (hp == NULL)
+ return (INADDR_NONE);
+ return (((struct in_addr *)(void *)hp->h_addr)->s_addr);
+}
+
+/*
+ * Function converts the given string to unsigned number.
+ */
+static int
+numfromstr(const char *str, intmax_t minnum, intmax_t maxnum, intmax_t *nump)
+{
+ intmax_t digit, num;
+
+ if (str[0] == '\0')
+ goto invalid; /* Empty string. */
+ num = 0;
+ for (; *str != '\0'; str++) {
+ if (*str < '0' || *str > '9')
+ goto invalid; /* Non-digit character. */
+ digit = *str - '0';
+ if (num > num * 10 + digit)
+ goto invalid; /* Overflow. */
+ num = num * 10 + digit;
+ if (num > maxnum)
+ goto invalid; /* Too big. */
+ }
+ if (num < minnum)
+ goto invalid; /* Too small. */
+ *nump = num;
+ return (0);
+invalid:
+ errno = EINVAL;
+ return (-1);
+}
+
+static int
+tcp4_addr(const char *addr, struct sockaddr_in *sinp)
+{
+ char iporhost[MAXHOSTNAMELEN];
+ const char *pp;
+ size_t size;
+ in_addr_t ip;
+
+ if (addr == NULL)
+ return (-1);
+
+ if (strncasecmp(addr, "tcp4://", 7) == 0)
+ addr += 7;
+ else if (strncasecmp(addr, "tcp://", 6) == 0)
+ addr += 6;
+ else if (addr[0] != '/' && /* If this is not path... */
+ strstr(addr, "://") == NULL)/* ...and has no prefix... */
+ ; /* ...tcp4 is the default. */
+ else
+ return (-1);
+
+ sinp->sin_family = AF_INET;
+ sinp->sin_len = sizeof(*sinp);
+ /* Extract optional port. */
+ pp = strrchr(addr, ':');
+ if (pp == NULL) {
+ /* Port not given, use the default. */
+ sinp->sin_port = htons(HASTD_PORT);
+ } else {
+ intmax_t port;
+
+ if (numfromstr(pp + 1, 1, 65535, &port) < 0)
+ return (errno);
+ sinp->sin_port = htons(port);
+ }
+ /* Extract host name or IP address. */
+ if (pp == NULL) {
+ size = sizeof(iporhost);
+ if (strlcpy(iporhost, addr, size) >= size)
+ return (ENAMETOOLONG);
+ } else {
+ size = (size_t)(pp - addr + 1);
+ if (size > sizeof(iporhost))
+ return (ENAMETOOLONG);
+ strlcpy(iporhost, addr, size);
+ }
+ /* Convert string (IP address or host name) to in_addr_t. */
+ ip = str2ip(iporhost);
+ if (ip == INADDR_NONE)
+ return (EINVAL);
+ sinp->sin_addr.s_addr = ip;
+
+ return (0);
+}
+
+static int
+tcp4_common_setup(const char *addr, void **ctxp, int side)
+{
+ struct tcp4_ctx *tctx;
+ int ret, val;
+
+ tctx = malloc(sizeof(*tctx));
+ if (tctx == NULL)
+ return (errno);
+
+ /* Parse given address. */
+ if ((ret = tcp4_addr(addr, &tctx->tc_sin)) != 0) {
+ free(tctx);
+ return (ret);
+ }
+
+ tctx->tc_fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (tctx->tc_fd == -1) {
+ ret = errno;
+ free(tctx);
+ return (ret);
+ }
+
+ /* Socket settings. */
+ val = 1;
+ if (setsockopt(tctx->tc_fd, IPPROTO_TCP, TCP_NODELAY, &val,
+ sizeof(val)) == -1) {
+ pjdlog_warning("Unable to set TCP_NOELAY on %s", addr);
+ }
+ val = 131072;
+ if (setsockopt(tctx->tc_fd, SOL_SOCKET, SO_SNDBUF, &val,
+ sizeof(val)) == -1) {
+ pjdlog_warning("Unable to set send buffer size on %s", addr);
+ }
+ val = 131072;
+ if (setsockopt(tctx->tc_fd, SOL_SOCKET, SO_RCVBUF, &val,
+ sizeof(val)) == -1) {
+ pjdlog_warning("Unable to set receive buffer size on %s", addr);
+ }
+
+ tctx->tc_side = side;
+ tctx->tc_magic = TCP4_CTX_MAGIC;
+ *ctxp = tctx;
+
+ return (0);
+}
+
+static int
+tcp4_client(const char *addr, void **ctxp)
+{
+
+ return (tcp4_common_setup(addr, ctxp, TCP4_SIDE_CLIENT));
+}
+
+static int
+tcp4_connect(void *ctx)
+{
+ struct tcp4_ctx *tctx = ctx;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+ assert(tctx->tc_side == TCP4_SIDE_CLIENT);
+ assert(tctx->tc_fd >= 0);
+
+ if (connect(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sin,
+ sizeof(tctx->tc_sin)) < 0) {
+ return (errno);
+ }
+
+ return (0);
+}
+
+static int
+tcp4_server(const char *addr, void **ctxp)
+{
+ struct tcp4_ctx *tctx;
+ int ret, val;
+
+ ret = tcp4_common_setup(addr, ctxp, TCP4_SIDE_SERVER_LISTEN);
+ if (ret != 0)
+ return (ret);
+
+ tctx = *ctxp;
+
+ val = 1;
+ /* Ignore failure. */
+ (void)setsockopt(tctx->tc_fd, SOL_SOCKET, SO_REUSEADDR, &val,
+ sizeof(val));
+
+ if (bind(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sin,
+ sizeof(tctx->tc_sin)) < 0) {
+ ret = errno;
+ tcp4_close(tctx);
+ return (ret);
+ }
+ if (listen(tctx->tc_fd, 8) < 0) {
+ ret = errno;
+ tcp4_close(tctx);
+ return (ret);
+ }
+
+ return (0);
+}
+
+static int
+tcp4_accept(void *ctx, void **newctxp)
+{
+ struct tcp4_ctx *tctx = ctx;
+ struct tcp4_ctx *newtctx;
+ socklen_t fromlen;
+ int ret;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+ assert(tctx->tc_side == TCP4_SIDE_SERVER_LISTEN);
+ assert(tctx->tc_fd >= 0);
+
+ newtctx = malloc(sizeof(*newtctx));
+ if (newtctx == NULL)
+ return (errno);
+
+ fromlen = sizeof(tctx->tc_sin);
+ newtctx->tc_fd = accept(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sin,
+ &fromlen);
+ if (newtctx->tc_fd < 0) {
+ ret = errno;
+ free(newtctx);
+ return (ret);
+ }
+
+ newtctx->tc_side = TCP4_SIDE_SERVER_WORK;
+ newtctx->tc_magic = TCP4_CTX_MAGIC;
+ *newctxp = newtctx;
+
+ return (0);
+}
+
+static int
+tcp4_send(void *ctx, const unsigned char *data, size_t size)
+{
+ struct tcp4_ctx *tctx = ctx;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+ assert(tctx->tc_fd >= 0);
+
+ return (proto_common_send(tctx->tc_fd, data, size));
+}
+
+static int
+tcp4_recv(void *ctx, unsigned char *data, size_t size)
+{
+ struct tcp4_ctx *tctx = ctx;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+ assert(tctx->tc_fd >= 0);
+
+ return (proto_common_recv(tctx->tc_fd, data, size));
+}
+
+static int
+tcp4_descriptor(const void *ctx)
+{
+ const struct tcp4_ctx *tctx = ctx;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+
+ return (tctx->tc_fd);
+}
+
+static void
+sin2str(struct sockaddr_in *sinp, char *addr, size_t size)
+{
+ in_addr_t ip;
+ unsigned int port;
+
+ assert(addr != NULL);
+ assert(sinp->sin_family == AF_INET);
+
+ ip = ntohl(sinp->sin_addr.s_addr);
+ port = ntohs(sinp->sin_port);
+ snprintf(addr, size, "tcp4://%u.%u.%u.%u:%u", ((ip >> 24) & 0xff),
+ ((ip >> 16) & 0xff), ((ip >> 8) & 0xff), (ip & 0xff), port);
+}
+
+static bool
+tcp4_address_match(const void *ctx, const char *addr)
+{
+ const struct tcp4_ctx *tctx = ctx;
+ struct sockaddr_in sin;
+ socklen_t sinlen;
+ in_addr_t ip1, ip2;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+
+ if (tcp4_addr(addr, &sin) != 0)
+ return (false);
+ ip1 = sin.sin_addr.s_addr;
+
+ sinlen = sizeof(sin);
+ if (getpeername(tctx->tc_fd, (struct sockaddr *)&sin, &sinlen) < 0)
+ return (false);
+ ip2 = sin.sin_addr.s_addr;
+
+ return (ip1 == ip2);
+}
+
+static void
+tcp4_local_address(const void *ctx, char *addr, size_t size)
+{
+ const struct tcp4_ctx *tctx = ctx;
+ struct sockaddr_in sin;
+ socklen_t sinlen;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+
+ sinlen = sizeof(sin);
+ if (getsockname(tctx->tc_fd, (struct sockaddr *)&sin, &sinlen) < 0) {
+ strlcpy(addr, "N/A", size);
+ return;
+ }
+ sin2str(&sin, addr, size);
+}
+
+static void
+tcp4_remote_address(const void *ctx, char *addr, size_t size)
+{
+ const struct tcp4_ctx *tctx = ctx;
+ struct sockaddr_in sin;
+ socklen_t sinlen;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+
+ sinlen = sizeof(sin);
+ if (getpeername(tctx->tc_fd, (struct sockaddr *)&sin, &sinlen) < 0) {
+ strlcpy(addr, "N/A", size);
+ return;
+ }
+ sin2str(&sin, addr, size);
+}
+
+static void
+tcp4_close(void *ctx)
+{
+ struct tcp4_ctx *tctx = ctx;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+
+ if (tctx->tc_fd >= 0)
+ close(tctx->tc_fd);
+ tctx->tc_magic = 0;
+ free(tctx);
+}
+
+static struct hast_proto tcp4_proto = {
+ .hp_name = "tcp4",
+ .hp_client = tcp4_client,
+ .hp_connect = tcp4_connect,
+ .hp_server = tcp4_server,
+ .hp_accept = tcp4_accept,
+ .hp_send = tcp4_send,
+ .hp_recv = tcp4_recv,
+ .hp_descriptor = tcp4_descriptor,
+ .hp_address_match = tcp4_address_match,
+ .hp_local_address = tcp4_local_address,
+ .hp_remote_address = tcp4_remote_address,
+ .hp_close = tcp4_close
+};
+
+static __constructor void
+tcp4_ctor(void)
+{
+
+ proto_register(&tcp4_proto);
+}
diff --git a/sbin/hastd/proto_uds.c b/sbin/hastd/proto_uds.c
new file mode 100644
index 0000000..0fac82f
--- /dev/null
+++ b/sbin/hastd/proto_uds.c
@@ -0,0 +1,330 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/* UDS - UNIX Domain Socket */
+
+#include <sys/un.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "hast.h"
+#include "proto_impl.h"
+
+#define UDS_CTX_MAGIC 0xd541c
+struct uds_ctx {
+ int uc_magic;
+ struct sockaddr_un uc_sun;
+ int uc_fd;
+ int uc_side;
+#define UDS_SIDE_CLIENT 0
+#define UDS_SIDE_SERVER_LISTEN 1
+#define UDS_SIDE_SERVER_WORK 2
+};
+
+static void uds_close(void *ctx);
+
+static int
+uds_addr(const char *addr, struct sockaddr_un *sunp)
+{
+
+ if (addr == NULL)
+ return (-1);
+
+ if (strncasecmp(addr, "uds://", 6) == 0)
+ addr += 6;
+ else if (strncasecmp(addr, "unix://", 7) == 0)
+ addr += 7;
+ else if (addr[0] == '/' && /* If it starts from /... */
+ strstr(addr, "://") == NULL)/* ...and there is no prefix... */
+ ; /* ...we assume its us. */
+ else
+ return (-1);
+
+ sunp->sun_family = AF_UNIX;
+ if (strlcpy(sunp->sun_path, addr, sizeof(sunp->sun_path)) >=
+ sizeof(sunp->sun_path)) {
+ return (ENAMETOOLONG);
+ }
+ sunp->sun_len = SUN_LEN(sunp);
+
+ return (0);
+}
+
+static int
+uds_common_setup(const char *addr, void **ctxp, int side)
+{
+ struct uds_ctx *uctx;
+ int ret;
+
+ uctx = malloc(sizeof(*uctx));
+ if (uctx == NULL)
+ return (errno);
+
+ /* Parse given address. */
+ if ((ret = uds_addr(addr, &uctx->uc_sun)) != 0) {
+ free(uctx);
+ return (ret);
+ }
+
+ uctx->uc_fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (uctx->uc_fd == -1) {
+ ret = errno;
+ free(uctx);
+ return (ret);
+ }
+
+ uctx->uc_side = side;
+ uctx->uc_magic = UDS_CTX_MAGIC;
+ *ctxp = uctx;
+
+ return (0);
+}
+
+static int
+uds_client(const char *addr, void **ctxp)
+{
+
+ return (uds_common_setup(addr, ctxp, UDS_SIDE_CLIENT));
+}
+
+static int
+uds_connect(void *ctx)
+{
+ struct uds_ctx *uctx = ctx;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+ assert(uctx->uc_side == UDS_SIDE_CLIENT);
+ assert(uctx->uc_fd >= 0);
+
+ if (connect(uctx->uc_fd, (struct sockaddr *)&uctx->uc_sun,
+ sizeof(uctx->uc_sun)) < 0) {
+ return (errno);
+ }
+
+ return (0);
+}
+
+static int
+uds_server(const char *addr, void **ctxp)
+{
+ struct uds_ctx *uctx;
+ int ret;
+
+ ret = uds_common_setup(addr, ctxp, UDS_SIDE_SERVER_LISTEN);
+ if (ret != 0)
+ return (ret);
+
+ uctx = *ctxp;
+
+ unlink(uctx->uc_sun.sun_path);
+ if (bind(uctx->uc_fd, (struct sockaddr *)&uctx->uc_sun,
+ sizeof(uctx->uc_sun)) < 0) {
+ ret = errno;
+ uds_close(uctx);
+ return (ret);
+ }
+ if (listen(uctx->uc_fd, 8) < 0) {
+ ret = errno;
+ uds_close(uctx);
+ return (ret);
+ }
+
+ return (0);
+}
+
+static int
+uds_accept(void *ctx, void **newctxp)
+{
+ struct uds_ctx *uctx = ctx;
+ struct uds_ctx *newuctx;
+ socklen_t fromlen;
+ int ret;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+ assert(uctx->uc_side == UDS_SIDE_SERVER_LISTEN);
+ assert(uctx->uc_fd >= 0);
+
+ newuctx = malloc(sizeof(*newuctx));
+ if (newuctx == NULL)
+ return (errno);
+
+ fromlen = sizeof(uctx->uc_sun);
+ newuctx->uc_fd = accept(uctx->uc_fd, (struct sockaddr *)&uctx->uc_sun,
+ &fromlen);
+ if (newuctx->uc_fd < 0) {
+ ret = errno;
+ free(newuctx);
+ return (ret);
+ }
+
+ newuctx->uc_side = UDS_SIDE_SERVER_WORK;
+ newuctx->uc_magic = UDS_CTX_MAGIC;
+ *newctxp = newuctx;
+
+ return (0);
+}
+
+static int
+uds_send(void *ctx, const unsigned char *data, size_t size)
+{
+ struct uds_ctx *uctx = ctx;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+ assert(uctx->uc_fd >= 0);
+
+ return (proto_common_send(uctx->uc_fd, data, size));
+}
+
+static int
+uds_recv(void *ctx, unsigned char *data, size_t size)
+{
+ struct uds_ctx *uctx = ctx;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+ assert(uctx->uc_fd >= 0);
+
+ return (proto_common_recv(uctx->uc_fd, data, size));
+}
+
+static int
+uds_descriptor(const void *ctx)
+{
+ const struct uds_ctx *uctx = ctx;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+
+ return (uctx->uc_fd);
+}
+
+static bool
+uds_address_match(const void *ctx __unused, const char *addr __unused)
+{
+
+ assert(!"proto_address_match() not supported on UNIX domain sockets");
+ abort();
+}
+
+static void
+uds_local_address(const void *ctx, char *addr, size_t size)
+{
+ const struct uds_ctx *uctx = ctx;
+ struct sockaddr_un sun;
+ socklen_t sunlen;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+ assert(addr != NULL);
+
+ sunlen = sizeof(sun);
+ if (getsockname(uctx->uc_fd, (struct sockaddr *)&sun, &sunlen) < 0) {
+ strlcpy(addr, "N/A", size);
+ return;
+ }
+ assert(sun.sun_family == AF_UNIX);
+ if (sun.sun_path[0] == '\0') {
+ strlcpy(addr, "N/A", size);
+ return;
+ }
+ snprintf(addr, size, "uds://%s", sun.sun_path);
+}
+
+static void
+uds_remote_address(const void *ctx, char *addr, size_t size)
+{
+ const struct uds_ctx *uctx = ctx;
+ struct sockaddr_un sun;
+ socklen_t sunlen;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+ assert(addr != NULL);
+
+ sunlen = sizeof(sun);
+ if (getpeername(uctx->uc_fd, (struct sockaddr *)&sun, &sunlen) < 0) {
+ strlcpy(addr, "N/A", size);
+ return;
+ }
+ assert(sun.sun_family == AF_UNIX);
+ if (sun.sun_path[0] == '\0') {
+ strlcpy(addr, "N/A", size);
+ return;
+ }
+ snprintf(addr, size, "uds://%s", sun.sun_path);
+}
+
+static void
+uds_close(void *ctx)
+{
+ struct uds_ctx *uctx = ctx;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+
+ if (uctx->uc_fd >= 0)
+ close(uctx->uc_fd);
+ unlink(uctx->uc_sun.sun_path);
+ uctx->uc_magic = 0;
+ free(uctx);
+}
+
+static struct hast_proto uds_proto = {
+ .hp_name = "uds",
+ .hp_client = uds_client,
+ .hp_connect = uds_connect,
+ .hp_server = uds_server,
+ .hp_accept = uds_accept,
+ .hp_send = uds_send,
+ .hp_recv = uds_recv,
+ .hp_descriptor = uds_descriptor,
+ .hp_address_match = uds_address_match,
+ .hp_local_address = uds_local_address,
+ .hp_remote_address = uds_remote_address,
+ .hp_close = uds_close
+};
+
+static __constructor void
+uds_ctor(void)
+{
+
+ proto_register(&uds_proto);
+}
diff --git a/sbin/hastd/rangelock.c b/sbin/hastd/rangelock.c
new file mode 100644
index 0000000..02247d6
--- /dev/null
+++ b/sbin/hastd/rangelock.c
@@ -0,0 +1,137 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/queue.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "rangelock.h"
+
+#define RANGELOCKS_MAGIC 0x94310c
+struct rangelocks {
+ int rls_magic; /* Magic value. */
+ TAILQ_HEAD(, rlock) rls_locks; /* List of locked ranges. */
+};
+
+struct rlock {
+ off_t rl_start;
+ off_t rl_end;
+ TAILQ_ENTRY(rlock) rl_next;
+};
+
+int
+rangelock_init(struct rangelocks **rlsp)
+{
+ struct rangelocks *rls;
+
+ assert(rlsp != NULL);
+
+ rls = malloc(sizeof(*rls));
+ if (rls == NULL)
+ return (-1);
+
+ TAILQ_INIT(&rls->rls_locks);
+
+ rls->rls_magic = RANGELOCKS_MAGIC;
+ *rlsp = rls;
+
+ return (0);
+}
+
+void
+rangelock_free(struct rangelocks *rls)
+{
+ struct rlock *rl;
+
+ assert(rls->rls_magic == RANGELOCKS_MAGIC);
+
+ rls->rls_magic = 0;
+
+ while ((rl = TAILQ_FIRST(&rls->rls_locks)) != NULL) {
+ TAILQ_REMOVE(&rls->rls_locks, rl, rl_next);
+ free(rl);
+ }
+ free(rls);
+}
+
+int
+rangelock_add(struct rangelocks *rls, off_t offset, off_t length)
+{
+ struct rlock *rl;
+
+ assert(rls->rls_magic == RANGELOCKS_MAGIC);
+
+ rl = malloc(sizeof(*rl));
+ if (rl == NULL)
+ return (-1);
+ rl->rl_start = offset;
+ rl->rl_end = offset + length;
+ TAILQ_INSERT_TAIL(&rls->rls_locks, rl, rl_next);
+ return (0);
+}
+
+void
+rangelock_del(struct rangelocks *rls, off_t offset, off_t length)
+{
+ struct rlock *rl;
+
+ assert(rls->rls_magic == RANGELOCKS_MAGIC);
+
+ TAILQ_FOREACH(rl, &rls->rls_locks, rl_next) {
+ if (rl->rl_start == offset && rl->rl_end == offset + length)
+ break;
+ }
+ assert(rl != NULL);
+ TAILQ_REMOVE(&rls->rls_locks, rl, rl_next);
+ free(rl);
+}
+
+bool
+rangelock_islocked(struct rangelocks *rls, off_t offset, off_t length)
+{
+ struct rlock *rl;
+
+ assert(rls->rls_magic == RANGELOCKS_MAGIC);
+
+ TAILQ_FOREACH(rl, &rls->rls_locks, rl_next) {
+ if (rl->rl_start >= offset && rl->rl_start < offset + length)
+ break;
+ else if (rl->rl_end > offset && rl->rl_end <= offset + length)
+ break;
+ else if (rl->rl_start < offset && rl->rl_end > offset + length)
+ break;
+ }
+ return (rl != NULL);
+}
diff --git a/sbin/hastd/rangelock.h b/sbin/hastd/rangelock.h
new file mode 100644
index 0000000..2ad9895
--- /dev/null
+++ b/sbin/hastd/rangelock.h
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _RANGELOCK_H_
+#define _RANGELOCK_H_
+
+#include <stdbool.h>
+#include <unistd.h>
+
+struct rangelocks;
+
+int rangelock_init(struct rangelocks **rlsp);
+void rangelock_free(struct rangelocks *rls);
+int rangelock_add(struct rangelocks *rls, off_t offset, off_t length);
+void rangelock_del(struct rangelocks *rls, off_t offset, off_t length);
+bool rangelock_islocked(struct rangelocks *rls, off_t offset, off_t length);
+
+#endif /* !_RANGELOCK_H_ */
diff --git a/sbin/hastd/secondary.c b/sbin/hastd/secondary.c
new file mode 100644
index 0000000..6af95b5
--- /dev/null
+++ b/sbin/hastd/secondary.c
@@ -0,0 +1,697 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/bio.h>
+#include <sys/disk.h>
+#include <sys/stat.h>
+
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgeom.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <activemap.h>
+#include <nv.h>
+#include <pjdlog.h>
+
+#include "control.h"
+#include "hast.h"
+#include "hast_proto.h"
+#include "hastd.h"
+#include "metadata.h"
+#include "proto.h"
+#include "subr.h"
+#include "synch.h"
+
+struct hio {
+ uint64_t hio_seq;
+ int hio_error;
+ struct nv *hio_nv;
+ void *hio_data;
+ uint8_t hio_cmd;
+ uint64_t hio_offset;
+ uint64_t hio_length;
+ TAILQ_ENTRY(hio) hio_next;
+};
+
+/*
+ * Free list holds unused structures. When free list is empty, we have to wait
+ * until some in-progress requests are freed.
+ */
+static TAILQ_HEAD(, hio) hio_free_list;
+static pthread_mutex_t hio_free_list_lock;
+static pthread_cond_t hio_free_list_cond;
+/*
+ * Disk thread (the one that do I/O requests) takes requests from this list.
+ */
+static TAILQ_HEAD(, hio) hio_disk_list;
+static pthread_mutex_t hio_disk_list_lock;
+static pthread_cond_t hio_disk_list_cond;
+/*
+ * There is one recv list for every component, although local components don't
+ * use recv lists as local requests are done synchronously.
+ */
+static TAILQ_HEAD(, hio) hio_send_list;
+static pthread_mutex_t hio_send_list_lock;
+static pthread_cond_t hio_send_list_cond;
+
+/*
+ * Maximum number of outstanding I/O requests.
+ */
+#define HAST_HIO_MAX 256
+
+static void *recv_thread(void *arg);
+static void *disk_thread(void *arg);
+static void *send_thread(void *arg);
+
+static void
+init_environment(void)
+{
+ struct hio *hio;
+ unsigned int ii;
+
+ /*
+ * Initialize lists, their locks and theirs condition variables.
+ */
+ TAILQ_INIT(&hio_free_list);
+ mtx_init(&hio_free_list_lock);
+ cv_init(&hio_free_list_cond);
+ TAILQ_INIT(&hio_disk_list);
+ mtx_init(&hio_disk_list_lock);
+ cv_init(&hio_disk_list_cond);
+ TAILQ_INIT(&hio_send_list);
+ mtx_init(&hio_send_list_lock);
+ cv_init(&hio_send_list_cond);
+
+ /*
+ * Allocate requests pool and initialize requests.
+ */
+ for (ii = 0; ii < HAST_HIO_MAX; ii++) {
+ hio = malloc(sizeof(*hio));
+ if (hio == NULL) {
+ errx(EX_TEMPFAIL, "cannot allocate %zu bytes of memory "
+ "for hio request", sizeof(*hio));
+ }
+ hio->hio_error = 0;
+ hio->hio_data = malloc(MAXPHYS);
+ if (hio->hio_data == NULL) {
+ errx(EX_TEMPFAIL, "cannot allocate %zu bytes of memory "
+ "for gctl_data", (size_t)MAXPHYS);
+ }
+ TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_next);
+ }
+}
+
+static void
+init_local(struct hast_resource *res)
+{
+
+ if (metadata_read(res, true) < 0)
+ exit(EX_NOINPUT);
+}
+
+static void
+init_remote(struct hast_resource *res, struct nv *nvin)
+{
+ uint64_t resuid;
+ struct nv *nvout;
+ unsigned char *map;
+ size_t mapsize;
+
+ map = NULL;
+ mapsize = 0;
+ nvout = nv_alloc();
+ nv_add_int64(nvout, (int64_t)res->hr_datasize, "datasize");
+ nv_add_int32(nvout, (int32_t)res->hr_extentsize, "extentsize");
+ resuid = nv_get_uint64(nvin, "resuid");
+ res->hr_primary_localcnt = nv_get_uint64(nvin, "localcnt");
+ res->hr_primary_remotecnt = nv_get_uint64(nvin, "remotecnt");
+ nv_add_uint64(nvout, res->hr_secondary_localcnt, "localcnt");
+ nv_add_uint64(nvout, res->hr_secondary_remotecnt, "remotecnt");
+ mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize -
+ METADATA_SIZE, res->hr_extentsize, res->hr_local_sectorsize);
+ map = malloc(mapsize);
+ if (map == NULL) {
+ pjdlog_exitx(EX_TEMPFAIL,
+ "Unable to allocate memory (%zu bytes) for activemap.",
+ mapsize);
+ }
+ nv_add_uint32(nvout, (uint32_t)mapsize, "mapsize");
+ /*
+ * When we work as primary and secondary is missing we will increase
+ * localcnt in our metadata. When secondary is connected and synced
+ * we make localcnt be equal to remotecnt, which means nodes are more
+ * or less in sync.
+ * Split-brain condition is when both nodes are not able to communicate
+ * and are both configured as primary nodes. In turn, they can both
+ * make incompatible changes to the data and we have to detect that.
+ * Under split-brain condition we will increase our localcnt on first
+ * write and remote node will increase its localcnt on first write.
+ * When we connect we can see that primary's localcnt is greater than
+ * our remotecnt (primary was modified while we weren't watching) and
+ * our localcnt is greater than primary's remotecnt (we were modified
+ * while primary wasn't watching).
+ * There are many possible combinations which are all gathered below.
+ * Don't pay too much attention to exact numbers, the more important
+ * is to compare them. We compare secondary's local with primary's
+ * remote and secondary's remote with primary's local.
+ * Note that every case where primary's localcnt is smaller than
+ * secondary's remotecnt and where secondary's localcnt is smaller than
+ * primary's remotecnt should be impossible in practise. We will perform
+ * full synchronization then. Those cases are marked with an asterisk.
+ * Regular synchronization means that only extents marked as dirty are
+ * synchronized (regular synchronization).
+ *
+ * SECONDARY METADATA PRIMARY METADATA
+ * local=3 remote=3 local=2 remote=2* ?! Full sync from secondary.
+ * local=3 remote=3 local=2 remote=3* ?! Full sync from primary.
+ * local=3 remote=3 local=2 remote=4* ?! Full sync from primary.
+ * local=3 remote=3 local=3 remote=2 Primary is out-of-date,
+ * regular sync from secondary.
+ * local=3 remote=3 local=3 remote=3 Regular sync just in case.
+ * local=3 remote=3 local=3 remote=4* ?! Full sync from primary.
+ * local=3 remote=3 local=4 remote=2 Split-brain condition.
+ * local=3 remote=3 local=4 remote=3 Secondary out-of-date,
+ * regular sync from primary.
+ * local=3 remote=3 local=4 remote=4* ?! Full sync from primary.
+ */
+ if (res->hr_resuid == 0) {
+ /*
+ * Provider is used for the first time. Initialize everything.
+ */
+ assert(res->hr_secondary_localcnt == 0);
+ res->hr_resuid = resuid;
+ if (metadata_write(res) < 0)
+ exit(EX_NOINPUT);
+ memset(map, 0xff, mapsize);
+ nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
+ } else if (
+ /* Is primary is out-of-date? */
+ (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
+ res->hr_secondary_remotecnt == res->hr_primary_localcnt) ||
+ /* Node are more or less in sync? */
+ (res->hr_secondary_localcnt == res->hr_primary_remotecnt &&
+ res->hr_secondary_remotecnt == res->hr_primary_localcnt) ||
+ /* Is secondary is out-of-date? */
+ (res->hr_secondary_localcnt == res->hr_primary_remotecnt &&
+ res->hr_secondary_remotecnt < res->hr_primary_localcnt)) {
+ /*
+ * Nodes are more or less in sync or one of the nodes is
+ * out-of-date.
+ * It doesn't matter at this point which one, we just have to
+ * send out local bitmap to the remote node.
+ */
+ if (pread(res->hr_localfd, map, mapsize, METADATA_SIZE) !=
+ (ssize_t)mapsize) {
+ pjdlog_exit(LOG_ERR, "Unable to read activemap");
+ }
+ if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
+ res->hr_secondary_remotecnt == res->hr_primary_localcnt) {
+ /* Primary is out-of-date, sync from secondary. */
+ nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc");
+ } else {
+ /*
+ * Secondary is out-of-date or counts match.
+ * Sync from primary.
+ */
+ nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
+ }
+ } else if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
+ res->hr_primary_localcnt > res->hr_secondary_remotecnt) {
+ /*
+ * Not good, we have split-brain condition.
+ */
+ pjdlog_error("Split-brain detected, exiting.");
+ nv_add_string(nvout, "Split-brain condition!", "errmsg");
+ free(map);
+ map = NULL;
+ mapsize = 0;
+ } else /* if (res->hr_secondary_localcnt < res->hr_primary_remotecnt ||
+ res->hr_primary_localcnt < res->hr_secondary_remotecnt) */ {
+ /*
+ * This should never happen in practise, but we will perform
+ * full synchronization.
+ */
+ assert(res->hr_secondary_localcnt < res->hr_primary_remotecnt ||
+ res->hr_primary_localcnt < res->hr_secondary_remotecnt);
+ mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize -
+ METADATA_SIZE, res->hr_extentsize,
+ res->hr_local_sectorsize);
+ memset(map, 0xff, mapsize);
+ if (res->hr_secondary_localcnt > res->hr_primary_remotecnt) {
+ /* In this one of five cases sync from secondary. */
+ nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc");
+ } else {
+ /* For the rest four cases sync from primary. */
+ nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
+ }
+ pjdlog_warning("This should never happen, asking for full synchronization (primary(local=%ju, remote=%ju), secondary(local=%ju, remote=%ju)).",
+ (uintmax_t)res->hr_primary_localcnt,
+ (uintmax_t)res->hr_primary_remotecnt,
+ (uintmax_t)res->hr_secondary_localcnt,
+ (uintmax_t)res->hr_secondary_remotecnt);
+ }
+ if (hast_proto_send(res, res->hr_remotein, nvout, map, mapsize) < 0) {
+ pjdlog_errno(LOG_WARNING, "Unable to send activemap to %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ exit(EX_TEMPFAIL);
+ }
+ if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
+ res->hr_primary_localcnt > res->hr_secondary_remotecnt) {
+ /* Exit on split-brain. */
+ exit(EX_CONFIG);
+ }
+}
+
+void
+hastd_secondary(struct hast_resource *res, struct nv *nvin)
+{
+ pthread_t td;
+ pid_t pid;
+ int error;
+
+ /*
+ * Create communication channel between parent and child.
+ */
+ if (proto_client("socketpair://", &res->hr_ctrl) < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR,
+ "Unable to create control sockets between parent and child");
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "Unable to fork");
+ }
+
+ if (pid > 0) {
+ /* This is parent. */
+ proto_close(res->hr_remotein);
+ res->hr_remotein = NULL;
+ proto_close(res->hr_remoteout);
+ res->hr_remoteout = NULL;
+ res->hr_workerpid = pid;
+ return;
+ }
+ (void)pidfile_close(pfh);
+
+ setproctitle("%s (secondary)", res->hr_name);
+
+ init_local(res);
+ init_remote(res, nvin);
+ init_environment();
+
+ error = pthread_create(&td, NULL, recv_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, disk_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, send_thread, res);
+ assert(error == 0);
+ (void)ctrl_thread(res);
+}
+
+static void
+reqlog(int loglevel, int debuglevel, int error, struct hio *hio, const char *fmt, ...)
+{
+ char msg[1024];
+ va_list ap;
+ int len;
+
+ va_start(ap, fmt);
+ len = vsnprintf(msg, sizeof(msg), fmt, ap);
+ va_end(ap);
+ if ((size_t)len < sizeof(msg)) {
+ switch (hio->hio_cmd) {
+ case HIO_READ:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "READ(%ju, %ju).", (uintmax_t)hio->hio_offset,
+ (uintmax_t)hio->hio_length);
+ break;
+ case HIO_DELETE:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "DELETE(%ju, %ju).", (uintmax_t)hio->hio_offset,
+ (uintmax_t)hio->hio_length);
+ break;
+ case HIO_FLUSH:
+ (void)snprintf(msg + len, sizeof(msg) - len, "FLUSH.");
+ break;
+ case HIO_WRITE:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "WRITE(%ju, %ju).", (uintmax_t)hio->hio_offset,
+ (uintmax_t)hio->hio_length);
+ break;
+ default:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "UNKNOWN(%u).", (unsigned int)hio->hio_cmd);
+ break;
+ }
+ }
+ pjdlog_common(loglevel, debuglevel, error, "%s", msg);
+}
+
+static int
+requnpack(struct hast_resource *res, struct hio *hio)
+{
+
+ hio->hio_cmd = nv_get_uint8(hio->hio_nv, "cmd");
+ if (hio->hio_cmd == 0) {
+ pjdlog_error("Header contains no 'cmd' field.");
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ switch (hio->hio_cmd) {
+ case HIO_READ:
+ case HIO_WRITE:
+ case HIO_DELETE:
+ hio->hio_offset = nv_get_uint64(hio->hio_nv, "offset");
+ if (nv_error(hio->hio_nv) != 0) {
+ pjdlog_error("Header is missing 'offset' field.");
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ hio->hio_length = nv_get_uint64(hio->hio_nv, "length");
+ if (nv_error(hio->hio_nv) != 0) {
+ pjdlog_error("Header is missing 'length' field.");
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if (hio->hio_length == 0) {
+ pjdlog_error("Data length is zero.");
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if (hio->hio_length > MAXPHYS) {
+ pjdlog_error("Data length is too large (%ju > %ju).",
+ (uintmax_t)hio->hio_length, (uintmax_t)MAXPHYS);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if ((hio->hio_offset % res->hr_local_sectorsize) != 0) {
+ pjdlog_error("Offset %ju is not multiple of sector size.",
+ (uintmax_t)hio->hio_offset);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if ((hio->hio_length % res->hr_local_sectorsize) != 0) {
+ pjdlog_error("Length %ju is not multiple of sector size.",
+ (uintmax_t)hio->hio_length);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if (hio->hio_offset + hio->hio_length >
+ (uint64_t)res->hr_datasize) {
+ pjdlog_error("Data offset is too large (%ju > %ju).",
+ (uintmax_t)(hio->hio_offset + hio->hio_length),
+ (uintmax_t)res->hr_datasize);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ break;
+ default:
+ pjdlog_error("Header contains invalid 'cmd' (%hhu).",
+ hio->hio_cmd);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ hio->hio_error = 0;
+end:
+ return (hio->hio_error);
+}
+
+/*
+ * Thread receives requests from the primary node.
+ */
+static void *
+recv_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct hio *hio;
+ bool wakeup;
+
+ for (;;) {
+ pjdlog_debug(2, "recv: Taking free request.");
+ mtx_lock(&hio_free_list_lock);
+ while ((hio = TAILQ_FIRST(&hio_free_list)) == NULL) {
+ pjdlog_debug(2, "recv: No free requests, waiting.");
+ cv_wait(&hio_free_list_cond, &hio_free_list_lock);
+ }
+ TAILQ_REMOVE(&hio_free_list, hio, hio_next);
+ mtx_unlock(&hio_free_list_lock);
+ pjdlog_debug(2, "recv: (%p) Got request.", hio);
+ if (hast_proto_recv_hdr(res->hr_remotein, &hio->hio_nv) < 0) {
+ pjdlog_exit(EX_TEMPFAIL,
+ "Unable to receive request header");
+ }
+ if (requnpack(res, hio) != 0)
+ goto send_queue;
+ reqlog(LOG_DEBUG, 2, -1, hio,
+ "recv: (%p) Got request header: ", hio);
+ if (hio->hio_cmd == HIO_WRITE) {
+ if (hast_proto_recv_data(res, res->hr_remotein,
+ hio->hio_nv, hio->hio_data, MAXPHYS) < 0) {
+ pjdlog_exit(EX_TEMPFAIL,
+ "Unable to receive reply data");
+ }
+ }
+ pjdlog_debug(2, "recv: (%p) Moving request to the disk queue.",
+ hio);
+ mtx_lock(&hio_disk_list_lock);
+ wakeup = TAILQ_EMPTY(&hio_disk_list);
+ TAILQ_INSERT_TAIL(&hio_disk_list, hio, hio_next);
+ mtx_unlock(&hio_disk_list_lock);
+ if (wakeup)
+ cv_signal(&hio_disk_list_cond);
+ continue;
+send_queue:
+ pjdlog_debug(2, "recv: (%p) Moving request to the send queue.",
+ hio);
+ mtx_lock(&hio_send_list_lock);
+ wakeup = TAILQ_EMPTY(&hio_send_list);
+ TAILQ_INSERT_TAIL(&hio_send_list, hio, hio_next);
+ mtx_unlock(&hio_send_list_lock);
+ if (wakeup)
+ cv_signal(&hio_send_list_cond);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread reads from or writes to local component and also handles DELETE and
+ * FLUSH requests.
+ */
+static void *
+disk_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct hio *hio;
+ ssize_t ret;
+ bool clear_activemap, wakeup;
+
+ clear_activemap = true;
+
+ for (;;) {
+ pjdlog_debug(2, "disk: Taking request.");
+ mtx_lock(&hio_disk_list_lock);
+ while ((hio = TAILQ_FIRST(&hio_disk_list)) == NULL) {
+ pjdlog_debug(2, "disk: No requests, waiting.");
+ cv_wait(&hio_disk_list_cond, &hio_disk_list_lock);
+ }
+ TAILQ_REMOVE(&hio_disk_list, hio, hio_next);
+ mtx_unlock(&hio_disk_list_lock);
+ while (clear_activemap) {
+ unsigned char *map;
+ size_t mapsize;
+
+ /*
+ * When first request is received, it means that primary
+ * already received our activemap, merged it and stored
+ * locally. We can now safely clear our activemap.
+ */
+ mapsize =
+ activemap_calc_ondisk_size(res->hr_local_mediasize -
+ METADATA_SIZE, res->hr_extentsize,
+ res->hr_local_sectorsize);
+ map = calloc(1, mapsize);
+ if (map == NULL) {
+ pjdlog_warning("Unable to allocate memory to clear local activemap.");
+ break;
+ }
+ if (pwrite(res->hr_localfd, map, mapsize,
+ METADATA_SIZE) != (ssize_t)mapsize) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to store cleared activemap");
+ free(map);
+ break;
+ }
+ free(map);
+ clear_activemap = false;
+ pjdlog_debug(1, "Local activemap cleared.");
+ }
+ reqlog(LOG_DEBUG, 2, -1, hio, "disk: (%p) Got request: ", hio);
+ /* Handle the actual request. */
+ switch (hio->hio_cmd) {
+ case HIO_READ:
+ ret = pread(res->hr_localfd, hio->hio_data,
+ hio->hio_length,
+ hio->hio_offset + res->hr_localoff);
+ if (ret < 0)
+ hio->hio_error = errno;
+ else if (ret != (int64_t)hio->hio_length)
+ hio->hio_error = EIO;
+ else
+ hio->hio_error = 0;
+ break;
+ case HIO_WRITE:
+ ret = pwrite(res->hr_localfd, hio->hio_data,
+ hio->hio_length,
+ hio->hio_offset + res->hr_localoff);
+ if (ret < 0)
+ hio->hio_error = errno;
+ else if (ret != (int64_t)hio->hio_length)
+ hio->hio_error = EIO;
+ else
+ hio->hio_error = 0;
+ break;
+ case HIO_DELETE:
+ ret = g_delete(res->hr_localfd,
+ hio->hio_offset + res->hr_localoff,
+ hio->hio_length);
+ if (ret < 0)
+ hio->hio_error = errno;
+ else
+ hio->hio_error = 0;
+ break;
+ case HIO_FLUSH:
+ ret = g_flush(res->hr_localfd);
+ if (ret < 0)
+ hio->hio_error = errno;
+ else
+ hio->hio_error = 0;
+ break;
+ }
+ if (hio->hio_error != 0) {
+ reqlog(LOG_ERR, 0, hio->hio_error, hio,
+ "Request failed: ");
+ }
+ pjdlog_debug(2, "disk: (%p) Moving request to the send queue.",
+ hio);
+ mtx_lock(&hio_send_list_lock);
+ wakeup = TAILQ_EMPTY(&hio_send_list);
+ TAILQ_INSERT_TAIL(&hio_send_list, hio, hio_next);
+ mtx_unlock(&hio_send_list_lock);
+ if (wakeup)
+ cv_signal(&hio_send_list_cond);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread sends requests back to primary node.
+ */
+static void *
+send_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct nv *nvout;
+ struct hio *hio;
+ void *data;
+ size_t length;
+ bool wakeup;
+
+ for (;;) {
+ pjdlog_debug(2, "send: Taking request.");
+ mtx_lock(&hio_send_list_lock);
+ while ((hio = TAILQ_FIRST(&hio_send_list)) == NULL) {
+ pjdlog_debug(2, "send: No requests, waiting.");
+ cv_wait(&hio_send_list_cond, &hio_send_list_lock);
+ }
+ TAILQ_REMOVE(&hio_send_list, hio, hio_next);
+ mtx_unlock(&hio_send_list_lock);
+ reqlog(LOG_DEBUG, 2, -1, hio, "send: (%p) Got request: ", hio);
+ nvout = nv_alloc();
+ /* Copy sequence number. */
+ nv_add_uint64(nvout, nv_get_uint64(hio->hio_nv, "seq"), "seq");
+ switch (hio->hio_cmd) {
+ case HIO_READ:
+ if (hio->hio_error == 0) {
+ data = hio->hio_data;
+ length = hio->hio_length;
+ break;
+ }
+ /*
+ * We send no data in case of an error.
+ */
+ /* FALLTHROUGH */
+ case HIO_DELETE:
+ case HIO_FLUSH:
+ case HIO_WRITE:
+ data = NULL;
+ length = 0;
+ break;
+ default:
+ abort();
+ break;
+ }
+ if (hio->hio_error != 0)
+ nv_add_int16(nvout, hio->hio_error, "error");
+ if (hast_proto_send(res, res->hr_remoteout, nvout, data,
+ length) < 0) {
+ pjdlog_exit(EX_TEMPFAIL, "Unable to send reply.");
+ }
+ nv_free(nvout);
+ pjdlog_debug(2, "disk: (%p) Moving request to the free queue.",
+ hio);
+ nv_free(hio->hio_nv);
+ hio->hio_error = 0;
+ mtx_lock(&hio_free_list_lock);
+ wakeup = TAILQ_EMPTY(&hio_free_list);
+ TAILQ_INSERT_TAIL(&hio_free_list, hio, hio_next);
+ mtx_unlock(&hio_free_list_lock);
+ if (wakeup)
+ cv_signal(&hio_free_list_cond);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
diff --git a/sbin/hastd/subr.c b/sbin/hastd/subr.c
new file mode 100644
index 0000000..16ea93f
--- /dev/null
+++ b/sbin/hastd/subr.c
@@ -0,0 +1,118 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/disk.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include <pjdlog.h>
+
+#include "hast.h"
+#include "subr.h"
+
+int
+provinfo(struct hast_resource *res, bool dowrite)
+{
+ struct stat sb;
+
+ assert(res->hr_localpath != NULL && res->hr_localpath[0] != '\0');
+
+ if (res->hr_localfd == -1) {
+ res->hr_localfd = open(res->hr_localpath,
+ dowrite ? O_RDWR : O_RDONLY);
+ if (res->hr_localfd < 0) {
+ KEEP_ERRNO(pjdlog_errno(LOG_ERR, "Unable to open %s",
+ res->hr_localpath));
+ return (-1);
+ }
+ }
+ if (fstat(res->hr_localfd, &sb) < 0) {
+ KEEP_ERRNO(pjdlog_errno(LOG_ERR, "Unable to stat %s",
+ res->hr_localpath));
+ return (-1);
+ }
+ if (S_ISCHR(sb.st_mode)) {
+ /*
+ * If this is character device, it is most likely GEOM provider.
+ */
+ if (ioctl(res->hr_localfd, DIOCGMEDIASIZE,
+ &res->hr_local_mediasize) < 0) {
+ KEEP_ERRNO(pjdlog_errno(LOG_ERR,
+ "Unable obtain provider %s mediasize",
+ res->hr_localpath));
+ return (-1);
+ }
+ if (ioctl(res->hr_localfd, DIOCGSECTORSIZE,
+ &res->hr_local_sectorsize) < 0) {
+ KEEP_ERRNO(pjdlog_errno(LOG_ERR,
+ "Unable obtain provider %s sectorsize",
+ res->hr_localpath));
+ return (-1);
+ }
+ } else if (S_ISREG(sb.st_mode)) {
+ /*
+ * We also support regular files for which we hardcode
+ * sector size of 512 bytes.
+ */
+ res->hr_local_mediasize = sb.st_size;
+ res->hr_local_sectorsize = 512;
+ } else {
+ /*
+ * We support no other file types.
+ */
+ pjdlog_error("%s is neither GEOM provider nor regular file.",
+ res->hr_localpath);
+ errno = EFTYPE;
+ return (-1);
+ }
+ return (0);
+}
+
+const char *
+role2str(int role)
+{
+
+ switch (role) {
+ case HAST_ROLE_INIT:
+ return ("init");
+ case HAST_ROLE_PRIMARY:
+ return ("primary");
+ case HAST_ROLE_SECONDARY:
+ return ("secondary");
+ }
+ return ("unknown");
+}
diff --git a/sbin/hastd/subr.h b/sbin/hastd/subr.h
new file mode 100644
index 0000000..c486f5c
--- /dev/null
+++ b/sbin/hastd/subr.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SUBR_H_
+#define _SUBR_H_
+
+#include <sys/types.h>
+#include <stdbool.h>
+
+#include "hast.h"
+
+#define KEEP_ERRNO(work) do { \
+ int _rerrno; \
+ \
+ _rerrno = errno; \
+ work; \
+ errno = _rerrno; \
+} while (0)
+
+int provinfo(struct hast_resource *res, bool dowrite);
+const char *role2str(int role);
+
+#endif /* !_SUBR_H_ */
diff --git a/sbin/hastd/synch.h b/sbin/hastd/synch.h
new file mode 100644
index 0000000..7269aea
--- /dev/null
+++ b/sbin/hastd/synch.h
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYNCH_H_
+#define _SYNCH_H_
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <time.h>
+
+static __inline void
+mtx_init(pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_mutex_init(lock, NULL);
+ assert(error == 0);
+}
+static __inline void
+mtx_lock(pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_mutex_lock(lock);
+ assert(error == 0);
+}
+static __inline bool
+mtx_trylock(pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_mutex_trylock(lock);
+ assert(error == 0 || error == EBUSY);
+ return (error == 0);
+}
+static __inline void
+mtx_unlock(pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_mutex_unlock(lock);
+ assert(error == 0);
+}
+
+static __inline void
+rw_init(pthread_rwlock_t *lock)
+{
+ int error;
+
+ error = pthread_rwlock_init(lock, NULL);
+ assert(error == 0);
+}
+static __inline void
+rw_rlock(pthread_rwlock_t *lock)
+{
+ int error;
+
+ error = pthread_rwlock_rdlock(lock);
+ assert(error == 0);
+}
+static __inline void
+rw_wlock(pthread_rwlock_t *lock)
+{
+ int error;
+
+ error = pthread_rwlock_wrlock(lock);
+ assert(error == 0);
+}
+static __inline void
+rw_unlock(pthread_rwlock_t *lock)
+{
+ int error;
+
+ error = pthread_rwlock_unlock(lock);
+ assert(error == 0);
+}
+
+static __inline void
+cv_init(pthread_cond_t *cv)
+{
+ pthread_condattr_t attr;
+ int error;
+
+ error = pthread_condattr_init(&attr);
+ assert(error == 0);
+ error = pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
+ assert(error == 0);
+ error = pthread_cond_init(cv, &attr);
+ assert(error == 0);
+}
+static __inline void
+cv_wait(pthread_cond_t *cv, pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_cond_wait(cv, lock);
+ assert(error == 0);
+}
+static __inline bool
+cv_timedwait(pthread_cond_t *cv, pthread_mutex_t *lock, int timeout)
+{
+ struct timespec ts;
+ int error;
+
+ if (timeout == 0) {
+ cv_wait(cv, lock);
+ return (false);
+ }
+
+ error = clock_gettime(CLOCK_MONOTONIC, &ts);
+ assert(error == 0);
+ ts.tv_sec += timeout;
+ error = pthread_cond_timedwait(cv, lock, &ts);
+ assert(error == 0 || error == ETIMEDOUT);
+ return (error == ETIMEDOUT);
+}
+static __inline void
+cv_signal(pthread_cond_t *cv)
+{
+ int error;
+
+ error = pthread_cond_signal(cv);
+ assert(error == 0);
+}
+static __inline void
+cv_broadcast(pthread_cond_t *cv)
+{
+ int error;
+
+ error = pthread_cond_broadcast(cv);
+ assert(error == 0);
+}
+#endif /* !_SYNCH_H_ */
diff --git a/sbin/hastd/token.l b/sbin/hastd/token.l
new file mode 100644
index 0000000..7b80384
--- /dev/null
+++ b/sbin/hastd/token.l
@@ -0,0 +1,66 @@
+%{
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "hast.h"
+
+#include "y.tab.h"
+
+int depth;
+int lineno;
+
+#define DP do { } while (0)
+%}
+
+%%
+control { DP; return CONTROL; }
+listen { DP; return LISTEN; }
+port { DP; return PORT; }
+replication { DP; return REPLICATION; }
+resource { DP; return RESOURCE; }
+name { DP; return NAME; }
+local { DP; return LOCAL; }
+remote { DP; return REMOTE; }
+on { DP; return ON; }
+fullsync { DP; return FULLSYNC; }
+memsync { DP; return MEMSYNC; }
+async { DP; return ASYNC; }
+[0-9]+ { DP; yylval.num = atoi(yytext); return NUM; }
+[a-zA-Z0-9\.\-_/\:]+ { DP; yylval.str = strdup(yytext); return STR; }
+\{ { DP; depth++; return OB; }
+\} { DP; depth--; return CB; }
+#.*$ /* ignore comments */;
+\n { lineno++; }
+[ \t]+ /* ignore whitespace */;
+%%
diff --git a/share/examples/Makefile b/share/examples/Makefile
index 315eb91..99d92c0 100644
--- a/share/examples/Makefile
+++ b/share/examples/Makefile
@@ -13,6 +13,7 @@ LDIRS= BSD_daemon \
drivers \
etc \
find_interface \
+ hast \
ibcs2 \
ipfw \
kld \
@@ -69,6 +70,11 @@ XFILES= BSD_daemon/FreeBSD.pfa \
find_interface/Makefile \
find_interface/README \
find_interface/find_interface.c \
+ hast/ucarp.sh \
+ hast/ucarp_down.sh \
+ hast/ucarp_up.sh \
+ hast/vip-down.sh \
+ hast/vip-up.sh \
ibcs2/README \
ibcs2/hello.uu \
ipfw/change_rules.sh \
diff --git a/share/examples/hast/ucarp.sh b/share/examples/hast/ucarp.sh
new file mode 100755
index 0000000..6a02c89
--- /dev/null
+++ b/share/examples/hast/ucarp.sh
@@ -0,0 +1,69 @@
+#!/bin/sh
+#
+# Copyright (c) 2010 The FreeBSD Foundation
+# All rights reserved.
+#
+# This software was developed by Pawel Jakub Dawidek under sponsorship from
+# the FreeBSD Foundation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+
+# Shared IP address, unused for now.
+addr="10.99.0.3"
+# Password for UCARP communication.
+pass="password"
+# First node IP and interface for UCARP communication.
+nodea_srcip="10.99.0.1"
+nodea_ifnet="bge0"
+# Second node IP and interface for UCARP communication.
+nodeb_srcip="10.99.0.2"
+nodeb_ifnet="em3"
+
+export PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin
+
+vhid="1"
+upscript="/root/hast/sbin/hastd/vip-up.sh"
+downscript="/root/hast/sbin/hastd/vip-down.sh"
+
+ifconfig "${nodea_ifnet}" 2>/dev/null | grep -q "inet ${nodea_srcip} "
+if [ $? -eq 0 ]; then
+ srcip="${nodea_srcip}"
+ ifnet="${nodea_ifnet}"
+ node="node A"
+fi
+ifconfig "${nodeb_ifnet}" 2>/dev/null | grep -q "inet ${nodeb_srcip} "
+if [ $? -eq 0 ]; then
+ if [ -n "${srcip}" -o -n "${ifnet}" ]; then
+ echo "Unable to determine which node is this (both match)." >/dev/stderr
+ exit 1
+ fi
+ srcip="${nodeb_srcip}"
+ ifnet="${nodeb_ifnet}"
+ node="node B"
+fi
+if [ -z "${srcip}" -o -z "${ifnet}" ]; then
+ echo "Unable to determine which node is this (none match)." >/dev/stderr
+ exit 1
+fi
+ucarp -i ${ifnet} -s ${srcip} -v ${vhid} -a ${addr} -p ${pass} -u "${upscript}" -d "${downscript}"
diff --git a/share/examples/hast/ucarp_down.sh b/share/examples/hast/ucarp_down.sh
new file mode 100755
index 0000000..a5b3428
--- /dev/null
+++ b/share/examples/hast/ucarp_down.sh
@@ -0,0 +1,98 @@
+#!/bin/sh
+#
+# Copyright (c) 2010 The FreeBSD Foundation
+# All rights reserved.
+#
+# This software was developed by Pawel Jakub Dawidek under sponsorship from
+# the FreeBSD Foundation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+
+# Resource name as defined in /etc/hast.conf.
+resource="test"
+# Supported file system types: UFS, ZFS
+fstype="UFS"
+# ZFS pool name. Required only when fstype == ZFS.
+pool="test"
+# File system mount point. Required only when fstype == UFS.
+mountpoint="/mnt/test"
+# Name of HAST provider as defined in /etc/hast.conf.
+# Required only when fstype == UFS.
+device="/dev/hast/${resource}"
+
+export PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin
+
+# KIll UP script if it still runs in the background.
+sig="TERM"
+for i in `jot 30`; do
+ pgid=`pgrep -f ucarp_up.sh | head -1`
+ [ -n "${pgid}" ] || break
+ kill -${sig} -- -${pgid}
+ sig="KILL"
+ sleep 1
+done
+if [ -n "${pgid}" ]; then
+ logger -p local0.error -t hast "UCARP UP process for resource ${resource} is still running after 30 seconds."
+ exit 1
+fi
+logger -p local0.debug -t hast "UCARP UP is not running."
+
+case "${fstype}" in
+UFS)
+ mount | egrep -q "^${device} on "
+ if [ $? -eq 0 ]; then
+ # Forcibly unmount file system.
+ out=`umount -f "${mountpoint}" 2>&1`
+ if [ $? -ne 0 ]; then
+ logger -p local0.error -t hast "Unable to unmount file system for resource ${resource}: ${out}."
+ exit 1
+ fi
+ logger -p local0.debug -t hast "File system for resource ${resource} unmounted."
+ fi
+ ;;
+ZFS)
+ zpool list | egrep -q "^${pool} "
+ if [ $? -eq 0 ]; then
+ # Forcibly export file pool.
+ out=`zpool export -f "${pool}" 2>&1`
+ if [ $? -ne 0 ]; then
+ logger -p local0.error -t hast "Unable to export pool for resource ${resource}: ${out}."
+ exit 1
+ fi
+ logger -p local0.debug -t hast "ZFS pool for resource ${resource} exported."
+ fi
+ ;;
+esac
+
+# Change role to secondary for our resource.
+out=`hastctl role secondary "${resource}" 2>&1`
+if [ $? -ne 0 ]; then
+ logger -p local0.error -t hast "Unable to change to role to secondary for resource ${resource}: ${out}."
+ exit 1
+fi
+logger -p local0.debug -t hast "Role for resource ${resource} changed to secondary."
+
+logger -p local0.info -t hast "Successfully switched to secondary for resource ${resource}."
+
+exit 0
diff --git a/share/examples/hast/ucarp_up.sh b/share/examples/hast/ucarp_up.sh
new file mode 100755
index 0000000..9e56040
--- /dev/null
+++ b/share/examples/hast/ucarp_up.sh
@@ -0,0 +1,105 @@
+#!/bin/sh
+#
+# Copyright (c) 2010 The FreeBSD Foundation
+# All rights reserved.
+#
+# This software was developed by Pawel Jakub Dawidek under sponsorship from
+# the FreeBSD Foundation.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+
+# Resource name as defined in /etc/hast.conf.
+resource="test"
+# Supported file system types: UFS, ZFS
+fstype="UFS"
+# ZFS pool name. Required only when fstype == ZFS.
+pool="test"
+# File system mount point. Required only when fstype == UFS.
+mountpoint="/mnt/test"
+# Name of HAST provider as defined in /etc/hast.conf.
+device="/dev/hast/${resource}"
+
+export PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin
+
+# If there is secondary worker process, it means that remote primary process is
+# still running. We have to wait for it to terminate.
+for i in `jot 30`; do
+ pgrep -f "hastd: ${resource} \(secondary\)" >/dev/null 2>&1 || break
+ sleep 1
+done
+if pgrep -f "hastd: ${resource} \(secondary\)" >/dev/null 2>&1; then
+ logger -p local0.error -t hast "Secondary process for resource ${resource} is still running after 30 seconds."
+ exit 1
+fi
+logger -p local0.debug -t hast "Secondary process in not running."
+
+# Change role to primary for our resource.
+out=`hastctl role primary "${resource}" 2>&1`
+if [ $? -ne 0 ]; then
+ logger -p local0.error -t hast "Unable to change to role to primary for resource ${resource}: ${out}."
+ exit 1
+fi
+# Wait few seconds for provider to appear.
+for i in `jot 50`; do
+ [ -c "${device}" ] && break
+ sleep 0.1
+done
+if [ ! -c "${device}" ]; then
+ logger -p local0.error -t hast "Device ${device} didn't appear."
+ exit 1
+fi
+logger -p local0.debug -t hast "Role for resource ${resource} changed to primary."
+
+case "${fstype}" in
+UFS)
+ # Check the file system.
+ fsck -y -t ufs "${device}" >/dev/null 2>&1
+ if [ $? -ne 0 ]; then
+ logger -p local0.error -t hast "File system check for resource ${resource} failed."
+ exit 1
+ fi
+ logger -p local0.debug -t hast "File system check for resource ${resource} finished."
+ # Mount the file system.
+ out=`mount -t ufs "${device}" "${mountpoint}" 2>&1`
+ if [ $? -ne 0 ]; then
+ logger -p local0.error -t hast "File system mount for resource ${resource} failed: ${out}."
+ exit 1
+ fi
+ logger -p local0.debug -t hast "File system for resource ${resource} mounted."
+ ;;
+ZFS)
+ # Import ZFS pool. Do it forcibly as it remembers hostid of
+ # the other cluster node.
+ out=`zpool import -f "${pool}" 2>&1`
+ if [ $? -ne 0 ]; then
+ logger -p local0.error -t hast "ZFS pool import for resource ${resource} failed: ${out}."
+ exit 1
+ fi
+ logger -p local0.debug -t hast "ZFS pool for resource ${resource} imported."
+ ;;
+esac
+
+logger -p local0.info -t hast "Successfully switched to primary for resource ${resource}."
+
+exit 0
diff --git a/share/examples/hast/vip-down.sh b/share/examples/hast/vip-down.sh
new file mode 100755
index 0000000..5e47609
--- /dev/null
+++ b/share/examples/hast/vip-down.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+# $FreeBSD$
+
+/root/hast/sbin/hastd/ucarp_down.sh
+exit 0
diff --git a/share/examples/hast/vip-up.sh b/share/examples/hast/vip-up.sh
new file mode 100755
index 0000000..61dabe9
--- /dev/null
+++ b/share/examples/hast/vip-up.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+# $FreeBSD$
+
+set -m
+/root/hast/sbin/hastd/ucarp_up.sh &
+set +m
+exit 0
diff --git a/share/man/man5/rc.conf.5 b/share/man/man5/rc.conf.5
index f8d265b..96f64d3 100644
--- a/share/man/man5/rc.conf.5
+++ b/share/man/man5/rc.conf.5
@@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd November 11, 2009
+.Dd February 12, 2010
.Dt RC.CONF 5
.Os
.Sh NAME
@@ -1746,6 +1746,27 @@ is set to
.Dq Li YES ,
these are the flags to pass to
.Xr inetd 8 .
+.It Va hastd_enable
+.Pq Vt bool
+If set to
+.Dq Li YES ,
+run the
+.Xr hastd 8
+daemon.
+.It Va hastd_program
+.Pq Vt str
+Path to
+.Xr hastd 8
+(default
+.Pa /sbin/hastd ) .
+.It Va hastd_flags
+.Pq Vt str
+If
+.Va hastd_enable
+is set to
+.Dq Li YES ,
+these are the flags to pass to
+.Xr hastd 8 .
.It Va named_enable
.Pq Vt bool
If set to
diff --git a/sys/geom/gate/g_gate.c b/sys/geom/gate/g_gate.c
index 26df0f4..952e856 100644
--- a/sys/geom/gate/g_gate.c
+++ b/sys/geom/gate/g_gate.c
@@ -1,7 +1,11 @@
/*-
* Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
* All rights reserved.
*
+ * Portions of this software were developed by Pawel Jakub Dawidek
+ * under sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -53,9 +57,14 @@ static MALLOC_DEFINE(M_GATE, "gg_data", "GEOM Gate Data");
SYSCTL_DECL(_kern_geom);
SYSCTL_NODE(_kern_geom, OID_AUTO, gate, CTLFLAG_RW, 0, "GEOM_GATE stuff");
-static u_int g_gate_debug = 0;
-SYSCTL_UINT(_kern_geom_gate, OID_AUTO, debug, CTLFLAG_RW, &g_gate_debug, 0,
+static int g_gate_debug = 0;
+TUNABLE_INT("kern.geom.gate.debug", &g_gate_debug);
+SYSCTL_INT(_kern_geom_gate, OID_AUTO, debug, CTLFLAG_RW, &g_gate_debug, 0,
"Debug level");
+static u_int g_gate_maxunits = 256;
+TUNABLE_INT("kern.geom.gate.maxunits", &g_gate_maxunits);
+SYSCTL_UINT(_kern_geom_gate, OID_AUTO, maxunits, CTLFLAG_RDTUN,
+ &g_gate_maxunits, 0, "Maximum number of ggate devices");
struct g_class g_gate_class = {
.name = G_GATE_CLASS_NAME,
@@ -71,10 +80,9 @@ static struct cdevsw g_gate_cdevsw = {
};
-static LIST_HEAD(, g_gate_softc) g_gate_list =
- LIST_HEAD_INITIALIZER(g_gate_list);
-static struct mtx g_gate_list_mtx;
-
+static struct g_gate_softc **g_gate_units;
+static u_int g_gate_nunits;
+static struct mtx g_gate_units_lock;
static int
g_gate_destroy(struct g_gate_softc *sc, boolean_t force)
@@ -84,13 +92,13 @@ g_gate_destroy(struct g_gate_softc *sc, boolean_t force)
struct bio *bp;
g_topology_assert();
- mtx_assert(&g_gate_list_mtx, MA_OWNED);
+ mtx_assert(&g_gate_units_lock, MA_OWNED);
pp = sc->sc_provider;
if (!force && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
- mtx_unlock(&g_gate_list_mtx);
+ mtx_unlock(&g_gate_units_lock);
return (EBUSY);
}
- mtx_unlock(&g_gate_list_mtx);
+ mtx_unlock(&g_gate_units_lock);
mtx_lock(&sc->sc_queue_mtx);
if ((sc->sc_flags & G_GATE_FLAG_DESTROY) == 0)
sc->sc_flags |= G_GATE_FLAG_DESTROY;
@@ -125,14 +133,15 @@ g_gate_destroy(struct g_gate_softc *sc, boolean_t force)
}
mtx_unlock(&sc->sc_queue_mtx);
g_topology_unlock();
- mtx_lock(&g_gate_list_mtx);
+ mtx_lock(&g_gate_units_lock);
/* One reference is ours. */
sc->sc_ref--;
- while (sc->sc_ref > 0) {
- msleep(&sc->sc_ref, &g_gate_list_mtx, 0, "gg:destroy", 0);
- }
- LIST_REMOVE(sc, sc_next);
- mtx_unlock(&g_gate_list_mtx);
+ while (sc->sc_ref > 0)
+ msleep(&sc->sc_ref, &g_gate_units_lock, 0, "gg:destroy", 0);
+ g_gate_units[sc->sc_unit] = NULL;
+ KASSERT(g_gate_nunits > 0, ("negative g_gate_nunits?"));
+ g_gate_nunits--;
+ mtx_unlock(&g_gate_units_lock);
mtx_destroy(&sc->sc_queue_mtx);
g_topology_lock();
G_GATE_DEBUG(0, "Device %s destroyed.", gp->name);
@@ -196,7 +205,7 @@ g_gate_start(struct bio *bp)
if (sc->sc_queue_count > sc->sc_queue_size) {
mtx_unlock(&sc->sc_queue_mtx);
G_GATE_LOGREQ(1, bp, "Queue full, request canceled.");
- g_io_deliver(bp, EIO);
+ g_io_deliver(bp, ENOMEM);
return;
}
@@ -211,18 +220,29 @@ g_gate_start(struct bio *bp)
}
static struct g_gate_softc *
-g_gate_hold(u_int unit)
+g_gate_hold(u_int unit, const char *name)
{
- struct g_gate_softc *sc;
-
- mtx_lock(&g_gate_list_mtx);
- LIST_FOREACH(sc, &g_gate_list, sc_next) {
- if (sc->sc_unit == unit)
+ struct g_gate_softc *sc = NULL;
+
+ mtx_lock(&g_gate_units_lock);
+ if (unit >= 0 && unit < g_gate_maxunits)
+ sc = g_gate_units[unit];
+ else if (unit == G_GATE_NAME_GIVEN) {
+ KASSERT(name != NULL, ("name is NULL"));
+ for (unit = 0; unit < g_gate_maxunits; unit++) {
+ if (g_gate_units[unit] == NULL)
+ continue;
+ if (strcmp(name,
+ g_gate_units[unit]->sc_provider->name) != 0) {
+ continue;
+ }
+ sc = g_gate_units[unit];
break;
+ }
}
if (sc != NULL)
sc->sc_ref++;
- mtx_unlock(&g_gate_list_mtx);
+ mtx_unlock(&g_gate_units_lock);
return (sc);
}
@@ -231,40 +251,34 @@ g_gate_release(struct g_gate_softc *sc)
{
g_topology_assert_not();
- mtx_lock(&g_gate_list_mtx);
+ mtx_lock(&g_gate_units_lock);
sc->sc_ref--;
KASSERT(sc->sc_ref >= 0, ("Negative sc_ref for %s.", sc->sc_name));
- if (sc->sc_ref == 0 && (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) {
+ if (sc->sc_ref == 0 && (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0)
wakeup(&sc->sc_ref);
- mtx_unlock(&g_gate_list_mtx);
- } else {
- mtx_unlock(&g_gate_list_mtx);
- }
+ mtx_unlock(&g_gate_units_lock);
}
static int
-g_gate_getunit(int unit)
+g_gate_getunit(int unit, int *errorp)
{
- struct g_gate_softc *sc;
- mtx_assert(&g_gate_list_mtx, MA_OWNED);
+ mtx_assert(&g_gate_units_lock, MA_OWNED);
if (unit >= 0) {
- LIST_FOREACH(sc, &g_gate_list, sc_next) {
- if (sc->sc_unit == unit)
- return (-1);
- }
+ if (unit >= g_gate_maxunits)
+ *errorp = EINVAL;
+ else if (g_gate_units[unit] == NULL)
+ return (unit);
+ else
+ *errorp = EEXIST;
} else {
- unit = 0;
-once_again:
- LIST_FOREACH(sc, &g_gate_list, sc_next) {
- if (sc->sc_unit == unit) {
- if (++unit > 666)
- return (-1);
- goto once_again;
- }
+ for (unit = 0; unit < g_gate_maxunits; unit++) {
+ if (g_gate_units[unit] == NULL)
+ return (unit);
}
+ *errorp = ENFILE;
}
- return (unit);
+ return (-1);
}
static void
@@ -276,7 +290,7 @@ g_gate_guard(void *arg)
sc = arg;
binuptime(&curtime);
- g_gate_hold(sc->sc_unit);
+ g_gate_hold(sc->sc_unit, NULL);
mtx_lock(&sc->sc_queue_mtx);
TAILQ_FOREACH_SAFE(bp, &sc->sc_inqueue.queue, bio_queue, bp2) {
if (curtime.sec - bp->bio_t0.sec < 5)
@@ -311,7 +325,7 @@ g_gate_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
sc = gp->softc;
if (sc == NULL || pp != NULL || cp != NULL)
return;
- g_gate_hold(sc->sc_unit);
+ g_gate_hold(sc->sc_unit, NULL);
if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0) {
sbuf_printf(sb, "%s<access>%s</access>\n", indent, "read-only");
} else if ((sc->sc_flags & G_GATE_FLAG_WRITEONLY) != 0) {
@@ -328,6 +342,7 @@ g_gate_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
sbuf_printf(sb, "%s<queue_size>%u</queue_size>\n", indent,
sc->sc_queue_size);
sbuf_printf(sb, "%s<ref>%u</ref>\n", indent, sc->sc_ref);
+ sbuf_printf(sb, "%s<unit>%d</unit>\n", indent, sc->sc_unit);
g_topology_unlock();
g_gate_release(sc);
g_topology_lock();
@@ -339,6 +354,8 @@ g_gate_create(struct g_gate_ctl_create *ggio)
struct g_gate_softc *sc;
struct g_geom *gp;
struct g_provider *pp;
+ char name[NAME_MAX];
+ int error = 0, unit;
if (ggio->gctl_mediasize == 0) {
G_GATE_DEBUG(1, "Invalid media size.");
@@ -357,15 +374,22 @@ g_gate_create(struct g_gate_ctl_create *ggio)
G_GATE_DEBUG(1, "Invalid flags.");
return (EINVAL);
}
- if (ggio->gctl_unit < -1) {
+ if (ggio->gctl_unit != G_GATE_UNIT_AUTO &&
+ ggio->gctl_unit != G_GATE_NAME_GIVEN &&
+ ggio->gctl_unit < 0) {
G_GATE_DEBUG(1, "Invalid unit number.");
return (EINVAL);
}
+ if (ggio->gctl_unit == G_GATE_NAME_GIVEN &&
+ ggio->gctl_name[0] == '\0') {
+ G_GATE_DEBUG(1, "No device name.");
+ return (EINVAL);
+ }
sc = malloc(sizeof(*sc), M_GATE, M_WAITOK | M_ZERO);
sc->sc_flags = (ggio->gctl_flags & G_GATE_USERFLAGS);
strlcpy(sc->sc_info, ggio->gctl_info, sizeof(sc->sc_info));
- sc->sc_seq = 0;
+ sc->sc_seq = 1;
bioq_init(&sc->sc_inqueue);
bioq_init(&sc->sc_outqueue);
mtx_init(&sc->sc_queue_mtx, "gg:queue", NULL, MTX_DEF);
@@ -375,26 +399,44 @@ g_gate_create(struct g_gate_ctl_create *ggio)
sc->sc_queue_size = G_GATE_MAX_QUEUE_SIZE;
sc->sc_timeout = ggio->gctl_timeout;
callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
- mtx_lock(&g_gate_list_mtx);
- ggio->gctl_unit = g_gate_getunit(ggio->gctl_unit);
- if (ggio->gctl_unit == -1) {
- mtx_unlock(&g_gate_list_mtx);
+ mtx_lock(&g_gate_units_lock);
+ sc->sc_unit = g_gate_getunit(ggio->gctl_unit, &error);
+ if (sc->sc_unit < 0) {
+ mtx_unlock(&g_gate_units_lock);
mtx_destroy(&sc->sc_queue_mtx);
free(sc, M_GATE);
- return (EBUSY);
+ return (error);
+ }
+ if (ggio->gctl_unit == G_GATE_NAME_GIVEN)
+ snprintf(name, sizeof(name), "%s", ggio->gctl_name);
+ else {
+ snprintf(name, sizeof(name), "%s%d", G_GATE_PROVIDER_NAME,
+ sc->sc_unit);
}
- sc->sc_unit = ggio->gctl_unit;
- LIST_INSERT_HEAD(&g_gate_list, sc, sc_next);
- mtx_unlock(&g_gate_list_mtx);
+ /* Check for name collision. */
+ for (unit = 0; unit < g_gate_maxunits; unit++) {
+ if (g_gate_units[unit] == NULL)
+ continue;
+ if (strcmp(name, g_gate_units[unit]->sc_provider->name) != 0)
+ continue;
+ mtx_unlock(&g_gate_units_lock);
+ mtx_destroy(&sc->sc_queue_mtx);
+ free(sc, M_GATE);
+ return (EEXIST);
+ }
+ g_gate_units[sc->sc_unit] = sc;
+ g_gate_nunits++;
+ mtx_unlock(&g_gate_units_lock);
+
+ ggio->gctl_unit = sc->sc_unit;
g_topology_lock();
- gp = g_new_geomf(&g_gate_class, "%s%d", G_GATE_PROVIDER_NAME,
- sc->sc_unit);
+ gp = g_new_geomf(&g_gate_class, "%s", name);
gp->start = g_gate_start;
gp->access = g_gate_access;
gp->dumpconf = g_gate_dumpconf;
gp->softc = sc;
- pp = g_new_providerf(gp, "%s%d", G_GATE_PROVIDER_NAME, sc->sc_unit);
+ pp = g_new_providerf(gp, "%s", name);
pp->mediasize = ggio->gctl_mediasize;
pp->sectorsize = ggio->gctl_sectorsize;
sc->sc_provider = pp;
@@ -446,11 +488,11 @@ g_gate_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct threa
struct g_gate_ctl_destroy *ggio = (void *)addr;
G_GATE_CHECK_VERSION(ggio);
- sc = g_gate_hold(ggio->gctl_unit);
+ sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name);
if (sc == NULL)
return (ENXIO);
g_topology_lock();
- mtx_lock(&g_gate_list_mtx);
+ mtx_lock(&g_gate_units_lock);
error = g_gate_destroy(sc, ggio->gctl_force);
g_topology_unlock();
if (error != 0)
@@ -463,7 +505,7 @@ g_gate_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct threa
struct bio *tbp, *lbp;
G_GATE_CHECK_VERSION(ggio);
- sc = g_gate_hold(ggio->gctl_unit);
+ sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name);
if (sc == NULL)
return (ENXIO);
lbp = NULL;
@@ -491,6 +533,8 @@ g_gate_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct threa
break;
}
}
+ if (ggio->gctl_unit == G_GATE_NAME_GIVEN)
+ ggio->gctl_unit = sc->sc_unit;
mtx_unlock(&sc->sc_queue_mtx);
g_gate_release(sc);
return (error);
@@ -500,7 +544,7 @@ g_gate_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct threa
struct g_gate_ctl_io *ggio = (void *)addr;
G_GATE_CHECK_VERSION(ggio);
- sc = g_gate_hold(ggio->gctl_unit);
+ sc = g_gate_hold(ggio->gctl_unit, NULL);
if (sc == NULL)
return (ENXIO);
error = 0;
@@ -561,7 +605,7 @@ start_end:
struct g_gate_ctl_io *ggio = (void *)addr;
G_GATE_CHECK_VERSION(ggio);
- sc = g_gate_hold(ggio->gctl_unit);
+ sc = g_gate_hold(ggio->gctl_unit, NULL);
if (sc == NULL)
return (ENOENT);
error = 0;
@@ -631,20 +675,24 @@ g_gate_modevent(module_t mod, int type, void *data)
switch (type) {
case MOD_LOAD:
- mtx_init(&g_gate_list_mtx, "gg_list_lock", NULL, MTX_DEF);
+ mtx_init(&g_gate_units_lock, "gg_units_lock", NULL, MTX_DEF);
+ g_gate_units = malloc(g_gate_maxunits * sizeof(g_gate_units[0]),
+ M_GATE, M_WAITOK | M_ZERO);
+ g_gate_nunits = 0;
g_gate_device();
break;
case MOD_UNLOAD:
- mtx_lock(&g_gate_list_mtx);
- if (!LIST_EMPTY(&g_gate_list)) {
- mtx_unlock(&g_gate_list_mtx);
+ mtx_lock(&g_gate_units_lock);
+ if (g_gate_nunits > 0) {
+ mtx_unlock(&g_gate_units_lock);
error = EBUSY;
break;
}
- mtx_unlock(&g_gate_list_mtx);
- mtx_destroy(&g_gate_list_mtx);
+ mtx_unlock(&g_gate_units_lock);
+ mtx_destroy(&g_gate_units_lock);
if (status_dev != 0)
destroy_dev(status_dev);
+ free(g_gate_units, M_GATE);
break;
default:
return (EOPNOTSUPP);
diff --git a/sys/geom/gate/g_gate.h b/sys/geom/gate/g_gate.h
index cd2564d..4f41348 100644
--- a/sys/geom/gate/g_gate.h
+++ b/sys/geom/gate/g_gate.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * Copyright (c) 2004-2009 Pawel Jakub Dawidek <pjd@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -41,7 +41,7 @@
#define G_GATE_MOD_NAME "ggate"
#define G_GATE_CTL_NAME "ggctl"
-#define G_GATE_VERSION 1
+#define G_GATE_VERSION 2
/*
* Maximum number of request that can be stored in
@@ -54,6 +54,15 @@
#define G_GATE_FLAG_DESTROY 0x1000
#define G_GATE_USERFLAGS (G_GATE_FLAG_READONLY | G_GATE_FLAG_WRITEONLY)
+/*
+ * Pick unit number automatically in /dev/ggate<unit>.
+ */
+#define G_GATE_UNIT_AUTO (-1)
+/*
+ * Full provider name is given, so don't use ggate<unit>.
+ */
+#define G_GATE_NAME_GIVEN (-2)
+
#define G_GATE_CMD_CREATE _IOWR('m', 0, struct g_gate_ctl_create)
#define G_GATE_CMD_DESTROY _IOWR('m', 1, struct g_gate_ctl_destroy)
#define G_GATE_CMD_CANCEL _IOWR('m', 2, struct g_gate_ctl_cancel)
@@ -120,20 +129,23 @@ struct g_gate_ctl_create {
u_int gctl_flags;
u_int gctl_maxcount;
u_int gctl_timeout;
+ char gctl_name[NAME_MAX];
char gctl_info[G_GATE_INFOSIZE];
- int gctl_unit; /* out */
+ int gctl_unit; /* in/out */
};
struct g_gate_ctl_destroy {
u_int gctl_version;
int gctl_unit;
int gctl_force;
+ char gctl_name[NAME_MAX];
};
struct g_gate_ctl_cancel {
u_int gctl_version;
int gctl_unit;
uintptr_t gctl_seq;
+ char gctl_name[NAME_MAX];
};
struct g_gate_ctl_io {
OpenPOWER on IntegriCloud