diff options
56 files changed, 11572 insertions, 78 deletions
diff --git a/etc/defaults/rc.conf b/etc/defaults/rc.conf index 7d0a7d2..35b3a7b 100644 --- a/etc/defaults/rc.conf +++ b/etc/defaults/rc.conf @@ -260,6 +260,9 @@ syslogd_flags="-s" # Flags to syslogd (if enabled). inetd_enable="NO" # Run the network daemon dispatcher (YES/NO). inetd_program="/usr/sbin/inetd" # path to inetd, if you want a different one. inetd_flags="-wW -C 60" # Optional flags to inetd +hastd_enable="NO" # Run the HAST daemon (YES/NO). +hastd_program="/sbin/hastd" # path to hastd, if you want a different one. +hastd_flags="" # Optional flags to hastd. # # named. It may be possible to run named in a sandbox, man security for # details. diff --git a/etc/rc.d/Makefile b/etc/rc.d/Makefile index d0e24b3..17f7634 100755 --- a/etc/rc.d/Makefile +++ b/etc/rc.d/Makefile @@ -12,7 +12,7 @@ FILES= DAEMON FILESYSTEMS LOGIN NETWORKING SERVERS \ encswap \ faith fsck ftp-proxy ftpd \ gbde geli geli2 gssd \ - hcsecd \ + hastd hcsecd \ hostapd hostid hostid_save hostname \ inetd initrandom \ ip6addrctl ipfilter ipfs ipfw ipmon \ diff --git a/etc/rc.d/hastd b/etc/rc.d/hastd new file mode 100644 index 0000000..3014caf --- /dev/null +++ b/etc/rc.d/hastd @@ -0,0 +1,31 @@ +#!/bin/sh +# +# $FreeBSD$ +# + +# PROVIDE: hastd +# REQUIRE: NETWORKING syslogd +# BEFORE: DAEMON + +. /etc/rc.subr + +name="hastd" +rcvar=`set_rcvar` +pidfile="/var/run/${name}.pid" +command="/sbin/${name}" +hastctl="/sbin/hastctl" +required_files="/etc/hast.conf" +stop_precmd="hastd_stop_precmd" +required_modules="geom_gate:g_gate" + +sockfile="/var/run/syslogd.sockets" +evalargs="rc_flags=\"\`set_socketlist\` \$rc_flags\"" +altlog_proglist="named" + +hastd_stop_precmd() +{ + ${hastctl} role init all +} + +load_rc_config $name +run_rc_command "$1" diff --git a/sbin/Makefile b/sbin/Makefile index 8ece390..72f4bff 100644 --- a/sbin/Makefile +++ b/sbin/Makefile @@ -36,6 +36,8 @@ SUBDIR= adjkerntz \ ggate \ growfs \ gvinum \ + hastctl \ + hastd \ ifconfig \ init \ ${_ipf} \ diff --git a/sbin/ggate/ggatec/ggatec.c b/sbin/ggate/ggatec/ggatec.c index e421614..660bd8a 100644 --- a/sbin/ggate/ggatec/ggatec.c +++ b/sbin/ggate/ggatec/ggatec.c @@ -59,7 +59,7 @@ enum { UNSET, CREATE, DESTROY, LIST, RESCUE } action = UNSET; static const char *path = NULL; static const char *host = NULL; -static int unit = -1; +static int unit = G_GATE_UNIT_AUTO; static unsigned flags = 0; static int force = 0; static unsigned queue_size = G_GATE_QUEUE_SIZE; diff --git a/sbin/ggate/ggatel/ggatel.c b/sbin/ggate/ggatel/ggatel.c index 03979c3..6a3f26e 100644 --- a/sbin/ggate/ggatel/ggatel.c +++ b/sbin/ggate/ggatel/ggatel.c @@ -50,7 +50,7 @@ enum { UNSET, CREATE, DESTROY, LIST, RESCUE } action = UNSET; static const char *path = NULL; -static int unit = -1; +static int unit = G_GATE_UNIT_AUTO; static unsigned flags = 0; static int force = 0; static unsigned queue_size = G_GATE_QUEUE_SIZE; diff --git a/sbin/hastctl/Makefile b/sbin/hastctl/Makefile new file mode 100644 index 0000000..43c8c20 --- /dev/null +++ b/sbin/hastctl/Makefile @@ -0,0 +1,36 @@ +# $FreeBSD$ + +.include <bsd.own.mk> + +.PATH: ${.CURDIR}/../hastd + +PROG= hastctl +SRCS= activemap.c +SRCS+= ebuf.c +SRCS+= hast_proto.c hastctl.c +SRCS+= metadata.c +SRCS+= nv.c +SRCS+= parse.y pjdlog.c +SRCS+= proto.c proto_common.c proto_tcp4.c proto_uds.c +SRCS+= token.l +SRCS+= subr.c +SRCS+= y.tab.h +WARNS?= 6 +MAN= hastctl.8 + +CFLAGS+=-I${.CURDIR}/../hastd +CFLAGS+=-DINET +.if ${MK_INET6_SUPPORT} != "no" +CFLAGS+=-DINET6 +.endif +# This is needed to have WARNS > 1. +CFLAGS+=-DYY_NO_UNPUT + +DPADD= ${LIBCRYPTO} ${LIBL} +LDADD= -lcrypto -ll + +YFLAGS+=-v + +CLEANFILES=y.tab.c y.tab.h y.output + +.include <bsd.prog.mk> diff --git a/sbin/hastctl/hastctl.8 b/sbin/hastctl/hastctl.8 new file mode 100644 index 0000000..bf03c2e --- /dev/null +++ b/sbin/hastctl/hastctl.8 @@ -0,0 +1,217 @@ +.\" Copyright (c) 2010 The FreeBSD Foundation +.\" All rights reserved. +.\" +.\" This software was developed by Pawel Jakub Dawidek under sponsorship from +.\" the FreeBSD Foundation. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd February 1, 2010 +.Dt HASTCTL 8 +.Os +.Sh NAME +.Nm hastctl +.Nd "Highly Available Storage control utility" +.Sh SYNOPSIS +.Nm +.Cm create +.Op Fl d +.Op Fl c Ar config +.Op Fl e Ar extentsize +.Op Fl k Ar keepdirty +.Op Fl m Ar mediasize +.Ar name ... +.Nm +.Cm role +.Op Fl d +.Op Fl c Ar config +.Aq init | primary | secondary +.Ar all | name ... +.Nm +.Cm status +.Op Fl d +.Op Fl c Ar config +.Op Ar all | name ... +.Nm +.Cm dump +.Op Fl d +.Op Fl c Ar config +.Op Ar all | name ... +.Sh DESCRIPTION +The +.Nm +utility is used to control the behaviour of the +.Xr hastd 8 +daemon. +.Pp +This utility should be used by HA software like +.Nm heartbeat +or +.Nm ucarp +to setup HAST resources role when changing from primary mode to +secondary or vice versa. +Be aware that if a file system like UFS exists on HAST provider and +primary node dies, file system has to be checked for inconsistencies +with the +.Xr fsck 8 +utility after switching secondary node to primary role. +.Pp +The first argument to +.Nm +indicates an action to be performed: +.Bl -tag -width ".Cm create" +.It Cm create +Initialize local provider configured for the given resource. +Additional options include: +.Bl -tag -width ".Fl e Ar extentsize" +.It Fl e Ar extentsize +Size of an extent. +Extent is a block which is used for synchronization. +.Nm +maintains a map of dirty extents and extent is the smallest region that +can be marked as dirty. +If any part of an extent is modified, entire extent will be synchronized +when nodes connect. +If extent size is too small, there will be too much disk activity +related to dirty map updates, which will degrade performance of the +given resource. +If extent size is too large, synchronization, even in case of short +outage, can take a long time increasing the risk of loosing up-to-date +node before synchronization process is completed. +The default extent size is +.Va 2MB . +.It Fl k Ar keepdirty +Maximum number of dirty extents to keep dirty all the time. +Most recently used extents are kept dirty to reduce number of metadata +updates. +The default numer of most recently used extents which will be kept +dirty is +.Va 64 . +.It Fl m Ar mediasize +Size of the smaller provider used as backend storage on both nodes. +This option can be omitted if node providers have the same size on both +sides. +.El +.It Cm role +Change role of the given resource. +The role can be one of: +.Bl -tag -width ".Cm secondary" +.It Cm init +Resource is turned off. +.It Cm primary +Local +.Xr hastd 8 +daemon will act as primary node for the given resource. +System on which resource role is set to primary can use +.Pa /dev/hast/<name> +GEOM provider. +.It Cm secondary +Local +.Xr hastd 8 +daemon will act as secondary node for the given resource - it will wait +for connection from the primary node and will handle I/O requests +received from it. +GEOM provider +.Pa /dev/hast/<name> +will not be created on secondary node. +.El +.It Cm status +Present status of the configured resources. +.It Cm dump +Dump metadata stored on local component for the configured resources. +.El +.Pp +In addition, every subcommand can be followed by the following options: +.Bl -tag -width ".Fl c Ar config" +.It Fl c Ar config +Specify alternative location of the configuration file. +The default location is +.Pa /etc/hast.conf . +.It Fl d +Print debugging information. +This option can be specified multiple times to raise the verbosity +level. +.El +.Sh EXIT STATUS +Exit status is 0 on success, or one of the values described in +.Xr sysexits 3 +on failure. +.Sh EXAMPLES +Initialize HAST provider, create file system on it and mount it. +.Bd -literal -offset indent +nodeB# hastctl create shared +nodeB# hastd +nodeB# hastctl role secondary shared + +nodeB# hastctl create shared +nodeA# hastd +nodeA# hastctl role primary shared +nodeA# newfs -U /dev/hast/shared +nodeA# mount -o noatime /dev/hast/shared /shared +nodeA# application_start +.Ed +.Pp +Switch roles for the +.Nm shared +HAST resource. +.Bd -literal -offset indent +nodeA# application_stop +nodeA# umount -f /shared +nodeA# hastctl role secondary shared + +nodeB# hastctl role primary shared +nodeB# fsck -t ufs /dev/hast/shared +nodeB# mount -o noatime /dev/hast/shared /shared +nodeB# application_start +.Ed +.Sh FILES +.Bl -tag -width ".Pa /var/run/hastctl" -compact +.It Pa /etc/hast.conf +Configuration file for +.Nm +and +.Xr hastd 8 . +.It Pa /var/run/hastctl +Control socket used by +.Nm +to communicate with the +.Xr hastd 8 +daemon. +.El +.Sh SEE ALSO +.Xr sysexits 3 , +.Xr geom 4 , +.Xr hast.conf 5 , +.Xr fsck 8 , +.Xr ggatec 8 , +.Xr ggatel 8 , +.Xr hastd 8 , +.Xr mount 8 , +.Xr newfs 8 . +.Sh AUTHORS +The +.Nm +was developed by +.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +under sponsorship of the FreeBSD Foundation. diff --git a/sbin/hastctl/hastctl.c b/sbin/hastctl/hastctl.c new file mode 100644 index 0000000..8499528 --- /dev/null +++ b/sbin/hastctl/hastctl.c @@ -0,0 +1,526 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/disk.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/sysctl.h> + +#include <assert.h> +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <inttypes.h> +#include <limits.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> + +#include <activemap.h> + +#include "hast.h" +#include "hast_proto.h" +#include "metadata.h" +#include "nv.h" +#include "pjdlog.h" +#include "proto.h" +#include "subr.h" + +/* Path to configuration file. */ +static const char *cfgpath = HAST_CONFIG; +/* Hastd configuration. */ +static struct hastd_config *cfg; +/* Control connection. */ +static struct proto_conn *controlconn; + +enum { + CMD_INVALID, + CMD_CREATE, + CMD_ROLE, + CMD_STATUS, + CMD_DUMP +}; + +static __dead2 void +usage(void) +{ + + fprintf(stderr, + "usage: %s create [-d] [-c config] [-e extentsize] [-k keepdirty]\n" + "\t\t[-m mediasize] name ...\n", + getprogname()); + fprintf(stderr, + " %s role [-d] [-c config] <init | primary | secondary> all | name ...\n", + getprogname()); + fprintf(stderr, + " %s status [-d] [-c config] [all | name ...]\n", + getprogname()); + fprintf(stderr, + " %s dump [-d] [-c config] [all | name ...]\n", + getprogname()); + exit(EX_USAGE); +} + +static int +create_one(struct hast_resource *res, intmax_t mediasize, intmax_t extentsize, + intmax_t keepdirty) +{ + unsigned char *buf; + size_t mapsize; + int ec; + + ec = 0; + pjdlog_prefix_set("[%s] ", res->hr_name); + + if (provinfo(res, true) < 0) { + ec = EX_NOINPUT; + goto end; + } + if (mediasize == 0) + mediasize = res->hr_local_mediasize; + else if (mediasize > res->hr_local_mediasize) { + pjdlog_error("Provided mediasize is larger than provider %s size.", + res->hr_localpath); + ec = EX_DATAERR; + goto end; + } + if (!powerof2(res->hr_local_sectorsize)) { + pjdlog_error("Sector size of provider %s is not power of 2 (%u).", + res->hr_localpath, res->hr_local_sectorsize); + ec = EX_DATAERR; + goto end; + } + if (extentsize == 0) + extentsize = HAST_EXTENTSIZE; + if (extentsize < res->hr_local_sectorsize) { + pjdlog_error("Extent size (%jd) is less than sector size (%u).", + (intmax_t)extentsize, res->hr_local_sectorsize); + ec = EX_DATAERR; + goto end; + } + if ((extentsize % res->hr_local_sectorsize) != 0) { + pjdlog_error("Extent size (%jd) is not multiple of sector size (%u).", + (intmax_t)extentsize, res->hr_local_sectorsize); + ec = EX_DATAERR; + goto end; + } + mapsize = activemap_calc_ondisk_size(mediasize - METADATA_SIZE, + extentsize, res->hr_local_sectorsize); + if (keepdirty == 0) + keepdirty = HAST_KEEPDIRTY; + res->hr_datasize = mediasize - METADATA_SIZE - mapsize; + res->hr_extentsize = extentsize; + res->hr_keepdirty = keepdirty; + + res->hr_localoff = METADATA_SIZE + mapsize; + + if (metadata_write(res) < 0) { + ec = EX_IOERR; + goto end; + } + buf = calloc(1, mapsize); + if (buf == NULL) { + pjdlog_error("Unable to allocate %zu bytes of memory for initial bitmap.", + mapsize); + ec = EX_TEMPFAIL; + goto end; + } + if (pwrite(res->hr_localfd, buf, mapsize, METADATA_SIZE) != + (ssize_t)mapsize) { + pjdlog_errno(LOG_ERR, "Unable to store initial bitmap on %s", + res->hr_localpath); + free(buf); + ec = EX_IOERR; + goto end; + } + free(buf); +end: + if (res->hr_localfd >= 0) + close(res->hr_localfd); + pjdlog_prefix_set("%s", ""); + return (ec); +} + +static void +control_create(int argc, char *argv[], intmax_t mediasize, intmax_t extentsize, + intmax_t keepdirty) +{ + struct hast_resource *res; + int ec, ii, ret; + + /* Initialize the given resources. */ + if (argc < 1) + usage(); + ec = 0; + for (ii = 0; ii < argc; ii++) { + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (strcmp(argv[ii], res->hr_name) == 0) + break; + } + if (res == NULL) { + pjdlog_error("Unknown resource %s.", argv[ii]); + if (ec == 0) + ec = EX_DATAERR; + continue; + } + ret = create_one(res, mediasize, extentsize, keepdirty); + if (ret != 0 && ec == 0) + ec = ret; + } + exit(ec); +} + +static int +dump_one(struct hast_resource *res) +{ + int ret; + + ret = metadata_read(res, false); + if (ret != 0) + return (ret); + + printf("resource: %s\n", res->hr_name); + printf(" datasize: %ju\n", (uintmax_t)res->hr_datasize); + printf(" extentsize: %d\n", res->hr_extentsize); + printf(" keepdirty: %d\n", res->hr_keepdirty); + printf(" localoff: %ju\n", (uintmax_t)res->hr_localoff); + printf(" resuid: %ju\n", (uintmax_t)res->hr_resuid); + printf(" localcnt: %ju\n", (uintmax_t)res->hr_primary_localcnt); + printf(" remotecnt: %ju\n", (uintmax_t)res->hr_primary_remotecnt); + printf(" prevrole: %s\n", role2str(res->hr_previous_role)); + + return (0); +} + +static void +control_dump(int argc, char *argv[]) +{ + struct hast_resource *res; + int ec, ret; + + /* Dump metadata of the given resource(s). */ + + ec = 0; + if (argc == 0 || (argc == 1 && strcmp(argv[0], "all") == 0)) { + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + ret = dump_one(res); + if (ret != 0 && ec == 0) + ec = ret; + } + } else { + int ii; + + for (ii = 0; ii < argc; ii++) { + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (strcmp(argv[ii], res->hr_name) == 0) + break; + } + if (res == NULL) { + pjdlog_error("Unknown resource %s.", argv[ii]); + if (ec == 0) + ec = EX_DATAERR; + continue; + } + ret = dump_one(res); + if (ret != 0 && ec == 0) + ec = ret; + } + } + exit(ec); +} + +static int +control_set_role(struct nv *nv, const char *newrole) +{ + const char *res, *oldrole; + unsigned int ii; + int error, ret; + + ret = 0; + + for (ii = 0; ; ii++) { + res = nv_get_string(nv, "resource%u", ii); + if (res == NULL) + break; + pjdlog_prefix_set("[%s] ", res); + error = nv_get_int16(nv, "error%u", ii); + if (error != 0) { + if (ret == 0) + ret = error; + pjdlog_warning("Received error %d from hastd.", error); + continue; + } + oldrole = nv_get_string(nv, "role%u", ii); + if (strcmp(oldrole, newrole) == 0) + pjdlog_debug(2, "Role unchanged (%s).", oldrole); + else { + pjdlog_debug(1, "Role changed from %s to %s.", oldrole, + newrole); + } + } + pjdlog_prefix_set("%s", ""); + return (ret); +} + +static int +control_status(struct nv *nv) +{ + unsigned int ii; + const char *str; + int error, ret; + + ret = 0; + + for (ii = 0; ; ii++) { + str = nv_get_string(nv, "resource%u", ii); + if (str == NULL) + break; + printf("%s:\n", str); + error = nv_get_int16(nv, "error%u", ii); + if (error != 0) { + if (ret == 0) + ret = error; + printf(" error: %d\n", error); + continue; + } + printf(" role: %s\n", nv_get_string(nv, "role%u", ii)); + printf(" provname: %s\n", + nv_get_string(nv, "provname%u", ii)); + printf(" localpath: %s\n", + nv_get_string(nv, "localpath%u", ii)); + printf(" extentsize: %u\n", + (unsigned int)nv_get_uint32(nv, "extentsize%u", ii)); + printf(" keepdirty: %u\n", + (unsigned int)nv_get_uint32(nv, "keepdirty%u", ii)); + printf(" remoteaddr: %s\n", + nv_get_string(nv, "remoteaddr%u", ii)); + printf(" replication: %s\n", + nv_get_string(nv, "replication%u", ii)); + str = nv_get_string(nv, "status%u", ii); + if (str != NULL) + printf(" status: %s\n", str); + printf(" dirty: %ju bytes\n", + (uintmax_t)nv_get_uint64(nv, "dirty%u", ii)); + } + return (ret); +} + +static int +numfromstr(const char *str, intmax_t *nump) +{ + intmax_t num; + char *suffix; + int rerrno; + + rerrno = errno; + errno = 0; + num = strtoimax(str, &suffix, 0); + if (errno == 0 && *suffix != '\0') + errno = EINVAL; + if (errno != 0) + return (-1); + *nump = num; + errno = rerrno; + return (0); +} + +int +main(int argc, char *argv[]) +{ + struct nv *nv; + intmax_t mediasize, extentsize, keepdirty; + int cmd, debug, error, ii; + const char *optstr; + + debug = 0; + mediasize = extentsize = keepdirty = 0; + + if (argc == 1) + usage(); + + if (strcmp(argv[1], "create") == 0) { + cmd = CMD_CREATE; + optstr = "c:de:k:m:h"; + } else if (strcmp(argv[1], "role") == 0) { + cmd = CMD_ROLE; + optstr = "c:dh"; + } else if (strcmp(argv[1], "status") == 0) { + cmd = CMD_STATUS; + optstr = "c:dh"; + } else if (strcmp(argv[1], "dump") == 0) { + cmd = CMD_DUMP; + optstr = "c:dh"; + } else + usage(); + + argc--; + argv++; + + for (;;) { + int ch; + + ch = getopt(argc, argv, optstr); + if (ch == -1) + break; + switch (ch) { + case 'c': + cfgpath = optarg; + break; + case 'd': + debug++; + break; + case 'e': + if (numfromstr(optarg, &extentsize) < 0) + err(1, "Invalid extentsize"); + break; + case 'k': + if (numfromstr(optarg, &keepdirty) < 0) + err(1, "Invalid keepdirty"); + break; + case 'm': + if (numfromstr(optarg, &mediasize) < 0) + err(1, "Invalid mediasize"); + break; + case 'h': + default: + usage(); + } + } + argc -= optind; + argv += optind; + + switch (cmd) { + case CMD_CREATE: + case CMD_ROLE: + if (argc == 0) + usage(); + break; + } + + pjdlog_debug_set(debug); + + cfg = yy_config_parse(cfgpath); + assert(cfg != NULL); + + switch (cmd) { + case CMD_CREATE: + control_create(argc, argv, mediasize, extentsize, keepdirty); + /* NOTREACHED */ + assert(!"What are we doing here?!"); + break; + case CMD_DUMP: + /* Dump metadata from local component of the given resource. */ + control_dump(argc, argv); + /* NOTREACHED */ + assert(!"What are we doing here?!"); + break; + case CMD_ROLE: + /* Change role for the given resources. */ + if (argc < 2) + usage(); + nv = nv_alloc(); + nv_add_uint8(nv, HASTCTL_CMD_SETROLE, "cmd"); + if (strcmp(argv[0], "init") == 0) + nv_add_uint8(nv, HAST_ROLE_INIT, "role"); + else if (strcmp(argv[0], "primary") == 0) + nv_add_uint8(nv, HAST_ROLE_PRIMARY, "role"); + else if (strcmp(argv[0], "secondary") == 0) + nv_add_uint8(nv, HAST_ROLE_SECONDARY, "role"); + else + usage(); + for (ii = 0; ii < argc - 1; ii++) + nv_add_string(nv, argv[ii + 1], "resource%d", ii); + break; + case CMD_STATUS: + /* Obtain status of the given resources. */ + nv = nv_alloc(); + nv_add_uint8(nv, HASTCTL_CMD_STATUS, "cmd"); + if (argc == 0) + nv_add_string(nv, "all", "resource%d", 0); + else { + for (ii = 0; ii < argc; ii++) + nv_add_string(nv, argv[ii], "resource%d", ii); + } + break; + default: + assert(!"Impossible role!"); + } + + /* Setup control connection... */ + if (proto_client(cfg->hc_controladdr, &controlconn) < 0) { + pjdlog_exit(EX_OSERR, + "Unable to setup control connection to %s", + cfg->hc_controladdr); + } + /* ...and connect to hastd. */ + if (proto_connect(controlconn) < 0) { + pjdlog_exit(EX_OSERR, "Unable to connect to hastd via %s", + cfg->hc_controladdr); + } + /* Send the command to the server... */ + if (hast_proto_send(NULL, controlconn, nv, NULL, 0) < 0) { + pjdlog_exit(EX_UNAVAILABLE, + "Unable to send command to hastd via %s", + cfg->hc_controladdr); + } + nv_free(nv); + /* ...and receive reply. */ + if (hast_proto_recv(NULL, controlconn, &nv, NULL, 0) < 0) { + pjdlog_exit(EX_UNAVAILABLE, + "cannot receive reply from hastd via %s", + cfg->hc_controladdr); + } + + error = nv_get_int16(nv, "error"); + if (error != 0) { + pjdlog_exitx(EX_SOFTWARE, "Error %d received from hastd.", + error); + } + nv_set_error(nv, 0); + + switch (cmd) { + case CMD_ROLE: + error = control_set_role(nv, argv[0]); + break; + case CMD_STATUS: + error = control_status(nv); + break; + default: + assert(!"Impossible role!"); + } + + exit(error); +} diff --git a/sbin/hastd/Makefile b/sbin/hastd/Makefile new file mode 100644 index 0000000..16a0b8f --- /dev/null +++ b/sbin/hastd/Makefile @@ -0,0 +1,37 @@ +# $FreeBSD$ + +.include <bsd.own.mk> + +PROG= hastd +SRCS= activemap.c +SRCS+= control.c +SRCS+= ebuf.c +SRCS+= hast_proto.c hastd.c hooks.c +SRCS+= metadata.c +SRCS+= nv.c +SRCS+= secondary.c +SRCS+= parse.y pjdlog.c primary.c +SRCS+= proto.c proto_common.c proto_socketpair.c proto_tcp4.c proto_uds.c +SRCS+= rangelock.c +SRCS+= subr.c +SRCS+= token.l +SRCS+= y.tab.h +WARNS?= 6 +MAN= hastd.8 hast.conf.5 + +CFLAGS+=-I${.CURDIR} +CFLAGS+=-DINET +.if ${MK_INET6_SUPPORT} != "no" +CFLAGS+=-DINET6 +.endif +# This is needed to have WARNS > 1. +CFLAGS+=-DYY_NO_UNPUT + +DPADD= ${LIBCRYPTO} ${LIBGEOM} ${LIBL} ${LIBPTHREAD} ${LIBUTIL} +LDADD= -lcrypto -lgeom -ll -lpthread -lutil + +YFLAGS+=-v + +CLEANFILES=y.tab.c y.tab.h y.output + +.include <bsd.prog.mk> diff --git a/sbin/hastd/activemap.c b/sbin/hastd/activemap.c new file mode 100644 index 0000000..10eb641 --- /dev/null +++ b/sbin/hastd/activemap.c @@ -0,0 +1,691 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> /* powerof2() */ +#include <sys/queue.h> + +#include <assert.h> +#include <bitstring.h> +#include <errno.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <activemap.h> + +#define ACTIVEMAP_MAGIC 0xac71e4 +struct activemap { + int am_magic; /* Magic value. */ + off_t am_mediasize; /* Media size in bytes. */ + uint32_t am_extentsize; /* Extent size in bytes, + must be power of 2. */ + uint8_t am_extentshift;/* 2 ^ extentbits == extentsize */ + int am_nextents; /* Number of extents. */ + size_t am_mapsize; /* Bitmap size in bytes. */ + uint16_t *am_memtab; /* An array that holds number of pending + writes per extent. */ + bitstr_t *am_diskmap; /* On-disk bitmap of dirty extents. */ + bitstr_t *am_memmap; /* In-memory bitmap of dirty extents. */ + size_t am_diskmapsize; /* Map size rounded up to sector size. */ + uint64_t am_ndirty; /* Number of dirty regions. */ + bitstr_t *am_syncmap; /* Bitmap of extents to sync. */ + off_t am_syncoff; /* Next synchronization offset. */ + TAILQ_HEAD(skeepdirty, keepdirty) am_keepdirty; /* List of extents that + we keep dirty to reduce bitmap + updates. */ + int am_nkeepdirty; /* Number of am_keepdirty elements. */ + int am_nkeepdirty_limit; /* Maximum number of am_keepdirty + elements. */ +}; + +struct keepdirty { + int kd_extent; + TAILQ_ENTRY(keepdirty) kd_next; +}; + +/* + * Helper function taken from sys/systm.h to calculate extentshift. + */ +static uint32_t +bitcount32(uint32_t x) +{ + + x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1); + x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2); + x = (x + (x >> 4)) & 0x0f0f0f0f; + x = (x + (x >> 8)); + x = (x + (x >> 16)) & 0x000000ff; + return (x); +} + +static __inline int +off2ext(const struct activemap *amp, off_t offset) +{ + int extent; + + assert(offset >= 0 && offset < amp->am_mediasize); + extent = (offset >> amp->am_extentshift); + assert(extent >= 0 && extent < amp->am_nextents); + return (extent); +} + +static __inline off_t +ext2off(const struct activemap *amp, int extent) +{ + off_t offset; + + assert(extent >= 0 && extent < amp->am_nextents); + offset = ((off_t)extent << amp->am_extentshift); + assert(offset >= 0 && offset < amp->am_mediasize); + return (offset); +} + +/* + * Function calculates number of requests needed to synchronize the given + * extent. + */ +static __inline int +ext2reqs(const struct activemap *amp, int ext) +{ + off_t left; + + if (ext < amp->am_nextents - 1) + return (((amp->am_extentsize - 1) / MAXPHYS) + 1); + + assert(ext == amp->am_nextents - 1); + left = amp->am_mediasize % amp->am_extentsize; + if (left == 0) + left = amp->am_extentsize; + return (((left - 1) / MAXPHYS) + 1); +} + +/* + * Initialize activemap structure and allocate memory for internal needs. + * Function returns 0 on success and -1 if any of the allocations failed. + */ +int +activemap_init(struct activemap **ampp, uint64_t mediasize, uint32_t extentsize, + uint32_t sectorsize, uint32_t keepdirty) +{ + struct activemap *amp; + + assert(ampp != NULL); + assert(mediasize > 0); + assert(extentsize > 0); + assert(powerof2(extentsize)); + assert(sectorsize > 0); + assert(powerof2(sectorsize)); + assert(keepdirty > 0); + + amp = malloc(sizeof(*amp)); + if (amp == NULL) + return (-1); + + amp->am_mediasize = mediasize; + amp->am_nkeepdirty_limit = keepdirty; + amp->am_extentsize = extentsize; + amp->am_extentshift = bitcount32(extentsize - 1); + amp->am_nextents = ((mediasize - 1) / extentsize) + 1; + amp->am_mapsize = sizeof(bitstr_t) * bitstr_size(amp->am_nextents); + amp->am_diskmapsize = roundup2(amp->am_mapsize, sectorsize); + amp->am_ndirty = 0; + amp->am_syncoff = -2; + TAILQ_INIT(&->am_keepdirty); + amp->am_nkeepdirty = 0; + + amp->am_memtab = calloc(amp->am_nextents, sizeof(amp->am_memtab[0])); + amp->am_diskmap = calloc(1, amp->am_diskmapsize); + amp->am_memmap = bit_alloc(amp->am_nextents); + amp->am_syncmap = bit_alloc(amp->am_nextents); + + /* + * Check to see if any of the allocations above failed. + */ + if (amp->am_memtab == NULL || amp->am_diskmap == NULL || + amp->am_memmap == NULL || amp->am_syncmap == NULL) { + if (amp->am_memtab != NULL) + free(amp->am_memtab); + if (amp->am_diskmap != NULL) + free(amp->am_diskmap); + if (amp->am_memmap != NULL) + free(amp->am_memmap); + if (amp->am_syncmap != NULL) + free(amp->am_syncmap); + amp->am_magic = 0; + free(amp); + errno = ENOMEM; + return (-1); + } + + amp->am_magic = ACTIVEMAP_MAGIC; + *ampp = amp; + + return (0); +} + +static struct keepdirty * +keepdirty_find(struct activemap *amp, int extent) +{ + struct keepdirty *kd; + + TAILQ_FOREACH(kd, &->am_keepdirty, kd_next) { + if (kd->kd_extent == extent) + break; + } + return (kd); +} + +static void +keepdirty_add(struct activemap *amp, int extent) +{ + struct keepdirty *kd; + + kd = keepdirty_find(amp, extent); + if (kd != NULL) { + /* + * Only move element at the begining. + */ + TAILQ_REMOVE(&->am_keepdirty, kd, kd_next); + TAILQ_INSERT_HEAD(&->am_keepdirty, kd, kd_next); + return; + } + /* + * Add new element, but first remove the most unused one if + * we have too many. + */ + if (amp->am_nkeepdirty >= amp->am_nkeepdirty_limit) { + kd = TAILQ_LAST(&->am_keepdirty, skeepdirty); + assert(kd != NULL); + TAILQ_REMOVE(&->am_keepdirty, kd, kd_next); + amp->am_nkeepdirty--; + assert(amp->am_nkeepdirty > 0); + } + if (kd == NULL) + kd = malloc(sizeof(*kd)); + /* We can ignore allocation failure. */ + if (kd != NULL) { + kd->kd_extent = extent; + amp->am_nkeepdirty++; + TAILQ_INSERT_HEAD(&->am_keepdirty, kd, kd_next); + } +} + +static void +keepdirty_fill(struct activemap *amp) +{ + struct keepdirty *kd; + + TAILQ_FOREACH(kd, &->am_keepdirty, kd_next) + bit_set(amp->am_diskmap, kd->kd_extent); +} + +static void +keepdirty_free(struct activemap *amp) +{ + struct keepdirty *kd; + + while ((kd = TAILQ_FIRST(&->am_keepdirty)) != NULL) { + TAILQ_REMOVE(&->am_keepdirty, kd, kd_next); + amp->am_nkeepdirty--; + free(kd); + } + assert(amp->am_nkeepdirty == 0); +} + +/* + * Function frees resources allocated by activemap_init() function. + */ +void +activemap_free(struct activemap *amp) +{ + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + + amp->am_magic = 0; + + keepdirty_free(amp); + free(amp->am_memtab); + free(amp->am_diskmap); + free(amp->am_memmap); + free(amp->am_syncmap); +} + +/* + * Function should be called before we handle write requests. It updates + * internal structures and returns true if on-disk metadata should be updated. + */ +bool +activemap_write_start(struct activemap *amp, off_t offset, off_t length) +{ + bool modified; + off_t end; + int ext; + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + assert(length > 0); + + modified = false; + end = offset + length - 1; + + for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) { + /* + * If the number of pending writes is increased from 0, + * we have to mark the extent as dirty also in on-disk bitmap. + * By returning true we inform the caller that on-disk bitmap + * was modified and has to be flushed to disk. + */ + if (amp->am_memtab[ext]++ == 0) { + assert(!bit_test(amp->am_memmap, ext)); + bit_set(amp->am_memmap, ext); + amp->am_ndirty++; + modified = true; + } + keepdirty_add(amp, ext); + } + + return (modified); +} + +/* + * Function should be called after receiving write confirmation. It updates + * internal structures and returns true if on-disk metadata should be updated. + */ +bool +activemap_write_complete(struct activemap *amp, off_t offset, off_t length) +{ + bool modified; + off_t end; + int ext; + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + assert(length > 0); + + modified = false; + end = offset + length - 1; + + for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) { + /* + * If the number of pending writes goes down to 0, we have to + * mark the extent as clean also in on-disk bitmap. + * By returning true we inform the caller that on-disk bitmap + * was modified and has to be flushed to disk. + */ + assert(amp->am_memtab[ext] > 0); + assert(bit_test(amp->am_memmap, ext)); + if (--amp->am_memtab[ext] == 0) { + bit_clear(amp->am_memmap, ext); + amp->am_ndirty--; + modified = true; + } + } + + return (modified); +} + +/* + * Function should be called after finishing synchronization of one extent. + * It returns true if on-disk metadata should be updated. + */ +bool +activemap_extent_complete(struct activemap *amp, int extent) +{ + bool modified; + int reqs; + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + assert(extent >= 0 && extent < amp->am_nextents); + + modified = false; + + reqs = ext2reqs(amp, extent); + assert(amp->am_memtab[extent] >= reqs); + amp->am_memtab[extent] -= reqs; + assert(bit_test(amp->am_memmap, extent)); + if (amp->am_memtab[extent] == 0) { + bit_clear(amp->am_memmap, extent); + amp->am_ndirty--; + modified = true; + } + + return (modified); +} + +/* + * Function returns number of dirty regions. + */ +uint64_t +activemap_ndirty(const struct activemap *amp) +{ + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + + return (amp->am_ndirty); +} + +/* + * Function compare on-disk bitmap and in-memory bitmap and returns true if + * they differ and should be flushed to the disk. + */ +bool +activemap_differ(const struct activemap *amp) +{ + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + + return (memcmp(amp->am_diskmap, amp->am_memmap, + amp->am_mapsize) != 0); +} + +/* + * Function returns number of bytes used by bitmap. + */ +size_t +activemap_size(const struct activemap *amp) +{ + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + + return (amp->am_mapsize); +} + +/* + * Function returns number of bytes needed for storing on-disk bitmap. + * This is the same as activemap_size(), but rounded up to sector size. + */ +size_t +activemap_ondisk_size(const struct activemap *amp) +{ + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + + return (amp->am_diskmapsize); +} + +/* + * Function copies the given buffer read from disk to the internal bitmap. + */ +void +activemap_copyin(struct activemap *amp, const unsigned char *buf, size_t size) +{ + int ext; + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + assert(size >= amp->am_mapsize); + + memcpy(amp->am_diskmap, buf, amp->am_mapsize); + memcpy(amp->am_memmap, buf, amp->am_mapsize); + memcpy(amp->am_syncmap, buf, amp->am_mapsize); + + bit_ffs(amp->am_memmap, amp->am_nextents, &ext); + if (ext == -1) { + /* There are no dirty extents, so we can leave now. */ + return; + } + /* + * Set synchronization offset to the first dirty extent. + */ + activemap_sync_rewind(amp); + /* + * We have dirty extents and we want them to stay that way until + * we synchronize, so we set number of pending writes to number + * of requests needed to synchronize one extent. + */ + amp->am_ndirty = 0; + for (; ext < amp->am_nextents; ext++) { + if (bit_test(amp->am_memmap, ext)) { + amp->am_memtab[ext] = ext2reqs(amp, ext); + amp->am_ndirty++; + } + } +} + +/* + * Function merges the given bitmap with existng one. + */ +void +activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size) +{ + bitstr_t *remmap = __DECONST(bitstr_t *, buf); + int ext; + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + assert(size >= amp->am_mapsize); + + bit_ffs(remmap, amp->am_nextents, &ext); + if (ext == -1) { + /* There are no dirty extents, so we can leave now. */ + return; + } + /* + * We have dirty extents and we want them to stay that way until + * we synchronize, so we set number of pending writes to number + * of requests needed to synchronize one extent. + */ + for (; ext < amp->am_nextents; ext++) { + /* Local extent already dirty. */ + if (bit_test(amp->am_syncmap, ext)) + continue; + /* Remote extent isn't dirty. */ + if (!bit_test(remmap, ext)) + continue; + bit_set(amp->am_syncmap, ext); + bit_set(amp->am_memmap, ext); + bit_set(amp->am_diskmap, ext); + if (amp->am_memtab[ext] == 0) + amp->am_ndirty++; + amp->am_memtab[ext] = ext2reqs(amp, ext); + } + /* + * Set synchronization offset to the first dirty extent. + */ + activemap_sync_rewind(amp); +} + +/* + * Function returns pointer to internal bitmap that should be written to disk. + */ +const unsigned char * +activemap_bitmap(struct activemap *amp, size_t *sizep) +{ + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + + if (sizep != NULL) + *sizep = amp->am_diskmapsize; + memcpy(amp->am_diskmap, amp->am_memmap, amp->am_mapsize); + keepdirty_fill(amp); + return ((const unsigned char *)amp->am_diskmap); +} + +/* + * Function calculates size needed to store bitmap on disk. + */ +size_t +activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize, + uint32_t sectorsize) +{ + uint64_t nextents, mapsize; + + assert(mediasize > 0); + assert(extentsize > 0); + assert(powerof2(extentsize)); + assert(sectorsize > 0); + assert(powerof2(sectorsize)); + + nextents = ((mediasize - 1) / extentsize) + 1; + mapsize = sizeof(bitstr_t) * bitstr_size(nextents); + return (roundup2(mapsize, sectorsize)); +} + +/* + * Set synchronization offset to the first dirty extent. + */ +void +activemap_sync_rewind(struct activemap *amp) +{ + int ext; + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + + bit_ffs(amp->am_syncmap, amp->am_nextents, &ext); + if (ext == -1) { + /* There are no extents to synchronize. */ + amp->am_syncoff = -2; + return; + } + /* + * Mark that we want to start synchronization from the begining. + */ + amp->am_syncoff = -1; +} + +/* + * Return next offset of where we should synchronize. + */ +off_t +activemap_sync_offset(struct activemap *amp, off_t *lengthp, int *syncextp) +{ + off_t syncoff, left; + int ext; + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + assert(lengthp != NULL); + assert(syncextp != NULL); + + *syncextp = -1; + + if (amp->am_syncoff == -2) + return (-1); + + if (amp->am_syncoff >= 0 && + (amp->am_syncoff + MAXPHYS >= amp->am_mediasize || + off2ext(amp, amp->am_syncoff) != + off2ext(amp, amp->am_syncoff + MAXPHYS))) { + /* + * We are about to change extent, so mark previous one as clean. + */ + ext = off2ext(amp, amp->am_syncoff); + bit_clear(amp->am_syncmap, ext); + *syncextp = ext; + amp->am_syncoff = -1; + } + + if (amp->am_syncoff == -1) { + /* + * Let's find first extent to synchronize. + */ + bit_ffs(amp->am_syncmap, amp->am_nextents, &ext); + if (ext == -1) { + amp->am_syncoff = -2; + return (-1); + } + amp->am_syncoff = ext2off(amp, ext); + } else { + /* + * We don't change extent, so just increase offset. + */ + amp->am_syncoff += MAXPHYS; + if (amp->am_syncoff >= amp->am_mediasize) { + amp->am_syncoff = -2; + return (-1); + } + } + + syncoff = amp->am_syncoff; + left = ext2off(amp, off2ext(amp, syncoff)) + + amp->am_extentsize - syncoff; + if (syncoff + left > amp->am_mediasize) + left = amp->am_mediasize - syncoff; + if (left > MAXPHYS) + left = MAXPHYS; + + assert(left >= 0 && left <= MAXPHYS); + assert(syncoff >= 0 && syncoff < amp->am_mediasize); + assert(syncoff + left >= 0 && syncoff + left <= amp->am_mediasize); + + *lengthp = left; + return (syncoff); +} + +/* + * Mark extent(s) containing the given region for synchronization. + * Most likely one of the components is unavailable. + */ +bool +activemap_need_sync(struct activemap *amp, off_t offset, off_t length) +{ + bool modified; + off_t end; + int ext; + + assert(amp->am_magic == ACTIVEMAP_MAGIC); + + modified = false; + end = offset + length - 1; + + for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) { + if (bit_test(amp->am_syncmap, ext)) { + /* Already marked for synchronization. */ + assert(bit_test(amp->am_memmap, ext)); + continue; + } + bit_set(amp->am_syncmap, ext); + if (!bit_test(amp->am_memmap, ext)) { + bit_set(amp->am_memmap, ext); + amp->am_ndirty++; + } + amp->am_memtab[ext] += ext2reqs(amp, ext); + modified = true; + } + + return (modified); +} + +void +activemap_dump(const struct activemap *amp) +{ + int bit; + + printf("M: "); + for (bit = 0; bit < amp->am_nextents; bit++) + printf("%d", bit_test(amp->am_memmap, bit) ? 1 : 0); + printf("\n"); + printf("D: "); + for (bit = 0; bit < amp->am_nextents; bit++) + printf("%d", bit_test(amp->am_diskmap, bit) ? 1 : 0); + printf("\n"); + printf("S: "); + for (bit = 0; bit < amp->am_nextents; bit++) + printf("%d", bit_test(amp->am_syncmap, bit) ? 1 : 0); + printf("\n"); +} diff --git a/sbin/hastd/activemap.h b/sbin/hastd/activemap.h new file mode 100644 index 0000000..42f0221 --- /dev/null +++ b/sbin/hastd/activemap.h @@ -0,0 +1,69 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _ACTIVEMAP_H_ +#define _ACTIVEMAP_H_ + +#include <stdbool.h> +#include <stdint.h> + +struct activemap; + +int activemap_init(struct activemap **ampp, uint64_t mediasize, + uint32_t extentsize, uint32_t sectorsize, uint32_t keepdirty); +void activemap_free(struct activemap *amp); + +bool activemap_write_start(struct activemap *amp, off_t offset, off_t length); +bool activemap_write_complete(struct activemap *amp, off_t offset, + off_t length); +bool activemap_extent_complete(struct activemap *amp, int extent); +uint64_t activemap_ndirty(const struct activemap *amp); + +bool activemap_differ(const struct activemap *amp); +size_t activemap_size(const struct activemap *amp); +size_t activemap_ondisk_size(const struct activemap *amp); +void activemap_copyin(struct activemap *amp, const unsigned char *buf, + size_t size); +void activemap_merge(struct activemap *amp, const unsigned char *buf, + size_t size); +const unsigned char *activemap_bitmap(struct activemap *amp, size_t *sizep); + +size_t activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize, + uint32_t sectorsize); + +void activemap_sync_rewind(struct activemap *amp); +off_t activemap_sync_offset(struct activemap *amp, off_t *lengthp, + int *syncextp); +bool activemap_need_sync(struct activemap *amp, off_t offset, off_t length); + +void activemap_dump(const struct activemap *amp); + +#endif /* !_ACTIVEMAP_H_ */ diff --git a/sbin/hastd/control.c b/sbin/hastd/control.c new file mode 100644 index 0000000..0ad39b4 --- /dev/null +++ b/sbin/hastd/control.c @@ -0,0 +1,426 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/wait.h> +#include <signal.h> + +#include <assert.h> +#include <errno.h> +#include <pthread.h> +#include <stdio.h> +#include <string.h> + +#include "hast.h" +#include "hastd.h" +#include "hast_proto.h" +#include "nv.h" +#include "pjdlog.h" +#include "proto.h" +#include "subr.h" + +#include "control.h" + +static void +control_set_role(struct hastd_config *cfg, struct nv *nvout, uint8_t role, + struct hast_resource *res, const char *name, unsigned int no) +{ + + assert(cfg != NULL); + assert(nvout != NULL); + assert(name != NULL); + + /* Name is always needed. */ + nv_add_string(nvout, name, "resource%u", no); + + if (res == NULL) { + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (strcmp(res->hr_name, name) == 0) + break; + } + if (res == NULL) { + nv_add_int16(nvout, EHAST_NOENTRY, "error%u", no); + return; + } + } + assert(res != NULL); + + /* Send previous role back. */ + nv_add_string(nvout, role2str(res->hr_role), "role%u", no); + + /* Nothing changed, return here. */ + if (role == res->hr_role) + return; + + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); + pjdlog_info("Role changed to %s.", role2str(role)); + + /* Change role to the new one. */ + res->hr_role = role; + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); + + /* + * If previous role was primary or secondary we have to kill process + * doing that work. + */ + if (res->hr_workerpid != 0) { + if (kill(res->hr_workerpid, SIGTERM) < 0) { + pjdlog_errno(LOG_WARNING, + "Unable to kill worker process %u", + (unsigned int)res->hr_workerpid); + } else if (waitpid(res->hr_workerpid, NULL, 0) != + res->hr_workerpid) { + pjdlog_errno(LOG_WARNING, + "Error while waiting for worker process %u", + (unsigned int)res->hr_workerpid); + } else { + pjdlog_debug(1, "Worker process %u stopped.", + (unsigned int)res->hr_workerpid); + } + res->hr_workerpid = 0; + } + + /* Start worker process if we are changing to primary. */ + if (role == HAST_ROLE_PRIMARY) + hastd_primary(res); + pjdlog_prefix_set("%s", ""); +} + +static void +control_status_worker(struct hast_resource *res, struct nv *nvout, + unsigned int no) +{ + struct nv *cnvin, *cnvout; + const char *str; + int error; + + cnvin = cnvout = NULL; + error = 0; + + /* + * Prepare and send command to worker process. + */ + cnvout = nv_alloc(); + nv_add_uint8(cnvout, HASTCTL_STATUS, "cmd"); + error = nv_error(cnvout); + if (error != 0) { + /* LOG */ + goto end; + } + if (hast_proto_send(res, res->hr_ctrl, cnvout, NULL, 0) < 0) { + error = errno; + /* LOG */ + goto end; + } + + /* + * Receive response. + */ + if (hast_proto_recv_hdr(res->hr_ctrl, &cnvin) < 0) { + error = errno; + /* LOG */ + goto end; + } + + error = nv_get_int64(cnvin, "error"); + if (error != 0) + goto end; + + if ((str = nv_get_string(cnvin, "status")) == NULL) { + error = ENOENT; + /* LOG */ + goto end; + } + nv_add_string(nvout, str, "status%u", no); + nv_add_uint64(nvout, nv_get_uint64(cnvin, "dirty"), "dirty%u", no); + nv_add_uint32(nvout, nv_get_uint32(cnvin, "extentsize"), + "extentsize%u", no); + nv_add_uint32(nvout, nv_get_uint32(cnvin, "keepdirty"), + "keepdirty%u", no); +end: + if (cnvin != NULL) + nv_free(cnvin); + if (cnvout != NULL) + nv_free(cnvout); + if (error != 0) + nv_add_int16(nvout, error, "error"); +} + +static void +control_status(struct hastd_config *cfg, struct nv *nvout, + struct hast_resource *res, const char *name, unsigned int no) +{ + + assert(cfg != NULL); + assert(nvout != NULL); + assert(name != NULL); + + /* Name is always needed. */ + nv_add_string(nvout, name, "resource%u", no); + + if (res == NULL) { + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (strcmp(res->hr_name, name) == 0) + break; + } + if (res == NULL) { + nv_add_int16(nvout, EHAST_NOENTRY, "error%u", no); + return; + } + } + assert(res != NULL); + nv_add_string(nvout, res->hr_provname, "provname%u", no); + nv_add_string(nvout, res->hr_localpath, "localpath%u", no); + nv_add_string(nvout, res->hr_remoteaddr, "remoteaddr%u", no); + switch (res->hr_replication) { + case HAST_REPLICATION_FULLSYNC: + nv_add_string(nvout, "fullsync", "replication%u", no); + break; + case HAST_REPLICATION_MEMSYNC: + nv_add_string(nvout, "memsync", "replication%u", no); + break; + case HAST_REPLICATION_ASYNC: + nv_add_string(nvout, "async", "replication%u", no); + break; + default: + nv_add_string(nvout, "unknown", "replication%u", no); + break; + } + nv_add_string(nvout, role2str(res->hr_role), "role%u", no); + + switch (res->hr_role) { + case HAST_ROLE_PRIMARY: + assert(res->hr_workerpid != 0); + /* FALLTHROUGH */ + case HAST_ROLE_SECONDARY: + if (res->hr_workerpid != 0) + break; + /* FALLTHROUGH */ + default: + return; + } + + /* + * If we are here, it means that we have a worker process, which we + * want to ask some questions. + */ + control_status_worker(res, nvout, no); +} + +void +control_handle(struct hastd_config *cfg) +{ + struct proto_conn *conn; + struct nv *nvin, *nvout; + unsigned int ii; + const char *str; + uint8_t cmd, role; + int error; + + if (proto_accept(cfg->hc_controlconn, &conn) < 0) { + pjdlog_errno(LOG_ERR, "Unable to accept control connection"); + return; + } + + nvin = nvout = NULL; + role = HAST_ROLE_UNDEF; + + if (hast_proto_recv_hdr(conn, &nvin) < 0) { + pjdlog_errno(LOG_ERR, "Unable to receive control header"); + nvin = NULL; + goto close; + } + + /* Obtain command code. 0 means that nv_get_uint8() failed. */ + cmd = nv_get_uint8(nvin, "cmd"); + if (cmd == 0) { + pjdlog_error("Control header is missing 'cmd' field."); + error = EHAST_INVALID; + goto close; + } + + /* Allocate outgoing nv structure. */ + nvout = nv_alloc(); + if (nvout == NULL) { + pjdlog_error("Unable to allocate header for control response."); + error = EHAST_NOMEMORY; + goto close; + } + + error = 0; + + str = nv_get_string(nvin, "resource0"); + if (str == NULL) { + pjdlog_error("Control header is missing 'resource0' field."); + error = EHAST_INVALID; + goto fail; + } + if (cmd == HASTCTL_SET_ROLE) { + role = nv_get_uint8(nvin, "role"); + switch (role) { + case HAST_ROLE_INIT: /* Is that valid to set, hmm? */ + case HAST_ROLE_PRIMARY: + case HAST_ROLE_SECONDARY: + break; + default: + pjdlog_error("Invalid role received (%hhu).", role); + error = EHAST_INVALID; + goto fail; + } + } + if (strcmp(str, "all") == 0) { + struct hast_resource *res; + + /* All configured resources. */ + + ii = 0; + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + switch (cmd) { + case HASTCTL_SET_ROLE: + control_set_role(cfg, nvout, role, res, + res->hr_name, ii++); + break; + case HASTCTL_STATUS: + control_status(cfg, nvout, res, res->hr_name, + ii++); + break; + default: + pjdlog_error("Invalid command received (%hhu).", + cmd); + error = EHAST_UNIMPLEMENTED; + goto fail; + } + } + } else { + /* Only selected resources. */ + + for (ii = 0; ; ii++) { + str = nv_get_string(nvin, "resource%u", ii); + if (str == NULL) + break; + switch (cmd) { + case HASTCTL_SET_ROLE: + control_set_role(cfg, nvout, role, NULL, str, + ii); + break; + case HASTCTL_STATUS: + control_status(cfg, nvout, NULL, str, ii); + break; + default: + pjdlog_error("Invalid command received (%hhu).", + cmd); + error = EHAST_UNIMPLEMENTED; + goto fail; + } + } + } + if (nv_error(nvout) != 0) + goto close; +fail: + if (error != 0) + nv_add_int16(nvout, error, "error"); + + if (hast_proto_send(NULL, conn, nvout, NULL, 0) < 0) + pjdlog_errno(LOG_ERR, "Unable to send control response"); +close: + if (nvin != NULL) + nv_free(nvin); + if (nvout != NULL) + nv_free(nvout); + proto_close(conn); +} + +/* + * Thread handles control requests from the parent. + */ +void * +ctrl_thread(void *arg) +{ + struct hast_resource *res = arg; + struct nv *nvin, *nvout; + uint8_t cmd; + + for (;;) { + if (hast_proto_recv_hdr(res->hr_ctrl, &nvin) < 0) { + if (sigexit_received) + pthread_exit(NULL); + pjdlog_errno(LOG_ERR, + "Unable to receive control message"); + continue; + } + cmd = nv_get_uint8(nvin, "cmd"); + if (cmd == 0) { + pjdlog_error("Control message is missing 'cmd' field."); + nv_free(nvin); + continue; + } + nv_free(nvin); + nvout = nv_alloc(); + switch (cmd) { + case HASTCTL_STATUS: + if (res->hr_remotein != NULL && + res->hr_remoteout != NULL) { + nv_add_string(nvout, "complete", "status"); + } else { + nv_add_string(nvout, "degraded", "status"); + } + nv_add_uint32(nvout, (uint32_t)res->hr_extentsize, + "extentsize"); + if (res->hr_role == HAST_ROLE_PRIMARY) { + nv_add_uint32(nvout, + (uint32_t)res->hr_keepdirty, "keepdirty"); + nv_add_uint64(nvout, + (uint64_t)(activemap_ndirty(res->hr_amp) * + res->hr_extentsize), "dirty"); + } else { + nv_add_uint32(nvout, (uint32_t)0, "keepdirty"); + nv_add_uint64(nvout, (uint64_t)0, "dirty"); + } + break; + default: + nv_add_int16(nvout, EINVAL, "error"); + break; + } + if (nv_error(nvout) != 0) { + pjdlog_error("Unable to create answer on control message."); + nv_free(nvout); + continue; + } + if (hast_proto_send(NULL, res->hr_ctrl, nvout, NULL, 0) < 0) { + pjdlog_errno(LOG_ERR, + "Unable to send reply to control message"); + } + nv_free(nvout); + } + /* NOTREACHED */ + return (NULL); +} diff --git a/sbin/hastd/control.h b/sbin/hastd/control.h new file mode 100644 index 0000000..15ea290 --- /dev/null +++ b/sbin/hastd/control.h @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _CONTROL_H_ +#define _CONTROL_H_ + +#define HASTCTL_SET_ROLE 1 +#define HASTCTL_STATUS 2 + +struct hastd_config; + +void control_handle(struct hastd_config *cfg); + +void *ctrl_thread(void *arg); + +#endif /* !_CONTROL_H_ */ diff --git a/sbin/hastd/ebuf.c b/sbin/hastd/ebuf.c new file mode 100644 index 0000000..47b7530 --- /dev/null +++ b/sbin/hastd/ebuf.c @@ -0,0 +1,252 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> + +#include <assert.h> +#include <errno.h> +#include <stdbool.h> +#include <stdint.h> +#include <strings.h> +#include <unistd.h> + +#include "ebuf.h" + +#define EBUF_MAGIC 0xeb0f41c +struct ebuf { + /* Magic to assert the caller uses valid structure. */ + int eb_magic; + /* Address where we did the allocation. */ + unsigned char *eb_start; + /* Allocation end address. */ + unsigned char *eb_end; + /* Start of real data. */ + unsigned char *eb_used; + /* Size of real data. */ + size_t eb_size; +}; + +static int ebuf_head_extent(struct ebuf *eb, size_t size); +static int ebuf_tail_extent(struct ebuf *eb, size_t size); + +struct ebuf * +ebuf_alloc(size_t size) +{ + struct ebuf *eb; + int rerrno; + + eb = malloc(sizeof(*eb)); + if (eb == NULL) + return (NULL); + size += PAGE_SIZE; + eb->eb_start = malloc(size); + if (eb->eb_start == NULL) { + rerrno = errno; + free(eb); + errno = rerrno; + return (NULL); + } + eb->eb_end = eb->eb_start + size; + /* + * We set start address for real data not at the first entry, because + * we want to be able to add data at the front. + */ + eb->eb_used = eb->eb_start + PAGE_SIZE / 4; + eb->eb_size = 0; + eb->eb_magic = EBUF_MAGIC; + + return (eb); +} + +void +ebuf_free(struct ebuf *eb) +{ + + assert(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + eb->eb_magic = 0; + + free(eb->eb_start); + free(eb); +} + +int +ebuf_add_head(struct ebuf *eb, const void *data, size_t size) +{ + + assert(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + if (size > (size_t)(eb->eb_used - eb->eb_start)) { + /* + * We can't add more entries at the front, so we have to extend + * our buffer. + */ + if (ebuf_head_extent(eb, size) < 0) + return (-1); + } + assert(size <= (size_t)(eb->eb_used - eb->eb_start)); + + eb->eb_size += size; + eb->eb_used -= size; + /* + * If data is NULL the caller just wants to reserve place. + */ + if (data != NULL) + bcopy(data, eb->eb_used, size); + + return (0); +} + +int +ebuf_add_tail(struct ebuf *eb, const void *data, size_t size) +{ + + assert(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + if (size > (size_t)(eb->eb_end - (eb->eb_used + eb->eb_size))) { + /* + * We can't add more entries at the back, so we have to extend + * our buffer. + */ + if (ebuf_tail_extent(eb, size) < 0) + return (-1); + } + assert(size <= (size_t)(eb->eb_end - (eb->eb_used + eb->eb_size))); + + /* + * If data is NULL the caller just wants to reserve place. + */ + if (data != NULL) + bcopy(data, eb->eb_used + eb->eb_size, size); + eb->eb_size += size; + + return (0); +} + +void +ebuf_del_head(struct ebuf *eb, size_t size) +{ + + assert(eb != NULL && eb->eb_magic == EBUF_MAGIC); + assert(size <= eb->eb_size); + + eb->eb_used += size; + eb->eb_size -= size; +} + +void +ebuf_del_tail(struct ebuf *eb, size_t size) +{ + + assert(eb != NULL && eb->eb_magic == EBUF_MAGIC); + assert(size <= eb->eb_size); + + eb->eb_size -= size; +} + +/* + * Return pointer to the data and data size. + */ +void * +ebuf_data(struct ebuf *eb, size_t *sizep) +{ + + assert(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + if (sizep != NULL) + *sizep = eb->eb_size; + return (eb->eb_size > 0 ? eb->eb_used : NULL); +} + +/* + * Return data size. + */ +size_t +ebuf_size(struct ebuf *eb) +{ + + assert(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + return (eb->eb_size); +} + +/* + * Function adds size + (PAGE_SIZE / 4) bytes at the front of the buffer.. + */ +static int +ebuf_head_extent(struct ebuf *eb, size_t size) +{ + unsigned char *newstart, *newused; + size_t newsize; + + assert(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + newsize = eb->eb_end - eb->eb_start + (PAGE_SIZE / 4) + size; + + newstart = malloc(newsize); + if (newstart == NULL) + return (-1); + newused = + newstart + (PAGE_SIZE / 4) + size + (eb->eb_used - eb->eb_start); + + bcopy(eb->eb_used, newused, eb->eb_size); + + eb->eb_start = newstart; + eb->eb_used = newused; + eb->eb_end = newstart + newsize; + + return (0); +} + +/* + * Function adds size + ((3 * PAGE_SIZE) / 4) bytes at the back. + */ +static int +ebuf_tail_extent(struct ebuf *eb, size_t size) +{ + unsigned char *newstart; + size_t newsize; + + assert(eb != NULL && eb->eb_magic == EBUF_MAGIC); + + newsize = eb->eb_end - eb->eb_start + size + ((3 * PAGE_SIZE) / 4); + + newstart = realloc(eb->eb_start, newsize); + if (newstart == NULL) + return (-1); + + eb->eb_used = newstart + (eb->eb_used - eb->eb_start); + eb->eb_start = newstart; + eb->eb_end = newstart + newsize; + + return (0); +} diff --git a/sbin/hastd/ebuf.h b/sbin/hastd/ebuf.h new file mode 100644 index 0000000..06275e7 --- /dev/null +++ b/sbin/hastd/ebuf.h @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _EBUF_H_ +#define _EBUF_H_ + +#include <stdlib.h> /* size_t */ + +struct ebuf; + +struct ebuf *ebuf_alloc(size_t size); +void ebuf_free(struct ebuf *eb); + +int ebuf_add_head(struct ebuf *eb, const void *data, size_t size); +int ebuf_add_tail(struct ebuf *eb, const void *data, size_t size); + +void ebuf_del_head(struct ebuf *eb, size_t size); +void ebuf_del_tail(struct ebuf *eb, size_t size); + +void *ebuf_data(struct ebuf *eb, size_t *sizep); +size_t ebuf_size(struct ebuf *eb); + +#endif /* !_EBUF_H_ */ diff --git a/sbin/hastd/hast.conf.5 b/sbin/hastd/hast.conf.5 new file mode 100644 index 0000000..5734ee8 --- /dev/null +++ b/sbin/hastd/hast.conf.5 @@ -0,0 +1,267 @@ +.\" Copyright (c) 2010 The FreeBSD Foundation +.\" All rights reserved. +.\" +.\" This software was developed by Pawel Jakub Dawidek under sponsorship from +.\" the FreeBSD Foundation. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd February 1, 2010 +.Dt HAST.CONF 5 +.Os +.Sh NAME +.Nm hast.conf +.Nd configuration file for the +.Xr hastd 8 +deamon and the +.Xr hastctl 8 +utility. +.Sh DESCRIPTION +The +.Nm +file is used by both +.Xr hastd 8 +daemon +and +.Xr hastctl 8 +control utility. +Configuration file is designed in a way that exactly the same file can be +(and should be) used on both HAST nodes. +Every line starting with # is treated as comment and ignored. +.Sh CONFIGURATION FILE SYNTAX +General syntax of the +.Nm +file is following: +.Bd -literal -offset indent +# Global section +control <addr> +listen <addr> +replication <mode> + +on <node> { + # Node section + control <addr> + listen <addr> +} + +on <node> { + # Node section + control <addr> + listen <addr> +} + +resource <name> { + # Resource section + replication <mode> + name <name> + local <path> + + on <node> { + # Resource-node section + name <name> + # Required + local <path> + # Required + remote <addr> + } + on <node> { + # Resource-node section + name <name> + # Required + local <path> + # Required + remote <addr> + } +} +.Ed +.Pp +Most of the various available configuration parameters are optional. +If parameter is not defined in the particular section, it will be +inherited from the parent section. +For example, if the +.Ic listen +parameter is not defined in the node section, it will be inherited from +the global section. +In case the global section does not define the +.Ic listen +parameter at all, the default value will be used. +.Sh CONFIGURATION FILE DESCRIPTION +The +.Aq node +argument can be replaced either by a full hostname as obtained by +.Xr gethostname 3 , +only first part of the hostname, or by node's UUID as found in the +.Va kern.hostuuid +.Xr sysctl 8 +variable. +.Pp +The following statements are available: +.Bl -tag -width ".Ic xxxx" +.It Ic control Aq addr +.Pp +Address for communication with +.Xr hastctl 8 . +Each of the following examples defines the same control address: +.Bd -literal -offset indent +uds:///var/run/hastctl +unix:///var/run/hastctl +/var/run/hastctl +.Ed +.Pp +The default value is +.Pa uds:///var/run/hastctl . +.It Ic listen Aq addr +.Pp +Address to listen on in form of: +.Bd -literal -offset indent +protocol://protocol-specific-address +.Ed +.Pp +Each of the following examples defines the same listen address: +.Bd -literal -offset indent +0.0.0.0 +0.0.0.0:8457 +tcp://0.0.0.0 +tcp://0.0.0.0:8457 +tcp4://0.0.0.0 +tcp4://0.0.0.0:8457 +.Ed +.Pp +The default value is +.Pa tcp4://0.0.0.0:8457 . +.It Ic replication Aq mode +.Pp +Replication mode should be one of the following: +.Bl -tag -width ".Ic xxxx" +.It Ic memsync +.Pp +Report the write operation as completed when local write completes and +when the remote node acknowledges the data receipt, but before it +actually stores the data. +The data on remote node will be stored directly after sending +acknowledgement. +This mode is intended to reduce latency, but still provides a very good +reliability. +The only situation where some small amount of data could be lost is when +the data is stored on primary node and sent to the secondary. +Secondary node then acknowledges data receipt and primary reports +success to an application. +However, it may happen that the seconderay goes down before the received +data is really stored locally. +Before secondary node returns, primary node dies entirely. +When the secondary node comes back to life it becomes the new primary. +Unfortunately some small amount of data which was confirmed to be stored +to the application was lost. +The risk of such a situation is very small, which is the reason for this +mode to be the default. +.It Ic fullsync +.Pp +Mark the write operation as completed when local as well as remote +write completes. +This is the safest and the slowest replication mode. +The +.Ic fullsync +replication mode is currently not implemented. +.It Ic async +.Pp +The write operation is reported as complete right after the local write +completes. +This is the fastest and the most dangerous replication mode. +This mode should be used when replicating to a distant node where +latency is too high for other modes. +The +.Ic async +replication mode is currently not implemented. +.El +.It Ic name Aq name +.Pp +GEOM provider name that will appear as +.Pa /dev/hast/<name> . +If name is not defined, resource name will be used as provider name. +.It Ic local Aq path +.Pp +Path to the local component which will be used as backend provider for +the resource. +This can be either GEOM provider or regular file. +.It Ic remote Aq addr +.Pp +Address of the remote +.Nm hastd +daemon. +Format is the same as for the +.Ic listen +statement. +When operating as a primary node this address will be used to connect to +the secondary node. +When operating as a secondary node only connections from this address +will be accepted. +.El +.Sh EXAMPLES +The example configuration file can look as follows: +.Bd -literal -offset indent +resource shared { + local /dev/da0 + + on hasta { + remote tcp4://10.0.0.2 + } + on hastb { + remote tcp4://10.0.0.1 + } +} +resource tank { + on hasta { + local /dev/mirror/tanka + remote tcp4://10.0.0.2 + } + on hastb { + local /dev/mirror/tankb + remote tcp4://10.0.0.1 + } +} +.Ed +.Sh FILES +.Bl -tag -width ".Pa /var/run/hastctl" -compact +.It Pa /etc/hast.conf +The default +.Nm +configuration file. +.It Pa /var/run/hastctl +Control socket used by the +.Xr hastctl 8 +control utility to communicate with the +.Xr hastd 8 +daemon. +.El +.Sh SEE ALSO +.Xr gethostname 3 , +.Xr geom 4 , +.Xr hastctl 8 , +.Xr hastd 8 . +.Sh AUTHORS +The +.Nm +was written by +.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +under sponsorship of the FreeBSD Foundation. diff --git a/sbin/hastd/hast.h b/sbin/hastd/hast.h new file mode 100644 index 0000000..c5220b5 --- /dev/null +++ b/sbin/hastd/hast.h @@ -0,0 +1,190 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HAST_H_ +#define _HAST_H_ + +#include <sys/queue.h> +#include <sys/socket.h> + +#include <arpa/inet.h> + +#include <netinet/in.h> + +#include <limits.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdint.h> + +#include <activemap.h> + +#include "proto.h" + +#define HAST_PROTO_VERSION 0 + +#define EHAST_OK 0 +#define EHAST_NOENTRY 1 +#define EHAST_INVALID 2 +#define EHAST_NOMEMORY 3 +#define EHAST_UNIMPLEMENTED 4 + +#define HASTCTL_CMD_UNKNOWN 0 +#define HASTCTL_CMD_SETROLE 1 +#define HASTCTL_CMD_STATUS 2 + +#define HAST_ROLE_UNDEF 0 +#define HAST_ROLE_INIT 1 +#define HAST_ROLE_PRIMARY 2 +#define HAST_ROLE_SECONDARY 3 + +#define HAST_SYNCSRC_UNDEF 0 +#define HAST_SYNCSRC_PRIMARY 1 +#define HAST_SYNCSRC_SECONDARY 2 + +#define HIO_UNDEF 0 +#define HIO_READ 1 +#define HIO_WRITE 2 +#define HIO_DELETE 3 +#define HIO_FLUSH 4 + +#define HAST_CONFIG "/etc/hast.conf" +#define HAST_CONTROL "/var/run/hastctl" +#define HASTD_PORT 8457 +#define HASTD_LISTEN "tcp4://0.0.0.0:8457" +#define HASTD_PIDFILE "/var/run/hastd.pid" + +/* Default extent size. */ +#define HAST_EXTENTSIZE 2097152 +/* Default maximum number of extents that are kept dirty. */ +#define HAST_KEEPDIRTY 64 + +#define HAST_ADDRSIZE 1024 +#define HAST_TOKEN_SIZE 16 + +struct hastd_config { + /* Address to communicate with hastctl(8). */ + char hc_controladdr[HAST_ADDRSIZE]; + /* Protocol-specific data. */ + struct proto_conn *hc_controlconn; + /* Address to listen on. */ + char hc_listenaddr[HAST_ADDRSIZE]; + /* Protocol-specific data. */ + struct proto_conn *hc_listenconn; + /* List of resources. */ + TAILQ_HEAD(, hast_resource) hc_resources; +}; + +#define HAST_REPLICATION_FULLSYNC 0 +#define HAST_REPLICATION_MEMSYNC 1 +#define HAST_REPLICATION_ASYNC 2 + +/* + * Structure that describes single resource. + */ +struct hast_resource { + /* Resource name. */ + char hr_name[NAME_MAX]; + /* Replication mode (HAST_REPLICATION_*). */ + int hr_replication; + /* Provider name that will appear in /dev/hast/. */ + char hr_provname[NAME_MAX]; + /* Synchronization extent size. */ + int hr_extentsize; + /* Maximum number of extents that are kept dirty. */ + int hr_keepdirty; + + /* Path to local component. */ + char hr_localpath[PATH_MAX]; + /* Descriptor to access local component. */ + int hr_localfd; + /* Offset into local component. */ + off_t hr_localoff; + /* Size of usable space. */ + off_t hr_datasize; + /* Size of entire local provider. */ + off_t hr_local_mediasize; + /* Sector size of local provider. */ + unsigned int hr_local_sectorsize; + + /* Descriptor for /dev/ggctl communication. */ + int hr_ggatefd; + /* Unit number for ggate communication. */ + int hr_ggateunit; + + /* Address of the remote component. */ + char hr_remoteaddr[HAST_ADDRSIZE]; + /* Connection for incoming data. */ + struct proto_conn *hr_remotein; + /* Connection for outgoing data. */ + struct proto_conn *hr_remoteout; + /* Token to verify both in and out connection are coming from + the same node (not necessarily from the same address). */ + unsigned char hr_token[HAST_TOKEN_SIZE]; + + /* Resource unique identifier. */ + uint64_t hr_resuid; + /* Primary's local modification count. */ + uint64_t hr_primary_localcnt; + /* Primary's remote modification count. */ + uint64_t hr_primary_remotecnt; + /* Secondary's local modification count. */ + uint64_t hr_secondary_localcnt; + /* Secondary's remote modification count. */ + uint64_t hr_secondary_remotecnt; + /* Synchronization source. */ + uint8_t hr_syncsrc; + + /* Resource role: HAST_ROLE_{INIT,PRIMARY,SECONDARY}. */ + int hr_role; + /* Previous resource role: HAST_ROLE_{INIT,PRIMARY,SECONDARY}. */ + int hr_previous_role; + /* PID of child worker process. 0 - no child. */ + pid_t hr_workerpid; + /* Control connection between parent and child. */ + struct proto_conn *hr_ctrl; + + /* Activemap structure. */ + struct activemap *hr_amp; + /* Locked used to synchronize access to hr_amp. */ + pthread_mutex_t hr_amp_lock; + + /* Next resource. */ + TAILQ_ENTRY(hast_resource) hr_next; +}; + +struct hastd_config *yy_config_parse(const char *config); +void yy_config_free(struct hastd_config *config); + +void yyerror(const char *); +int yylex(void); +int yyparse(void); + +#endif /* !_HAST_H_ */ diff --git a/sbin/hastd/hast_proto.c b/sbin/hastd/hast_proto.c new file mode 100644 index 0000000..6e66006 --- /dev/null +++ b/sbin/hastd/hast_proto.c @@ -0,0 +1,401 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/endian.h> + +#include <assert.h> +#include <errno.h> +#include <string.h> +#include <strings.h> + +#include <openssl/sha.h> + +#include <hast.h> +#include <ebuf.h> +#include <nv.h> +#include <pjdlog.h> +#include <proto.h> + +#include "hast_proto.h" + +struct hast_main_header { + /* Protocol version. */ + uint8_t version; + /* Size of nv headers. */ + uint32_t size; +} __packed; + +typedef int hps_send_t(struct hast_resource *, struct nv *nv, void **, size_t *, bool *); +typedef int hps_recv_t(struct hast_resource *, struct nv *nv, void **, size_t *, bool *); + +struct hast_pipe_stage { + const char *hps_name; + hps_send_t *hps_send; + hps_recv_t *hps_recv; +}; + +static int compression_send(struct hast_resource *res, struct nv *nv, + void **datap, size_t *sizep, bool *freedatap); +static int compression_recv(struct hast_resource *res, struct nv *nv, + void **datap, size_t *sizep, bool *freedatap); +static int checksum_send(struct hast_resource *res, struct nv *nv, + void **datap, size_t *sizep, bool *freedatap); +static int checksum_recv(struct hast_resource *res, struct nv *nv, + void **datap, size_t *sizep, bool *freedatap); + +static struct hast_pipe_stage pipeline[] = { + { "compression", compression_send, compression_recv }, + { "checksum", checksum_send, checksum_recv } +}; + +static int +compression_send(struct hast_resource *res, struct nv *nv, void **datap, + size_t *sizep, bool *freedatap) +{ + unsigned char *newbuf; + + res = res; /* TODO */ + + /* + * TODO: For now we emulate compression. + * At 80% probability we succeed to compress data, which means we + * allocate new buffer, copy the data over set *freedatap to true. + */ + + if (arc4random_uniform(100) < 80) { + uint32_t *origsize; + + /* + * Compression succeeded (but we will grow by 4 bytes, not + * shrink for now). + */ + newbuf = malloc(sizeof(uint32_t) + *sizep); + if (newbuf == NULL) + return (-1); + origsize = (void *)newbuf; + *origsize = htole32((uint32_t)*sizep); + nv_add_string(nv, "null", "compression"); + if (nv_error(nv) != 0) { + free(newbuf); + errno = nv_error(nv); + return (-1); + } + bcopy(*datap, newbuf + sizeof(uint32_t), *sizep); + if (*freedatap) + free(*datap); + *freedatap = true; + *datap = newbuf; + *sizep = sizeof(uint32_t) + *sizep; + } else { + /* + * Compression failed, so we leave everything as it was. + * It is not critical for compression to succeed. + */ + } + + return (0); +} + +static int +compression_recv(struct hast_resource *res, struct nv *nv, void **datap, + size_t *sizep, bool *freedatap) +{ + unsigned char *newbuf; + const char *algo; + size_t origsize; + + res = res; /* TODO */ + + /* + * TODO: For now we emulate compression. + */ + + algo = nv_get_string(nv, "compression"); + if (algo == NULL) + return (0); /* No compression. */ + if (strcmp(algo, "null") != 0) { + pjdlog_error("Unknown compression algorithm '%s'.", algo); + return (-1); /* Unknown compression algorithm. */ + } + + origsize = le32toh(*(uint32_t *)*datap); + newbuf = malloc(origsize); + if (newbuf == NULL) + return (-1); + bcopy((unsigned char *)*datap + sizeof(uint32_t), newbuf, origsize); + if (*freedatap) + free(*datap); + *freedatap = true; + *datap = newbuf; + *sizep = origsize; + + return (0); +} + +static int +checksum_send(struct hast_resource *res, struct nv *nv, void **datap, + size_t *sizep, bool *freedatap __unused) +{ + unsigned char hash[SHA256_DIGEST_LENGTH]; + SHA256_CTX ctx; + + res = res; /* TODO */ + + SHA256_Init(&ctx); + SHA256_Update(&ctx, *datap, *sizep); + SHA256_Final(hash, &ctx); + + nv_add_string(nv, "sha256", "checksum"); + nv_add_uint8_array(nv, hash, sizeof(hash), "hash"); + + return (0); +} + +static int +checksum_recv(struct hast_resource *res, struct nv *nv, void **datap, + size_t *sizep, bool *freedatap __unused) +{ + unsigned char chash[SHA256_DIGEST_LENGTH]; + const unsigned char *rhash; + SHA256_CTX ctx; + const char *algo; + size_t size; + + res = res; /* TODO */ + + algo = nv_get_string(nv, "checksum"); + if (algo == NULL) + return (0); /* No checksum. */ + if (strcmp(algo, "sha256") != 0) { + pjdlog_error("Unknown checksum algorithm '%s'.", algo); + return (-1); /* Unknown checksum algorithm. */ + } + rhash = nv_get_uint8_array(nv, &size, "hash"); + if (rhash == NULL) { + pjdlog_error("Checksum algorithm is present, but hash is missing."); + return (-1); /* Hash not found. */ + } + if (size != sizeof(chash)) { + pjdlog_error("Invalid hash size (%zu) for %s, should be %zu.", + size, algo, sizeof(chash)); + return (-1); /* Different hash size. */ + } + + SHA256_Init(&ctx); + SHA256_Update(&ctx, *datap, *sizep); + SHA256_Final(chash, &ctx); + + if (bcmp(rhash, chash, sizeof(chash)) != 0) { + pjdlog_error("Hash mismatch."); + return (-1); /* Hash mismatch. */ + } + + return (0); +} + +/* + * Send the given nv structure via conn. + * We keep headers in nv structure and pass data in separate argument. + * There can be no data at all (data is NULL then). + */ +int +hast_proto_send(struct hast_resource *res, struct proto_conn *conn, + struct nv *nv, const void *data, size_t size) +{ + struct hast_main_header hdr; + struct ebuf *eb; + bool freedata; + void *dptr, *hptr; + size_t hsize; + int ret; + + dptr = (void *)(uintptr_t)data; + freedata = false; + ret = -1; + + if (data != NULL) { +if (false) { + unsigned int ii; + + for (ii = 0; ii < sizeof(pipeline) / sizeof(pipeline[0]); + ii++) { + ret = pipeline[ii].hps_send(res, nv, &dptr, &size, + &freedata); + if (ret == -1) + goto end; + } + ret = -1; +} + nv_add_uint32(nv, size, "size"); + if (nv_error(nv) != 0) { + errno = nv_error(nv); + goto end; + } + } + + eb = nv_hton(nv); + if (eb == NULL) + goto end; + + hdr.version = HAST_PROTO_VERSION; + hdr.size = htole32((uint32_t)ebuf_size(eb)); + if (ebuf_add_head(eb, &hdr, sizeof(hdr)) < 0) + goto end; + + hptr = ebuf_data(eb, &hsize); + if (proto_send(conn, hptr, hsize) < 0) + goto end; + if (data != NULL && proto_send(conn, dptr, size) < 0) + goto end; + + ret = 0; +end: + if (freedata) + free(dptr); + return (ret); +} + +int +hast_proto_recv_hdr(struct proto_conn *conn, struct nv **nvp) +{ + struct hast_main_header hdr; + struct nv *nv; + struct ebuf *eb; + void *hptr; + + eb = NULL; + nv = NULL; + + if (proto_recv(conn, &hdr, sizeof(hdr)) < 0) + goto fail; + + if (hdr.version != HAST_PROTO_VERSION) { + errno = ERPCMISMATCH; + goto fail; + } + + hdr.size = le32toh(hdr.size); + + eb = ebuf_alloc(hdr.size); + if (eb == NULL) + goto fail; + if (ebuf_add_tail(eb, NULL, hdr.size) < 0) + goto fail; + hptr = ebuf_data(eb, NULL); + assert(hptr != NULL); + if (proto_recv(conn, hptr, hdr.size) < 0) + goto fail; + nv = nv_ntoh(eb); + if (nv == NULL) + goto fail; + + *nvp = nv; + return (0); +fail: + if (nv != NULL) + nv_free(nv); + else if (eb != NULL) + ebuf_free(eb); + return (-1); +} + +int +hast_proto_recv_data(struct hast_resource *res, struct proto_conn *conn, + struct nv *nv, void *data, size_t size) +{ + unsigned int ii; + bool freedata; + size_t dsize; + void *dptr; + int ret; + + assert(data != NULL); + assert(size > 0); + + ret = -1; + freedata = false; + dptr = data; + + dsize = nv_get_uint32(nv, "size"); + if (dsize == 0) + (void)nv_set_error(nv, 0); + else { + if (proto_recv(conn, data, dsize) < 0) + goto end; +if (false) { + for (ii = sizeof(pipeline) / sizeof(pipeline[0]); ii > 0; + ii--) { + assert(!"to be verified"); + ret = pipeline[ii - 1].hps_recv(res, nv, &dptr, + &dsize, &freedata); + if (ret == -1) + goto end; + } + ret = -1; + if (dsize < size) + goto end; + /* TODO: 'size' doesn't seem right here. It is maximum data size. */ + if (dptr != data) + bcopy(dptr, data, dsize); +} + } + + ret = 0; +end: +if (ret < 0) printf("%s:%u %s\n", __func__, __LINE__, strerror(errno)); + if (freedata) + free(dptr); + return (ret); +} + +int +hast_proto_recv(struct hast_resource *res, struct proto_conn *conn, + struct nv **nvp, void *data, size_t size) +{ + struct nv *nv; + size_t dsize; + int ret; + + ret = hast_proto_recv_hdr(conn, &nv); + if (ret < 0) + return (ret); + dsize = nv_get_uint32(nv, "size"); + if (dsize == 0) + (void)nv_set_error(nv, 0); + else + ret = hast_proto_recv_data(res, conn, nv, data, size); + if (ret < 0) + nv_free(nv); + else + *nvp = nv; + return (ret); +} diff --git a/sbin/hastd/hast_proto.h b/sbin/hastd/hast_proto.h new file mode 100644 index 0000000..3894e38 --- /dev/null +++ b/sbin/hastd/hast_proto.h @@ -0,0 +1,48 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HAST_PROTO_H_ +#define _HAST_PROTO_H_ + +#include <stdlib.h> /* size_t */ + +#include <nv.h> +#include <proto.h> + +int hast_proto_send(struct hast_resource *res, struct proto_conn *conn, + struct nv *nv, const void *data, size_t size); +int hast_proto_recv(struct hast_resource *res, struct proto_conn *conn, + struct nv **nvp, void *data, size_t size); +int hast_proto_recv_hdr(struct proto_conn *conn, struct nv **nvp); +int hast_proto_recv_data(struct hast_resource *res, struct proto_conn *conn, + struct nv *nv, void *data, size_t size); + +#endif /* !_HAST_PROTO_H_ */ diff --git a/sbin/hastd/hastd.8 b/sbin/hastd/hastd.8 new file mode 100644 index 0000000..276b3d3 --- /dev/null +++ b/sbin/hastd/hastd.8 @@ -0,0 +1,232 @@ +.\" Copyright (c) 2010 The FreeBSD Foundation +.\" All rights reserved. +.\" +.\" This software was developed by Pawel Jakub Dawidek under sponsorship from +.\" the FreeBSD Foundation. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd February 1, 2010 +.Dt HASTD 8 +.Os +.Sh NAME +.Nm hastd +.Nd "Highly Available Storage daemon" +.Sh SYNOPSIS +.Nm +.Op Fl dFh +.Op Fl c Ar config +.Op Fl P Ar pidfile +.Sh DESCRIPTION +The +.Nm +daemon is responsible for managing highly available GEOM providers. +.Pp +.Nm +allows to transparently store data on two physically separated machines +connected over the TCP/IP network. +Only one machine (cluster node) can actively use storage provided by +.Nm . +This machine is called primary. +The +.Nm +daemon operates on block level, which makes it transparent for file +systems and applications. +.Pp +There is one main +.Nm +daemon which starts new worker process as soon as a role for the given +resource is changed to primary or as soon as a role for the given +resource is changed to secondary and remote (primary) node will +successfully connect to it. +Every worker process gets a new process title (see +.Xr setproctitle 3 ) , +which describes its role and resource it controls. +The exact format is: +.Bd -literal -offset indent +hastd: <resource name> (<role>) +.Ed +.Pp +When (and only when) +.Nm +operates in primary role for the given resource, corresponding +.Pa /dev/hast/<name> +disk-like device (GEOM provider) is created. +File systems and applications can use this provider to send I/O +requests to. +Every write, delete and flush operation +.Dv ( BIO_WRITE , BIO_DELETE , BIO_FLUSH ) +is send to local component and synchronously replicated +to the remote (secondary) node if it is available. +Read operations +.Dv ( BIO_READ ) +are handled locally unless I/O error occurs or local version of the data +is not up-to-date yet (synchronization is in progress). +.Pp +The +.Nm +daemon uses the GEOM Gate class to receive I/O requests from the +in-kernel GEOM infrastructure. +The +.Nm geom_gate.ko +module is loaded automatically if the kernel was not compiled with the +following option: +.Bd -ragged -offset indent +.Cd "options GEOM_GATE" +.Ed +.Pp +The connection between two +.Nm +daemons is always initiated from the one running as primary to the one +running as secondary. +When primary +.Nm +is unable to connect or connection fails, it will try to re-establish +connection every few seconds. +Once connection is established, primary +.Nm +will synchronize every extent that was modified during connection outage +to the secondary +.Nm . +.Pp +It is possible that in case of connection outage between the nodes +.Nm +primary role for the given resource will be configured on both nodes. +This in turn leads to incompatible data modifications. +Such condition is called split-brain and cannot be automatically +resolved by the +.Nm +daemon as this will lead most likely to data corruption or lost of +important changes. +Even though it cannot be fixed by +.Nm +itself, it will be detected and further connection between independently +modified nodes will not be possible. +Once this situation is manually resolved by an administrator, resource +on one of the nodes can be initialized (erasing local data), which makes +connection to the remote node possible again. +Connection of freshly initialized component will trigger full resource +synchronization. +.Pp +The +.Nm +daemon itself never picks his role up automatically. +The role has to be configured with the +.Xr hastctl 8 +control utility by additional software like +.Nm ucarp +or +.Nm heartbeat +that can reliably manage role separation and switch secondary node to +primary role in case of original primary failure. +.Pp +The +.Nm +daemon can be started with the following command line arguments: +.Bl -tag -width ".Fl P Ar pidfile" +.It Fl c Ar config +Specify alternative location of the configuration file. +The default location is +.Pa /etc/hast.conf . +.It Fl d +Print or log debugging information. +This option can be specified multiple times to raise the verbosity +level. +.It Fl F +Start the +.Nm +daemon in the foreground. +By default +.Nm +starts in the background. +.It Fl h +Print the +.Nm +usage message. +.It Fl P Ar pidfile +Specify alternative location of a file where main process PID will be +stored. +The default location is +.Pa /var/run/hastd.pid . +.El +.Sh EXIT STATUS +Exit status is 0 on success, or one of the values described in +.Xr sysexits 3 +on failure. +.Sh EXAMPLES +Launch +.Nm +on both nodes. +Set role for resource +.Nm shared +to primary on +.Nm nodeA +and to secondary on +.Nm nodeB . +Create file system on +.Pa /dev/hast/shared +provider and mount it. +.Bd -literal -offset indent +nodeB# hastd +nodeB# hastctl role secondary shared + +nodeA# hastd +nodeA# hastctl role primary shared +nodeA# newfs -U /dev/hast/shared +nodeA# mount -o noatime /dev/hast/shared /shared +.Ed +.Sh FILES +.Bl -tag -width ".Pa /var/run/hastctl" -compact +.It Pa /etc/hast.conf +The configuration file for +.Nm +and +.Xr hastctl 8 . +.It Pa /var/run/hastctl +Control socket used by the +.Xr hastctl 8 +control utility to communicate with +.Nm . +.It Pa /var/run/hastd.pid +The default location of the +.Nm +PID file. +.El +.Sh SEE ALSO +.Xr sysexits 3 , +.Xr geom 4 , +.Xr hast.conf 5 , +.Xr ggatec 8 , +.Xr ggated 8 , +.Xr ggatel 8 , +.Xr hastctl 8 , +.Xr mount 8 , +.Xr newfs 8 , +.Xr g_bio 9 . +.Sh AUTHORS +The +.Nm +was developed by +.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org +under sponsorship of the FreeBSD Foundation. diff --git a/sbin/hastd/hastd.c b/sbin/hastd/hastd.c new file mode 100644 index 0000000..19f0893 --- /dev/null +++ b/sbin/hastd/hastd.c @@ -0,0 +1,522 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker.h> +#include <sys/module.h> +#include <sys/wait.h> + +#include <assert.h> +#include <err.h> +#include <errno.h> +#include <libutil.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> + +#include <activemap.h> +#include <pjdlog.h> + +#include "control.h" +#include "hast.h" +#include "hast_proto.h" +#include "hastd.h" +#include "subr.h" + +/* Path to configuration file. */ +static const char *cfgpath = HAST_CONFIG; +/* Hastd configuration. */ +static struct hastd_config *cfg; +/* Was SIGCHLD signal received? */ +static bool sigchld_received = false; +/* Was SIGHUP signal received? */ +static bool sighup_received = false; +/* Was SIGINT or SIGTERM signal received? */ +bool sigexit_received = false; +/* PID file handle. */ +struct pidfh *pfh; + +static void +usage(void) +{ + + errx(EX_USAGE, "[-dFh] [-c config] [-P pidfile]"); +} + +static void +sighandler(int sig) +{ + + switch (sig) { + case SIGCHLD: + sigchld_received = true; + break; + case SIGHUP: + sighup_received = true; + break; + default: + assert(!"invalid condition"); + } +} + +static void +g_gate_load(void) +{ + + if (modfind("g_gate") == -1) { + /* Not present in kernel, try loading it. */ + if (kldload("geom_gate") == -1 || modfind("g_gate") == -1) { + if (errno != EEXIST) { + pjdlog_exit(EX_OSERR, + "Unable to load geom_gate module"); + } + } + } +} + +static void +child_exit(void) +{ + struct hast_resource *res; + int status; + pid_t pid; + + while ((pid = wait3(&status, WNOHANG, NULL)) > 0) { + /* Find resource related to the process that just exited. */ + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (pid == res->hr_workerpid) + break; + } + if (res == NULL) { + /* + * This can happen when new connection arrives and we + * cancel child responsible for the old one. + */ + continue; + } + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, + role2str(res->hr_role)); + if (WEXITSTATUS(status) == 0) { + pjdlog_debug(1, + "Worker process exited gracefully (pid=%u).", + (unsigned int)pid); + } else { + pjdlog_error("Worker process failed (pid=%u, status=%d).", + (unsigned int)pid, WEXITSTATUS(status)); + } + res->hr_workerpid = 0; + if (res->hr_role == HAST_ROLE_PRIMARY) { + sleep(1); + pjdlog_info("Restarting worker process."); + hastd_primary(res); + } + pjdlog_prefix_set("%s", ""); + } +} + +static void +hastd_reload(void) +{ + + /* TODO */ + pjdlog_warning("Configuration reload is not implemented."); +} + +static void +listen_accept(void) +{ + struct hast_resource *res; + struct proto_conn *conn; + struct nv *nvin, *nvout, *nverr; + const char *resname; + const unsigned char *token; + char laddr[256], raddr[256]; + size_t size; + pid_t pid; + int status; + + proto_local_address(cfg->hc_listenconn, laddr, sizeof(laddr)); + pjdlog_debug(1, "Accepting connection to %s.", laddr); + + if (proto_accept(cfg->hc_listenconn, &conn) < 0) { + pjdlog_errno(LOG_ERR, "Unable to accept connection %s", laddr); + return; + } + + proto_local_address(conn, laddr, sizeof(laddr)); + proto_remote_address(conn, raddr, sizeof(raddr)); + pjdlog_info("Connection from %s to %s.", laddr, raddr); + + nvin = nvout = nverr = NULL; + + /* + * Before receiving any data see if remote host have access to any + * resource. + */ + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (proto_address_match(conn, res->hr_remoteaddr)) + break; + } + if (res == NULL) { + pjdlog_error("Client %s isn't known.", raddr); + goto close; + } + /* Ok, remote host can access at least one resource. */ + + if (hast_proto_recv_hdr(conn, &nvin) < 0) { + pjdlog_errno(LOG_ERR, "Unable to receive header from %s", + raddr); + goto close; + } + + resname = nv_get_string(nvin, "resource"); + if (resname == NULL) { + pjdlog_error("No 'resource' field in the header received from %s.", + raddr); + goto close; + } + pjdlog_debug(2, "%s: resource=%s", raddr, resname); + token = nv_get_uint8_array(nvin, &size, "token"); + /* + * NULL token means that this is first conection. + */ + if (token != NULL && size != sizeof(res->hr_token)) { + pjdlog_error("Received token of invalid size from %s (expected %zu, got %zu).", + raddr, sizeof(res->hr_token), size); + goto close; + } + + /* + * From now on we want to send errors to the remote node. + */ + nverr = nv_alloc(); + + /* Find resource related to this connection. */ + TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) { + if (strcmp(resname, res->hr_name) == 0) + break; + } + /* Have we found the resource? */ + if (res == NULL) { + pjdlog_error("No resource '%s' as requested by %s.", + resname, raddr); + nv_add_stringf(nverr, "errmsg", "Resource not configured."); + goto fail; + } + + /* Now that we know resource name setup log prefix. */ + pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); + + /* Does the remote host have access to this resource? */ + if (!proto_address_match(conn, res->hr_remoteaddr)) { + pjdlog_error("Client %s has no access to the resource.", raddr); + nv_add_stringf(nverr, "errmsg", "No access to the resource."); + goto fail; + } + /* Is the resource marked as secondary? */ + if (res->hr_role != HAST_ROLE_SECONDARY) { + pjdlog_error("We act as %s for the resource and not as %s as requested by %s.", + role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY), + raddr); + nv_add_stringf(nverr, "errmsg", + "Remote node acts as %s for the resource and not as %s.", + role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY)); + goto fail; + } + /* Does token (if exists) match? */ + if (token != NULL && memcmp(token, res->hr_token, + sizeof(res->hr_token)) != 0) { + pjdlog_error("Token received from %s doesn't match.", raddr); + nv_add_stringf(nverr, "errmsg", "Toke doesn't match."); + goto fail; + } + /* + * If there is no token, but we have half-open connection + * (only remotein) or full connection (worker process is running) + * we have to cancel those and accept the new connection. + */ + if (token == NULL) { + assert(res->hr_remoteout == NULL); + pjdlog_debug(1, "Initial connection from %s.", raddr); + if (res->hr_workerpid != 0) { + assert(res->hr_remotein == NULL); + pjdlog_debug(1, + "Worker process exists (pid=%u), stopping it.", + (unsigned int)res->hr_workerpid); + /* Stop child process. */ + if (kill(res->hr_workerpid, SIGINT) < 0) { + pjdlog_errno(LOG_ERR, + "Unable to stop worker process (pid=%u)", + (unsigned int)res->hr_workerpid); + /* + * Other than logging the problem we + * ignore it - nothing smart to do. + */ + } + /* Wait for it to exit. */ + else if ((pid = waitpid(res->hr_workerpid, + &status, 0)) != res->hr_workerpid) { + pjdlog_errno(LOG_ERR, + "Waiting for worker process (pid=%u) failed", + (unsigned int)res->hr_workerpid); + /* See above. */ + } else if (status != 0) { + pjdlog_error("Worker process (pid=%u) exited ungracefully: status=%d.", + (unsigned int)res->hr_workerpid, status); + /* See above. */ + } else { + pjdlog_debug(1, + "Worker process (pid=%u) exited gracefully.", + (unsigned int)res->hr_workerpid); + } + res->hr_workerpid = 0; + } else if (res->hr_remotein != NULL) { + char oaddr[256]; + + proto_remote_address(conn, oaddr, sizeof(oaddr)); + pjdlog_debug(1, + "Canceling half-open connection from %s on connection from %s.", + oaddr, raddr); + proto_close(res->hr_remotein); + res->hr_remotein = NULL; + } + } + + /* + * Checks and cleanups are done. + */ + + if (token == NULL) { + arc4random_buf(res->hr_token, sizeof(res->hr_token)); + nvout = nv_alloc(); + nv_add_uint8_array(nvout, res->hr_token, + sizeof(res->hr_token), "token"); + if (nv_error(nvout) != 0) { + pjdlog_common(LOG_ERR, 0, nv_error(nvout), + "Unable to prepare return header for %s", raddr); + nv_add_stringf(nverr, "errmsg", + "Remote node was unable to prepare return header: %s.", + strerror(nv_error(nvout))); + goto fail; + } + if (hast_proto_send(NULL, conn, nvout, NULL, 0) < 0) { + int error = errno; + + pjdlog_errno(LOG_ERR, "Unable to send response to %s", + raddr); + nv_add_stringf(nverr, "errmsg", + "Remote node was unable to send response: %s.", + strerror(error)); + goto fail; + } + res->hr_remotein = conn; + pjdlog_debug(1, "Incoming connection from %s configured.", + raddr); + } else { + res->hr_remoteout = conn; + pjdlog_debug(1, "Outgoing connection to %s configured.", raddr); + hastd_secondary(res, nvin); + } + nv_free(nvin); + nv_free(nvout); + nv_free(nverr); + pjdlog_prefix_set("%s", ""); + return; +fail: + if (nv_error(nverr) != 0) { + pjdlog_common(LOG_ERR, 0, nv_error(nverr), + "Unable to prepare error header for %s", raddr); + goto close; + } + if (hast_proto_send(NULL, conn, nverr, NULL, 0) < 0) { + pjdlog_errno(LOG_ERR, "Unable to send error to %s", raddr); + goto close; + } +close: + if (nvin != NULL) + nv_free(nvin); + if (nvout != NULL) + nv_free(nvout); + if (nverr != NULL) + nv_free(nverr); + proto_close(conn); + pjdlog_prefix_set("%s", ""); +} + +static void +main_loop(void) +{ + fd_set rfds, wfds; + int fd, maxfd, ret; + + for (;;) { + if (sigchld_received) { + sigchld_received = false; + child_exit(); + } + if (sighup_received) { + sighup_received = false; + hastd_reload(); + } + + maxfd = 0; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + + /* Setup descriptors for select(2). */ +#define SETUP_FD(conn) do { \ + fd = proto_descriptor(conn); \ + if (fd >= 0) { \ + maxfd = fd > maxfd ? fd : maxfd; \ + FD_SET(fd, &rfds); \ + FD_SET(fd, &wfds); \ + } \ +} while (0) + SETUP_FD(cfg->hc_controlconn); + SETUP_FD(cfg->hc_listenconn); +#undef SETUP_FD + + ret = select(maxfd + 1, &rfds, &wfds, NULL, NULL); + if (ret == -1) { + if (errno == EINTR) + continue; + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, "select() failed"); + } + +#define ISSET_FD(conn) \ + (FD_ISSET((fd = proto_descriptor(conn)), &rfds) || FD_ISSET(fd, &wfds)) + if (ISSET_FD(cfg->hc_controlconn)) + control_handle(cfg); + if (ISSET_FD(cfg->hc_listenconn)) + listen_accept(); +#undef ISSET_FD + } +} + +int +main(int argc, char *argv[]) +{ + const char *pidfile; + pid_t otherpid; + bool foreground; + int debuglevel; + + g_gate_load(); + + foreground = false; + debuglevel = 0; + pidfile = HASTD_PIDFILE; + + for (;;) { + int ch; + + ch = getopt(argc, argv, "c:dFhP:"); + if (ch == -1) + break; + switch (ch) { + case 'c': + cfgpath = optarg; + break; + case 'd': + debuglevel++; + break; + case 'F': + foreground = true; + break; + case 'P': + pidfile = optarg; + break; + case 'h': + default: + usage(); + } + } + argc -= optind; + argv += optind; + + pjdlog_debug_set(debuglevel); + + pfh = pidfile_open(pidfile, 0600, &otherpid); + if (pfh == NULL) { + if (errno == EEXIST) { + pjdlog_exitx(EX_TEMPFAIL, + "Another hastd is already running, pid: %jd.", + (intmax_t)otherpid); + } + /* If we cannot create pidfile from other reasons, only warn. */ + pjdlog_errno(LOG_WARNING, "Cannot open or create pidfile"); + } + + cfg = yy_config_parse(cfgpath); + assert(cfg != NULL); + + signal(SIGHUP, sighandler); + signal(SIGCHLD, sighandler); + + /* Listen on control address. */ + if (proto_server(cfg->hc_controladdr, &cfg->hc_controlconn) < 0) { + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, "Unable to listen on control address %s", + cfg->hc_controladdr); + } + /* Listen for remote connections. */ + if (proto_server(cfg->hc_listenaddr, &cfg->hc_listenconn) < 0) { + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, "Unable to listen on address %s", + cfg->hc_listenaddr); + } + + if (!foreground) { + if (daemon(0, 0) < 0) { + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, "Unable to daemonize"); + } + + /* Start logging to syslog. */ + pjdlog_mode_set(PJDLOG_MODE_SYSLOG); + + /* Write PID to a file. */ + if (pidfile_write(pfh) < 0) { + pjdlog_errno(LOG_WARNING, + "Unable to write PID to a file"); + } + } + + main_loop(); + + exit(0); +} diff --git a/sbin/hastd/hastd.h b/sbin/hastd/hastd.h new file mode 100644 index 0000000..199de8c --- /dev/null +++ b/sbin/hastd/hastd.h @@ -0,0 +1,48 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HASTD_H_ +#define _HASTD_H_ + +#include <sys/param.h> +#include <libutil.h> + +#include <nv.h> + +#include "hast.h" + +extern bool sigexit_received; +extern struct pidfh *pfh; + +void hastd_primary(struct hast_resource *res); +void hastd_secondary(struct hast_resource *res, struct nv *nvin); + +#endif /* !_HASTD_H_ */ diff --git a/sbin/hastd/hooks.c b/sbin/hastd/hooks.c new file mode 100644 index 0000000..1fdeb75 --- /dev/null +++ b/sbin/hastd/hooks.c @@ -0,0 +1,148 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/wait.h> + +#include <assert.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <syslog.h> +#include <libgen.h> +#include <paths.h> + +#include <pjdlog.h> + +#include "hooks.h" + +static void +descriptors(void) +{ + long maxfd; + int fd; + + /* + * Close all descriptors. + */ + maxfd = sysconf(_SC_OPEN_MAX); + if (maxfd < 0) { + pjdlog_errno(LOG_WARNING, "sysconf(_SC_OPEN_MAX) failed"); + maxfd = 1024; + } + for (fd = 0; fd <= maxfd; fd++) + close(fd); + /* + * Redirect stdin, stdout and stderr to /dev/null. + */ + fd = open(_PATH_DEVNULL, O_RDONLY); + if (fd < 0) { + pjdlog_errno(LOG_WARNING, "Unable to open %s for reading", + _PATH_DEVNULL); + } else if (fd != STDIN_FILENO) { + if (dup2(fd, STDIN_FILENO) < 0) { + pjdlog_errno(LOG_WARNING, + "Unable to duplicate descriptor for stdin"); + } + close(fd); + } + fd = open(_PATH_DEVNULL, O_WRONLY); + if (fd < 0) { + pjdlog_errno(LOG_WARNING, "Unable to open %s for writing", + _PATH_DEVNULL); + } else { + if (fd != STDOUT_FILENO && dup2(fd, STDOUT_FILENO) < 0) { + pjdlog_errno(LOG_WARNING, + "Unable to duplicate descriptor for stdout"); + } + if (fd != STDERR_FILENO && dup2(fd, STDERR_FILENO) < 0) { + pjdlog_errno(LOG_WARNING, + "Unable to duplicate descriptor for stderr"); + } + if (fd != STDOUT_FILENO && fd != STDERR_FILENO) + close(fd); + } +} + +int +hook_exec(const char *path, ...) +{ + va_list ap; + int ret; + + va_start(ap, path); + ret = hook_execv(path, ap); + va_end(ap); + return (ret); +} + +int +hook_execv(const char *path, va_list ap) +{ + char *args[64]; + unsigned int ii; + pid_t pid, wpid; + int status; + + if (path == NULL || path[0] == '\0') + return (0); + + memset(args, 0, sizeof(args)); + args[0] = basename(path); + for (ii = 1; ii < sizeof(args) / sizeof(args[0]); ii++) { + args[ii] = va_arg(ap, char *); + if (args[ii] == NULL) + break; + } + assert(ii < sizeof(args) / sizeof(args[0])); + + pid = fork(); + switch (pid) { + case -1: /* Error. */ + pjdlog_errno(LOG_ERR, "Unable to fork %s", path); + return (-1); + case 0: /* Child. */ + descriptors(); + execv(path, args); + pjdlog_errno(LOG_ERR, "Unable to execute %s", path); + exit(EX_SOFTWARE); + default: /* Parent. */ + break; + } + + wpid = waitpid(pid, &status, 0); + assert(wpid == pid); + + return (WEXITSTATUS(status)); +} diff --git a/sbin/hastd/hooks.h b/sbin/hastd/hooks.h new file mode 100644 index 0000000..799b781 --- /dev/null +++ b/sbin/hastd/hooks.h @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HOOKS_H_ +#define _HOOKS_H_ + +#include <stdarg.h> + +int hook_exec(const char *path, ...); +int hook_execv(const char *path, va_list ap); + +#endif /* !_HOOKS_H_ */ diff --git a/sbin/hastd/metadata.c b/sbin/hastd/metadata.c new file mode 100644 index 0000000..9bca66b --- /dev/null +++ b/sbin/hastd/metadata.c @@ -0,0 +1,222 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> + +#include <ebuf.h> +#include <nv.h> +#include <pjdlog.h> +#include <subr.h> + +#include "metadata.h" + +int +metadata_read(struct hast_resource *res, bool openrw) +{ + unsigned char *buf; + struct ebuf *eb; + struct nv *nv; + ssize_t done; + const char *str; + int rerrno; + bool opened_here; + + opened_here = false; + rerrno = 0; + + /* + * Is this first metadata_read() call for this resource? + */ + if (res->hr_localfd == -1) { + if (provinfo(res, openrw) < 0) { + rerrno = errno; + goto fail; + } + opened_here = true; + pjdlog_debug(1, "Obtained info about %s.", res->hr_localpath); + if (openrw) { + if (flock(res->hr_localfd, LOCK_EX | LOCK_NB) < 0) { + rerrno = errno; + if (errno == EOPNOTSUPP) { + pjdlog_warning("Unable to lock %s (operation not supported), but continuing.", + res->hr_localpath); + } else { + pjdlog_errno(LOG_ERR, + "Unable to lock %s", + res->hr_localpath); + goto fail; + } + } + pjdlog_debug(1, "Locked %s.", res->hr_localpath); + } + } + + eb = ebuf_alloc(METADATA_SIZE); + if (eb == NULL) { + rerrno = errno; + pjdlog_errno(LOG_ERR, + "Unable to allocate memory to read metadata"); + goto fail; + } + if (ebuf_add_tail(eb, NULL, METADATA_SIZE) < 0) { + rerrno = errno; + pjdlog_errno(LOG_ERR, + "Unable to allocate memory to read metadata"); + goto fail; + } + buf = ebuf_data(eb, NULL); + assert(buf != NULL); + done = pread(res->hr_localfd, buf, METADATA_SIZE, 0); + if (done < 0 || done != METADATA_SIZE) { + rerrno = errno; + pjdlog_errno(LOG_ERR, "Unable to read metadata"); + ebuf_free(eb); + goto fail; + } + nv = nv_ntoh(eb); + if (nv == NULL) { + rerrno = errno; + pjdlog_errno(LOG_ERR, "Metadata read from %s is invalid", + res->hr_localpath); + ebuf_free(eb); + goto fail; + } + + str = nv_get_string(nv, "resource"); + if (strcmp(str, res->hr_name) != 0) { + pjdlog_error("Provider %s is not part of resource %s.", + res->hr_localpath, res->hr_name); + nv_free(nv); + goto fail; + } + + res->hr_datasize = nv_get_uint64(nv, "datasize"); + res->hr_extentsize = (int)nv_get_uint32(nv, "extentsize"); + res->hr_keepdirty = (int)nv_get_uint32(nv, "keepdirty"); + res->hr_localoff = nv_get_uint64(nv, "offset"); + res->hr_resuid = nv_get_uint64(nv, "resuid"); + if (res->hr_role != HAST_ROLE_PRIMARY) { + /* Secondary or init role. */ + res->hr_secondary_localcnt = nv_get_uint64(nv, "localcnt"); + res->hr_secondary_remotecnt = nv_get_uint64(nv, "remotecnt"); + } + if (res->hr_role != HAST_ROLE_SECONDARY) { + /* Primary or init role. */ + res->hr_primary_localcnt = nv_get_uint64(nv, "localcnt"); + res->hr_primary_remotecnt = nv_get_uint64(nv, "remotecnt"); + } + str = nv_get_string(nv, "prevrole"); + if (str != NULL) { + if (strcmp(str, "primary") == 0) + res->hr_previous_role = HAST_ROLE_PRIMARY; + else if (strcmp(str, "secondary") == 0) + res->hr_previous_role = HAST_ROLE_SECONDARY; + } + + if (nv_error(nv) != 0) { + errno = rerrno = nv_error(nv); + pjdlog_errno(LOG_ERR, "Unable to read metadata from %s", + res->hr_localpath); + nv_free(nv); + goto fail; + } + return (0); +fail: + if (opened_here) { + close(res->hr_localfd); + res->hr_localfd = -1; + } + errno = rerrno; + return (-1); +} + +int +metadata_write(struct hast_resource *res) +{ + struct ebuf *eb; + struct nv *nv; + unsigned char *buf, *ptr; + size_t size; + ssize_t done; + + buf = calloc(1, METADATA_SIZE); + if (buf == NULL) { + pjdlog_error("Unable to allocate %zu bytes for metadata.", + (size_t)METADATA_SIZE); + return (-1); + } + + nv = nv_alloc(); + nv_add_string(nv, res->hr_name, "resource"); + nv_add_uint64(nv, (uint64_t)res->hr_datasize, "datasize"); + nv_add_uint32(nv, (uint32_t)res->hr_extentsize, "extentsize"); + nv_add_uint32(nv, (uint32_t)res->hr_keepdirty, "keepdirty"); + nv_add_uint64(nv, (uint64_t)res->hr_localoff, "offset"); + nv_add_uint64(nv, res->hr_resuid, "resuid"); + if (res->hr_role == HAST_ROLE_PRIMARY || + res->hr_role == HAST_ROLE_INIT) { + nv_add_uint64(nv, res->hr_primary_localcnt, "localcnt"); + nv_add_uint64(nv, res->hr_primary_remotecnt, "remotecnt"); + } else /* if (res->hr_role == HAST_ROLE_SECONDARY) */ { + assert(res->hr_role == HAST_ROLE_SECONDARY); + nv_add_uint64(nv, res->hr_secondary_localcnt, "localcnt"); + nv_add_uint64(nv, res->hr_secondary_remotecnt, "remotecnt"); + } + nv_add_string(nv, role2str(res->hr_role), "prevrole"); + if (nv_error(nv) != 0) { + pjdlog_error("Unable to create metadata."); + goto fail; + } + res->hr_previous_role = res->hr_role; + eb = nv_hton(nv); + assert(eb != NULL); + ptr = ebuf_data(eb, &size); + assert(ptr != NULL); + assert(size < METADATA_SIZE); + bcopy(ptr, buf, size); + done = pwrite(res->hr_localfd, buf, METADATA_SIZE, 0); + if (done < 0 || done != METADATA_SIZE) { + pjdlog_errno(LOG_ERR, "Unable to write metadata"); + goto fail; + } + + return (0); +fail: + free(buf); + nv_free(nv); + return (-1); +} diff --git a/sbin/hastd/metadata.h b/sbin/hastd/metadata.h new file mode 100644 index 0000000..83d35f4 --- /dev/null +++ b/sbin/hastd/metadata.h @@ -0,0 +1,48 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _METADATA_H_ +#define _METADATA_H_ + +#include <stdbool.h> + +#include <hast.h> + +/* + * Maximum size of metadata. + * XXX: We should take sector size into account. + */ +#define METADATA_SIZE 4096 + +int metadata_read(struct hast_resource *res, bool openrw); +int metadata_write(struct hast_resource *res); + +#endif /* !_METADATA_H_ */ diff --git a/sbin/hastd/nv.c b/sbin/hastd/nv.c new file mode 100644 index 0000000..0b4e362 --- /dev/null +++ b/sbin/hastd/nv.c @@ -0,0 +1,882 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/endian.h> + +#include <assert.h> +#include <bitstring.h> +#include <errno.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <ebuf.h> +#include <nv.h> + +#define NV_MAGIC 0xaea1e +struct nv { + int nv_magic; + int nv_error; + struct ebuf *nv_ebuf; +}; + +struct nvhdr { + uint8_t nvh_type; + uint8_t nvh_namesize; + uint32_t nvh_dsize; + char nvh_name[0]; +} __packed; +#define NVH_DATA(nvh) ((unsigned char *)nvh + NVH_HSIZE(nvh)) +#define NVH_HSIZE(nvh) \ + (sizeof(struct nvhdr) + roundup2((nvh)->nvh_namesize, 8)) +#define NVH_DSIZE(nvh) \ + (((nvh)->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST ? \ + (nvh)->nvh_dsize : \ + le32toh((nvh)->nvh_dsize)) +#define NVH_SIZE(nvh) (NVH_HSIZE(nvh) + roundup2(NVH_DSIZE(nvh), 8)) + +#define NV_CHECK(nv) do { \ + assert((nv) != NULL); \ + assert((nv)->nv_magic == NV_MAGIC); \ +} while (0) + +static void nv_add(struct nv *nv, const unsigned char *value, size_t vsize, + int type, const char *name); +static void nv_addv(struct nv *nv, const unsigned char *value, size_t vsize, + int type, const char *namefmt, va_list nameap); +static struct nvhdr *nv_find(struct nv *nv, int type, const char *namefmt, + va_list nameap); +static void nv_swap(struct nvhdr *nvh, bool tohost); + +/* + * Allocate and initialize new nv structure. + * Return NULL in case of malloc(3) failure. + */ +struct nv * +nv_alloc(void) +{ + struct nv *nv; + + nv = malloc(sizeof(*nv)); + if (nv == NULL) + return (NULL); + nv->nv_ebuf = ebuf_alloc(0); + if (nv->nv_ebuf == NULL) { + free(nv); + return (NULL); + } + nv->nv_error = 0; + nv->nv_magic = NV_MAGIC; + return (nv); +} + +/* + * Free the given nv structure. + */ +void +nv_free(struct nv *nv) +{ + + if (nv == NULL) + return; + + NV_CHECK(nv); + + nv->nv_magic = 0; + ebuf_free(nv->nv_ebuf); + free(nv); +} + +/* + * Return error for the given nv structure. + */ +int +nv_error(const struct nv *nv) +{ + + if (nv == NULL) + return (ENOMEM); + + NV_CHECK(nv); + + return (nv->nv_error); +} + +/* + * Set error for the given nv structure and return previous error. + */ +int +nv_set_error(struct nv *nv, int error) +{ + int preverr; + + if (nv == NULL) + return (ENOMEM); + + NV_CHECK(nv); + + preverr = nv->nv_error; + nv->nv_error = error; + return (preverr); +} + +/* + * Validate correctness of the entire nv structure and all its elements. + * If extrap is not NULL, store number of extra bytes at the end of the buffer. + */ +int +nv_validate(struct nv *nv, size_t *extrap) +{ + struct nvhdr *nvh; + unsigned char *data, *ptr; + size_t dsize, size, vsize; + int error; + + if (nv == NULL) { + errno = ENOMEM; + return (-1); + } + + NV_CHECK(nv); + assert(nv->nv_error == 0); + + /* TODO: Check that names are unique? */ + + error = 0; + ptr = ebuf_data(nv->nv_ebuf, &size); + while (size > 0) { + /* + * Zeros at the end of the buffer are acceptable. + */ + if (ptr[0] == '\0') + break; + /* + * Minimum size at this point is size of nvhdr structure, one + * character long name plus terminating '\0'. + */ + if (size < sizeof(*nvh) + 2) { + error = EINVAL; + break; + } + nvh = (struct nvhdr *)ptr; + if (size < NVH_HSIZE(nvh)) { + error = EINVAL; + break; + } + if (nvh->nvh_name[nvh->nvh_namesize - 1] != '\0') { + error = EINVAL; + break; + } + if (strlen(nvh->nvh_name) != + (size_t)(nvh->nvh_namesize - 1)) { + error = EINVAL; + break; + } + if ((nvh->nvh_type & NV_TYPE_MASK) < NV_TYPE_FIRST || + (nvh->nvh_type & NV_TYPE_MASK) > NV_TYPE_LAST) { + error = EINVAL; + break; + } + dsize = NVH_DSIZE(nvh); + if (dsize == 0) { + error = EINVAL; + break; + } + if (size < NVH_SIZE(nvh)) { + error = EINVAL; + break; + } + vsize = 0; + switch (nvh->nvh_type & NV_TYPE_MASK) { + case NV_TYPE_INT8: + case NV_TYPE_UINT8: + if (vsize == 0) + vsize = 1; + /* FALLTHOUGH */ + case NV_TYPE_INT16: + case NV_TYPE_UINT16: + if (vsize == 0) + vsize = 2; + /* FALLTHOUGH */ + case NV_TYPE_INT32: + case NV_TYPE_UINT32: + if (vsize == 0) + vsize = 4; + /* FALLTHOUGH */ + case NV_TYPE_INT64: + case NV_TYPE_UINT64: + if (vsize == 0) + vsize = 8; + if (dsize != vsize) { + error = EINVAL; + break; + } + break; + case NV_TYPE_INT8_ARRAY: + case NV_TYPE_UINT8_ARRAY: + break; + case NV_TYPE_INT16_ARRAY: + case NV_TYPE_UINT16_ARRAY: + if (vsize == 0) + vsize = 2; + /* FALLTHOUGH */ + case NV_TYPE_INT32_ARRAY: + case NV_TYPE_UINT32_ARRAY: + if (vsize == 0) + vsize = 4; + /* FALLTHOUGH */ + case NV_TYPE_INT64_ARRAY: + case NV_TYPE_UINT64_ARRAY: + if (vsize == 0) + vsize = 8; + if ((dsize % vsize) != 0) { + error = EINVAL; + break; + } + break; + case NV_TYPE_STRING: + data = NVH_DATA(nvh); + if (data[dsize - 1] != '\0') { + error = EINVAL; + break; + } + if (strlen((char *)data) != dsize - 1) { + error = EINVAL; + break; + } + break; + default: + assert(!"invalid condition"); + } + if (error != 0) + break; + ptr += NVH_SIZE(nvh); + size -= NVH_SIZE(nvh); + } + if (error != 0) { + errno = error; + if (nv->nv_error == 0) + nv->nv_error = error; + return (-1); + } + if (extrap != NULL) + *extrap = size; + return (0); +} + +/* + * Convert the given nv structure to network byte order and return ebuf + * structure. + */ +struct ebuf * +nv_hton(struct nv *nv) +{ + struct nvhdr *nvh; + unsigned char *ptr; + size_t size; + + NV_CHECK(nv); + assert(nv->nv_error == 0); + + ptr = ebuf_data(nv->nv_ebuf, &size); + while (size > 0) { + /* + * Minimum size at this point is size of nvhdr structure, + * one character long name plus terminating '\0'. + */ + assert(size >= sizeof(*nvh) + 2); + nvh = (struct nvhdr *)ptr; + assert(NVH_SIZE(nvh) <= size); + nv_swap(nvh, false); + ptr += NVH_SIZE(nvh); + size -= NVH_SIZE(nvh); + } + + return (nv->nv_ebuf); +} + +/* + * Create nv structure based on ebuf received from the network. + */ +struct nv * +nv_ntoh(struct ebuf *eb) +{ + struct nv *nv; + size_t extra; + int rerrno; + + assert(eb != NULL); + + nv = malloc(sizeof(*nv)); + if (nv == NULL) + return (NULL); + nv->nv_error = 0; + nv->nv_ebuf = eb; + nv->nv_magic = NV_MAGIC; + + if (nv_validate(nv, &extra) < 0) { + rerrno = errno; + nv->nv_magic = 0; + free(nv); + errno = rerrno; + return (NULL); + } + /* + * Remove extra zeros at the end of the buffer. + */ + ebuf_del_tail(eb, extra); + + return (nv); +} + +#define NV_DEFINE_ADD(type, TYPE) \ +void \ +nv_add_##type(struct nv *nv, type##_t value, const char *namefmt, ...) \ +{ \ + va_list nameap; \ + \ + va_start(nameap, namefmt); \ + nv_addv(nv, (unsigned char *)&value, sizeof(value), \ + NV_TYPE_##TYPE, namefmt, nameap); \ + va_end(nameap); \ +} + +NV_DEFINE_ADD(int8, INT8) +NV_DEFINE_ADD(uint8, UINT8) +NV_DEFINE_ADD(int16, INT16) +NV_DEFINE_ADD(uint16, UINT16) +NV_DEFINE_ADD(int32, INT32) +NV_DEFINE_ADD(uint32, UINT32) +NV_DEFINE_ADD(int64, INT64) +NV_DEFINE_ADD(uint64, UINT64) + +#undef NV_DEFINE_ADD + +#define NV_DEFINE_ADD_ARRAY(type, TYPE) \ +void \ +nv_add_##type##_array(struct nv *nv, const type##_t *value, \ + size_t nsize, const char *namefmt, ...) \ +{ \ + va_list nameap; \ + \ + va_start(nameap, namefmt); \ + nv_addv(nv, (const unsigned char *)value, \ + sizeof(value[0]) * nsize, NV_TYPE_##TYPE##_ARRAY, namefmt, \ + nameap); \ + va_end(nameap); \ +} + +NV_DEFINE_ADD_ARRAY(int8, INT8) +NV_DEFINE_ADD_ARRAY(uint8, UINT8) +NV_DEFINE_ADD_ARRAY(int16, INT16) +NV_DEFINE_ADD_ARRAY(uint16, UINT16) +NV_DEFINE_ADD_ARRAY(int32, INT32) +NV_DEFINE_ADD_ARRAY(uint32, UINT32) +NV_DEFINE_ADD_ARRAY(int64, INT64) +NV_DEFINE_ADD_ARRAY(uint64, UINT64) + +#undef NV_DEFINE_ADD_ARRAY + +void +nv_add_string(struct nv *nv, const char *value, const char *namefmt, ...) +{ + va_list nameap; + size_t size; + + size = strlen(value) + 1; + + va_start(nameap, namefmt); + nv_addv(nv, (const unsigned char *)value, size, NV_TYPE_STRING, + namefmt, nameap); + va_end(nameap); +} + +void +nv_add_stringf(struct nv *nv, const char *name, const char *valuefmt, ...) +{ + va_list valueap; + + va_start(valueap, valuefmt); + nv_add_stringv(nv, name, valuefmt, valueap); + va_end(valueap); +} + +void +nv_add_stringv(struct nv *nv, const char *name, const char *valuefmt, + va_list valueap) +{ + char *value; + ssize_t size; + + size = vasprintf(&value, valuefmt, valueap); + if (size < 0) { + if (nv->nv_error == 0) + nv->nv_error = ENOMEM; + return; + } + size++; + nv_add(nv, (const unsigned char *)value, size, NV_TYPE_STRING, name); + free(value); +} + +#define NV_DEFINE_GET(type, TYPE) \ +type##_t \ +nv_get_##type(struct nv *nv, const char *namefmt, ...) \ +{ \ + struct nvhdr *nvh; \ + va_list nameap; \ + type##_t value; \ + \ + va_start(nameap, namefmt); \ + nvh = nv_find(nv, NV_TYPE_##TYPE, namefmt, nameap); \ + va_end(nameap); \ + if (nvh == NULL) \ + return (0); \ + assert((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST); \ + assert(sizeof(value) == nvh->nvh_dsize); \ + bcopy(NVH_DATA(nvh), &value, sizeof(value)); \ + \ + return (value); \ +} + +NV_DEFINE_GET(int8, INT8) +NV_DEFINE_GET(uint8, UINT8) +NV_DEFINE_GET(int16, INT16) +NV_DEFINE_GET(uint16, UINT16) +NV_DEFINE_GET(int32, INT32) +NV_DEFINE_GET(uint32, UINT32) +NV_DEFINE_GET(int64, INT64) +NV_DEFINE_GET(uint64, UINT64) + +#undef NV_DEFINE_GET + +#define NV_DEFINE_GET_ARRAY(type, TYPE) \ +const type##_t * \ +nv_get_##type##_array(struct nv *nv, size_t *sizep, \ + const char *namefmt, ...) \ +{ \ + struct nvhdr *nvh; \ + va_list nameap; \ + \ + va_start(nameap, namefmt); \ + nvh = nv_find(nv, NV_TYPE_##TYPE##_ARRAY, namefmt, nameap); \ + va_end(nameap); \ + if (nvh == NULL) \ + return (NULL); \ + assert((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST); \ + assert((nvh->nvh_dsize % sizeof(type##_t)) == 0); \ + if (sizep != NULL) \ + *sizep = nvh->nvh_dsize / sizeof(type##_t); \ + return ((type##_t *)(void *)NVH_DATA(nvh)); \ +} + +NV_DEFINE_GET_ARRAY(int8, INT8) +NV_DEFINE_GET_ARRAY(uint8, UINT8) +NV_DEFINE_GET_ARRAY(int16, INT16) +NV_DEFINE_GET_ARRAY(uint16, UINT16) +NV_DEFINE_GET_ARRAY(int32, INT32) +NV_DEFINE_GET_ARRAY(uint32, UINT32) +NV_DEFINE_GET_ARRAY(int64, INT64) +NV_DEFINE_GET_ARRAY(uint64, UINT64) + +#undef NV_DEFINE_GET_ARRAY + +const char * +nv_get_string(struct nv *nv, const char *namefmt, ...) +{ + struct nvhdr *nvh; + va_list nameap; + char *str; + + va_start(nameap, namefmt); + nvh = nv_find(nv, NV_TYPE_STRING, namefmt, nameap); + va_end(nameap); + if (nvh == NULL) + return (NULL); + assert((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST); + assert(nvh->nvh_dsize >= 1); + str = NVH_DATA(nvh); + assert(str[nvh->nvh_dsize - 1] == '\0'); + assert(strlen(str) == nvh->nvh_dsize - 1); + return (str); +} + +/* + * Dump content of the nv structure. + */ +void +nv_dump(struct nv *nv) +{ + struct nvhdr *nvh; + unsigned char *data, *ptr; + size_t dsize, size; + unsigned int ii; + bool swap; + + if (nv_validate(nv, NULL) < 0) { + printf("error: %d\n", errno); + return; + } + + NV_CHECK(nv); + assert(nv->nv_error == 0); + + ptr = ebuf_data(nv->nv_ebuf, &size); + while (size > 0) { + assert(size >= sizeof(*nvh) + 2); + nvh = (struct nvhdr *)ptr; + assert(size >= NVH_SIZE(nvh)); + swap = ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_NETWORK); + dsize = NVH_DSIZE(nvh); + data = NVH_DATA(nvh); + printf(" %s", nvh->nvh_name); + switch (nvh->nvh_type & NV_TYPE_MASK) { + case NV_TYPE_INT8: + printf("(int8): %jd", (intmax_t)(*(int8_t *)data)); + break; + case NV_TYPE_UINT8: + printf("(uint8): %ju", (uintmax_t)(*(uint8_t *)data)); + break; + case NV_TYPE_INT16: + printf("(int16): %jd", swap ? + (intmax_t)le16toh(*(int16_t *)(void *)data) : + (intmax_t)*(int16_t *)(void *)data); + break; + case NV_TYPE_UINT16: + printf("(uint16): %ju", swap ? + (uintmax_t)le16toh(*(uint16_t *)(void *)data) : + (uintmax_t)*(uint16_t *)(void *)data); + break; + case NV_TYPE_INT32: + printf("(int32): %jd", swap ? + (intmax_t)le32toh(*(int32_t *)(void *)data) : + (intmax_t)*(int32_t *)(void *)data); + break; + case NV_TYPE_UINT32: + printf("(uint32): %ju", swap ? + (uintmax_t)le32toh(*(uint32_t *)(void *)data) : + (uintmax_t)*(uint32_t *)(void *)data); + break; + case NV_TYPE_INT64: + printf("(int64): %jd", swap ? + (intmax_t)le64toh(*(int64_t *)(void *)data) : + (intmax_t)*(int64_t *)(void *)data); + break; + case NV_TYPE_UINT64: + printf("(uint64): %ju", swap ? + (uintmax_t)le64toh(*(uint64_t *)(void *)data) : + (uintmax_t)*(uint64_t *)(void *)data); + break; + case NV_TYPE_INT8_ARRAY: + printf("(int8 array):"); + for (ii = 0; ii < dsize; ii++) + printf(" %jd", (intmax_t)((int8_t *)data)[ii]); + break; + case NV_TYPE_UINT8_ARRAY: + printf("(uint8 array):"); + for (ii = 0; ii < dsize; ii++) + printf(" %ju", (uintmax_t)((uint8_t *)data)[ii]); + break; + case NV_TYPE_INT16_ARRAY: + printf("(int16 array):"); + for (ii = 0; ii < dsize / 2; ii++) { + printf(" %jd", swap ? + (intmax_t)le16toh(((int16_t *)(void *)data)[ii]) : + (intmax_t)((int16_t *)(void *)data)[ii]); + } + break; + case NV_TYPE_UINT16_ARRAY: + printf("(uint16 array):"); + for (ii = 0; ii < dsize / 2; ii++) { + printf(" %ju", swap ? + (uintmax_t)le16toh(((uint16_t *)(void *)data)[ii]) : + (uintmax_t)((uint16_t *)(void *)data)[ii]); + } + break; + case NV_TYPE_INT32_ARRAY: + printf("(int32 array):"); + for (ii = 0; ii < dsize / 4; ii++) { + printf(" %jd", swap ? + (intmax_t)le32toh(((int32_t *)(void *)data)[ii]) : + (intmax_t)((int32_t *)(void *)data)[ii]); + } + break; + case NV_TYPE_UINT32_ARRAY: + printf("(uint32 array):"); + for (ii = 0; ii < dsize / 4; ii++) { + printf(" %ju", swap ? + (uintmax_t)le32toh(((uint32_t *)(void *)data)[ii]) : + (uintmax_t)((uint32_t *)(void *)data)[ii]); + } + break; + case NV_TYPE_INT64_ARRAY: + printf("(int64 array):"); + for (ii = 0; ii < dsize / 8; ii++) { + printf(" %ju", swap ? + (uintmax_t)le64toh(((uint64_t *)(void *)data)[ii]) : + (uintmax_t)((uint64_t *)(void *)data)[ii]); + } + break; + case NV_TYPE_UINT64_ARRAY: + printf("(uint64 array):"); + for (ii = 0; ii < dsize / 8; ii++) { + printf(" %ju", swap ? + (uintmax_t)le64toh(((uint64_t *)(void *)data)[ii]) : + (uintmax_t)((uint64_t *)(void *)data)[ii]); + } + break; + case NV_TYPE_STRING: + printf("(string): %s", (char *)data); + break; + default: + assert(!"invalid condition"); + } + printf("\n"); + ptr += NVH_SIZE(nvh); + size -= NVH_SIZE(nvh); + } +} + +/* + * Local routines below. + */ + +static void +nv_add(struct nv *nv, const unsigned char *value, size_t vsize, int type, + const char *name) +{ + static unsigned char align[7]; + struct nvhdr *nvh; + size_t namesize; + + if (nv == NULL) { + errno = ENOMEM; + return; + } + + NV_CHECK(nv); + + namesize = strlen(name) + 1; + + nvh = malloc(sizeof(*nvh) + roundup2(namesize, 8)); + if (nvh == NULL) { + if (nv->nv_error == 0) + nv->nv_error = ENOMEM; + return; + } + nvh->nvh_type = NV_ORDER_HOST | type; + nvh->nvh_namesize = (uint8_t)namesize; + nvh->nvh_dsize = (uint32_t)vsize; + bcopy(name, nvh->nvh_name, namesize); + + /* Add header first. */ + if (ebuf_add_tail(nv->nv_ebuf, nvh, NVH_HSIZE(nvh)) < 0) { + assert(errno != 0); + if (nv->nv_error == 0) + nv->nv_error = errno; + return; + } + /* Add the actual data. */ + if (ebuf_add_tail(nv->nv_ebuf, value, vsize) < 0) { + assert(errno != 0); + if (nv->nv_error == 0) + nv->nv_error = errno; + return; + } + /* Align the data (if needed). */ + vsize = roundup2(vsize, 8) - vsize; + if (vsize == 0) + return; + assert(vsize > 0 && vsize <= sizeof(align)); + if (ebuf_add_tail(nv->nv_ebuf, align, vsize) < 0) { + assert(errno != 0); + if (nv->nv_error == 0) + nv->nv_error = errno; + return; + } +} + +static void +nv_addv(struct nv *nv, const unsigned char *value, size_t vsize, int type, + const char *namefmt, va_list nameap) +{ + char name[255]; + size_t namesize; + + namesize = vsnprintf(name, sizeof(name), namefmt, nameap); + assert(namesize > 0 && namesize < sizeof(name)); + + nv_add(nv, value, vsize, type, name); +} + +static struct nvhdr * +nv_find(struct nv *nv, int type, const char *namefmt, va_list nameap) +{ + char name[255]; + struct nvhdr *nvh; + unsigned char *ptr; + size_t size, namesize; + + if (nv == NULL) { + errno = ENOMEM; + return (NULL); + } + + NV_CHECK(nv); + + namesize = vsnprintf(name, sizeof(name), namefmt, nameap); + assert(namesize > 0 && namesize < sizeof(name)); + namesize++; + + ptr = ebuf_data(nv->nv_ebuf, &size); + while (size > 0) { + assert(size >= sizeof(*nvh) + 2); + nvh = (struct nvhdr *)ptr; + assert(size >= NVH_SIZE(nvh)); + nv_swap(nvh, true); + if (strcmp(nvh->nvh_name, name) == 0) { + if ((nvh->nvh_type & NV_TYPE_MASK) != type) { + errno = EINVAL; + if (nv->nv_error == 0) + nv->nv_error = EINVAL; + return (NULL); + } + return (nvh); + } + ptr += NVH_SIZE(nvh); + size -= NVH_SIZE(nvh); + } + errno = ENOENT; + if (nv->nv_error == 0) + nv->nv_error = ENOENT; + return (NULL); +} + +static void +nv_swap(struct nvhdr *nvh, bool tohost) +{ + unsigned char *data, *end, *p; + size_t vsize; + + data = NVH_DATA(nvh); + if (tohost) { + if ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST) + return; + nvh->nvh_dsize = le32toh(nvh->nvh_dsize); + end = data + nvh->nvh_dsize; + nvh->nvh_type &= ~NV_ORDER_MASK; + nvh->nvh_type |= NV_ORDER_HOST; + } else { + if ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_NETWORK) + return; + end = data + nvh->nvh_dsize; + nvh->nvh_dsize = htole32(nvh->nvh_dsize); + nvh->nvh_type &= ~NV_ORDER_MASK; + nvh->nvh_type |= NV_ORDER_NETWORK; + } + + vsize = 0; + + switch (nvh->nvh_type & NV_TYPE_MASK) { + case NV_TYPE_INT8: + case NV_TYPE_UINT8: + case NV_TYPE_INT8_ARRAY: + case NV_TYPE_UINT8_ARRAY: + break; + case NV_TYPE_INT16: + case NV_TYPE_UINT16: + case NV_TYPE_INT16_ARRAY: + case NV_TYPE_UINT16_ARRAY: + if (vsize == 0) + vsize = 2; + /* FALLTHOUGH */ + case NV_TYPE_INT32: + case NV_TYPE_UINT32: + case NV_TYPE_INT32_ARRAY: + case NV_TYPE_UINT32_ARRAY: + if (vsize == 0) + vsize = 4; + /* FALLTHOUGH */ + case NV_TYPE_INT64: + case NV_TYPE_UINT64: + case NV_TYPE_INT64_ARRAY: + case NV_TYPE_UINT64_ARRAY: + if (vsize == 0) + vsize = 8; + for (p = data; p < end; p += vsize) { + if (tohost) { + switch (vsize) { + case 2: + *(uint16_t *)(void *)p = + le16toh(*(uint16_t *)(void *)p); + break; + case 4: + *(uint32_t *)(void *)p = + le32toh(*(uint32_t *)(void *)p); + break; + case 8: + *(uint64_t *)(void *)p = + le64toh(*(uint64_t *)(void *)p); + break; + default: + assert(!"invalid condition"); + } + } else { + switch (vsize) { + case 2: + *(uint16_t *)(void *)p = + htole16(*(uint16_t *)(void *)p); + break; + case 4: + *(uint32_t *)(void *)p = + htole32(*(uint32_t *)(void *)p); + break; + case 8: + *(uint64_t *)(void *)p = + htole64(*(uint64_t *)(void *)p); + break; + default: + assert(!"invalid condition"); + } + } + } + break; + case NV_TYPE_STRING: + break; + default: + assert(!"unrecognized type"); + } +} diff --git a/sbin/hastd/nv.h b/sbin/hastd/nv.h new file mode 100644 index 0000000..1677548 --- /dev/null +++ b/sbin/hastd/nv.h @@ -0,0 +1,158 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NV_H_ +#define _NV_H_ + +#include <sys/cdefs.h> + +#include <stdarg.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> + +#include <ebuf.h> + +#define NV_TYPE_INT8 1 +#define NV_TYPE_UINT8 2 +#define NV_TYPE_INT16 3 +#define NV_TYPE_UINT16 4 +#define NV_TYPE_INT32 5 +#define NV_TYPE_UINT32 6 +#define NV_TYPE_INT64 7 +#define NV_TYPE_UINT64 8 +#define NV_TYPE_INT8_ARRAY 9 +#define NV_TYPE_UINT8_ARRAY 10 +#define NV_TYPE_INT16_ARRAY 11 +#define NV_TYPE_UINT16_ARRAY 12 +#define NV_TYPE_INT32_ARRAY 13 +#define NV_TYPE_UINT32_ARRAY 14 +#define NV_TYPE_INT64_ARRAY 15 +#define NV_TYPE_UINT64_ARRAY 16 +#define NV_TYPE_STRING 17 + +#define NV_TYPE_MASK 0x7f +#define NV_TYPE_FIRST NV_TYPE_INT8 +#define NV_TYPE_LAST NV_TYPE_STRING + +#define NV_ORDER_NETWORK 0x00 +#define NV_ORDER_HOST 0x80 + +#define NV_ORDER_MASK 0x80 + +struct nv; + +struct nv *nv_alloc(void); +void nv_free(struct nv *nv); +int nv_error(const struct nv *nv); +int nv_set_error(struct nv *nv, int error); +int nv_validate(struct nv *nv, size_t *extrap); + +struct ebuf *nv_hton(struct nv *nv); +struct nv *nv_ntoh(struct ebuf *eb); + +void nv_add_int8(struct nv *nv, int8_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_uint8(struct nv *nv, uint8_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_int16(struct nv *nv, int16_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_uint16(struct nv *nv, uint16_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_int32(struct nv *nv, int32_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_uint32(struct nv *nv, uint32_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_int64(struct nv *nv, int64_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_uint64(struct nv *nv, uint64_t value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_int8_array(struct nv *nv, const int8_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_uint8_array(struct nv *nv, const uint8_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_int16_array(struct nv *nv, const int16_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_uint16_array(struct nv *nv, const uint16_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_int32_array(struct nv *nv, const int32_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_uint32_array(struct nv *nv, const uint32_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_int64_array(struct nv *nv, const int64_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_uint64_array(struct nv *nv, const uint64_t *value, size_t size, + const char *namefmt, ...) __printflike(4, 5); +void nv_add_string(struct nv *nv, const char *value, const char *namefmt, ...) + __printflike(3, 4); +void nv_add_stringf(struct nv *nv, const char *name, const char *valuefmt, ...) + __printflike(3, 4); +void nv_add_stringv(struct nv *nv, const char *name, const char *valuefmt, + va_list valueap) __printflike(3, 0); + +int8_t nv_get_int8(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +uint8_t nv_get_uint8(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +int16_t nv_get_int16(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +uint16_t nv_get_uint16(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +int32_t nv_get_int32(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +uint32_t nv_get_uint32(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +int64_t nv_get_int64(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +uint64_t nv_get_uint64(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); +const int8_t *nv_get_int8_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const uint8_t *nv_get_uint8_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const int16_t *nv_get_int16_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const uint16_t *nv_get_uint16_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const int32_t *nv_get_int32_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const uint32_t *nv_get_uint32_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const int64_t *nv_get_int64_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const uint64_t *nv_get_uint64_array(struct nv *nv, size_t *sizep, + const char *namefmt, ...) __printflike(3, 4); +const char *nv_get_string(struct nv *nv, const char *namefmt, ...) + __printflike(2, 3); + +void nv_dump(struct nv *nv); + +#endif /* !_NV_H_ */ diff --git a/sbin/hastd/parse.y b/sbin/hastd/parse.y new file mode 100644 index 0000000..6755320 --- /dev/null +++ b/sbin/hastd/parse.y @@ -0,0 +1,507 @@ +%{ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> /* MAXHOSTNAMELEN */ +#include <sys/queue.h> +#include <sys/sysctl.h> + +#include <arpa/inet.h> + +#include <assert.h> +#include <err.h> +#include <stdio.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> + +#include "hast.h" + +extern int depth; +extern int lineno; + +extern FILE *yyin; +extern char *yytext; + +static struct hastd_config lconfig; +static struct hast_resource *curres; +static bool mynode; + +static char depth0_control[HAST_ADDRSIZE]; +static char depth0_listen[HAST_ADDRSIZE]; +static int depth0_replication; + +static char depth1_provname[PATH_MAX]; +static char depth1_localpath[PATH_MAX]; + +static bool +isitme(const char *name) +{ + char buf[MAXHOSTNAMELEN]; + char *pos; + size_t bufsize; + + /* + * First check if the give name matches our full hostname. + */ + if (gethostname(buf, sizeof(buf)) < 0) + err(EX_OSERR, "gethostname() failed"); + if (strcmp(buf, name) == 0) + return (true); + + /* + * Now check if it matches first part of the host name. + */ + pos = strchr(buf, '.'); + if (pos != NULL && pos != buf && strncmp(buf, name, pos - buf) == 0) + return (true); + + /* + * At the end check if name is equal to our host's UUID. + */ + bufsize = sizeof(buf); + if (sysctlbyname("kern.hostuuid", buf, &bufsize, NULL, 0) < 0) + err(EX_OSERR, "sysctlbyname(kern.hostuuid) failed"); + if (strcasecmp(buf, name) == 0) + return (true); + + /* + * Looks like this isn't about us. + */ + return (false); +} + +void +yyerror(const char *str) +{ + + fprintf(stderr, "error at line %d near '%s': %s\n", + lineno, yytext, str); +} + +struct hastd_config * +yy_config_parse(const char *config) +{ + int ret; + + curres = NULL; + mynode = false; + + depth0_replication = HAST_REPLICATION_MEMSYNC; + strlcpy(depth0_control, HAST_CONTROL, sizeof(depth0_control)); + strlcpy(depth0_listen, HASTD_LISTEN, sizeof(depth0_listen)); + + TAILQ_INIT(&lconfig.hc_resources); + + yyin = fopen(config, "r"); + if (yyin == NULL) + err(EX_OSFILE, "cannot open configuration file %s", config); + ret = yyparse(); + fclose(yyin); + if (ret != 0) { + yy_config_free(&lconfig); + exit(EX_CONFIG); + } + + /* + * Let's see if everything is set up. + */ + if (lconfig.hc_controladdr[0] == '\0') { + strlcpy(lconfig.hc_controladdr, depth0_control, + sizeof(lconfig.hc_controladdr)); + } + if (lconfig.hc_listenaddr[0] == '\0') { + strlcpy(lconfig.hc_listenaddr, depth0_listen, + sizeof(lconfig.hc_listenaddr)); + } + TAILQ_FOREACH(curres, &lconfig.hc_resources, hr_next) { + assert(curres->hr_provname[0] != '\0'); + assert(curres->hr_localpath[0] != '\0'); + assert(curres->hr_remoteaddr[0] != '\0'); + + if (curres->hr_replication == -1) { + /* + * Replication is not set at resource-level. + * Use global or default setting. + */ + curres->hr_replication = depth0_replication; + } + } + + return (&lconfig); +} + +void +yy_config_free(struct hastd_config *config) +{ + struct hast_resource *res; + + while ((res = TAILQ_FIRST(&config->hc_resources)) != NULL) { + TAILQ_REMOVE(&config->hc_resources, res, hr_next); + free(res); + } +} +%} + +%token CONTROL LISTEN PORT REPLICATION EXTENTSIZE RESOURCE NAME LOCAL REMOTE ON +%token FULLSYNC MEMSYNC ASYNC +%token NUM STR OB CB + +%type <num> replication_type + +%union +{ + int num; + char *str; +} + +%token <num> NUM +%token <str> STR + +%% + +statements: + | + statements statement + ; + +statement: + control_statement + | + listen_statement + | + replication_statement + | + node_statement + | + resource_statement + ; + +control_statement: CONTROL STR + { + switch (depth) { + case 0: + if (strlcpy(depth0_control, $2, + sizeof(depth0_control)) >= + sizeof(depth0_control)) { + errx(EX_CONFIG, "control argument too long"); + } + break; + case 1: + if (mynode) { + if (strlcpy(lconfig.hc_controladdr, $2, + sizeof(lconfig.hc_controladdr)) >= + sizeof(lconfig.hc_controladdr)) { + errx(EX_CONFIG, + "control argument too long"); + } + } + break; + default: + assert(!"control at wrong depth level"); + } + } + ; + +listen_statement: LISTEN STR + { + switch (depth) { + case 0: + if (strlcpy(depth0_listen, $2, + sizeof(depth0_listen)) >= + sizeof(depth0_listen)) { + errx(EX_CONFIG, "listen argument too long"); + } + break; + case 1: + if (mynode) { + if (strlcpy(lconfig.hc_listenaddr, $2, + sizeof(lconfig.hc_listenaddr)) >= + sizeof(lconfig.hc_listenaddr)) { + errx(EX_CONFIG, + "listen argument too long"); + } + } + break; + default: + assert(!"listen at wrong depth level"); + } + } + ; + +replication_statement: REPLICATION replication_type + { + switch (depth) { + case 0: + depth0_replication = $2; + break; + case 1: + if (curres != NULL) + curres->hr_replication = $2; + break; + default: + assert(!"replication at wrong depth level"); + } + } + ; + +replication_type: + FULLSYNC { $$ = HAST_REPLICATION_FULLSYNC; } + | + MEMSYNC { $$ = HAST_REPLICATION_MEMSYNC; } + | + ASYNC { $$ = HAST_REPLICATION_ASYNC; } + ; + +node_statement: ON node_start OB node_entries CB + { + mynode = false; + } + ; + +node_start: STR + { + if (isitme($1)) + mynode = true; + } + ; + +node_entries: + | + node_entries node_entry + ; + +node_entry: + control_statement + | + listen_statement + ; + +resource_statement: RESOURCE resource_start OB resource_entries CB + { + if (curres != NULL) { + /* + * Let's see there are some resource-level settings + * that we can use for node-level settings. + */ + if (curres->hr_provname[0] == '\0' && + depth1_provname[0] != '\0') { + /* + * Provider name is not set at node-level, + * but is set at resource-level, use it. + */ + strlcpy(curres->hr_provname, depth1_provname, + sizeof(curres->hr_provname)); + } + if (curres->hr_localpath[0] == '\0' && + depth1_localpath[0] != '\0') { + /* + * Path to local provider is not set at + * node-level, but is set at resource-level, + * use it. + */ + strlcpy(curres->hr_localpath, depth1_localpath, + sizeof(curres->hr_localpath)); + } + + /* + * If provider name is not given, use resource name + * as provider name. + */ + if (curres->hr_provname[0] == '\0') { + strlcpy(curres->hr_provname, curres->hr_name, + sizeof(curres->hr_provname)); + } + + /* + * Remote address has to be configured at this point. + */ + if (curres->hr_remoteaddr[0] == '\0') { + errx(EX_CONFIG, + "remote address not configured for resource %s", + curres->hr_name); + } + /* + * Path to local provider has to be configured at this + * point. + */ + if (curres->hr_localpath[0] == '\0') { + errx(EX_CONFIG, + "path local component not configured for resource %s", + curres->hr_name); + } + + /* Put it onto resource list. */ + TAILQ_INSERT_TAIL(&lconfig.hc_resources, curres, hr_next); + curres = NULL; + } + } + ; + +resource_start: STR + { + /* + * Clear those, so we can tell if they were set at + * resource-level or not. + */ + depth1_provname[0] = '\0'; + depth1_localpath[0] = '\0'; + + curres = calloc(1, sizeof(*curres)); + if (curres == NULL) { + errx(EX_TEMPFAIL, + "cannot allocate memory for resource"); + } + if (strlcpy(curres->hr_name, $1, + sizeof(curres->hr_name)) >= + sizeof(curres->hr_name)) { + errx(EX_CONFIG, + "resource name (%s) too long", $1); + } + curres->hr_role = HAST_ROLE_INIT; + curres->hr_previous_role = HAST_ROLE_INIT; + curres->hr_replication = -1; + curres->hr_provname[0] = '\0'; + curres->hr_localpath[0] = '\0'; + curres->hr_localfd = -1; + curres->hr_remoteaddr[0] = '\0'; + curres->hr_ggateunit = -1; + } + ; + +resource_entries: + | + resource_entries resource_entry + ; + +resource_entry: + replication_statement + | + name_statement + | + local_statement + | + resource_node_statement + ; + +name_statement: NAME STR + { + switch (depth) { + case 1: + if (strlcpy(depth1_provname, $2, + sizeof(depth1_provname)) >= + sizeof(depth1_provname)) { + errx(EX_CONFIG, "name argument too long"); + } + break; + case 2: + if (mynode) { + assert(curres != NULL); + if (strlcpy(curres->hr_provname, $2, + sizeof(curres->hr_provname)) >= + sizeof(curres->hr_provname)) { + errx(EX_CONFIG, + "name argument too long"); + } + } + break; + default: + assert(!"name at wrong depth level"); + } + } + ; + +local_statement: LOCAL STR + { + switch (depth) { + case 1: + if (strlcpy(depth1_localpath, $2, + sizeof(depth1_localpath)) >= + sizeof(depth1_localpath)) { + errx(EX_CONFIG, "local argument too long"); + } + break; + case 2: + if (mynode) { + assert(curres != NULL); + if (strlcpy(curres->hr_localpath, $2, + sizeof(curres->hr_localpath)) >= + sizeof(curres->hr_localpath)) { + errx(EX_CONFIG, + "local argument too long"); + } + } + break; + default: + assert(!"local at wrong depth level"); + } + } + ; + +resource_node_statement:ON resource_node_start OB resource_node_entries CB + { + mynode = false; + } + ; + +resource_node_start: STR + { + if (curres != NULL && isitme($1)) + mynode = true; + } + ; + +resource_node_entries: + | + resource_node_entries resource_node_entry + ; + +resource_node_entry: + name_statement + | + local_statement + | + remote_statement + ; + +remote_statement: REMOTE STR + { + assert(depth == 2); + if (mynode) { + assert(curres != NULL); + if (strlcpy(curres->hr_remoteaddr, $2, + sizeof(curres->hr_remoteaddr)) >= + sizeof(curres->hr_remoteaddr)) { + errx(EX_CONFIG, "remote argument too long"); + } + } + } + ; diff --git a/sbin/hastd/pjdlog.c b/sbin/hastd/pjdlog.c new file mode 100644 index 0000000..38c5539 --- /dev/null +++ b/sbin/hastd/pjdlog.c @@ -0,0 +1,367 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <assert.h> +#include <errno.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> + +#include "pjdlog.h" + +static int pjdlog_mode = PJDLOG_MODE_STD; +static int pjdlog_debug_level = 0; +static char pjdlog_prefix[128]; + +/* + * Configure where the logs should go. + * By default they are send to stdout/stderr, but after going into background + * (eg. by calling daemon(3)) application is responsible for changing mode to + * PJDLOG_MODE_SYSLOG, so logs will be send to syslog. + */ +void +pjdlog_mode_set(int mode) +{ + + assert(mode == PJDLOG_MODE_STD || mode == PJDLOG_MODE_SYSLOG); + + pjdlog_mode = mode; +} + +/* + * Return current mode. + */ +int +pjdlog_mode_get(void) +{ + + return (pjdlog_mode); +} + +/* + * Set debug level. All the logs above the level specified here will be + * ignored. + */ +void +pjdlog_debug_set(int level) +{ + + assert(level >= 0); + + pjdlog_debug_level = level; +} + +/* + * Return current debug level. + */ +int +pjdlog_debug_get(void) +{ + + return (pjdlog_debug_level); +} + +/* + * Set prefix that will be used before each log. + * Setting prefix to NULL will remove it. + */ +void +pjdlog_prefix_set(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + pjdlog_prefix_setv(fmt, ap); + va_end(ap); +} + +/* + * Set prefix that will be used before each log. + * Setting prefix to NULL will remove it. + */ +void +pjdlog_prefix_setv(const char *fmt, va_list ap) +{ + + assert(fmt != NULL); + + vsnprintf(pjdlog_prefix, sizeof(pjdlog_prefix), fmt, ap); +} + +/* + * Convert log level into string. + */ +static const char * +pjdlog_level_string(int loglevel) +{ + + switch (loglevel) { + case LOG_EMERG: + return ("EMERG"); + case LOG_ALERT: + return ("ALERT"); + case LOG_CRIT: + return ("CRIT"); + case LOG_ERR: + return ("ERROR"); + case LOG_WARNING: + return ("WARNING"); + case LOG_NOTICE: + return ("NOTICE"); + case LOG_INFO: + return ("INFO"); + case LOG_DEBUG: + return ("DEBUG"); + } + assert(!"Invalid log level."); + abort(); /* XXX: gcc */ +} + +/* + * Common log routine. + */ +void +pjdlog_common(int loglevel, int debuglevel, int error, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + pjdlogv_common(loglevel, debuglevel, error, fmt, ap); + va_end(ap); +} + +/* + * Common log routine, which can handle regular log level as well as debug + * level. We decide here where to send the logs (stdout/stderr or syslog). + */ +void +pjdlogv_common(int loglevel, int debuglevel, int error, const char *fmt, + va_list ap) +{ + + assert(loglevel == LOG_EMERG || loglevel == LOG_ALERT || + loglevel == LOG_CRIT || loglevel == LOG_ERR || + loglevel == LOG_WARNING || loglevel == LOG_NOTICE || + loglevel == LOG_INFO || loglevel == LOG_DEBUG); + assert(loglevel != LOG_DEBUG || debuglevel > 0); + assert(error >= -1); + + /* Ignore debug above configured level. */ + if (loglevel == LOG_DEBUG && debuglevel > pjdlog_debug_level) + return; + + switch (pjdlog_mode) { + case PJDLOG_MODE_STD: + { + FILE *out; + + /* + * We send errors and warning to stderr and the rest to stdout. + */ + switch (loglevel) { + case LOG_EMERG: + case LOG_ALERT: + case LOG_CRIT: + case LOG_ERR: + case LOG_WARNING: + out = stderr; + break; + case LOG_NOTICE: + case LOG_INFO: + case LOG_DEBUG: + out = stdout; + break; + default: + assert(!"Invalid loglevel."); + abort(); /* XXX: gcc */ + } + + fprintf(out, "[%s]", pjdlog_level_string(loglevel)); + /* Attach debuglevel if this is debug log. */ + if (loglevel == LOG_DEBUG) + fprintf(out, "[%d]", debuglevel); + fprintf(out, " "); + fprintf(out, "%s", pjdlog_prefix); + vfprintf(out, fmt, ap); + if (error != -1) + fprintf(out, ": %s.", strerror(error)); + fprintf(out, "\n"); + break; + } + case PJDLOG_MODE_SYSLOG: + { + char log[1024]; + int len; + + len = snprintf(log, sizeof(log), "%s", pjdlog_prefix); + if ((size_t)len < sizeof(log)) + len = vsnprintf(log + len, sizeof(log) - len, fmt, ap); + if (error != -1 && (size_t)len < sizeof(log)) { + (void)snprintf(log + len, sizeof(log) - len, ": %s.", + strerror(error)); + } + syslog(loglevel, "%s", log); + break; + } + default: + assert(!"Invalid mode."); + } +} + +/* + * Regular logs. + */ +void +pjdlogv(int loglevel, const char *fmt, va_list ap) +{ + + /* LOG_DEBUG is invalid here, pjdlogv?_debug() should be used. */ + assert(loglevel == LOG_EMERG || loglevel == LOG_ALERT || + loglevel == LOG_CRIT || loglevel == LOG_ERR || + loglevel == LOG_WARNING || loglevel == LOG_NOTICE || + loglevel == LOG_INFO); + + pjdlogv_common(loglevel, 0, -1, fmt, ap); +} + +/* + * Regular logs. + */ +void +pjdlog(int loglevel, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + pjdlogv(loglevel, fmt, ap); + va_end(ap); +} + +/* + * Debug logs. + */ +void +pjdlogv_debug(int debuglevel, const char *fmt, va_list ap) +{ + + pjdlogv_common(LOG_DEBUG, debuglevel, -1, fmt, ap); +} + +/* + * Debug logs. + */ +void +pjdlog_debug(int debuglevel, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + pjdlogv_debug(debuglevel, fmt, ap); + va_end(ap); +} + +/* + * Error logs with errno logging. + */ +void +pjdlogv_errno(int loglevel, const char *fmt, va_list ap) +{ + + pjdlogv_common(loglevel, 0, errno, fmt, ap); +} + +/* + * Error logs with errno logging. + */ +void +pjdlog_errno(int loglevel, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + pjdlogv_errno(loglevel, fmt, ap); + va_end(ap); +} + +/* + * Log error, errno and exit. + */ +void +pjdlogv_exit(int exitcode, const char *fmt, va_list ap) +{ + + pjdlogv_errno(LOG_ERR, fmt, ap); + exit(exitcode); +} + +/* + * Log error, errno and exit. + */ +void +pjdlog_exit(int exitcode, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + pjdlogv_exit(exitcode, fmt, ap); + /* NOTREACHED */ + va_end(ap); +} + +/* + * Log error and exit. + */ +void +pjdlogv_exitx(int exitcode, const char *fmt, va_list ap) +{ + + pjdlogv(LOG_ERR, fmt, ap); + exit(exitcode); +} + +/* + * Log error and exit. + */ +void +pjdlog_exitx(int exitcode, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + pjdlogv_exitx(exitcode, fmt, ap); + /* NOTREACHED */ + va_end(ap); +} diff --git a/sbin/hastd/pjdlog.h b/sbin/hastd/pjdlog.h new file mode 100644 index 0000000..2136b12 --- /dev/null +++ b/sbin/hastd/pjdlog.h @@ -0,0 +1,88 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PJDLOG_H_ +#define _PJDLOG_H_ + +#include <sys/cdefs.h> + +#include <stdarg.h> +#include <sysexits.h> +#include <syslog.h> + +#define PJDLOG_MODE_STD 0 +#define PJDLOG_MODE_SYSLOG 1 + +void pjdlog_mode_set(int mode); +int pjdlog_mode_get(void); + +void pjdlog_debug_set(int level); +int pjdlog_debug_get(void); + +void pjdlog_prefix_set(const char *fmt, ...) __printflike(1, 2); +void pjdlog_prefix_setv(const char *fmt, va_list ap) __printflike(1, 0); + +void pjdlog_common(int loglevel, int debuglevel, int error, const char *fmt, + ...) __printflike(4, 5); +void pjdlogv_common(int loglevel, int debuglevel, int error, const char *fmt, + va_list ap) __printflike(4, 0); + +void pjdlog(int loglevel, const char *fmt, ...) __printflike(2, 3); +void pjdlogv(int loglevel, const char *fmt, va_list ap) __printflike(2, 0); + +#define pjdlogv_emergency(fmt, ap) pjdlogv(LOG_EMERG, (fmt), (ap)) +#define pjdlog_emergency(...) pjdlog(LOG_EMERG, __VA_ARGS__) +#define pjdlogv_alert(fmt, ap) pjdlogv(LOG_ALERT, (fmt), (ap)) +#define pjdlog_alert(...) pjdlog(LOG_ALERT, __VA_ARGS__) +#define pjdlogv_critical(fmt, ap) pjdlogv(LOG_CRIT, (fmt), (ap)) +#define pjdlog_critical(...) pjdlog(LOG_CRIT, __VA_ARGS__) +#define pjdlogv_error(fmt, ap) pjdlogv(LOG_ERR, (fmt), (ap)) +#define pjdlog_error(...) pjdlog(LOG_ERR, __VA_ARGS__) +#define pjdlogv_warning(fmt, ap) pjdlogv(LOG_WARNING, (fmt), (ap)) +#define pjdlog_warning(...) pjdlog(LOG_WARNING, __VA_ARGS__) +#define pjdlogv_notice(fmt, ap) pjdlogv(LOG_NOTICE, (fmt), (ap)) +#define pjdlog_notice(...) pjdlog(LOG_NOTICE, __VA_ARGS__) +#define pjdlogv_info(fmt, ap) pjdlogv(LOG_INFO, (fmt), (ap)) +#define pjdlog_info(...) pjdlog(LOG_INFO, __VA_ARGS__) + +void pjdlog_debug(int debuglevel, const char *fmt, ...) __printflike(2, 3); +void pjdlogv_debug(int debuglevel, const char *fmt, va_list ap) __printflike(2, 0); + +void pjdlog_errno(int loglevel, const char *fmt, ...) __printflike(2, 3); +void pjdlogv_errno(int loglevel, const char *fmt, va_list ap) __printflike(2, 0); + +void pjdlog_exit(int exitcode, const char *fmt, ...) __printflike(2, 3) __dead2; +void pjdlogv_exit(int exitcode, const char *fmt, va_list ap) __printflike(2, 0) __dead2; + +void pjdlog_exitx(int exitcode, const char *fmt, ...) __printflike(2, 3) __dead2; +void pjdlogv_exitx(int exitcode, const char *fmt, va_list ap) __printflike(2, 0) __dead2; + +#endif /* !_PJDLOG_H_ */ diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c new file mode 100644 index 0000000..ed6e91c --- /dev/null +++ b/sbin/hastd/primary.c @@ -0,0 +1,1769 @@ +/*- + * Copyright (c) 2009 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/bio.h> +#include <sys/disk.h> +#include <sys/refcount.h> +#include <sys/stat.h> + +#include <geom/gate/g_gate.h> + +#include <assert.h> +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <libgeom.h> +#include <pthread.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> + +#include <activemap.h> +#include <nv.h> +#include <rangelock.h> + +#include "control.h" +#include "hast.h" +#include "hast_proto.h" +#include "hastd.h" +#include "metadata.h" +#include "proto.h" +#include "pjdlog.h" +#include "subr.h" +#include "synch.h" + +struct hio { + /* + * Number of components we are still waiting for. + * When this field goes to 0, we can send the request back to the + * kernel. Each component has to decrease this counter by one + * even on failure. + */ + unsigned int hio_countdown; + /* + * Each component has a place to store its own error. + * Once the request is handled by all components we can decide if the + * request overall is successful or not. + */ + int *hio_errors; + /* + * Structure used to comunicate with GEOM Gate class. + */ + struct g_gate_ctl_io hio_ggio; + TAILQ_ENTRY(hio) *hio_next; +}; +#define hio_free_next hio_next[0] +#define hio_done_next hio_next[0] + +/* + * Free list holds unused structures. When free list is empty, we have to wait + * until some in-progress requests are freed. + */ +static TAILQ_HEAD(, hio) hio_free_list; +static pthread_mutex_t hio_free_list_lock; +static pthread_cond_t hio_free_list_cond; +/* + * There is one send list for every component. One requests is placed on all + * send lists - each component gets the same request, but each component is + * responsible for managing his own send list. + */ +static TAILQ_HEAD(, hio) *hio_send_list; +static pthread_mutex_t *hio_send_list_lock; +static pthread_cond_t *hio_send_list_cond; +/* + * There is one recv list for every component, although local components don't + * use recv lists as local requests are done synchronously. + */ +static TAILQ_HEAD(, hio) *hio_recv_list; +static pthread_mutex_t *hio_recv_list_lock; +static pthread_cond_t *hio_recv_list_cond; +/* + * Request is placed on done list by the slowest component (the one that + * decreased hio_countdown from 1 to 0). + */ +static TAILQ_HEAD(, hio) hio_done_list; +static pthread_mutex_t hio_done_list_lock; +static pthread_cond_t hio_done_list_cond; +/* + * Structure below are for interaction with sync thread. + */ +static bool sync_inprogress; +static pthread_mutex_t sync_lock; +static pthread_cond_t sync_cond; +/* + * The lock below allows to synchornize access to remote connections. + */ +static pthread_rwlock_t *hio_remote_lock; +static pthread_mutex_t hio_guard_lock; +static pthread_cond_t hio_guard_cond; + +/* + * Lock to synchronize metadata updates. Also synchronize access to + * hr_primary_localcnt and hr_primary_remotecnt fields. + */ +static pthread_mutex_t metadata_lock; + +/* + * Maximum number of outstanding I/O requests. + */ +#define HAST_HIO_MAX 256 +/* + * Number of components. At this point there are only two components: local + * and remote, but in the future it might be possible to use multiple local + * and remote components. + */ +#define HAST_NCOMPONENTS 2 +/* + * Number of seconds to sleep before next reconnect try. + */ +#define RECONNECT_SLEEP 5 + +#define ISCONNECTED(res, no) \ + ((res)->hr_remotein != NULL && (res)->hr_remoteout != NULL) + +#define QUEUE_INSERT1(hio, name, ncomp) do { \ + bool _wakeup; \ + \ + mtx_lock(&hio_##name##_list_lock[(ncomp)]); \ + _wakeup = TAILQ_EMPTY(&hio_##name##_list[(ncomp)]); \ + TAILQ_INSERT_TAIL(&hio_##name##_list[(ncomp)], (hio), \ + hio_next[(ncomp)]); \ + mtx_unlock(&hio_##name##_list_lock[ncomp]); \ + if (_wakeup) \ + cv_signal(&hio_##name##_list_cond[(ncomp)]); \ +} while (0) +#define QUEUE_INSERT2(hio, name) do { \ + bool _wakeup; \ + \ + mtx_lock(&hio_##name##_list_lock); \ + _wakeup = TAILQ_EMPTY(&hio_##name##_list); \ + TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_##name##_next);\ + mtx_unlock(&hio_##name##_list_lock); \ + if (_wakeup) \ + cv_signal(&hio_##name##_list_cond); \ +} while (0) +#define QUEUE_TAKE1(hio, name, ncomp) do { \ + mtx_lock(&hio_##name##_list_lock[(ncomp)]); \ + while (((hio) = TAILQ_FIRST(&hio_##name##_list[(ncomp)])) == NULL) { \ + cv_wait(&hio_##name##_list_cond[(ncomp)], \ + &hio_##name##_list_lock[(ncomp)]); \ + } \ + TAILQ_REMOVE(&hio_##name##_list[(ncomp)], (hio), \ + hio_next[(ncomp)]); \ + mtx_unlock(&hio_##name##_list_lock[(ncomp)]); \ +} while (0) +#define QUEUE_TAKE2(hio, name) do { \ + mtx_lock(&hio_##name##_list_lock); \ + while (((hio) = TAILQ_FIRST(&hio_##name##_list)) == NULL) { \ + cv_wait(&hio_##name##_list_cond, \ + &hio_##name##_list_lock); \ + } \ + TAILQ_REMOVE(&hio_##name##_list, (hio), hio_##name##_next); \ + mtx_unlock(&hio_##name##_list_lock); \ +} while (0) + +#define SYNCREQ(hio) do { (hio)->hio_ggio.gctl_unit = -1; } while (0) +#define ISSYNCREQ(hio) ((hio)->hio_ggio.gctl_unit == -1) +#define SYNCREQDONE(hio) do { (hio)->hio_ggio.gctl_unit = -2; } while (0) +#define ISSYNCREQDONE(hio) ((hio)->hio_ggio.gctl_unit == -2) + +static struct hast_resource *gres; + +static pthread_mutex_t range_lock; +static struct rangelocks *range_regular; +static bool range_regular_wait; +static pthread_cond_t range_regular_cond; +static struct rangelocks *range_sync; +static bool range_sync_wait; +static pthread_cond_t range_sync_cond; + +static void *ggate_recv_thread(void *arg); +static void *local_send_thread(void *arg); +static void *remote_send_thread(void *arg); +static void *remote_recv_thread(void *arg); +static void *ggate_send_thread(void *arg); +static void *sync_thread(void *arg); +static void *guard_thread(void *arg); + +static void sighandler(int sig); + +static void +cleanup(struct hast_resource *res) +{ + int rerrno; + + /* Remember errno. */ + rerrno = errno; + + /* + * Close descriptor to /dev/hast/<name> + * to work-around race in the kernel. + */ + close(res->hr_localfd); + + /* Destroy ggate provider if we created one. */ + if (res->hr_ggateunit >= 0) { + struct g_gate_ctl_destroy ggiod; + + ggiod.gctl_version = G_GATE_VERSION; + ggiod.gctl_unit = res->hr_ggateunit; + ggiod.gctl_force = 1; + if (ioctl(res->hr_ggatefd, G_GATE_CMD_DESTROY, &ggiod) < 0) { + pjdlog_warning("Unable to destroy hast/%s device", + res->hr_provname); + } + res->hr_ggateunit = -1; + } + + /* Restore errno. */ + errno = rerrno; +} + +static void +primary_exit(int exitcode, const char *fmt, ...) +{ + va_list ap; + + assert(exitcode != EX_OK); + va_start(ap, fmt); + pjdlogv_errno(LOG_ERR, fmt, ap); + va_end(ap); + cleanup(gres); + exit(exitcode); +} + +static void +primary_exitx(int exitcode, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + pjdlogv(exitcode == EX_OK ? LOG_INFO : LOG_ERR, fmt, ap); + va_end(ap); + cleanup(gres); + exit(exitcode); +} + +static int +hast_activemap_flush(struct hast_resource *res) +{ + const unsigned char *buf; + size_t size; + + buf = activemap_bitmap(res->hr_amp, &size); + assert(buf != NULL); + assert((size % res->hr_local_sectorsize) == 0); + if (pwrite(res->hr_localfd, buf, size, METADATA_SIZE) != + (ssize_t)size) { + KEEP_ERRNO(pjdlog_errno(LOG_ERR, + "Unable to flush activemap to disk")); + return (-1); + } + return (0); +} + +static void +init_environment(struct hast_resource *res __unused) +{ + struct hio *hio; + unsigned int ii, ncomps; + + /* + * In the future it might be per-resource value. + */ + ncomps = HAST_NCOMPONENTS; + + /* + * Allocate memory needed by lists. + */ + hio_send_list = malloc(sizeof(hio_send_list[0]) * ncomps); + if (hio_send_list == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for send lists.", + sizeof(hio_send_list[0]) * ncomps); + } + hio_send_list_lock = malloc(sizeof(hio_send_list_lock[0]) * ncomps); + if (hio_send_list_lock == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for send list locks.", + sizeof(hio_send_list_lock[0]) * ncomps); + } + hio_send_list_cond = malloc(sizeof(hio_send_list_cond[0]) * ncomps); + if (hio_send_list_cond == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for send list condition variables.", + sizeof(hio_send_list_cond[0]) * ncomps); + } + hio_recv_list = malloc(sizeof(hio_recv_list[0]) * ncomps); + if (hio_recv_list == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for recv lists.", + sizeof(hio_recv_list[0]) * ncomps); + } + hio_recv_list_lock = malloc(sizeof(hio_recv_list_lock[0]) * ncomps); + if (hio_recv_list_lock == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for recv list locks.", + sizeof(hio_recv_list_lock[0]) * ncomps); + } + hio_recv_list_cond = malloc(sizeof(hio_recv_list_cond[0]) * ncomps); + if (hio_recv_list_cond == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for recv list condition variables.", + sizeof(hio_recv_list_cond[0]) * ncomps); + } + hio_remote_lock = malloc(sizeof(hio_remote_lock[0]) * ncomps); + if (hio_remote_lock == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for remote connections locks.", + sizeof(hio_remote_lock[0]) * ncomps); + } + + /* + * Initialize lists, their locks and theirs condition variables. + */ + TAILQ_INIT(&hio_free_list); + mtx_init(&hio_free_list_lock); + cv_init(&hio_free_list_cond); + for (ii = 0; ii < HAST_NCOMPONENTS; ii++) { + TAILQ_INIT(&hio_send_list[ii]); + mtx_init(&hio_send_list_lock[ii]); + cv_init(&hio_send_list_cond[ii]); + TAILQ_INIT(&hio_recv_list[ii]); + mtx_init(&hio_recv_list_lock[ii]); + cv_init(&hio_recv_list_cond[ii]); + rw_init(&hio_remote_lock[ii]); + } + TAILQ_INIT(&hio_done_list); + mtx_init(&hio_done_list_lock); + cv_init(&hio_done_list_cond); + mtx_init(&hio_guard_lock); + cv_init(&hio_guard_cond); + mtx_init(&metadata_lock); + + /* + * Allocate requests pool and initialize requests. + */ + for (ii = 0; ii < HAST_HIO_MAX; ii++) { + hio = malloc(sizeof(*hio)); + if (hio == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for hio request.", + sizeof(*hio)); + } + hio->hio_countdown = 0; + hio->hio_errors = malloc(sizeof(hio->hio_errors[0]) * ncomps); + if (hio->hio_errors == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable allocate %zu bytes of memory for hio errors.", + sizeof(hio->hio_errors[0]) * ncomps); + } + hio->hio_next = malloc(sizeof(hio->hio_next[0]) * ncomps); + if (hio->hio_next == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable allocate %zu bytes of memory for hio_next field.", + sizeof(hio->hio_next[0]) * ncomps); + } + hio->hio_ggio.gctl_version = G_GATE_VERSION; + hio->hio_ggio.gctl_data = malloc(MAXPHYS); + if (hio->hio_ggio.gctl_data == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate %zu bytes of memory for gctl_data.", + MAXPHYS); + } + hio->hio_ggio.gctl_length = MAXPHYS; + hio->hio_ggio.gctl_error = 0; + TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_free_next); + } + + /* + * Turn on signals handling. + */ + signal(SIGINT, sighandler); + signal(SIGTERM, sighandler); +} + +static void +init_local(struct hast_resource *res) +{ + unsigned char *buf; + size_t mapsize; + + if (metadata_read(res, true) < 0) + exit(EX_NOINPUT); + mtx_init(&res->hr_amp_lock); + if (activemap_init(&res->hr_amp, res->hr_datasize, res->hr_extentsize, + res->hr_local_sectorsize, res->hr_keepdirty) < 0) { + primary_exit(EX_TEMPFAIL, "Unable to create activemap"); + } + mtx_init(&range_lock); + cv_init(&range_regular_cond); + if (rangelock_init(&range_regular) < 0) + primary_exit(EX_TEMPFAIL, "Unable to create regular range lock"); + cv_init(&range_sync_cond); + if (rangelock_init(&range_sync) < 0) + primary_exit(EX_TEMPFAIL, "Unable to create sync range lock"); + mapsize = activemap_ondisk_size(res->hr_amp); + buf = calloc(1, mapsize); + if (buf == NULL) { + primary_exitx(EX_TEMPFAIL, + "Unable to allocate buffer for activemap."); + } + if (pread(res->hr_localfd, buf, mapsize, METADATA_SIZE) != + (ssize_t)mapsize) { + primary_exit(EX_NOINPUT, "Unable to read activemap"); + } + activemap_copyin(res->hr_amp, buf, mapsize); + if (res->hr_resuid != 0) + return; + /* + * We're using provider for the first time, so we have to generate + * resource unique identifier and initialize local and remote counts. + */ + arc4random_buf(&res->hr_resuid, sizeof(res->hr_resuid)); + res->hr_primary_localcnt = 1; + res->hr_primary_remotecnt = 0; + if (metadata_write(res) < 0) + exit(EX_NOINPUT); +} + +static void +init_remote(struct hast_resource *res) +{ + struct nv *nvout, *nvin; + const unsigned char *token; + unsigned char *map; + const char *errmsg; + int32_t extentsize; + int64_t datasize; + uint32_t mapsize; + size_t size; + + /* Prepare outgoing connection with remote node. */ + if (proto_client(res->hr_remoteaddr, &res->hr_remoteout) < 0) { + primary_exit(EX_OSERR, "Unable to create connection to %s", + res->hr_remoteaddr); + } + /* Try to connect, but accept failure. */ + if (proto_connect(res->hr_remoteout) < 0) { + pjdlog_errno(LOG_WARNING, "Unable to connect to %s", + res->hr_remoteaddr); + goto close; + } + /* + * First handshake step. + * Setup outgoing connection with remote node. + */ + nvout = nv_alloc(); + nv_add_string(nvout, res->hr_name, "resource"); + if (nv_error(nvout) != 0) { + pjdlog_common(LOG_WARNING, 0, nv_error(nvout), + "Unable to allocate header for connection with %s", + res->hr_remoteaddr); + nv_free(nvout); + goto close; + } + if (hast_proto_send(res, res->hr_remoteout, nvout, NULL, 0) < 0) { + pjdlog_errno(LOG_WARNING, + "Unable to send handshake header to %s", + res->hr_remoteaddr); + nv_free(nvout); + goto close; + } + nv_free(nvout); + if (hast_proto_recv_hdr(res->hr_remoteout, &nvin) < 0) { + pjdlog_errno(LOG_WARNING, + "Unable to receive handshake header from %s", + res->hr_remoteaddr); + goto close; + } + errmsg = nv_get_string(nvin, "errmsg"); + if (errmsg != NULL) { + pjdlog_warning("%s", errmsg); + nv_free(nvin); + goto close; + } + token = nv_get_uint8_array(nvin, &size, "token"); + if (token == NULL) { + pjdlog_warning("Handshake header from %s has no 'token' field.", + res->hr_remoteaddr); + nv_free(nvin); + goto close; + } + if (size != sizeof(res->hr_token)) { + pjdlog_warning("Handshake header from %s contains 'token' of wrong size (got %zu, expected %zu).", + res->hr_remoteaddr, size, sizeof(res->hr_token)); + nv_free(nvin); + goto close; + } + bcopy(token, res->hr_token, sizeof(res->hr_token)); + nv_free(nvin); + + /* + * Second handshake step. + * Setup incoming connection with remote node. + */ + if (proto_client(res->hr_remoteaddr, &res->hr_remotein) < 0) { + pjdlog_errno(LOG_WARNING, "Unable to create connection to %s", + res->hr_remoteaddr); + } + /* Try to connect, but accept failure. */ + if (proto_connect(res->hr_remotein) < 0) { + pjdlog_errno(LOG_WARNING, "Unable to connect to %s", + res->hr_remoteaddr); + goto close; + } + nvout = nv_alloc(); + nv_add_string(nvout, res->hr_name, "resource"); + nv_add_uint8_array(nvout, res->hr_token, sizeof(res->hr_token), + "token"); + nv_add_uint64(nvout, res->hr_resuid, "resuid"); + nv_add_uint64(nvout, res->hr_primary_localcnt, "localcnt"); + nv_add_uint64(nvout, res->hr_primary_remotecnt, "remotecnt"); + if (nv_error(nvout) != 0) { + pjdlog_common(LOG_WARNING, 0, nv_error(nvout), + "Unable to allocate header for connection with %s", + res->hr_remoteaddr); + nv_free(nvout); + goto close; + } + if (hast_proto_send(res, res->hr_remotein, nvout, NULL, 0) < 0) { + pjdlog_errno(LOG_WARNING, + "Unable to send handshake header to %s", + res->hr_remoteaddr); + nv_free(nvout); + goto close; + } + nv_free(nvout); + if (hast_proto_recv_hdr(res->hr_remoteout, &nvin) < 0) { + pjdlog_errno(LOG_WARNING, + "Unable to receive handshake header from %s", + res->hr_remoteaddr); + goto close; + } + errmsg = nv_get_string(nvin, "errmsg"); + if (errmsg != NULL) { + pjdlog_warning("%s", errmsg); + nv_free(nvin); + goto close; + } + datasize = nv_get_int64(nvin, "datasize"); + if (datasize != res->hr_datasize) { + pjdlog_warning("Data size differs between nodes (local=%jd, remote=%jd).", + (intmax_t)res->hr_datasize, (intmax_t)datasize); + nv_free(nvin); + goto close; + } + extentsize = nv_get_int32(nvin, "extentsize"); + if (extentsize != res->hr_extentsize) { + pjdlog_warning("Extent size differs between nodes (local=%zd, remote=%zd).", + (ssize_t)res->hr_extentsize, (ssize_t)extentsize); + nv_free(nvin); + goto close; + } + res->hr_secondary_localcnt = nv_get_uint64(nvin, "localcnt"); + res->hr_secondary_remotecnt = nv_get_uint64(nvin, "remotecnt"); + res->hr_syncsrc = nv_get_uint8(nvin, "syncsrc"); + map = NULL; + mapsize = nv_get_uint32(nvin, "mapsize"); + if (mapsize > 0) { + map = malloc(mapsize); + if (map == NULL) { + pjdlog_error("Unable to allocate memory for remote activemap (mapsize=%ju).", + (uintmax_t)mapsize); + nv_free(nvin); + goto close; + } + /* + * Remote node have some dirty extents on its own, lets + * download its activemap. + */ + if (hast_proto_recv_data(res, res->hr_remoteout, nvin, map, + mapsize) < 0) { + pjdlog_errno(LOG_ERR, + "Unable to receive remote activemap"); + nv_free(nvin); + free(map); + goto close; + } + /* + * Merge local and remote bitmaps. + */ + activemap_merge(res->hr_amp, map, mapsize); + free(map); + /* + * Now that we merged bitmaps from both nodes, flush it to the + * disk before we start to synchronize. + */ + (void)hast_activemap_flush(res); + } + pjdlog_info("Connected to %s.", res->hr_remoteaddr); + mtx_lock(&sync_lock); + sync_inprogress = true; + mtx_unlock(&sync_lock); + cv_signal(&sync_cond); + return; +close: + proto_close(res->hr_remoteout); + res->hr_remoteout = NULL; + if (res->hr_remotein != NULL) { + proto_close(res->hr_remotein); + res->hr_remotein = NULL; + } +} + +static void +init_ggate(struct hast_resource *res) +{ + struct g_gate_ctl_create ggiocreate; + struct g_gate_ctl_cancel ggiocancel; + + /* + * We communicate with ggate via /dev/ggctl. Open it. + */ + res->hr_ggatefd = open("/dev/" G_GATE_CTL_NAME, O_RDWR); + if (res->hr_ggatefd < 0) + primary_exit(EX_OSFILE, "Unable to open /dev/" G_GATE_CTL_NAME); + /* + * Create provider before trying to connect, as connection failure + * is not critical, but may take some time. + */ + ggiocreate.gctl_version = G_GATE_VERSION; + ggiocreate.gctl_mediasize = res->hr_datasize; + ggiocreate.gctl_sectorsize = res->hr_local_sectorsize; + ggiocreate.gctl_flags = 0; + ggiocreate.gctl_maxcount = 128; + ggiocreate.gctl_timeout = 0; + ggiocreate.gctl_unit = G_GATE_NAME_GIVEN; + snprintf(ggiocreate.gctl_name, sizeof(ggiocreate.gctl_name), "hast/%s", + res->hr_provname); + bzero(ggiocreate.gctl_info, sizeof(ggiocreate.gctl_info)); + if (ioctl(res->hr_ggatefd, G_GATE_CMD_CREATE, &ggiocreate) == 0) { + pjdlog_info("Device hast/%s created.", res->hr_provname); + res->hr_ggateunit = ggiocreate.gctl_unit; + return; + } + if (errno != EEXIST) { + primary_exit(EX_OSERR, "Unable to create hast/%s device", + res->hr_provname); + } + pjdlog_debug(1, + "Device hast/%s already exists, we will try to take it over.", + res->hr_provname); + /* + * If we received EEXIST, we assume that the process who created the + * provider died and didn't clean up. In that case we will start from + * where he left of. + */ + ggiocancel.gctl_version = G_GATE_VERSION; + ggiocancel.gctl_unit = G_GATE_NAME_GIVEN; + snprintf(ggiocancel.gctl_name, sizeof(ggiocancel.gctl_name), "hast/%s", + res->hr_provname); + if (ioctl(res->hr_ggatefd, G_GATE_CMD_CANCEL, &ggiocancel) == 0) { + pjdlog_info("Device hast/%s recovered.", res->hr_provname); + res->hr_ggateunit = ggiocancel.gctl_unit; + return; + } + primary_exit(EX_OSERR, "Unable to take over hast/%s device", + res->hr_provname); +} + +void +hastd_primary(struct hast_resource *res) +{ + pthread_t td; + pid_t pid; + int error; + + gres = res; + + /* + * Create communication channel between parent and child. + */ + if (proto_client("socketpair://", &res->hr_ctrl) < 0) { + KEEP_ERRNO((void)pidfile_remove(pfh)); + primary_exit(EX_OSERR, + "Unable to create control sockets between parent and child"); + } + + pid = fork(); + if (pid < 0) { + KEEP_ERRNO((void)pidfile_remove(pfh)); + primary_exit(EX_OSERR, "Unable to fork"); + } + + if (pid > 0) { + /* This is parent. */ + res->hr_workerpid = pid; + return; + } + (void)pidfile_close(pfh); + + setproctitle("%s (primary)", res->hr_name); + + init_local(res); + init_remote(res); + init_ggate(res); + init_environment(res); + error = pthread_create(&td, NULL, ggate_recv_thread, res); + assert(error == 0); + error = pthread_create(&td, NULL, local_send_thread, res); + assert(error == 0); + error = pthread_create(&td, NULL, remote_send_thread, res); + assert(error == 0); + error = pthread_create(&td, NULL, remote_recv_thread, res); + assert(error == 0); + error = pthread_create(&td, NULL, ggate_send_thread, res); + assert(error == 0); + error = pthread_create(&td, NULL, sync_thread, res); + assert(error == 0); + error = pthread_create(&td, NULL, ctrl_thread, res); + assert(error == 0); + (void)guard_thread(res); +} + +static void +reqlog(int loglevel, int debuglevel, struct g_gate_ctl_io *ggio, const char *fmt, ...) +{ + char msg[1024]; + va_list ap; + int len; + + va_start(ap, fmt); + len = vsnprintf(msg, sizeof(msg), fmt, ap); + va_end(ap); + if ((size_t)len < sizeof(msg)) { + switch (ggio->gctl_cmd) { + case BIO_READ: + (void)snprintf(msg + len, sizeof(msg) - len, + "READ(%ju, %ju).", (uintmax_t)ggio->gctl_offset, + (uintmax_t)ggio->gctl_length); + break; + case BIO_DELETE: + (void)snprintf(msg + len, sizeof(msg) - len, + "DELETE(%ju, %ju).", (uintmax_t)ggio->gctl_offset, + (uintmax_t)ggio->gctl_length); + break; + case BIO_FLUSH: + (void)snprintf(msg + len, sizeof(msg) - len, "FLUSH."); + break; + case BIO_WRITE: + (void)snprintf(msg + len, sizeof(msg) - len, + "WRITE(%ju, %ju).", (uintmax_t)ggio->gctl_offset, + (uintmax_t)ggio->gctl_length); + break; + default: + (void)snprintf(msg + len, sizeof(msg) - len, + "UNKNOWN(%u).", (unsigned int)ggio->gctl_cmd); + break; + } + } + pjdlog_common(loglevel, debuglevel, -1, "%s", msg); +} + +static void +remote_close(struct hast_resource *res, int ncomp) +{ + + rw_wlock(&hio_remote_lock[ncomp]); + /* + * A race is possible between dropping rlock and acquiring wlock - + * another thread can close connection in-between. + */ + if (!ISCONNECTED(res, ncomp)) { + assert(res->hr_remotein == NULL); + assert(res->hr_remoteout == NULL); + rw_unlock(&hio_remote_lock[ncomp]); + return; + } + + assert(res->hr_remotein != NULL); + assert(res->hr_remoteout != NULL); + + pjdlog_debug(2, "Closing old incoming connection to %s.", + res->hr_remoteaddr); + proto_close(res->hr_remotein); + res->hr_remotein = NULL; + pjdlog_debug(2, "Closing old outgoing connection to %s.", + res->hr_remoteaddr); + proto_close(res->hr_remoteout); + res->hr_remoteout = NULL; + + rw_unlock(&hio_remote_lock[ncomp]); + + /* + * Stop synchronization if in-progress. + */ + mtx_lock(&sync_lock); + if (sync_inprogress) + sync_inprogress = false; + mtx_unlock(&sync_lock); + + /* + * Wake up guard thread, so it can immediately start reconnect. + */ + mtx_lock(&hio_guard_lock); + cv_signal(&hio_guard_cond); + mtx_unlock(&hio_guard_lock); +} + +/* + * Thread receives ggate I/O requests from the kernel and passes them to + * appropriate threads: + * WRITE - always goes to both local_send and remote_send threads + * READ (when the block is up-to-date on local component) - + * only local_send thread + * READ (when the block isn't up-to-date on local component) - + * only remote_send thread + * DELETE - always goes to both local_send and remote_send threads + * FLUSH - always goes to both local_send and remote_send threads + */ +static void * +ggate_recv_thread(void *arg) +{ + struct hast_resource *res = arg; + struct g_gate_ctl_io *ggio; + struct hio *hio; + unsigned int ii, ncomp, ncomps; + int error; + + ncomps = HAST_NCOMPONENTS; + + for (;;) { + pjdlog_debug(2, "ggate_recv: Taking free request."); + QUEUE_TAKE2(hio, free); + pjdlog_debug(2, "ggate_recv: (%p) Got free request.", hio); + ggio = &hio->hio_ggio; + ggio->gctl_unit = res->hr_ggateunit; + ggio->gctl_length = MAXPHYS; + ggio->gctl_error = 0; + pjdlog_debug(2, + "ggate_recv: (%p) Waiting for request from the kernel.", + hio); + if (ioctl(res->hr_ggatefd, G_GATE_CMD_START, ggio) < 0) { + if (sigexit_received) + pthread_exit(NULL); + primary_exit(EX_OSERR, "G_GATE_CMD_START failed"); + } + error = ggio->gctl_error; + switch (error) { + case 0: + break; + case ECANCELED: + /* Exit gracefully. */ + if (!sigexit_received) { + pjdlog_debug(2, + "ggate_recv: (%p) Received cancel from the kernel.", + hio); + pjdlog_info("Received cancel from the kernel, exiting."); + } + pthread_exit(NULL); + case ENOMEM: + /* + * Buffer too small? Impossible, we allocate MAXPHYS + * bytes - request can't be bigger than that. + */ + /* FALLTHROUGH */ + case ENXIO: + default: + primary_exitx(EX_OSERR, "G_GATE_CMD_START failed: %s.", + strerror(error)); + } + for (ii = 0; ii < ncomps; ii++) + hio->hio_errors[ii] = EINVAL; + reqlog(LOG_DEBUG, 2, ggio, + "ggate_recv: (%p) Request received from the kernel: ", + hio); + /* + * Inform all components about new write request. + * For read request prefer local component unless the given + * range is out-of-date, then use remote component. + */ + switch (ggio->gctl_cmd) { + case BIO_READ: + pjdlog_debug(2, + "ggate_recv: (%p) Moving request to the send queue.", + hio); + refcount_init(&hio->hio_countdown, 1); + mtx_lock(&metadata_lock); + if (res->hr_syncsrc == HAST_SYNCSRC_UNDEF || + res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) { + /* + * This range is up-to-date on local component, + * so handle request locally. + */ + /* Local component is 0 for now. */ + ncomp = 0; + } else /* if (res->hr_syncsrc == + HAST_SYNCSRC_SECONDARY) */ { + assert(res->hr_syncsrc == + HAST_SYNCSRC_SECONDARY); + /* + * This range is out-of-date on local component, + * so send request to the remote node. + */ + /* Remote component is 1 for now. */ + ncomp = 1; + } + mtx_unlock(&metadata_lock); + QUEUE_INSERT1(hio, send, ncomp); + break; + case BIO_WRITE: + for (;;) { + mtx_lock(&range_lock); + if (rangelock_islocked(range_sync, + ggio->gctl_offset, ggio->gctl_length)) { + pjdlog_debug(2, + "regular: Range offset=%jd length=%zu locked.", + (intmax_t)ggio->gctl_offset, + (size_t)ggio->gctl_length); + range_regular_wait = true; + cv_wait(&range_regular_cond, &range_lock); + range_regular_wait = false; + mtx_unlock(&range_lock); + continue; + } + if (rangelock_add(range_regular, + ggio->gctl_offset, ggio->gctl_length) < 0) { + mtx_unlock(&range_lock); + pjdlog_debug(2, + "regular: Range offset=%jd length=%zu is already locked, waiting.", + (intmax_t)ggio->gctl_offset, + (size_t)ggio->gctl_length); + sleep(1); + continue; + } + mtx_unlock(&range_lock); + break; + } + mtx_lock(&res->hr_amp_lock); + if (activemap_write_start(res->hr_amp, + ggio->gctl_offset, ggio->gctl_length)) { + (void)hast_activemap_flush(res); + } + mtx_unlock(&res->hr_amp_lock); + /* FALLTHROUGH */ + case BIO_DELETE: + case BIO_FLUSH: + pjdlog_debug(2, + "ggate_recv: (%p) Moving request to the send queues.", + hio); + refcount_init(&hio->hio_countdown, ncomps); + for (ii = 0; ii < ncomps; ii++) + QUEUE_INSERT1(hio, send, ii); + break; + } + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Thread reads from or writes to local component. + * If local read fails, it redirects it to remote_send thread. + */ +static void * +local_send_thread(void *arg) +{ + struct hast_resource *res = arg; + struct g_gate_ctl_io *ggio; + struct hio *hio; + unsigned int ncomp, rncomp; + ssize_t ret; + + /* Local component is 0 for now. */ + ncomp = 0; + /* Remote component is 1 for now. */ + rncomp = 1; + + for (;;) { + pjdlog_debug(2, "local_send: Taking request."); + QUEUE_TAKE1(hio, send, ncomp); + pjdlog_debug(2, "local_send: (%p) Got request.", hio); + ggio = &hio->hio_ggio; + switch (ggio->gctl_cmd) { + case BIO_READ: + ret = pread(res->hr_localfd, ggio->gctl_data, + ggio->gctl_length, + ggio->gctl_offset + res->hr_localoff); + if (ret == ggio->gctl_length) + hio->hio_errors[ncomp] = 0; + else { + /* + * If READ failed, try to read from remote node. + */ + QUEUE_INSERT1(hio, send, rncomp); + continue; + } + break; + case BIO_WRITE: + ret = pwrite(res->hr_localfd, ggio->gctl_data, + ggio->gctl_length, + ggio->gctl_offset + res->hr_localoff); + if (ret < 0) + hio->hio_errors[ncomp] = errno; + else if (ret != ggio->gctl_length) + hio->hio_errors[ncomp] = EIO; + else + hio->hio_errors[ncomp] = 0; + break; + case BIO_DELETE: + ret = g_delete(res->hr_localfd, + ggio->gctl_offset + res->hr_localoff, + ggio->gctl_length); + if (ret < 0) + hio->hio_errors[ncomp] = errno; + else + hio->hio_errors[ncomp] = 0; + break; + case BIO_FLUSH: + ret = g_flush(res->hr_localfd); + if (ret < 0) + hio->hio_errors[ncomp] = errno; + else + hio->hio_errors[ncomp] = 0; + break; + } + if (refcount_release(&hio->hio_countdown)) { + if (ISSYNCREQ(hio)) { + mtx_lock(&sync_lock); + SYNCREQDONE(hio); + mtx_unlock(&sync_lock); + cv_signal(&sync_cond); + } else { + pjdlog_debug(2, + "local_send: (%p) Moving request to the done queue.", + hio); + QUEUE_INSERT2(hio, done); + } + } + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Thread sends request to secondary node. + */ +static void * +remote_send_thread(void *arg) +{ + struct hast_resource *res = arg; + struct g_gate_ctl_io *ggio; + struct hio *hio; + struct nv *nv; + unsigned int ncomp; + bool wakeup; + uint64_t offset, length; + uint8_t cmd; + void *data; + + /* Remote component is 1 for now. */ + ncomp = 1; + + for (;;) { + pjdlog_debug(2, "remote_send: Taking request."); + QUEUE_TAKE1(hio, send, ncomp); + pjdlog_debug(2, "remote_send: (%p) Got request.", hio); + ggio = &hio->hio_ggio; + switch (ggio->gctl_cmd) { + case BIO_READ: + cmd = HIO_READ; + data = NULL; + offset = ggio->gctl_offset; + length = ggio->gctl_length; + break; + case BIO_WRITE: + cmd = HIO_WRITE; + data = ggio->gctl_data; + offset = ggio->gctl_offset; + length = ggio->gctl_length; + break; + case BIO_DELETE: + cmd = HIO_DELETE; + data = NULL; + offset = ggio->gctl_offset; + length = ggio->gctl_length; + break; + case BIO_FLUSH: + cmd = HIO_FLUSH; + data = NULL; + offset = 0; + length = 0; + break; + default: + assert(!"invalid condition"); + abort(); + } + nv = nv_alloc(); + nv_add_uint8(nv, cmd, "cmd"); + nv_add_uint64(nv, (uint64_t)ggio->gctl_seq, "seq"); + nv_add_uint64(nv, offset, "offset"); + nv_add_uint64(nv, length, "length"); + if (nv_error(nv) != 0) { + hio->hio_errors[ncomp] = nv_error(nv); + pjdlog_debug(2, + "remote_send: (%p) Unable to prepare header to send.", + hio); + reqlog(LOG_ERR, 0, ggio, + "Unable to prepare header to send (%s): ", + strerror(nv_error(nv))); + /* Move failed request immediately to the done queue. */ + goto done_queue; + } + pjdlog_debug(2, + "remote_send: (%p) Moving request to the recv queue.", + hio); + /* + * Protect connection from disappearing. + */ + rw_rlock(&hio_remote_lock[ncomp]); + if (!ISCONNECTED(res, ncomp)) { + rw_unlock(&hio_remote_lock[ncomp]); + hio->hio_errors[ncomp] = ENOTCONN; + goto done_queue; + } + /* + * Move the request to recv queue before sending it, because + * in different order we can get reply before we move request + * to recv queue. + */ + mtx_lock(&hio_recv_list_lock[ncomp]); + wakeup = TAILQ_EMPTY(&hio_recv_list[ncomp]); + TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], hio, hio_next[ncomp]); + mtx_unlock(&hio_recv_list_lock[ncomp]); + if (hast_proto_send(res, res->hr_remoteout, nv, data, + data != NULL ? length : 0) < 0) { + hio->hio_errors[ncomp] = errno; + rw_unlock(&hio_remote_lock[ncomp]); + remote_close(res, ncomp); + pjdlog_debug(2, + "remote_send: (%p) Unable to send request.", hio); + reqlog(LOG_ERR, 0, ggio, + "Unable to send request (%s): ", + strerror(hio->hio_errors[ncomp])); + /* + * Take request back from the receive queue and move + * it immediately to the done queue. + */ + mtx_lock(&hio_recv_list_lock[ncomp]); + TAILQ_REMOVE(&hio_recv_list[ncomp], hio, hio_next[ncomp]); + mtx_unlock(&hio_recv_list_lock[ncomp]); + goto done_queue; + } + rw_unlock(&hio_remote_lock[ncomp]); + nv_free(nv); + if (wakeup) + cv_signal(&hio_recv_list_cond[ncomp]); + continue; +done_queue: + nv_free(nv); + if (ISSYNCREQ(hio)) { + if (!refcount_release(&hio->hio_countdown)) + continue; + mtx_lock(&sync_lock); + SYNCREQDONE(hio); + mtx_unlock(&sync_lock); + cv_signal(&sync_cond); + continue; + } + if (ggio->gctl_cmd == BIO_WRITE) { + mtx_lock(&res->hr_amp_lock); + if (activemap_need_sync(res->hr_amp, ggio->gctl_offset, + ggio->gctl_length)) { + (void)hast_activemap_flush(res); + } + mtx_unlock(&res->hr_amp_lock); + } + if (!refcount_release(&hio->hio_countdown)) + continue; + pjdlog_debug(2, + "remote_send: (%p) Moving request to the done queue.", + hio); + QUEUE_INSERT2(hio, done); + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Thread receives answer from secondary node and passes it to ggate_send + * thread. + */ +static void * +remote_recv_thread(void *arg) +{ + struct hast_resource *res = arg; + struct g_gate_ctl_io *ggio; + struct hio *hio; + struct nv *nv; + unsigned int ncomp; + uint64_t seq; + int error; + + /* Remote component is 1 for now. */ + ncomp = 1; + + for (;;) { + /* Wait until there is anything to receive. */ + mtx_lock(&hio_recv_list_lock[ncomp]); + while (TAILQ_EMPTY(&hio_recv_list[ncomp])) { + pjdlog_debug(2, "remote_recv: No requests, waiting."); + cv_wait(&hio_recv_list_cond[ncomp], + &hio_recv_list_lock[ncomp]); + } + mtx_unlock(&hio_recv_list_lock[ncomp]); + rw_rlock(&hio_remote_lock[ncomp]); + if (!ISCONNECTED(res, ncomp)) { + rw_unlock(&hio_remote_lock[ncomp]); + /* + * Connection is dead, so move all pending requests to + * the done queue (one-by-one). + */ + mtx_lock(&hio_recv_list_lock[ncomp]); + hio = TAILQ_FIRST(&hio_recv_list[ncomp]); + assert(hio != NULL); + TAILQ_REMOVE(&hio_recv_list[ncomp], hio, + hio_next[ncomp]); + mtx_unlock(&hio_recv_list_lock[ncomp]); + goto done_queue; + } + if (hast_proto_recv_hdr(res->hr_remotein, &nv) < 0) { + pjdlog_errno(LOG_ERR, + "Unable to receive reply header"); + rw_unlock(&hio_remote_lock[ncomp]); + remote_close(res, ncomp); + continue; + } + rw_unlock(&hio_remote_lock[ncomp]); + seq = nv_get_uint64(nv, "seq"); + if (seq == 0) { + pjdlog_error("Header contains no 'seq' field."); + nv_free(nv); + continue; + } + mtx_lock(&hio_recv_list_lock[ncomp]); + TAILQ_FOREACH(hio, &hio_recv_list[ncomp], hio_next[ncomp]) { + if (hio->hio_ggio.gctl_seq == seq) { + TAILQ_REMOVE(&hio_recv_list[ncomp], hio, + hio_next[ncomp]); + break; + } + } + mtx_unlock(&hio_recv_list_lock[ncomp]); + if (hio == NULL) { + pjdlog_error("Found no request matching received 'seq' field (%ju).", + (uintmax_t)seq); + nv_free(nv); + continue; + } + error = nv_get_int16(nv, "error"); + if (error != 0) { + /* Request failed on remote side. */ + hio->hio_errors[ncomp] = 0; + nv_free(nv); + goto done_queue; + } + ggio = &hio->hio_ggio; + switch (ggio->gctl_cmd) { + case BIO_READ: + rw_rlock(&hio_remote_lock[ncomp]); + if (!ISCONNECTED(res, ncomp)) { + rw_unlock(&hio_remote_lock[ncomp]); + nv_free(nv); + goto done_queue; + } + if (hast_proto_recv_data(res, res->hr_remotein, nv, + ggio->gctl_data, ggio->gctl_length) < 0) { + hio->hio_errors[ncomp] = errno; + pjdlog_errno(LOG_ERR, + "Unable to receive reply data"); + rw_unlock(&hio_remote_lock[ncomp]); + nv_free(nv); + remote_close(res, ncomp); + goto done_queue; + } + rw_unlock(&hio_remote_lock[ncomp]); + break; + case BIO_WRITE: + case BIO_DELETE: + case BIO_FLUSH: + break; + default: + assert(!"invalid condition"); + abort(); + } + hio->hio_errors[ncomp] = 0; + nv_free(nv); +done_queue: + if (refcount_release(&hio->hio_countdown)) { + if (ISSYNCREQ(hio)) { + mtx_lock(&sync_lock); + SYNCREQDONE(hio); + mtx_unlock(&sync_lock); + cv_signal(&sync_cond); + } else { + pjdlog_debug(2, + "remote_recv: (%p) Moving request to the done queue.", + hio); + QUEUE_INSERT2(hio, done); + } + } + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Thread sends answer to the kernel. + */ +static void * +ggate_send_thread(void *arg) +{ + struct hast_resource *res = arg; + struct g_gate_ctl_io *ggio; + struct hio *hio; + unsigned int ii, ncomp, ncomps; + + ncomps = HAST_NCOMPONENTS; + + for (;;) { + pjdlog_debug(2, "ggate_send: Taking request."); + QUEUE_TAKE2(hio, done); + pjdlog_debug(2, "ggate_send: (%p) Got request.", hio); + ggio = &hio->hio_ggio; + for (ii = 0; ii < ncomps; ii++) { + if (hio->hio_errors[ii] == 0) { + /* + * One successful request is enough to declare + * success. + */ + ggio->gctl_error = 0; + break; + } + } + if (ii == ncomps) { + /* + * None of the requests were successful. + * Use first error. + */ + ggio->gctl_error = hio->hio_errors[0]; + } + if (ggio->gctl_error == 0 && ggio->gctl_cmd == BIO_WRITE) { + mtx_lock(&res->hr_amp_lock); + activemap_write_complete(res->hr_amp, + ggio->gctl_offset, ggio->gctl_length); + mtx_unlock(&res->hr_amp_lock); + } + if (ggio->gctl_cmd == BIO_WRITE) { + /* + * Unlock range we locked. + */ + mtx_lock(&range_lock); + rangelock_del(range_regular, ggio->gctl_offset, + ggio->gctl_length); + if (range_sync_wait) + cv_signal(&range_sync_cond); + mtx_unlock(&range_lock); + /* + * Bump local count if this is first write after + * connection failure with remote node. + */ + ncomp = 1; + rw_rlock(&hio_remote_lock[ncomp]); + if (!ISCONNECTED(res, ncomp)) { + mtx_lock(&metadata_lock); + if (res->hr_primary_localcnt == + res->hr_secondary_remotecnt) { + res->hr_primary_localcnt++; + pjdlog_debug(1, + "Increasing localcnt to %ju.", + (uintmax_t)res->hr_primary_localcnt); + (void)metadata_write(res); + } + mtx_unlock(&metadata_lock); + } + rw_unlock(&hio_remote_lock[ncomp]); + } + if (ioctl(res->hr_ggatefd, G_GATE_CMD_DONE, ggio) < 0) + primary_exit(EX_OSERR, "G_GATE_CMD_DONE failed"); + pjdlog_debug(2, + "ggate_send: (%p) Moving request to the free queue.", hio); + QUEUE_INSERT2(hio, free); + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Thread synchronize local and remote components. + */ +static void * +sync_thread(void *arg __unused) +{ + struct hast_resource *res = arg; + struct hio *hio; + struct g_gate_ctl_io *ggio; + unsigned int ii, ncomp, ncomps; + off_t offset, length, synced; + bool dorewind; + int syncext; + + ncomps = HAST_NCOMPONENTS; + dorewind = true; + synced = 0; + + for (;;) { + mtx_lock(&sync_lock); + while (!sync_inprogress) { + dorewind = true; + synced = 0; + cv_wait(&sync_cond, &sync_lock); + } + mtx_unlock(&sync_lock); + /* + * Obtain offset at which we should synchronize. + * Rewind synchronization if needed. + */ + mtx_lock(&res->hr_amp_lock); + if (dorewind) + activemap_sync_rewind(res->hr_amp); + offset = activemap_sync_offset(res->hr_amp, &length, &syncext); + if (syncext != -1) { + /* + * We synchronized entire syncext extent, we can mark + * it as clean now. + */ + if (activemap_extent_complete(res->hr_amp, syncext)) + (void)hast_activemap_flush(res); + } + mtx_unlock(&res->hr_amp_lock); + if (dorewind) { + dorewind = false; + if (offset < 0) + pjdlog_info("Nodes are in sync."); + else { + pjdlog_info("Synchronization started. %ju bytes to go.", + (uintmax_t)(res->hr_extentsize * + activemap_ndirty(res->hr_amp))); + } + } + if (offset < 0) { + mtx_lock(&sync_lock); + sync_inprogress = false; + mtx_unlock(&sync_lock); + pjdlog_debug(1, "Nothing to synchronize."); + /* + * Synchronization complete, make both localcnt and + * remotecnt equal. + */ + ncomp = 1; + rw_rlock(&hio_remote_lock[ncomp]); + if (ISCONNECTED(res, ncomp)) { + if (synced > 0) { + pjdlog_info("Synchronization complete. " + "%jd bytes synchronized.", + (intmax_t)synced); + } + mtx_lock(&metadata_lock); + res->hr_syncsrc = HAST_SYNCSRC_UNDEF; + res->hr_primary_localcnt = + res->hr_secondary_localcnt; + res->hr_primary_remotecnt = + res->hr_secondary_remotecnt; + pjdlog_debug(1, + "Setting localcnt to %ju and remotecnt to %ju.", + (uintmax_t)res->hr_primary_localcnt, + (uintmax_t)res->hr_secondary_localcnt); + (void)metadata_write(res); + mtx_unlock(&metadata_lock); + } else if (synced > 0) { + pjdlog_info("Synchronization interrupted. " + "%jd bytes synchronized so far.", + (intmax_t)synced); + } + rw_unlock(&hio_remote_lock[ncomp]); + continue; + } + pjdlog_debug(2, "sync: Taking free request."); + QUEUE_TAKE2(hio, free); + pjdlog_debug(2, "sync: (%p) Got free request.", hio); + /* + * Lock the range we are going to synchronize. We don't want + * race where someone writes between our read and write. + */ + for (;;) { + mtx_lock(&range_lock); + if (rangelock_islocked(range_regular, offset, length)) { + pjdlog_debug(2, + "sync: Range offset=%jd length=%jd locked.", + (intmax_t)offset, (intmax_t)length); + range_sync_wait = true; + cv_wait(&range_sync_cond, &range_lock); + range_sync_wait = false; + mtx_unlock(&range_lock); + continue; + } + if (rangelock_add(range_sync, offset, length) < 0) { + mtx_unlock(&range_lock); + pjdlog_debug(2, + "sync: Range offset=%jd length=%jd is already locked, waiting.", + (intmax_t)offset, (intmax_t)length); + sleep(1); + continue; + } + mtx_unlock(&range_lock); + break; + } + /* + * First read the data from synchronization source. + */ + SYNCREQ(hio); + ggio = &hio->hio_ggio; + ggio->gctl_cmd = BIO_READ; + ggio->gctl_offset = offset; + ggio->gctl_length = length; + ggio->gctl_error = 0; + for (ii = 0; ii < ncomps; ii++) + hio->hio_errors[ii] = EINVAL; + reqlog(LOG_DEBUG, 2, ggio, "sync: (%p) Sending sync request: ", + hio); + pjdlog_debug(2, "sync: (%p) Moving request to the send queue.", + hio); + mtx_lock(&metadata_lock); + if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) { + /* + * This range is up-to-date on local component, + * so handle request locally. + */ + /* Local component is 0 for now. */ + ncomp = 0; + } else /* if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) */ { + assert(res->hr_syncsrc == HAST_SYNCSRC_SECONDARY); + /* + * This range is out-of-date on local component, + * so send request to the remote node. + */ + /* Remote component is 1 for now. */ + ncomp = 1; + } + mtx_unlock(&metadata_lock); + refcount_init(&hio->hio_countdown, 1); + QUEUE_INSERT1(hio, send, ncomp); + + /* + * Let's wait for READ to finish. + */ + mtx_lock(&sync_lock); + while (!ISSYNCREQDONE(hio)) + cv_wait(&sync_cond, &sync_lock); + mtx_unlock(&sync_lock); + + if (hio->hio_errors[ncomp] != 0) { + pjdlog_error("Unable to read synchronization data: %s.", + strerror(hio->hio_errors[ncomp])); + goto free_queue; + } + + /* + * We read the data from synchronization source, now write it + * to synchronization target. + */ + SYNCREQ(hio); + ggio->gctl_cmd = BIO_WRITE; + for (ii = 0; ii < ncomps; ii++) + hio->hio_errors[ii] = EINVAL; + reqlog(LOG_DEBUG, 2, ggio, "sync: (%p) Sending sync request: ", + hio); + pjdlog_debug(2, "sync: (%p) Moving request to the send queue.", + hio); + mtx_lock(&metadata_lock); + if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) { + /* + * This range is up-to-date on local component, + * so we update remote component. + */ + /* Remote component is 1 for now. */ + ncomp = 1; + } else /* if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) */ { + assert(res->hr_syncsrc == HAST_SYNCSRC_SECONDARY); + /* + * This range is out-of-date on local component, + * so we update it. + */ + /* Local component is 0 for now. */ + ncomp = 0; + } + mtx_unlock(&metadata_lock); + + pjdlog_debug(2, "sync: (%p) Moving request to the send queues.", + hio); + refcount_init(&hio->hio_countdown, 1); + QUEUE_INSERT1(hio, send, ncomp); + + /* + * Let's wait for WRITE to finish. + */ + mtx_lock(&sync_lock); + while (!ISSYNCREQDONE(hio)) + cv_wait(&sync_cond, &sync_lock); + mtx_unlock(&sync_lock); + + if (hio->hio_errors[ncomp] != 0) { + pjdlog_error("Unable to write synchronization data: %s.", + strerror(hio->hio_errors[ncomp])); + goto free_queue; + } +free_queue: + mtx_lock(&range_lock); + rangelock_del(range_sync, offset, length); + if (range_regular_wait) + cv_signal(&range_regular_cond); + mtx_unlock(&range_lock); + + synced += length; + + pjdlog_debug(2, "sync: (%p) Moving request to the free queue.", + hio); + QUEUE_INSERT2(hio, free); + } + /* NOTREACHED */ + return (NULL); +} + +static void +sighandler(int sig) +{ + bool unlock; + + switch (sig) { + case SIGINT: + case SIGTERM: + sigexit_received = true; + break; + default: + assert(!"invalid condition"); + } + /* + * XXX: Racy, but if we cannot obtain hio_guard_lock here, we don't + * want to risk deadlock. + */ + unlock = mtx_trylock(&hio_guard_lock); + cv_signal(&hio_guard_cond); + if (unlock) + mtx_unlock(&hio_guard_lock); +} + +/* + * Thread guards remote connections and reconnects when needed, handles + * signals, etc. + */ +static void * +guard_thread(void *arg) +{ + struct hast_resource *res = arg; + unsigned int ii, ncomps; + int timeout; + + ncomps = HAST_NCOMPONENTS; + /* The is only one remote component for now. */ +#define ISREMOTE(no) ((no) == 1) + + for (;;) { + if (sigexit_received) { + primary_exitx(EX_OK, + "Termination signal received, exiting."); + } + /* + * If all the connection will be fine, we will sleep until + * someone wakes us up. + * If any of the connections will be broken and we won't be + * able to connect, we will sleep only for RECONNECT_SLEEP + * seconds so we can retry soon. + */ + timeout = 0; + pjdlog_debug(2, "remote_guard: Checking connections."); + mtx_lock(&hio_guard_lock); + for (ii = 0; ii < ncomps; ii++) { + if (!ISREMOTE(ii)) + continue; + rw_rlock(&hio_remote_lock[ii]); + if (ISCONNECTED(res, ii)) { + assert(res->hr_remotein != NULL); + assert(res->hr_remoteout != NULL); + rw_unlock(&hio_remote_lock[ii]); + pjdlog_debug(2, + "remote_guard: Connection to %s is ok.", + res->hr_remoteaddr); + } else { + assert(res->hr_remotein == NULL); + assert(res->hr_remoteout == NULL); + /* + * Upgrade the lock. It doesn't have to be + * atomic as no other thread can change + * connection status from disconnected to + * connected. + */ + rw_unlock(&hio_remote_lock[ii]); + rw_wlock(&hio_remote_lock[ii]); + assert(res->hr_remotein == NULL); + assert(res->hr_remoteout == NULL); + pjdlog_debug(2, + "remote_guard: Reconnecting to %s.", + res->hr_remoteaddr); + init_remote(res); + if (ISCONNECTED(res, ii)) { + pjdlog_info("Successfully reconnected to %s.", + res->hr_remoteaddr); + } else { + /* Both connections should be NULL. */ + assert(res->hr_remotein == NULL); + assert(res->hr_remoteout == NULL); + pjdlog_debug(2, + "remote_guard: Reconnect to %s failed.", + res->hr_remoteaddr); + timeout = RECONNECT_SLEEP; + } + rw_unlock(&hio_remote_lock[ii]); + } + } + (void)cv_timedwait(&hio_guard_cond, &hio_guard_lock, timeout); + mtx_unlock(&hio_guard_lock); + } +#undef ISREMOTE + /* NOTREACHED */ + return (NULL); +} diff --git a/sbin/hastd/proto.c b/sbin/hastd/proto.c new file mode 100644 index 0000000..103f20c --- /dev/null +++ b/sbin/hastd/proto.c @@ -0,0 +1,261 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/queue.h> + +#include <assert.h> +#include <errno.h> +#include <stdint.h> + +#include "proto.h" +#include "proto_impl.h" + +#define PROTO_CONN_MAGIC 0x907041c +struct proto_conn { + int pc_magic; + struct hast_proto *pc_proto; + void *pc_ctx; + int pc_side; +#define PROTO_SIDE_CLIENT 0 +#define PROTO_SIDE_SERVER_LISTEN 1 +#define PROTO_SIDE_SERVER_WORK 2 +}; + +static LIST_HEAD(, hast_proto) protos = LIST_HEAD_INITIALIZER(protos); + +void +proto_register(struct hast_proto *proto) +{ + + LIST_INSERT_HEAD(&protos, proto, hp_next); +} + +static int +proto_common_setup(const char *addr, struct proto_conn **connp, int side) +{ + struct hast_proto *proto; + struct proto_conn *conn; + void *ctx; + int ret; + + assert(side == PROTO_SIDE_CLIENT || side == PROTO_SIDE_SERVER_LISTEN); + + conn = malloc(sizeof(*conn)); + if (conn == NULL) + return (-1); + + LIST_FOREACH(proto, &protos, hp_next) { + if (side == PROTO_SIDE_CLIENT) + ret = proto->hp_client(addr, &ctx); + else /* if (side == PROTO_SIDE_SERVER_LISTEN) */ + ret = proto->hp_server(addr, &ctx); + /* + * ret == 0 - success + * ret == -1 - addr is not for this protocol + * ret > 0 - right protocol, but an error occured + */ + if (ret >= 0) + break; + } + if (proto == NULL) { + /* Unrecognized address. */ + free(conn); + errno = EINVAL; + return (-1); + } + if (ret > 0) { + /* An error occured. */ + free(conn); + errno = ret; + return (-1); + } + conn->pc_proto = proto; + conn->pc_ctx = ctx; + conn->pc_side = side; + conn->pc_magic = PROTO_CONN_MAGIC; + *connp = conn; + return (0); +} + +int +proto_client(const char *addr, struct proto_conn **connp) +{ + + return (proto_common_setup(addr, connp, PROTO_SIDE_CLIENT)); +} + +int +proto_connect(struct proto_conn *conn) +{ + int ret; + + assert(conn != NULL); + assert(conn->pc_magic == PROTO_CONN_MAGIC); + assert(conn->pc_side == PROTO_SIDE_CLIENT); + assert(conn->pc_proto != NULL); + + ret = conn->pc_proto->hp_connect(conn->pc_ctx); + if (ret != 0) { + errno = ret; + return (-1); + } + + return (0); +} + +int +proto_server(const char *addr, struct proto_conn **connp) +{ + + return (proto_common_setup(addr, connp, PROTO_SIDE_SERVER_LISTEN)); +} + +int +proto_accept(struct proto_conn *conn, struct proto_conn **newconnp) +{ + struct proto_conn *newconn; + int ret; + + assert(conn != NULL); + assert(conn->pc_magic == PROTO_CONN_MAGIC); + assert(conn->pc_side == PROTO_SIDE_SERVER_LISTEN); + assert(conn->pc_proto != NULL); + + newconn = malloc(sizeof(*newconn)); + if (newconn == NULL) + return (-1); + + ret = conn->pc_proto->hp_accept(conn->pc_ctx, &newconn->pc_ctx); + if (ret != 0) { + free(newconn); + errno = ret; + return (-1); + } + + newconn->pc_proto = conn->pc_proto; + newconn->pc_side = PROTO_SIDE_SERVER_WORK; + newconn->pc_magic = PROTO_CONN_MAGIC; + *newconnp = newconn; + + return (0); +} + +int +proto_send(struct proto_conn *conn, const void *data, size_t size) +{ + int ret; + + assert(conn != NULL); + assert(conn->pc_magic == PROTO_CONN_MAGIC); + assert(conn->pc_proto != NULL); + + ret = conn->pc_proto->hp_send(conn->pc_ctx, data, size); + if (ret != 0) { + errno = ret; + return (-1); + } + return (0); +} + +int +proto_recv(struct proto_conn *conn, void *data, size_t size) +{ + int ret; + + assert(conn != NULL); + assert(conn->pc_magic == PROTO_CONN_MAGIC); + assert(conn->pc_proto != NULL); + + ret = conn->pc_proto->hp_recv(conn->pc_ctx, data, size); + if (ret != 0) { + errno = ret; + return (-1); + } + return (0); +} + +int +proto_descriptor(const struct proto_conn *conn) +{ + + assert(conn != NULL); + assert(conn->pc_magic == PROTO_CONN_MAGIC); + assert(conn->pc_proto != NULL); + + return (conn->pc_proto->hp_descriptor(conn->pc_ctx)); +} + +bool +proto_address_match(const struct proto_conn *conn, const char *addr) +{ + + assert(conn != NULL); + assert(conn->pc_magic == PROTO_CONN_MAGIC); + assert(conn->pc_proto != NULL); + + return (conn->pc_proto->hp_address_match(conn->pc_ctx, addr)); +} + +void +proto_local_address(const struct proto_conn *conn, char *addr, size_t size) +{ + + assert(conn != NULL); + assert(conn->pc_magic == PROTO_CONN_MAGIC); + assert(conn->pc_proto != NULL); + + conn->pc_proto->hp_local_address(conn->pc_ctx, addr, size); +} + +void +proto_remote_address(const struct proto_conn *conn, char *addr, size_t size) +{ + + assert(conn != NULL); + assert(conn->pc_magic == PROTO_CONN_MAGIC); + assert(conn->pc_proto != NULL); + + conn->pc_proto->hp_remote_address(conn->pc_ctx, addr, size); +} + +void +proto_close(struct proto_conn *conn) +{ + + assert(conn != NULL); + assert(conn->pc_magic == PROTO_CONN_MAGIC); + assert(conn->pc_proto != NULL); + + conn->pc_proto->hp_close(conn->pc_ctx); + conn->pc_magic = 0; + free(conn); +} diff --git a/sbin/hastd/proto.h b/sbin/hastd/proto.h new file mode 100644 index 0000000..cb196d8 --- /dev/null +++ b/sbin/hastd/proto.h @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PROTO_H_ +#define _PROTO_H_ + +#include <stdbool.h> /* bool */ +#include <stdlib.h> /* size_t */ + +struct proto_conn; + +int proto_client(const char *addr, struct proto_conn **connp); +int proto_connect(struct proto_conn *conn); +int proto_server(const char *addr, struct proto_conn **connp); +int proto_accept(struct proto_conn *conn, struct proto_conn **newconnp); +int proto_send(struct proto_conn *conn, const void *data, size_t size); +int proto_recv(struct proto_conn *conn, void *data, size_t size); +int proto_descriptor(const struct proto_conn *conn); +bool proto_address_match(const struct proto_conn *conn, const char *addr); +void proto_local_address(const struct proto_conn *conn, char *addr, + size_t size); +void proto_remote_address(const struct proto_conn *conn, char *addr, + size_t size); +void proto_close(struct proto_conn *conn); + +#endif /* !_PROTO_H_ */ diff --git a/sbin/hastd/proto_common.c b/sbin/hastd/proto_common.c new file mode 100644 index 0000000..22102d8 --- /dev/null +++ b/sbin/hastd/proto_common.c @@ -0,0 +1,85 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/socket.h> + +#include <assert.h> +#include <errno.h> +#include <stdlib.h> +#include <strings.h> + +#include "proto_impl.h" + +/* Maximum size of packet we want to use when sending data. */ +#ifndef MAX_SEND_SIZE +//#define MAX_SEND_SIZE 32768 +#define MAX_SEND_SIZE 131072 +#endif + +int +proto_common_send(int fd, const unsigned char *data, size_t size) +{ + ssize_t done; + size_t sendsize; + + do { + sendsize = size < MAX_SEND_SIZE ? size : MAX_SEND_SIZE; + done = send(fd, data, sendsize, MSG_NOSIGNAL); + if (done == 0) + return (ENOTCONN); + else if (done < 0) { + if (errno == EAGAIN) + continue; + return (errno); + } + data += done; + size -= done; + } while (size > 0); + + return (0); +} + +int +proto_common_recv(int fd, unsigned char *data, size_t size) +{ + ssize_t done; + + do { + done = recv(fd, data, size, MSG_WAITALL); + } while (done == -1 && errno == EAGAIN); + if (done == 0) + return (ENOTCONN); + else if (done < 0) + return (errno); + return (0); +} diff --git a/sbin/hastd/proto_impl.h b/sbin/hastd/proto_impl.h new file mode 100644 index 0000000..ea6548d --- /dev/null +++ b/sbin/hastd/proto_impl.h @@ -0,0 +1,75 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PROTO_IMPL_H_ +#define _PROTO_IMPL_H_ + +#include <sys/queue.h> + +#include <stdbool.h> /* bool */ +#include <stdlib.h> /* size_t */ + +#define __constructor __attribute__((constructor)) + +typedef int hp_client_t(const char *, void **); +typedef int hp_connect_t(void *); +typedef int hp_server_t(const char *, void **); +typedef int hp_accept_t(void *, void **); +typedef int hp_send_t(void *, const unsigned char *, size_t); +typedef int hp_recv_t(void *, unsigned char *, size_t); +typedef int hp_descriptor_t(const void *); +typedef bool hp_address_match_t(const void *, const char *); +typedef void hp_local_address_t(const void *, char *, size_t); +typedef void hp_remote_address_t(const void *, char *, size_t); +typedef void hp_close_t(void *); + +struct hast_proto { + const char *hp_name; + hp_client_t *hp_client; + hp_connect_t *hp_connect; + hp_server_t *hp_server; + hp_accept_t *hp_accept; + hp_send_t *hp_send; + hp_recv_t *hp_recv; + hp_descriptor_t *hp_descriptor; + hp_address_match_t *hp_address_match; + hp_local_address_t *hp_local_address; + hp_remote_address_t *hp_remote_address; + hp_close_t *hp_close; + LIST_ENTRY(hast_proto) hp_next; +}; + +void proto_register(struct hast_proto *proto); + +int proto_common_send(int fd, const unsigned char *data, size_t size); +int proto_common_recv(int fd, unsigned char *data, size_t size); + +#endif /* !_PROTO_IMPL_H_ */ diff --git a/sbin/hastd/proto_socketpair.c b/sbin/hastd/proto_socketpair.c new file mode 100644 index 0000000..0e2cfa2 --- /dev/null +++ b/sbin/hastd/proto_socketpair.c @@ -0,0 +1,272 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/socket.h> + +#include <assert.h> +#include <errno.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include "hast.h" +#include "proto_impl.h" + +#define SP_CTX_MAGIC 0x50c3741 +struct sp_ctx { + int sp_magic; + int sp_fd[2]; + int sp_side; +#define SP_SIDE_UNDEF 0 +#define SP_SIDE_CLIENT 1 +#define SP_SIDE_SERVER 2 +}; + +static void sp_close(void *ctx); + +static int +sp_client(const char *addr, void **ctxp) +{ + struct sp_ctx *spctx; + int ret; + + if (strcmp(addr, "socketpair://") != 0) + return (-1); + + spctx = malloc(sizeof(*spctx)); + if (spctx == NULL) + return (errno); + + if (socketpair(PF_UNIX, SOCK_STREAM, 0, spctx->sp_fd) < 0) { + ret = errno; + free(spctx); + return (ret); + } + + spctx->sp_side = SP_SIDE_UNDEF; + spctx->sp_magic = SP_CTX_MAGIC; + *ctxp = spctx; + + return (0); +} + +static int +sp_connect(void *ctx __unused) +{ + + assert(!"proto_connect() not supported on socketpairs"); + abort(); +} + +static int +sp_server(const char *addr __unused, void **ctxp __unused) +{ + + assert(!"proto_server() not supported on socketpairs"); + abort(); +} + +static int +sp_accept(void *ctx __unused, void **newctxp __unused) +{ + + assert(!"proto_server() not supported on socketpairs"); + abort(); +} + +static int +sp_send(void *ctx, const unsigned char *data, size_t size) +{ + struct sp_ctx *spctx = ctx; + int fd; + + assert(spctx != NULL); + assert(spctx->sp_magic == SP_CTX_MAGIC); + + switch (spctx->sp_side) { + case SP_SIDE_UNDEF: + /* + * If the first operation done by the caller is proto_send(), + * we assume this the client. + */ + /* FALLTHROUGH */ + spctx->sp_side = SP_SIDE_CLIENT; + /* Close other end. */ + close(spctx->sp_fd[1]); + case SP_SIDE_CLIENT: + assert(spctx->sp_fd[0] >= 0); + fd = spctx->sp_fd[0]; + break; + case SP_SIDE_SERVER: + assert(spctx->sp_fd[1] >= 0); + fd = spctx->sp_fd[1]; + break; + default: + abort(); + } + + return (proto_common_send(fd, data, size)); +} + +static int +sp_recv(void *ctx, unsigned char *data, size_t size) +{ + struct sp_ctx *spctx = ctx; + int fd; + + assert(spctx != NULL); + assert(spctx->sp_magic == SP_CTX_MAGIC); + + switch (spctx->sp_side) { + case SP_SIDE_UNDEF: + /* + * If the first operation done by the caller is proto_recv(), + * we assume this the server. + */ + /* FALLTHROUGH */ + spctx->sp_side = SP_SIDE_SERVER; + /* Close other end. */ + close(spctx->sp_fd[0]); + case SP_SIDE_SERVER: + assert(spctx->sp_fd[1] >= 0); + fd = spctx->sp_fd[1]; + break; + case SP_SIDE_CLIENT: + assert(spctx->sp_fd[0] >= 0); + fd = spctx->sp_fd[0]; + break; + default: + abort(); + } + + return (proto_common_recv(fd, data, size)); +} + +static int +sp_descriptor(const void *ctx) +{ + const struct sp_ctx *spctx = ctx; + + assert(spctx != NULL); + assert(spctx->sp_magic == SP_CTX_MAGIC); + assert(spctx->sp_side == SP_SIDE_CLIENT || + spctx->sp_side == SP_SIDE_SERVER); + + switch (spctx->sp_side) { + case SP_SIDE_CLIENT: + assert(spctx->sp_fd[0] >= 0); + return (spctx->sp_fd[0]); + case SP_SIDE_SERVER: + assert(spctx->sp_fd[1] >= 0); + return (spctx->sp_fd[1]); + } + + abort(); +} + +static bool +sp_address_match(const void *ctx __unused, const char *addr __unused) +{ + + assert(!"proto_address_match() not supported on socketpairs"); + abort(); +} + +static void +sp_local_address(const void *ctx __unused, char *addr __unused, + size_t size __unused) +{ + + assert(!"proto_local_address() not supported on socketpairs"); + abort(); +} + +static void +sp_remote_address(const void *ctx __unused, char *addr __unused, + size_t size __unused) +{ + + assert(!"proto_remote_address() not supported on socketpairs"); + abort(); +} + +static void +sp_close(void *ctx) +{ + struct sp_ctx *spctx = ctx; + + assert(spctx != NULL); + assert(spctx->sp_magic == SP_CTX_MAGIC); + + switch (spctx->sp_side) { + case SP_SIDE_UNDEF: + close(spctx->sp_fd[0]); + close(spctx->sp_fd[1]); + break; + case SP_SIDE_CLIENT: + close(spctx->sp_fd[0]); + break; + case SP_SIDE_SERVER: + close(spctx->sp_fd[1]); + break; + default: + abort(); + } + + spctx->sp_magic = 0; + free(spctx); +} + +static struct hast_proto sp_proto = { + .hp_name = "socketpair", + .hp_client = sp_client, + .hp_connect = sp_connect, + .hp_server = sp_server, + .hp_accept = sp_accept, + .hp_send = sp_send, + .hp_recv = sp_recv, + .hp_descriptor = sp_descriptor, + .hp_address_match = sp_address_match, + .hp_local_address = sp_local_address, + .hp_remote_address = sp_remote_address, + .hp_close = sp_close +}; + +static __constructor void +sp_ctor(void) +{ + + proto_register(&sp_proto); +} diff --git a/sbin/hastd/proto_tcp4.c b/sbin/hastd/proto_tcp4.c new file mode 100644 index 0000000..2fba996 --- /dev/null +++ b/sbin/hastd/proto_tcp4.c @@ -0,0 +1,447 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> /* MAXHOSTNAMELEN */ + +#include <netinet/in.h> +#include <netinet/tcp.h> + +#include <assert.h> +#include <errno.h> +#include <netdb.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include "hast.h" +#include "pjdlog.h" +#include "proto_impl.h" + +#define TCP4_CTX_MAGIC 0x7c441c +struct tcp4_ctx { + int tc_magic; + struct sockaddr_in tc_sin; + int tc_fd; + int tc_side; +#define TCP4_SIDE_CLIENT 0 +#define TCP4_SIDE_SERVER_LISTEN 1 +#define TCP4_SIDE_SERVER_WORK 2 +}; + +static void tcp4_close(void *ctx); + +static in_addr_t +str2ip(const char *str) +{ + struct hostent *hp; + in_addr_t ip; + + ip = inet_addr(str); + if (ip != INADDR_NONE) { + /* It is a valid IP address. */ + return (ip); + } + /* Check if it is a valid host name. */ + hp = gethostbyname(str); + if (hp == NULL) + return (INADDR_NONE); + return (((struct in_addr *)(void *)hp->h_addr)->s_addr); +} + +/* + * Function converts the given string to unsigned number. + */ +static int +numfromstr(const char *str, intmax_t minnum, intmax_t maxnum, intmax_t *nump) +{ + intmax_t digit, num; + + if (str[0] == '\0') + goto invalid; /* Empty string. */ + num = 0; + for (; *str != '\0'; str++) { + if (*str < '0' || *str > '9') + goto invalid; /* Non-digit character. */ + digit = *str - '0'; + if (num > num * 10 + digit) + goto invalid; /* Overflow. */ + num = num * 10 + digit; + if (num > maxnum) + goto invalid; /* Too big. */ + } + if (num < minnum) + goto invalid; /* Too small. */ + *nump = num; + return (0); +invalid: + errno = EINVAL; + return (-1); +} + +static int +tcp4_addr(const char *addr, struct sockaddr_in *sinp) +{ + char iporhost[MAXHOSTNAMELEN]; + const char *pp; + size_t size; + in_addr_t ip; + + if (addr == NULL) + return (-1); + + if (strncasecmp(addr, "tcp4://", 7) == 0) + addr += 7; + else if (strncasecmp(addr, "tcp://", 6) == 0) + addr += 6; + else if (addr[0] != '/' && /* If this is not path... */ + strstr(addr, "://") == NULL)/* ...and has no prefix... */ + ; /* ...tcp4 is the default. */ + else + return (-1); + + sinp->sin_family = AF_INET; + sinp->sin_len = sizeof(*sinp); + /* Extract optional port. */ + pp = strrchr(addr, ':'); + if (pp == NULL) { + /* Port not given, use the default. */ + sinp->sin_port = htons(HASTD_PORT); + } else { + intmax_t port; + + if (numfromstr(pp + 1, 1, 65535, &port) < 0) + return (errno); + sinp->sin_port = htons(port); + } + /* Extract host name or IP address. */ + if (pp == NULL) { + size = sizeof(iporhost); + if (strlcpy(iporhost, addr, size) >= size) + return (ENAMETOOLONG); + } else { + size = (size_t)(pp - addr + 1); + if (size > sizeof(iporhost)) + return (ENAMETOOLONG); + strlcpy(iporhost, addr, size); + } + /* Convert string (IP address or host name) to in_addr_t. */ + ip = str2ip(iporhost); + if (ip == INADDR_NONE) + return (EINVAL); + sinp->sin_addr.s_addr = ip; + + return (0); +} + +static int +tcp4_common_setup(const char *addr, void **ctxp, int side) +{ + struct tcp4_ctx *tctx; + int ret, val; + + tctx = malloc(sizeof(*tctx)); + if (tctx == NULL) + return (errno); + + /* Parse given address. */ + if ((ret = tcp4_addr(addr, &tctx->tc_sin)) != 0) { + free(tctx); + return (ret); + } + + tctx->tc_fd = socket(AF_INET, SOCK_STREAM, 0); + if (tctx->tc_fd == -1) { + ret = errno; + free(tctx); + return (ret); + } + + /* Socket settings. */ + val = 1; + if (setsockopt(tctx->tc_fd, IPPROTO_TCP, TCP_NODELAY, &val, + sizeof(val)) == -1) { + pjdlog_warning("Unable to set TCP_NOELAY on %s", addr); + } + val = 131072; + if (setsockopt(tctx->tc_fd, SOL_SOCKET, SO_SNDBUF, &val, + sizeof(val)) == -1) { + pjdlog_warning("Unable to set send buffer size on %s", addr); + } + val = 131072; + if (setsockopt(tctx->tc_fd, SOL_SOCKET, SO_RCVBUF, &val, + sizeof(val)) == -1) { + pjdlog_warning("Unable to set receive buffer size on %s", addr); + } + + tctx->tc_side = side; + tctx->tc_magic = TCP4_CTX_MAGIC; + *ctxp = tctx; + + return (0); +} + +static int +tcp4_client(const char *addr, void **ctxp) +{ + + return (tcp4_common_setup(addr, ctxp, TCP4_SIDE_CLIENT)); +} + +static int +tcp4_connect(void *ctx) +{ + struct tcp4_ctx *tctx = ctx; + + assert(tctx != NULL); + assert(tctx->tc_magic == TCP4_CTX_MAGIC); + assert(tctx->tc_side == TCP4_SIDE_CLIENT); + assert(tctx->tc_fd >= 0); + + if (connect(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sin, + sizeof(tctx->tc_sin)) < 0) { + return (errno); + } + + return (0); +} + +static int +tcp4_server(const char *addr, void **ctxp) +{ + struct tcp4_ctx *tctx; + int ret, val; + + ret = tcp4_common_setup(addr, ctxp, TCP4_SIDE_SERVER_LISTEN); + if (ret != 0) + return (ret); + + tctx = *ctxp; + + val = 1; + /* Ignore failure. */ + (void)setsockopt(tctx->tc_fd, SOL_SOCKET, SO_REUSEADDR, &val, + sizeof(val)); + + if (bind(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sin, + sizeof(tctx->tc_sin)) < 0) { + ret = errno; + tcp4_close(tctx); + return (ret); + } + if (listen(tctx->tc_fd, 8) < 0) { + ret = errno; + tcp4_close(tctx); + return (ret); + } + + return (0); +} + +static int +tcp4_accept(void *ctx, void **newctxp) +{ + struct tcp4_ctx *tctx = ctx; + struct tcp4_ctx *newtctx; + socklen_t fromlen; + int ret; + + assert(tctx != NULL); + assert(tctx->tc_magic == TCP4_CTX_MAGIC); + assert(tctx->tc_side == TCP4_SIDE_SERVER_LISTEN); + assert(tctx->tc_fd >= 0); + + newtctx = malloc(sizeof(*newtctx)); + if (newtctx == NULL) + return (errno); + + fromlen = sizeof(tctx->tc_sin); + newtctx->tc_fd = accept(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sin, + &fromlen); + if (newtctx->tc_fd < 0) { + ret = errno; + free(newtctx); + return (ret); + } + + newtctx->tc_side = TCP4_SIDE_SERVER_WORK; + newtctx->tc_magic = TCP4_CTX_MAGIC; + *newctxp = newtctx; + + return (0); +} + +static int +tcp4_send(void *ctx, const unsigned char *data, size_t size) +{ + struct tcp4_ctx *tctx = ctx; + + assert(tctx != NULL); + assert(tctx->tc_magic == TCP4_CTX_MAGIC); + assert(tctx->tc_fd >= 0); + + return (proto_common_send(tctx->tc_fd, data, size)); +} + +static int +tcp4_recv(void *ctx, unsigned char *data, size_t size) +{ + struct tcp4_ctx *tctx = ctx; + + assert(tctx != NULL); + assert(tctx->tc_magic == TCP4_CTX_MAGIC); + assert(tctx->tc_fd >= 0); + + return (proto_common_recv(tctx->tc_fd, data, size)); +} + +static int +tcp4_descriptor(const void *ctx) +{ + const struct tcp4_ctx *tctx = ctx; + + assert(tctx != NULL); + assert(tctx->tc_magic == TCP4_CTX_MAGIC); + + return (tctx->tc_fd); +} + +static void +sin2str(struct sockaddr_in *sinp, char *addr, size_t size) +{ + in_addr_t ip; + unsigned int port; + + assert(addr != NULL); + assert(sinp->sin_family == AF_INET); + + ip = ntohl(sinp->sin_addr.s_addr); + port = ntohs(sinp->sin_port); + snprintf(addr, size, "tcp4://%u.%u.%u.%u:%u", ((ip >> 24) & 0xff), + ((ip >> 16) & 0xff), ((ip >> 8) & 0xff), (ip & 0xff), port); +} + +static bool +tcp4_address_match(const void *ctx, const char *addr) +{ + const struct tcp4_ctx *tctx = ctx; + struct sockaddr_in sin; + socklen_t sinlen; + in_addr_t ip1, ip2; + + assert(tctx != NULL); + assert(tctx->tc_magic == TCP4_CTX_MAGIC); + + if (tcp4_addr(addr, &sin) != 0) + return (false); + ip1 = sin.sin_addr.s_addr; + + sinlen = sizeof(sin); + if (getpeername(tctx->tc_fd, (struct sockaddr *)&sin, &sinlen) < 0) + return (false); + ip2 = sin.sin_addr.s_addr; + + return (ip1 == ip2); +} + +static void +tcp4_local_address(const void *ctx, char *addr, size_t size) +{ + const struct tcp4_ctx *tctx = ctx; + struct sockaddr_in sin; + socklen_t sinlen; + + assert(tctx != NULL); + assert(tctx->tc_magic == TCP4_CTX_MAGIC); + + sinlen = sizeof(sin); + if (getsockname(tctx->tc_fd, (struct sockaddr *)&sin, &sinlen) < 0) { + strlcpy(addr, "N/A", size); + return; + } + sin2str(&sin, addr, size); +} + +static void +tcp4_remote_address(const void *ctx, char *addr, size_t size) +{ + const struct tcp4_ctx *tctx = ctx; + struct sockaddr_in sin; + socklen_t sinlen; + + assert(tctx != NULL); + assert(tctx->tc_magic == TCP4_CTX_MAGIC); + + sinlen = sizeof(sin); + if (getpeername(tctx->tc_fd, (struct sockaddr *)&sin, &sinlen) < 0) { + strlcpy(addr, "N/A", size); + return; + } + sin2str(&sin, addr, size); +} + +static void +tcp4_close(void *ctx) +{ + struct tcp4_ctx *tctx = ctx; + + assert(tctx != NULL); + assert(tctx->tc_magic == TCP4_CTX_MAGIC); + + if (tctx->tc_fd >= 0) + close(tctx->tc_fd); + tctx->tc_magic = 0; + free(tctx); +} + +static struct hast_proto tcp4_proto = { + .hp_name = "tcp4", + .hp_client = tcp4_client, + .hp_connect = tcp4_connect, + .hp_server = tcp4_server, + .hp_accept = tcp4_accept, + .hp_send = tcp4_send, + .hp_recv = tcp4_recv, + .hp_descriptor = tcp4_descriptor, + .hp_address_match = tcp4_address_match, + .hp_local_address = tcp4_local_address, + .hp_remote_address = tcp4_remote_address, + .hp_close = tcp4_close +}; + +static __constructor void +tcp4_ctor(void) +{ + + proto_register(&tcp4_proto); +} diff --git a/sbin/hastd/proto_uds.c b/sbin/hastd/proto_uds.c new file mode 100644 index 0000000..0fac82f --- /dev/null +++ b/sbin/hastd/proto_uds.c @@ -0,0 +1,330 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* UDS - UNIX Domain Socket */ + +#include <sys/un.h> + +#include <assert.h> +#include <errno.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include "hast.h" +#include "proto_impl.h" + +#define UDS_CTX_MAGIC 0xd541c +struct uds_ctx { + int uc_magic; + struct sockaddr_un uc_sun; + int uc_fd; + int uc_side; +#define UDS_SIDE_CLIENT 0 +#define UDS_SIDE_SERVER_LISTEN 1 +#define UDS_SIDE_SERVER_WORK 2 +}; + +static void uds_close(void *ctx); + +static int +uds_addr(const char *addr, struct sockaddr_un *sunp) +{ + + if (addr == NULL) + return (-1); + + if (strncasecmp(addr, "uds://", 6) == 0) + addr += 6; + else if (strncasecmp(addr, "unix://", 7) == 0) + addr += 7; + else if (addr[0] == '/' && /* If it starts from /... */ + strstr(addr, "://") == NULL)/* ...and there is no prefix... */ + ; /* ...we assume its us. */ + else + return (-1); + + sunp->sun_family = AF_UNIX; + if (strlcpy(sunp->sun_path, addr, sizeof(sunp->sun_path)) >= + sizeof(sunp->sun_path)) { + return (ENAMETOOLONG); + } + sunp->sun_len = SUN_LEN(sunp); + + return (0); +} + +static int +uds_common_setup(const char *addr, void **ctxp, int side) +{ + struct uds_ctx *uctx; + int ret; + + uctx = malloc(sizeof(*uctx)); + if (uctx == NULL) + return (errno); + + /* Parse given address. */ + if ((ret = uds_addr(addr, &uctx->uc_sun)) != 0) { + free(uctx); + return (ret); + } + + uctx->uc_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (uctx->uc_fd == -1) { + ret = errno; + free(uctx); + return (ret); + } + + uctx->uc_side = side; + uctx->uc_magic = UDS_CTX_MAGIC; + *ctxp = uctx; + + return (0); +} + +static int +uds_client(const char *addr, void **ctxp) +{ + + return (uds_common_setup(addr, ctxp, UDS_SIDE_CLIENT)); +} + +static int +uds_connect(void *ctx) +{ + struct uds_ctx *uctx = ctx; + + assert(uctx != NULL); + assert(uctx->uc_magic == UDS_CTX_MAGIC); + assert(uctx->uc_side == UDS_SIDE_CLIENT); + assert(uctx->uc_fd >= 0); + + if (connect(uctx->uc_fd, (struct sockaddr *)&uctx->uc_sun, + sizeof(uctx->uc_sun)) < 0) { + return (errno); + } + + return (0); +} + +static int +uds_server(const char *addr, void **ctxp) +{ + struct uds_ctx *uctx; + int ret; + + ret = uds_common_setup(addr, ctxp, UDS_SIDE_SERVER_LISTEN); + if (ret != 0) + return (ret); + + uctx = *ctxp; + + unlink(uctx->uc_sun.sun_path); + if (bind(uctx->uc_fd, (struct sockaddr *)&uctx->uc_sun, + sizeof(uctx->uc_sun)) < 0) { + ret = errno; + uds_close(uctx); + return (ret); + } + if (listen(uctx->uc_fd, 8) < 0) { + ret = errno; + uds_close(uctx); + return (ret); + } + + return (0); +} + +static int +uds_accept(void *ctx, void **newctxp) +{ + struct uds_ctx *uctx = ctx; + struct uds_ctx *newuctx; + socklen_t fromlen; + int ret; + + assert(uctx != NULL); + assert(uctx->uc_magic == UDS_CTX_MAGIC); + assert(uctx->uc_side == UDS_SIDE_SERVER_LISTEN); + assert(uctx->uc_fd >= 0); + + newuctx = malloc(sizeof(*newuctx)); + if (newuctx == NULL) + return (errno); + + fromlen = sizeof(uctx->uc_sun); + newuctx->uc_fd = accept(uctx->uc_fd, (struct sockaddr *)&uctx->uc_sun, + &fromlen); + if (newuctx->uc_fd < 0) { + ret = errno; + free(newuctx); + return (ret); + } + + newuctx->uc_side = UDS_SIDE_SERVER_WORK; + newuctx->uc_magic = UDS_CTX_MAGIC; + *newctxp = newuctx; + + return (0); +} + +static int +uds_send(void *ctx, const unsigned char *data, size_t size) +{ + struct uds_ctx *uctx = ctx; + + assert(uctx != NULL); + assert(uctx->uc_magic == UDS_CTX_MAGIC); + assert(uctx->uc_fd >= 0); + + return (proto_common_send(uctx->uc_fd, data, size)); +} + +static int +uds_recv(void *ctx, unsigned char *data, size_t size) +{ + struct uds_ctx *uctx = ctx; + + assert(uctx != NULL); + assert(uctx->uc_magic == UDS_CTX_MAGIC); + assert(uctx->uc_fd >= 0); + + return (proto_common_recv(uctx->uc_fd, data, size)); +} + +static int +uds_descriptor(const void *ctx) +{ + const struct uds_ctx *uctx = ctx; + + assert(uctx != NULL); + assert(uctx->uc_magic == UDS_CTX_MAGIC); + + return (uctx->uc_fd); +} + +static bool +uds_address_match(const void *ctx __unused, const char *addr __unused) +{ + + assert(!"proto_address_match() not supported on UNIX domain sockets"); + abort(); +} + +static void +uds_local_address(const void *ctx, char *addr, size_t size) +{ + const struct uds_ctx *uctx = ctx; + struct sockaddr_un sun; + socklen_t sunlen; + + assert(uctx != NULL); + assert(uctx->uc_magic == UDS_CTX_MAGIC); + assert(addr != NULL); + + sunlen = sizeof(sun); + if (getsockname(uctx->uc_fd, (struct sockaddr *)&sun, &sunlen) < 0) { + strlcpy(addr, "N/A", size); + return; + } + assert(sun.sun_family == AF_UNIX); + if (sun.sun_path[0] == '\0') { + strlcpy(addr, "N/A", size); + return; + } + snprintf(addr, size, "uds://%s", sun.sun_path); +} + +static void +uds_remote_address(const void *ctx, char *addr, size_t size) +{ + const struct uds_ctx *uctx = ctx; + struct sockaddr_un sun; + socklen_t sunlen; + + assert(uctx != NULL); + assert(uctx->uc_magic == UDS_CTX_MAGIC); + assert(addr != NULL); + + sunlen = sizeof(sun); + if (getpeername(uctx->uc_fd, (struct sockaddr *)&sun, &sunlen) < 0) { + strlcpy(addr, "N/A", size); + return; + } + assert(sun.sun_family == AF_UNIX); + if (sun.sun_path[0] == '\0') { + strlcpy(addr, "N/A", size); + return; + } + snprintf(addr, size, "uds://%s", sun.sun_path); +} + +static void +uds_close(void *ctx) +{ + struct uds_ctx *uctx = ctx; + + assert(uctx != NULL); + assert(uctx->uc_magic == UDS_CTX_MAGIC); + + if (uctx->uc_fd >= 0) + close(uctx->uc_fd); + unlink(uctx->uc_sun.sun_path); + uctx->uc_magic = 0; + free(uctx); +} + +static struct hast_proto uds_proto = { + .hp_name = "uds", + .hp_client = uds_client, + .hp_connect = uds_connect, + .hp_server = uds_server, + .hp_accept = uds_accept, + .hp_send = uds_send, + .hp_recv = uds_recv, + .hp_descriptor = uds_descriptor, + .hp_address_match = uds_address_match, + .hp_local_address = uds_local_address, + .hp_remote_address = uds_remote_address, + .hp_close = uds_close +}; + +static __constructor void +uds_ctor(void) +{ + + proto_register(&uds_proto); +} diff --git a/sbin/hastd/rangelock.c b/sbin/hastd/rangelock.c new file mode 100644 index 0000000..02247d6 --- /dev/null +++ b/sbin/hastd/rangelock.c @@ -0,0 +1,137 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/queue.h> + +#include <assert.h> +#include <stdbool.h> +#include <stdlib.h> +#include <unistd.h> + +#include "rangelock.h" + +#define RANGELOCKS_MAGIC 0x94310c +struct rangelocks { + int rls_magic; /* Magic value. */ + TAILQ_HEAD(, rlock) rls_locks; /* List of locked ranges. */ +}; + +struct rlock { + off_t rl_start; + off_t rl_end; + TAILQ_ENTRY(rlock) rl_next; +}; + +int +rangelock_init(struct rangelocks **rlsp) +{ + struct rangelocks *rls; + + assert(rlsp != NULL); + + rls = malloc(sizeof(*rls)); + if (rls == NULL) + return (-1); + + TAILQ_INIT(&rls->rls_locks); + + rls->rls_magic = RANGELOCKS_MAGIC; + *rlsp = rls; + + return (0); +} + +void +rangelock_free(struct rangelocks *rls) +{ + struct rlock *rl; + + assert(rls->rls_magic == RANGELOCKS_MAGIC); + + rls->rls_magic = 0; + + while ((rl = TAILQ_FIRST(&rls->rls_locks)) != NULL) { + TAILQ_REMOVE(&rls->rls_locks, rl, rl_next); + free(rl); + } + free(rls); +} + +int +rangelock_add(struct rangelocks *rls, off_t offset, off_t length) +{ + struct rlock *rl; + + assert(rls->rls_magic == RANGELOCKS_MAGIC); + + rl = malloc(sizeof(*rl)); + if (rl == NULL) + return (-1); + rl->rl_start = offset; + rl->rl_end = offset + length; + TAILQ_INSERT_TAIL(&rls->rls_locks, rl, rl_next); + return (0); +} + +void +rangelock_del(struct rangelocks *rls, off_t offset, off_t length) +{ + struct rlock *rl; + + assert(rls->rls_magic == RANGELOCKS_MAGIC); + + TAILQ_FOREACH(rl, &rls->rls_locks, rl_next) { + if (rl->rl_start == offset && rl->rl_end == offset + length) + break; + } + assert(rl != NULL); + TAILQ_REMOVE(&rls->rls_locks, rl, rl_next); + free(rl); +} + +bool +rangelock_islocked(struct rangelocks *rls, off_t offset, off_t length) +{ + struct rlock *rl; + + assert(rls->rls_magic == RANGELOCKS_MAGIC); + + TAILQ_FOREACH(rl, &rls->rls_locks, rl_next) { + if (rl->rl_start >= offset && rl->rl_start < offset + length) + break; + else if (rl->rl_end > offset && rl->rl_end <= offset + length) + break; + else if (rl->rl_start < offset && rl->rl_end > offset + length) + break; + } + return (rl != NULL); +} diff --git a/sbin/hastd/rangelock.h b/sbin/hastd/rangelock.h new file mode 100644 index 0000000..2ad9895 --- /dev/null +++ b/sbin/hastd/rangelock.h @@ -0,0 +1,46 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _RANGELOCK_H_ +#define _RANGELOCK_H_ + +#include <stdbool.h> +#include <unistd.h> + +struct rangelocks; + +int rangelock_init(struct rangelocks **rlsp); +void rangelock_free(struct rangelocks *rls); +int rangelock_add(struct rangelocks *rls, off_t offset, off_t length); +void rangelock_del(struct rangelocks *rls, off_t offset, off_t length); +bool rangelock_islocked(struct rangelocks *rls, off_t offset, off_t length); + +#endif /* !_RANGELOCK_H_ */ diff --git a/sbin/hastd/secondary.c b/sbin/hastd/secondary.c new file mode 100644 index 0000000..6af95b5 --- /dev/null +++ b/sbin/hastd/secondary.c @@ -0,0 +1,697 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/time.h> +#include <sys/bio.h> +#include <sys/disk.h> +#include <sys/stat.h> + +#include <assert.h> +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <libgeom.h> +#include <pthread.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> + +#include <activemap.h> +#include <nv.h> +#include <pjdlog.h> + +#include "control.h" +#include "hast.h" +#include "hast_proto.h" +#include "hastd.h" +#include "metadata.h" +#include "proto.h" +#include "subr.h" +#include "synch.h" + +struct hio { + uint64_t hio_seq; + int hio_error; + struct nv *hio_nv; + void *hio_data; + uint8_t hio_cmd; + uint64_t hio_offset; + uint64_t hio_length; + TAILQ_ENTRY(hio) hio_next; +}; + +/* + * Free list holds unused structures. When free list is empty, we have to wait + * until some in-progress requests are freed. + */ +static TAILQ_HEAD(, hio) hio_free_list; +static pthread_mutex_t hio_free_list_lock; +static pthread_cond_t hio_free_list_cond; +/* + * Disk thread (the one that do I/O requests) takes requests from this list. + */ +static TAILQ_HEAD(, hio) hio_disk_list; +static pthread_mutex_t hio_disk_list_lock; +static pthread_cond_t hio_disk_list_cond; +/* + * There is one recv list for every component, although local components don't + * use recv lists as local requests are done synchronously. + */ +static TAILQ_HEAD(, hio) hio_send_list; +static pthread_mutex_t hio_send_list_lock; +static pthread_cond_t hio_send_list_cond; + +/* + * Maximum number of outstanding I/O requests. + */ +#define HAST_HIO_MAX 256 + +static void *recv_thread(void *arg); +static void *disk_thread(void *arg); +static void *send_thread(void *arg); + +static void +init_environment(void) +{ + struct hio *hio; + unsigned int ii; + + /* + * Initialize lists, their locks and theirs condition variables. + */ + TAILQ_INIT(&hio_free_list); + mtx_init(&hio_free_list_lock); + cv_init(&hio_free_list_cond); + TAILQ_INIT(&hio_disk_list); + mtx_init(&hio_disk_list_lock); + cv_init(&hio_disk_list_cond); + TAILQ_INIT(&hio_send_list); + mtx_init(&hio_send_list_lock); + cv_init(&hio_send_list_cond); + + /* + * Allocate requests pool and initialize requests. + */ + for (ii = 0; ii < HAST_HIO_MAX; ii++) { + hio = malloc(sizeof(*hio)); + if (hio == NULL) { + errx(EX_TEMPFAIL, "cannot allocate %zu bytes of memory " + "for hio request", sizeof(*hio)); + } + hio->hio_error = 0; + hio->hio_data = malloc(MAXPHYS); + if (hio->hio_data == NULL) { + errx(EX_TEMPFAIL, "cannot allocate %zu bytes of memory " + "for gctl_data", (size_t)MAXPHYS); + } + TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_next); + } +} + +static void +init_local(struct hast_resource *res) +{ + + if (metadata_read(res, true) < 0) + exit(EX_NOINPUT); +} + +static void +init_remote(struct hast_resource *res, struct nv *nvin) +{ + uint64_t resuid; + struct nv *nvout; + unsigned char *map; + size_t mapsize; + + map = NULL; + mapsize = 0; + nvout = nv_alloc(); + nv_add_int64(nvout, (int64_t)res->hr_datasize, "datasize"); + nv_add_int32(nvout, (int32_t)res->hr_extentsize, "extentsize"); + resuid = nv_get_uint64(nvin, "resuid"); + res->hr_primary_localcnt = nv_get_uint64(nvin, "localcnt"); + res->hr_primary_remotecnt = nv_get_uint64(nvin, "remotecnt"); + nv_add_uint64(nvout, res->hr_secondary_localcnt, "localcnt"); + nv_add_uint64(nvout, res->hr_secondary_remotecnt, "remotecnt"); + mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize - + METADATA_SIZE, res->hr_extentsize, res->hr_local_sectorsize); + map = malloc(mapsize); + if (map == NULL) { + pjdlog_exitx(EX_TEMPFAIL, + "Unable to allocate memory (%zu bytes) for activemap.", + mapsize); + } + nv_add_uint32(nvout, (uint32_t)mapsize, "mapsize"); + /* + * When we work as primary and secondary is missing we will increase + * localcnt in our metadata. When secondary is connected and synced + * we make localcnt be equal to remotecnt, which means nodes are more + * or less in sync. + * Split-brain condition is when both nodes are not able to communicate + * and are both configured as primary nodes. In turn, they can both + * make incompatible changes to the data and we have to detect that. + * Under split-brain condition we will increase our localcnt on first + * write and remote node will increase its localcnt on first write. + * When we connect we can see that primary's localcnt is greater than + * our remotecnt (primary was modified while we weren't watching) and + * our localcnt is greater than primary's remotecnt (we were modified + * while primary wasn't watching). + * There are many possible combinations which are all gathered below. + * Don't pay too much attention to exact numbers, the more important + * is to compare them. We compare secondary's local with primary's + * remote and secondary's remote with primary's local. + * Note that every case where primary's localcnt is smaller than + * secondary's remotecnt and where secondary's localcnt is smaller than + * primary's remotecnt should be impossible in practise. We will perform + * full synchronization then. Those cases are marked with an asterisk. + * Regular synchronization means that only extents marked as dirty are + * synchronized (regular synchronization). + * + * SECONDARY METADATA PRIMARY METADATA + * local=3 remote=3 local=2 remote=2* ?! Full sync from secondary. + * local=3 remote=3 local=2 remote=3* ?! Full sync from primary. + * local=3 remote=3 local=2 remote=4* ?! Full sync from primary. + * local=3 remote=3 local=3 remote=2 Primary is out-of-date, + * regular sync from secondary. + * local=3 remote=3 local=3 remote=3 Regular sync just in case. + * local=3 remote=3 local=3 remote=4* ?! Full sync from primary. + * local=3 remote=3 local=4 remote=2 Split-brain condition. + * local=3 remote=3 local=4 remote=3 Secondary out-of-date, + * regular sync from primary. + * local=3 remote=3 local=4 remote=4* ?! Full sync from primary. + */ + if (res->hr_resuid == 0) { + /* + * Provider is used for the first time. Initialize everything. + */ + assert(res->hr_secondary_localcnt == 0); + res->hr_resuid = resuid; + if (metadata_write(res) < 0) + exit(EX_NOINPUT); + memset(map, 0xff, mapsize); + nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); + } else if ( + /* Is primary is out-of-date? */ + (res->hr_secondary_localcnt > res->hr_primary_remotecnt && + res->hr_secondary_remotecnt == res->hr_primary_localcnt) || + /* Node are more or less in sync? */ + (res->hr_secondary_localcnt == res->hr_primary_remotecnt && + res->hr_secondary_remotecnt == res->hr_primary_localcnt) || + /* Is secondary is out-of-date? */ + (res->hr_secondary_localcnt == res->hr_primary_remotecnt && + res->hr_secondary_remotecnt < res->hr_primary_localcnt)) { + /* + * Nodes are more or less in sync or one of the nodes is + * out-of-date. + * It doesn't matter at this point which one, we just have to + * send out local bitmap to the remote node. + */ + if (pread(res->hr_localfd, map, mapsize, METADATA_SIZE) != + (ssize_t)mapsize) { + pjdlog_exit(LOG_ERR, "Unable to read activemap"); + } + if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && + res->hr_secondary_remotecnt == res->hr_primary_localcnt) { + /* Primary is out-of-date, sync from secondary. */ + nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc"); + } else { + /* + * Secondary is out-of-date or counts match. + * Sync from primary. + */ + nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); + } + } else if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && + res->hr_primary_localcnt > res->hr_secondary_remotecnt) { + /* + * Not good, we have split-brain condition. + */ + pjdlog_error("Split-brain detected, exiting."); + nv_add_string(nvout, "Split-brain condition!", "errmsg"); + free(map); + map = NULL; + mapsize = 0; + } else /* if (res->hr_secondary_localcnt < res->hr_primary_remotecnt || + res->hr_primary_localcnt < res->hr_secondary_remotecnt) */ { + /* + * This should never happen in practise, but we will perform + * full synchronization. + */ + assert(res->hr_secondary_localcnt < res->hr_primary_remotecnt || + res->hr_primary_localcnt < res->hr_secondary_remotecnt); + mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize - + METADATA_SIZE, res->hr_extentsize, + res->hr_local_sectorsize); + memset(map, 0xff, mapsize); + if (res->hr_secondary_localcnt > res->hr_primary_remotecnt) { + /* In this one of five cases sync from secondary. */ + nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc"); + } else { + /* For the rest four cases sync from primary. */ + nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc"); + } + pjdlog_warning("This should never happen, asking for full synchronization (primary(local=%ju, remote=%ju), secondary(local=%ju, remote=%ju)).", + (uintmax_t)res->hr_primary_localcnt, + (uintmax_t)res->hr_primary_remotecnt, + (uintmax_t)res->hr_secondary_localcnt, + (uintmax_t)res->hr_secondary_remotecnt); + } + if (hast_proto_send(res, res->hr_remotein, nvout, map, mapsize) < 0) { + pjdlog_errno(LOG_WARNING, "Unable to send activemap to %s", + res->hr_remoteaddr); + nv_free(nvout); + exit(EX_TEMPFAIL); + } + if (res->hr_secondary_localcnt > res->hr_primary_remotecnt && + res->hr_primary_localcnt > res->hr_secondary_remotecnt) { + /* Exit on split-brain. */ + exit(EX_CONFIG); + } +} + +void +hastd_secondary(struct hast_resource *res, struct nv *nvin) +{ + pthread_t td; + pid_t pid; + int error; + + /* + * Create communication channel between parent and child. + */ + if (proto_client("socketpair://", &res->hr_ctrl) < 0) { + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, + "Unable to create control sockets between parent and child"); + } + + pid = fork(); + if (pid < 0) { + KEEP_ERRNO((void)pidfile_remove(pfh)); + pjdlog_exit(EX_OSERR, "Unable to fork"); + } + + if (pid > 0) { + /* This is parent. */ + proto_close(res->hr_remotein); + res->hr_remotein = NULL; + proto_close(res->hr_remoteout); + res->hr_remoteout = NULL; + res->hr_workerpid = pid; + return; + } + (void)pidfile_close(pfh); + + setproctitle("%s (secondary)", res->hr_name); + + init_local(res); + init_remote(res, nvin); + init_environment(); + + error = pthread_create(&td, NULL, recv_thread, res); + assert(error == 0); + error = pthread_create(&td, NULL, disk_thread, res); + assert(error == 0); + error = pthread_create(&td, NULL, send_thread, res); + assert(error == 0); + (void)ctrl_thread(res); +} + +static void +reqlog(int loglevel, int debuglevel, int error, struct hio *hio, const char *fmt, ...) +{ + char msg[1024]; + va_list ap; + int len; + + va_start(ap, fmt); + len = vsnprintf(msg, sizeof(msg), fmt, ap); + va_end(ap); + if ((size_t)len < sizeof(msg)) { + switch (hio->hio_cmd) { + case HIO_READ: + (void)snprintf(msg + len, sizeof(msg) - len, + "READ(%ju, %ju).", (uintmax_t)hio->hio_offset, + (uintmax_t)hio->hio_length); + break; + case HIO_DELETE: + (void)snprintf(msg + len, sizeof(msg) - len, + "DELETE(%ju, %ju).", (uintmax_t)hio->hio_offset, + (uintmax_t)hio->hio_length); + break; + case HIO_FLUSH: + (void)snprintf(msg + len, sizeof(msg) - len, "FLUSH."); + break; + case HIO_WRITE: + (void)snprintf(msg + len, sizeof(msg) - len, + "WRITE(%ju, %ju).", (uintmax_t)hio->hio_offset, + (uintmax_t)hio->hio_length); + break; + default: + (void)snprintf(msg + len, sizeof(msg) - len, + "UNKNOWN(%u).", (unsigned int)hio->hio_cmd); + break; + } + } + pjdlog_common(loglevel, debuglevel, error, "%s", msg); +} + +static int +requnpack(struct hast_resource *res, struct hio *hio) +{ + + hio->hio_cmd = nv_get_uint8(hio->hio_nv, "cmd"); + if (hio->hio_cmd == 0) { + pjdlog_error("Header contains no 'cmd' field."); + hio->hio_error = EINVAL; + goto end; + } + switch (hio->hio_cmd) { + case HIO_READ: + case HIO_WRITE: + case HIO_DELETE: + hio->hio_offset = nv_get_uint64(hio->hio_nv, "offset"); + if (nv_error(hio->hio_nv) != 0) { + pjdlog_error("Header is missing 'offset' field."); + hio->hio_error = EINVAL; + goto end; + } + hio->hio_length = nv_get_uint64(hio->hio_nv, "length"); + if (nv_error(hio->hio_nv) != 0) { + pjdlog_error("Header is missing 'length' field."); + hio->hio_error = EINVAL; + goto end; + } + if (hio->hio_length == 0) { + pjdlog_error("Data length is zero."); + hio->hio_error = EINVAL; + goto end; + } + if (hio->hio_length > MAXPHYS) { + pjdlog_error("Data length is too large (%ju > %ju).", + (uintmax_t)hio->hio_length, (uintmax_t)MAXPHYS); + hio->hio_error = EINVAL; + goto end; + } + if ((hio->hio_offset % res->hr_local_sectorsize) != 0) { + pjdlog_error("Offset %ju is not multiple of sector size.", + (uintmax_t)hio->hio_offset); + hio->hio_error = EINVAL; + goto end; + } + if ((hio->hio_length % res->hr_local_sectorsize) != 0) { + pjdlog_error("Length %ju is not multiple of sector size.", + (uintmax_t)hio->hio_length); + hio->hio_error = EINVAL; + goto end; + } + if (hio->hio_offset + hio->hio_length > + (uint64_t)res->hr_datasize) { + pjdlog_error("Data offset is too large (%ju > %ju).", + (uintmax_t)(hio->hio_offset + hio->hio_length), + (uintmax_t)res->hr_datasize); + hio->hio_error = EINVAL; + goto end; + } + break; + default: + pjdlog_error("Header contains invalid 'cmd' (%hhu).", + hio->hio_cmd); + hio->hio_error = EINVAL; + goto end; + } + hio->hio_error = 0; +end: + return (hio->hio_error); +} + +/* + * Thread receives requests from the primary node. + */ +static void * +recv_thread(void *arg) +{ + struct hast_resource *res = arg; + struct hio *hio; + bool wakeup; + + for (;;) { + pjdlog_debug(2, "recv: Taking free request."); + mtx_lock(&hio_free_list_lock); + while ((hio = TAILQ_FIRST(&hio_free_list)) == NULL) { + pjdlog_debug(2, "recv: No free requests, waiting."); + cv_wait(&hio_free_list_cond, &hio_free_list_lock); + } + TAILQ_REMOVE(&hio_free_list, hio, hio_next); + mtx_unlock(&hio_free_list_lock); + pjdlog_debug(2, "recv: (%p) Got request.", hio); + if (hast_proto_recv_hdr(res->hr_remotein, &hio->hio_nv) < 0) { + pjdlog_exit(EX_TEMPFAIL, + "Unable to receive request header"); + } + if (requnpack(res, hio) != 0) + goto send_queue; + reqlog(LOG_DEBUG, 2, -1, hio, + "recv: (%p) Got request header: ", hio); + if (hio->hio_cmd == HIO_WRITE) { + if (hast_proto_recv_data(res, res->hr_remotein, + hio->hio_nv, hio->hio_data, MAXPHYS) < 0) { + pjdlog_exit(EX_TEMPFAIL, + "Unable to receive reply data"); + } + } + pjdlog_debug(2, "recv: (%p) Moving request to the disk queue.", + hio); + mtx_lock(&hio_disk_list_lock); + wakeup = TAILQ_EMPTY(&hio_disk_list); + TAILQ_INSERT_TAIL(&hio_disk_list, hio, hio_next); + mtx_unlock(&hio_disk_list_lock); + if (wakeup) + cv_signal(&hio_disk_list_cond); + continue; +send_queue: + pjdlog_debug(2, "recv: (%p) Moving request to the send queue.", + hio); + mtx_lock(&hio_send_list_lock); + wakeup = TAILQ_EMPTY(&hio_send_list); + TAILQ_INSERT_TAIL(&hio_send_list, hio, hio_next); + mtx_unlock(&hio_send_list_lock); + if (wakeup) + cv_signal(&hio_send_list_cond); + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Thread reads from or writes to local component and also handles DELETE and + * FLUSH requests. + */ +static void * +disk_thread(void *arg) +{ + struct hast_resource *res = arg; + struct hio *hio; + ssize_t ret; + bool clear_activemap, wakeup; + + clear_activemap = true; + + for (;;) { + pjdlog_debug(2, "disk: Taking request."); + mtx_lock(&hio_disk_list_lock); + while ((hio = TAILQ_FIRST(&hio_disk_list)) == NULL) { + pjdlog_debug(2, "disk: No requests, waiting."); + cv_wait(&hio_disk_list_cond, &hio_disk_list_lock); + } + TAILQ_REMOVE(&hio_disk_list, hio, hio_next); + mtx_unlock(&hio_disk_list_lock); + while (clear_activemap) { + unsigned char *map; + size_t mapsize; + + /* + * When first request is received, it means that primary + * already received our activemap, merged it and stored + * locally. We can now safely clear our activemap. + */ + mapsize = + activemap_calc_ondisk_size(res->hr_local_mediasize - + METADATA_SIZE, res->hr_extentsize, + res->hr_local_sectorsize); + map = calloc(1, mapsize); + if (map == NULL) { + pjdlog_warning("Unable to allocate memory to clear local activemap."); + break; + } + if (pwrite(res->hr_localfd, map, mapsize, + METADATA_SIZE) != (ssize_t)mapsize) { + pjdlog_errno(LOG_WARNING, + "Unable to store cleared activemap"); + free(map); + break; + } + free(map); + clear_activemap = false; + pjdlog_debug(1, "Local activemap cleared."); + } + reqlog(LOG_DEBUG, 2, -1, hio, "disk: (%p) Got request: ", hio); + /* Handle the actual request. */ + switch (hio->hio_cmd) { + case HIO_READ: + ret = pread(res->hr_localfd, hio->hio_data, + hio->hio_length, + hio->hio_offset + res->hr_localoff); + if (ret < 0) + hio->hio_error = errno; + else if (ret != (int64_t)hio->hio_length) + hio->hio_error = EIO; + else + hio->hio_error = 0; + break; + case HIO_WRITE: + ret = pwrite(res->hr_localfd, hio->hio_data, + hio->hio_length, + hio->hio_offset + res->hr_localoff); + if (ret < 0) + hio->hio_error = errno; + else if (ret != (int64_t)hio->hio_length) + hio->hio_error = EIO; + else + hio->hio_error = 0; + break; + case HIO_DELETE: + ret = g_delete(res->hr_localfd, + hio->hio_offset + res->hr_localoff, + hio->hio_length); + if (ret < 0) + hio->hio_error = errno; + else + hio->hio_error = 0; + break; + case HIO_FLUSH: + ret = g_flush(res->hr_localfd); + if (ret < 0) + hio->hio_error = errno; + else + hio->hio_error = 0; + break; + } + if (hio->hio_error != 0) { + reqlog(LOG_ERR, 0, hio->hio_error, hio, + "Request failed: "); + } + pjdlog_debug(2, "disk: (%p) Moving request to the send queue.", + hio); + mtx_lock(&hio_send_list_lock); + wakeup = TAILQ_EMPTY(&hio_send_list); + TAILQ_INSERT_TAIL(&hio_send_list, hio, hio_next); + mtx_unlock(&hio_send_list_lock); + if (wakeup) + cv_signal(&hio_send_list_cond); + } + /* NOTREACHED */ + return (NULL); +} + +/* + * Thread sends requests back to primary node. + */ +static void * +send_thread(void *arg) +{ + struct hast_resource *res = arg; + struct nv *nvout; + struct hio *hio; + void *data; + size_t length; + bool wakeup; + + for (;;) { + pjdlog_debug(2, "send: Taking request."); + mtx_lock(&hio_send_list_lock); + while ((hio = TAILQ_FIRST(&hio_send_list)) == NULL) { + pjdlog_debug(2, "send: No requests, waiting."); + cv_wait(&hio_send_list_cond, &hio_send_list_lock); + } + TAILQ_REMOVE(&hio_send_list, hio, hio_next); + mtx_unlock(&hio_send_list_lock); + reqlog(LOG_DEBUG, 2, -1, hio, "send: (%p) Got request: ", hio); + nvout = nv_alloc(); + /* Copy sequence number. */ + nv_add_uint64(nvout, nv_get_uint64(hio->hio_nv, "seq"), "seq"); + switch (hio->hio_cmd) { + case HIO_READ: + if (hio->hio_error == 0) { + data = hio->hio_data; + length = hio->hio_length; + break; + } + /* + * We send no data in case of an error. + */ + /* FALLTHROUGH */ + case HIO_DELETE: + case HIO_FLUSH: + case HIO_WRITE: + data = NULL; + length = 0; + break; + default: + abort(); + break; + } + if (hio->hio_error != 0) + nv_add_int16(nvout, hio->hio_error, "error"); + if (hast_proto_send(res, res->hr_remoteout, nvout, data, + length) < 0) { + pjdlog_exit(EX_TEMPFAIL, "Unable to send reply."); + } + nv_free(nvout); + pjdlog_debug(2, "disk: (%p) Moving request to the free queue.", + hio); + nv_free(hio->hio_nv); + hio->hio_error = 0; + mtx_lock(&hio_free_list_lock); + wakeup = TAILQ_EMPTY(&hio_free_list); + TAILQ_INSERT_TAIL(&hio_free_list, hio, hio_next); + mtx_unlock(&hio_free_list_lock); + if (wakeup) + cv_signal(&hio_free_list_cond); + } + /* NOTREACHED */ + return (NULL); +} diff --git a/sbin/hastd/subr.c b/sbin/hastd/subr.c new file mode 100644 index 0000000..16ea93f --- /dev/null +++ b/sbin/hastd/subr.c @@ -0,0 +1,118 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/disk.h> +#include <sys/ioctl.h> +#include <sys/stat.h> + +#include <assert.h> +#include <errno.h> +#include <fcntl.h> + +#include <pjdlog.h> + +#include "hast.h" +#include "subr.h" + +int +provinfo(struct hast_resource *res, bool dowrite) +{ + struct stat sb; + + assert(res->hr_localpath != NULL && res->hr_localpath[0] != '\0'); + + if (res->hr_localfd == -1) { + res->hr_localfd = open(res->hr_localpath, + dowrite ? O_RDWR : O_RDONLY); + if (res->hr_localfd < 0) { + KEEP_ERRNO(pjdlog_errno(LOG_ERR, "Unable to open %s", + res->hr_localpath)); + return (-1); + } + } + if (fstat(res->hr_localfd, &sb) < 0) { + KEEP_ERRNO(pjdlog_errno(LOG_ERR, "Unable to stat %s", + res->hr_localpath)); + return (-1); + } + if (S_ISCHR(sb.st_mode)) { + /* + * If this is character device, it is most likely GEOM provider. + */ + if (ioctl(res->hr_localfd, DIOCGMEDIASIZE, + &res->hr_local_mediasize) < 0) { + KEEP_ERRNO(pjdlog_errno(LOG_ERR, + "Unable obtain provider %s mediasize", + res->hr_localpath)); + return (-1); + } + if (ioctl(res->hr_localfd, DIOCGSECTORSIZE, + &res->hr_local_sectorsize) < 0) { + KEEP_ERRNO(pjdlog_errno(LOG_ERR, + "Unable obtain provider %s sectorsize", + res->hr_localpath)); + return (-1); + } + } else if (S_ISREG(sb.st_mode)) { + /* + * We also support regular files for which we hardcode + * sector size of 512 bytes. + */ + res->hr_local_mediasize = sb.st_size; + res->hr_local_sectorsize = 512; + } else { + /* + * We support no other file types. + */ + pjdlog_error("%s is neither GEOM provider nor regular file.", + res->hr_localpath); + errno = EFTYPE; + return (-1); + } + return (0); +} + +const char * +role2str(int role) +{ + + switch (role) { + case HAST_ROLE_INIT: + return ("init"); + case HAST_ROLE_PRIMARY: + return ("primary"); + case HAST_ROLE_SECONDARY: + return ("secondary"); + } + return ("unknown"); +} diff --git a/sbin/hastd/subr.h b/sbin/hastd/subr.h new file mode 100644 index 0000000..c486f5c --- /dev/null +++ b/sbin/hastd/subr.h @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SUBR_H_ +#define _SUBR_H_ + +#include <sys/types.h> +#include <stdbool.h> + +#include "hast.h" + +#define KEEP_ERRNO(work) do { \ + int _rerrno; \ + \ + _rerrno = errno; \ + work; \ + errno = _rerrno; \ +} while (0) + +int provinfo(struct hast_resource *res, bool dowrite); +const char *role2str(int role); + +#endif /* !_SUBR_H_ */ diff --git a/sbin/hastd/synch.h b/sbin/hastd/synch.h new file mode 100644 index 0000000..7269aea --- /dev/null +++ b/sbin/hastd/synch.h @@ -0,0 +1,162 @@ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYNCH_H_ +#define _SYNCH_H_ + +#include <assert.h> +#include <pthread.h> +#include <stdbool.h> +#include <time.h> + +static __inline void +mtx_init(pthread_mutex_t *lock) +{ + int error; + + error = pthread_mutex_init(lock, NULL); + assert(error == 0); +} +static __inline void +mtx_lock(pthread_mutex_t *lock) +{ + int error; + + error = pthread_mutex_lock(lock); + assert(error == 0); +} +static __inline bool +mtx_trylock(pthread_mutex_t *lock) +{ + int error; + + error = pthread_mutex_trylock(lock); + assert(error == 0 || error == EBUSY); + return (error == 0); +} +static __inline void +mtx_unlock(pthread_mutex_t *lock) +{ + int error; + + error = pthread_mutex_unlock(lock); + assert(error == 0); +} + +static __inline void +rw_init(pthread_rwlock_t *lock) +{ + int error; + + error = pthread_rwlock_init(lock, NULL); + assert(error == 0); +} +static __inline void +rw_rlock(pthread_rwlock_t *lock) +{ + int error; + + error = pthread_rwlock_rdlock(lock); + assert(error == 0); +} +static __inline void +rw_wlock(pthread_rwlock_t *lock) +{ + int error; + + error = pthread_rwlock_wrlock(lock); + assert(error == 0); +} +static __inline void +rw_unlock(pthread_rwlock_t *lock) +{ + int error; + + error = pthread_rwlock_unlock(lock); + assert(error == 0); +} + +static __inline void +cv_init(pthread_cond_t *cv) +{ + pthread_condattr_t attr; + int error; + + error = pthread_condattr_init(&attr); + assert(error == 0); + error = pthread_condattr_setclock(&attr, CLOCK_MONOTONIC); + assert(error == 0); + error = pthread_cond_init(cv, &attr); + assert(error == 0); +} +static __inline void +cv_wait(pthread_cond_t *cv, pthread_mutex_t *lock) +{ + int error; + + error = pthread_cond_wait(cv, lock); + assert(error == 0); +} +static __inline bool +cv_timedwait(pthread_cond_t *cv, pthread_mutex_t *lock, int timeout) +{ + struct timespec ts; + int error; + + if (timeout == 0) { + cv_wait(cv, lock); + return (false); + } + + error = clock_gettime(CLOCK_MONOTONIC, &ts); + assert(error == 0); + ts.tv_sec += timeout; + error = pthread_cond_timedwait(cv, lock, &ts); + assert(error == 0 || error == ETIMEDOUT); + return (error == ETIMEDOUT); +} +static __inline void +cv_signal(pthread_cond_t *cv) +{ + int error; + + error = pthread_cond_signal(cv); + assert(error == 0); +} +static __inline void +cv_broadcast(pthread_cond_t *cv) +{ + int error; + + error = pthread_cond_broadcast(cv); + assert(error == 0); +} +#endif /* !_SYNCH_H_ */ diff --git a/sbin/hastd/token.l b/sbin/hastd/token.l new file mode 100644 index 0000000..7b80384 --- /dev/null +++ b/sbin/hastd/token.l @@ -0,0 +1,66 @@ +%{ +/*- + * Copyright (c) 2009-2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Pawel Jakub Dawidek under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <stdio.h> +#include <string.h> + +#include "hast.h" + +#include "y.tab.h" + +int depth; +int lineno; + +#define DP do { } while (0) +%} + +%% +control { DP; return CONTROL; } +listen { DP; return LISTEN; } +port { DP; return PORT; } +replication { DP; return REPLICATION; } +resource { DP; return RESOURCE; } +name { DP; return NAME; } +local { DP; return LOCAL; } +remote { DP; return REMOTE; } +on { DP; return ON; } +fullsync { DP; return FULLSYNC; } +memsync { DP; return MEMSYNC; } +async { DP; return ASYNC; } +[0-9]+ { DP; yylval.num = atoi(yytext); return NUM; } +[a-zA-Z0-9\.\-_/\:]+ { DP; yylval.str = strdup(yytext); return STR; } +\{ { DP; depth++; return OB; } +\} { DP; depth--; return CB; } +#.*$ /* ignore comments */; +\n { lineno++; } +[ \t]+ /* ignore whitespace */; +%% diff --git a/share/examples/Makefile b/share/examples/Makefile index 315eb91..99d92c0 100644 --- a/share/examples/Makefile +++ b/share/examples/Makefile @@ -13,6 +13,7 @@ LDIRS= BSD_daemon \ drivers \ etc \ find_interface \ + hast \ ibcs2 \ ipfw \ kld \ @@ -69,6 +70,11 @@ XFILES= BSD_daemon/FreeBSD.pfa \ find_interface/Makefile \ find_interface/README \ find_interface/find_interface.c \ + hast/ucarp.sh \ + hast/ucarp_down.sh \ + hast/ucarp_up.sh \ + hast/vip-down.sh \ + hast/vip-up.sh \ ibcs2/README \ ibcs2/hello.uu \ ipfw/change_rules.sh \ diff --git a/share/examples/hast/ucarp.sh b/share/examples/hast/ucarp.sh new file mode 100755 index 0000000..6a02c89 --- /dev/null +++ b/share/examples/hast/ucarp.sh @@ -0,0 +1,69 @@ +#!/bin/sh +# +# Copyright (c) 2010 The FreeBSD Foundation +# All rights reserved. +# +# This software was developed by Pawel Jakub Dawidek under sponsorship from +# the FreeBSD Foundation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $FreeBSD$ + +# Shared IP address, unused for now. +addr="10.99.0.3" +# Password for UCARP communication. +pass="password" +# First node IP and interface for UCARP communication. +nodea_srcip="10.99.0.1" +nodea_ifnet="bge0" +# Second node IP and interface for UCARP communication. +nodeb_srcip="10.99.0.2" +nodeb_ifnet="em3" + +export PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin + +vhid="1" +upscript="/root/hast/sbin/hastd/vip-up.sh" +downscript="/root/hast/sbin/hastd/vip-down.sh" + +ifconfig "${nodea_ifnet}" 2>/dev/null | grep -q "inet ${nodea_srcip} " +if [ $? -eq 0 ]; then + srcip="${nodea_srcip}" + ifnet="${nodea_ifnet}" + node="node A" +fi +ifconfig "${nodeb_ifnet}" 2>/dev/null | grep -q "inet ${nodeb_srcip} " +if [ $? -eq 0 ]; then + if [ -n "${srcip}" -o -n "${ifnet}" ]; then + echo "Unable to determine which node is this (both match)." >/dev/stderr + exit 1 + fi + srcip="${nodeb_srcip}" + ifnet="${nodeb_ifnet}" + node="node B" +fi +if [ -z "${srcip}" -o -z "${ifnet}" ]; then + echo "Unable to determine which node is this (none match)." >/dev/stderr + exit 1 +fi +ucarp -i ${ifnet} -s ${srcip} -v ${vhid} -a ${addr} -p ${pass} -u "${upscript}" -d "${downscript}" diff --git a/share/examples/hast/ucarp_down.sh b/share/examples/hast/ucarp_down.sh new file mode 100755 index 0000000..a5b3428 --- /dev/null +++ b/share/examples/hast/ucarp_down.sh @@ -0,0 +1,98 @@ +#!/bin/sh +# +# Copyright (c) 2010 The FreeBSD Foundation +# All rights reserved. +# +# This software was developed by Pawel Jakub Dawidek under sponsorship from +# the FreeBSD Foundation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $FreeBSD$ + +# Resource name as defined in /etc/hast.conf. +resource="test" +# Supported file system types: UFS, ZFS +fstype="UFS" +# ZFS pool name. Required only when fstype == ZFS. +pool="test" +# File system mount point. Required only when fstype == UFS. +mountpoint="/mnt/test" +# Name of HAST provider as defined in /etc/hast.conf. +# Required only when fstype == UFS. +device="/dev/hast/${resource}" + +export PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin + +# KIll UP script if it still runs in the background. +sig="TERM" +for i in `jot 30`; do + pgid=`pgrep -f ucarp_up.sh | head -1` + [ -n "${pgid}" ] || break + kill -${sig} -- -${pgid} + sig="KILL" + sleep 1 +done +if [ -n "${pgid}" ]; then + logger -p local0.error -t hast "UCARP UP process for resource ${resource} is still running after 30 seconds." + exit 1 +fi +logger -p local0.debug -t hast "UCARP UP is not running." + +case "${fstype}" in +UFS) + mount | egrep -q "^${device} on " + if [ $? -eq 0 ]; then + # Forcibly unmount file system. + out=`umount -f "${mountpoint}" 2>&1` + if [ $? -ne 0 ]; then + logger -p local0.error -t hast "Unable to unmount file system for resource ${resource}: ${out}." + exit 1 + fi + logger -p local0.debug -t hast "File system for resource ${resource} unmounted." + fi + ;; +ZFS) + zpool list | egrep -q "^${pool} " + if [ $? -eq 0 ]; then + # Forcibly export file pool. + out=`zpool export -f "${pool}" 2>&1` + if [ $? -ne 0 ]; then + logger -p local0.error -t hast "Unable to export pool for resource ${resource}: ${out}." + exit 1 + fi + logger -p local0.debug -t hast "ZFS pool for resource ${resource} exported." + fi + ;; +esac + +# Change role to secondary for our resource. +out=`hastctl role secondary "${resource}" 2>&1` +if [ $? -ne 0 ]; then + logger -p local0.error -t hast "Unable to change to role to secondary for resource ${resource}: ${out}." + exit 1 +fi +logger -p local0.debug -t hast "Role for resource ${resource} changed to secondary." + +logger -p local0.info -t hast "Successfully switched to secondary for resource ${resource}." + +exit 0 diff --git a/share/examples/hast/ucarp_up.sh b/share/examples/hast/ucarp_up.sh new file mode 100755 index 0000000..9e56040 --- /dev/null +++ b/share/examples/hast/ucarp_up.sh @@ -0,0 +1,105 @@ +#!/bin/sh +# +# Copyright (c) 2010 The FreeBSD Foundation +# All rights reserved. +# +# This software was developed by Pawel Jakub Dawidek under sponsorship from +# the FreeBSD Foundation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $FreeBSD$ + +# Resource name as defined in /etc/hast.conf. +resource="test" +# Supported file system types: UFS, ZFS +fstype="UFS" +# ZFS pool name. Required only when fstype == ZFS. +pool="test" +# File system mount point. Required only when fstype == UFS. +mountpoint="/mnt/test" +# Name of HAST provider as defined in /etc/hast.conf. +device="/dev/hast/${resource}" + +export PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin + +# If there is secondary worker process, it means that remote primary process is +# still running. We have to wait for it to terminate. +for i in `jot 30`; do + pgrep -f "hastd: ${resource} \(secondary\)" >/dev/null 2>&1 || break + sleep 1 +done +if pgrep -f "hastd: ${resource} \(secondary\)" >/dev/null 2>&1; then + logger -p local0.error -t hast "Secondary process for resource ${resource} is still running after 30 seconds." + exit 1 +fi +logger -p local0.debug -t hast "Secondary process in not running." + +# Change role to primary for our resource. +out=`hastctl role primary "${resource}" 2>&1` +if [ $? -ne 0 ]; then + logger -p local0.error -t hast "Unable to change to role to primary for resource ${resource}: ${out}." + exit 1 +fi +# Wait few seconds for provider to appear. +for i in `jot 50`; do + [ -c "${device}" ] && break + sleep 0.1 +done +if [ ! -c "${device}" ]; then + logger -p local0.error -t hast "Device ${device} didn't appear." + exit 1 +fi +logger -p local0.debug -t hast "Role for resource ${resource} changed to primary." + +case "${fstype}" in +UFS) + # Check the file system. + fsck -y -t ufs "${device}" >/dev/null 2>&1 + if [ $? -ne 0 ]; then + logger -p local0.error -t hast "File system check for resource ${resource} failed." + exit 1 + fi + logger -p local0.debug -t hast "File system check for resource ${resource} finished." + # Mount the file system. + out=`mount -t ufs "${device}" "${mountpoint}" 2>&1` + if [ $? -ne 0 ]; then + logger -p local0.error -t hast "File system mount for resource ${resource} failed: ${out}." + exit 1 + fi + logger -p local0.debug -t hast "File system for resource ${resource} mounted." + ;; +ZFS) + # Import ZFS pool. Do it forcibly as it remembers hostid of + # the other cluster node. + out=`zpool import -f "${pool}" 2>&1` + if [ $? -ne 0 ]; then + logger -p local0.error -t hast "ZFS pool import for resource ${resource} failed: ${out}." + exit 1 + fi + logger -p local0.debug -t hast "ZFS pool for resource ${resource} imported." + ;; +esac + +logger -p local0.info -t hast "Successfully switched to primary for resource ${resource}." + +exit 0 diff --git a/share/examples/hast/vip-down.sh b/share/examples/hast/vip-down.sh new file mode 100755 index 0000000..5e47609 --- /dev/null +++ b/share/examples/hast/vip-down.sh @@ -0,0 +1,5 @@ +#!/bin/sh +# $FreeBSD$ + +/root/hast/sbin/hastd/ucarp_down.sh +exit 0 diff --git a/share/examples/hast/vip-up.sh b/share/examples/hast/vip-up.sh new file mode 100755 index 0000000..61dabe9 --- /dev/null +++ b/share/examples/hast/vip-up.sh @@ -0,0 +1,7 @@ +#!/bin/sh +# $FreeBSD$ + +set -m +/root/hast/sbin/hastd/ucarp_up.sh & +set +m +exit 0 diff --git a/share/man/man5/rc.conf.5 b/share/man/man5/rc.conf.5 index f8d265b..96f64d3 100644 --- a/share/man/man5/rc.conf.5 +++ b/share/man/man5/rc.conf.5 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD$ .\" -.Dd November 11, 2009 +.Dd February 12, 2010 .Dt RC.CONF 5 .Os .Sh NAME @@ -1746,6 +1746,27 @@ is set to .Dq Li YES , these are the flags to pass to .Xr inetd 8 . +.It Va hastd_enable +.Pq Vt bool +If set to +.Dq Li YES , +run the +.Xr hastd 8 +daemon. +.It Va hastd_program +.Pq Vt str +Path to +.Xr hastd 8 +(default +.Pa /sbin/hastd ) . +.It Va hastd_flags +.Pq Vt str +If +.Va hastd_enable +is set to +.Dq Li YES , +these are the flags to pass to +.Xr hastd 8 . .It Va named_enable .Pq Vt bool If set to diff --git a/sys/geom/gate/g_gate.c b/sys/geom/gate/g_gate.c index 26df0f4..952e856 100644 --- a/sys/geom/gate/g_gate.c +++ b/sys/geom/gate/g_gate.c @@ -1,7 +1,11 @@ /*- * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * Copyright (c) 2009-2010 The FreeBSD Foundation * All rights reserved. * + * Portions of this software were developed by Pawel Jakub Dawidek + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -53,9 +57,14 @@ static MALLOC_DEFINE(M_GATE, "gg_data", "GEOM Gate Data"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, gate, CTLFLAG_RW, 0, "GEOM_GATE stuff"); -static u_int g_gate_debug = 0; -SYSCTL_UINT(_kern_geom_gate, OID_AUTO, debug, CTLFLAG_RW, &g_gate_debug, 0, +static int g_gate_debug = 0; +TUNABLE_INT("kern.geom.gate.debug", &g_gate_debug); +SYSCTL_INT(_kern_geom_gate, OID_AUTO, debug, CTLFLAG_RW, &g_gate_debug, 0, "Debug level"); +static u_int g_gate_maxunits = 256; +TUNABLE_INT("kern.geom.gate.maxunits", &g_gate_maxunits); +SYSCTL_UINT(_kern_geom_gate, OID_AUTO, maxunits, CTLFLAG_RDTUN, + &g_gate_maxunits, 0, "Maximum number of ggate devices"); struct g_class g_gate_class = { .name = G_GATE_CLASS_NAME, @@ -71,10 +80,9 @@ static struct cdevsw g_gate_cdevsw = { }; -static LIST_HEAD(, g_gate_softc) g_gate_list = - LIST_HEAD_INITIALIZER(g_gate_list); -static struct mtx g_gate_list_mtx; - +static struct g_gate_softc **g_gate_units; +static u_int g_gate_nunits; +static struct mtx g_gate_units_lock; static int g_gate_destroy(struct g_gate_softc *sc, boolean_t force) @@ -84,13 +92,13 @@ g_gate_destroy(struct g_gate_softc *sc, boolean_t force) struct bio *bp; g_topology_assert(); - mtx_assert(&g_gate_list_mtx, MA_OWNED); + mtx_assert(&g_gate_units_lock, MA_OWNED); pp = sc->sc_provider; if (!force && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { - mtx_unlock(&g_gate_list_mtx); + mtx_unlock(&g_gate_units_lock); return (EBUSY); } - mtx_unlock(&g_gate_list_mtx); + mtx_unlock(&g_gate_units_lock); mtx_lock(&sc->sc_queue_mtx); if ((sc->sc_flags & G_GATE_FLAG_DESTROY) == 0) sc->sc_flags |= G_GATE_FLAG_DESTROY; @@ -125,14 +133,15 @@ g_gate_destroy(struct g_gate_softc *sc, boolean_t force) } mtx_unlock(&sc->sc_queue_mtx); g_topology_unlock(); - mtx_lock(&g_gate_list_mtx); + mtx_lock(&g_gate_units_lock); /* One reference is ours. */ sc->sc_ref--; - while (sc->sc_ref > 0) { - msleep(&sc->sc_ref, &g_gate_list_mtx, 0, "gg:destroy", 0); - } - LIST_REMOVE(sc, sc_next); - mtx_unlock(&g_gate_list_mtx); + while (sc->sc_ref > 0) + msleep(&sc->sc_ref, &g_gate_units_lock, 0, "gg:destroy", 0); + g_gate_units[sc->sc_unit] = NULL; + KASSERT(g_gate_nunits > 0, ("negative g_gate_nunits?")); + g_gate_nunits--; + mtx_unlock(&g_gate_units_lock); mtx_destroy(&sc->sc_queue_mtx); g_topology_lock(); G_GATE_DEBUG(0, "Device %s destroyed.", gp->name); @@ -196,7 +205,7 @@ g_gate_start(struct bio *bp) if (sc->sc_queue_count > sc->sc_queue_size) { mtx_unlock(&sc->sc_queue_mtx); G_GATE_LOGREQ(1, bp, "Queue full, request canceled."); - g_io_deliver(bp, EIO); + g_io_deliver(bp, ENOMEM); return; } @@ -211,18 +220,29 @@ g_gate_start(struct bio *bp) } static struct g_gate_softc * -g_gate_hold(u_int unit) +g_gate_hold(u_int unit, const char *name) { - struct g_gate_softc *sc; - - mtx_lock(&g_gate_list_mtx); - LIST_FOREACH(sc, &g_gate_list, sc_next) { - if (sc->sc_unit == unit) + struct g_gate_softc *sc = NULL; + + mtx_lock(&g_gate_units_lock); + if (unit >= 0 && unit < g_gate_maxunits) + sc = g_gate_units[unit]; + else if (unit == G_GATE_NAME_GIVEN) { + KASSERT(name != NULL, ("name is NULL")); + for (unit = 0; unit < g_gate_maxunits; unit++) { + if (g_gate_units[unit] == NULL) + continue; + if (strcmp(name, + g_gate_units[unit]->sc_provider->name) != 0) { + continue; + } + sc = g_gate_units[unit]; break; + } } if (sc != NULL) sc->sc_ref++; - mtx_unlock(&g_gate_list_mtx); + mtx_unlock(&g_gate_units_lock); return (sc); } @@ -231,40 +251,34 @@ g_gate_release(struct g_gate_softc *sc) { g_topology_assert_not(); - mtx_lock(&g_gate_list_mtx); + mtx_lock(&g_gate_units_lock); sc->sc_ref--; KASSERT(sc->sc_ref >= 0, ("Negative sc_ref for %s.", sc->sc_name)); - if (sc->sc_ref == 0 && (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) { + if (sc->sc_ref == 0 && (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) wakeup(&sc->sc_ref); - mtx_unlock(&g_gate_list_mtx); - } else { - mtx_unlock(&g_gate_list_mtx); - } + mtx_unlock(&g_gate_units_lock); } static int -g_gate_getunit(int unit) +g_gate_getunit(int unit, int *errorp) { - struct g_gate_softc *sc; - mtx_assert(&g_gate_list_mtx, MA_OWNED); + mtx_assert(&g_gate_units_lock, MA_OWNED); if (unit >= 0) { - LIST_FOREACH(sc, &g_gate_list, sc_next) { - if (sc->sc_unit == unit) - return (-1); - } + if (unit >= g_gate_maxunits) + *errorp = EINVAL; + else if (g_gate_units[unit] == NULL) + return (unit); + else + *errorp = EEXIST; } else { - unit = 0; -once_again: - LIST_FOREACH(sc, &g_gate_list, sc_next) { - if (sc->sc_unit == unit) { - if (++unit > 666) - return (-1); - goto once_again; - } + for (unit = 0; unit < g_gate_maxunits; unit++) { + if (g_gate_units[unit] == NULL) + return (unit); } + *errorp = ENFILE; } - return (unit); + return (-1); } static void @@ -276,7 +290,7 @@ g_gate_guard(void *arg) sc = arg; binuptime(&curtime); - g_gate_hold(sc->sc_unit); + g_gate_hold(sc->sc_unit, NULL); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(bp, &sc->sc_inqueue.queue, bio_queue, bp2) { if (curtime.sec - bp->bio_t0.sec < 5) @@ -311,7 +325,7 @@ g_gate_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, sc = gp->softc; if (sc == NULL || pp != NULL || cp != NULL) return; - g_gate_hold(sc->sc_unit); + g_gate_hold(sc->sc_unit, NULL); if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0) { sbuf_printf(sb, "%s<access>%s</access>\n", indent, "read-only"); } else if ((sc->sc_flags & G_GATE_FLAG_WRITEONLY) != 0) { @@ -328,6 +342,7 @@ g_gate_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, sbuf_printf(sb, "%s<queue_size>%u</queue_size>\n", indent, sc->sc_queue_size); sbuf_printf(sb, "%s<ref>%u</ref>\n", indent, sc->sc_ref); + sbuf_printf(sb, "%s<unit>%d</unit>\n", indent, sc->sc_unit); g_topology_unlock(); g_gate_release(sc); g_topology_lock(); @@ -339,6 +354,8 @@ g_gate_create(struct g_gate_ctl_create *ggio) struct g_gate_softc *sc; struct g_geom *gp; struct g_provider *pp; + char name[NAME_MAX]; + int error = 0, unit; if (ggio->gctl_mediasize == 0) { G_GATE_DEBUG(1, "Invalid media size."); @@ -357,15 +374,22 @@ g_gate_create(struct g_gate_ctl_create *ggio) G_GATE_DEBUG(1, "Invalid flags."); return (EINVAL); } - if (ggio->gctl_unit < -1) { + if (ggio->gctl_unit != G_GATE_UNIT_AUTO && + ggio->gctl_unit != G_GATE_NAME_GIVEN && + ggio->gctl_unit < 0) { G_GATE_DEBUG(1, "Invalid unit number."); return (EINVAL); } + if (ggio->gctl_unit == G_GATE_NAME_GIVEN && + ggio->gctl_name[0] == '\0') { + G_GATE_DEBUG(1, "No device name."); + return (EINVAL); + } sc = malloc(sizeof(*sc), M_GATE, M_WAITOK | M_ZERO); sc->sc_flags = (ggio->gctl_flags & G_GATE_USERFLAGS); strlcpy(sc->sc_info, ggio->gctl_info, sizeof(sc->sc_info)); - sc->sc_seq = 0; + sc->sc_seq = 1; bioq_init(&sc->sc_inqueue); bioq_init(&sc->sc_outqueue); mtx_init(&sc->sc_queue_mtx, "gg:queue", NULL, MTX_DEF); @@ -375,26 +399,44 @@ g_gate_create(struct g_gate_ctl_create *ggio) sc->sc_queue_size = G_GATE_MAX_QUEUE_SIZE; sc->sc_timeout = ggio->gctl_timeout; callout_init(&sc->sc_callout, CALLOUT_MPSAFE); - mtx_lock(&g_gate_list_mtx); - ggio->gctl_unit = g_gate_getunit(ggio->gctl_unit); - if (ggio->gctl_unit == -1) { - mtx_unlock(&g_gate_list_mtx); + mtx_lock(&g_gate_units_lock); + sc->sc_unit = g_gate_getunit(ggio->gctl_unit, &error); + if (sc->sc_unit < 0) { + mtx_unlock(&g_gate_units_lock); mtx_destroy(&sc->sc_queue_mtx); free(sc, M_GATE); - return (EBUSY); + return (error); + } + if (ggio->gctl_unit == G_GATE_NAME_GIVEN) + snprintf(name, sizeof(name), "%s", ggio->gctl_name); + else { + snprintf(name, sizeof(name), "%s%d", G_GATE_PROVIDER_NAME, + sc->sc_unit); } - sc->sc_unit = ggio->gctl_unit; - LIST_INSERT_HEAD(&g_gate_list, sc, sc_next); - mtx_unlock(&g_gate_list_mtx); + /* Check for name collision. */ + for (unit = 0; unit < g_gate_maxunits; unit++) { + if (g_gate_units[unit] == NULL) + continue; + if (strcmp(name, g_gate_units[unit]->sc_provider->name) != 0) + continue; + mtx_unlock(&g_gate_units_lock); + mtx_destroy(&sc->sc_queue_mtx); + free(sc, M_GATE); + return (EEXIST); + } + g_gate_units[sc->sc_unit] = sc; + g_gate_nunits++; + mtx_unlock(&g_gate_units_lock); + + ggio->gctl_unit = sc->sc_unit; g_topology_lock(); - gp = g_new_geomf(&g_gate_class, "%s%d", G_GATE_PROVIDER_NAME, - sc->sc_unit); + gp = g_new_geomf(&g_gate_class, "%s", name); gp->start = g_gate_start; gp->access = g_gate_access; gp->dumpconf = g_gate_dumpconf; gp->softc = sc; - pp = g_new_providerf(gp, "%s%d", G_GATE_PROVIDER_NAME, sc->sc_unit); + pp = g_new_providerf(gp, "%s", name); pp->mediasize = ggio->gctl_mediasize; pp->sectorsize = ggio->gctl_sectorsize; sc->sc_provider = pp; @@ -446,11 +488,11 @@ g_gate_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct threa struct g_gate_ctl_destroy *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); - sc = g_gate_hold(ggio->gctl_unit); + sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name); if (sc == NULL) return (ENXIO); g_topology_lock(); - mtx_lock(&g_gate_list_mtx); + mtx_lock(&g_gate_units_lock); error = g_gate_destroy(sc, ggio->gctl_force); g_topology_unlock(); if (error != 0) @@ -463,7 +505,7 @@ g_gate_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct threa struct bio *tbp, *lbp; G_GATE_CHECK_VERSION(ggio); - sc = g_gate_hold(ggio->gctl_unit); + sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name); if (sc == NULL) return (ENXIO); lbp = NULL; @@ -491,6 +533,8 @@ g_gate_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct threa break; } } + if (ggio->gctl_unit == G_GATE_NAME_GIVEN) + ggio->gctl_unit = sc->sc_unit; mtx_unlock(&sc->sc_queue_mtx); g_gate_release(sc); return (error); @@ -500,7 +544,7 @@ g_gate_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct threa struct g_gate_ctl_io *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); - sc = g_gate_hold(ggio->gctl_unit); + sc = g_gate_hold(ggio->gctl_unit, NULL); if (sc == NULL) return (ENXIO); error = 0; @@ -561,7 +605,7 @@ start_end: struct g_gate_ctl_io *ggio = (void *)addr; G_GATE_CHECK_VERSION(ggio); - sc = g_gate_hold(ggio->gctl_unit); + sc = g_gate_hold(ggio->gctl_unit, NULL); if (sc == NULL) return (ENOENT); error = 0; @@ -631,20 +675,24 @@ g_gate_modevent(module_t mod, int type, void *data) switch (type) { case MOD_LOAD: - mtx_init(&g_gate_list_mtx, "gg_list_lock", NULL, MTX_DEF); + mtx_init(&g_gate_units_lock, "gg_units_lock", NULL, MTX_DEF); + g_gate_units = malloc(g_gate_maxunits * sizeof(g_gate_units[0]), + M_GATE, M_WAITOK | M_ZERO); + g_gate_nunits = 0; g_gate_device(); break; case MOD_UNLOAD: - mtx_lock(&g_gate_list_mtx); - if (!LIST_EMPTY(&g_gate_list)) { - mtx_unlock(&g_gate_list_mtx); + mtx_lock(&g_gate_units_lock); + if (g_gate_nunits > 0) { + mtx_unlock(&g_gate_units_lock); error = EBUSY; break; } - mtx_unlock(&g_gate_list_mtx); - mtx_destroy(&g_gate_list_mtx); + mtx_unlock(&g_gate_units_lock); + mtx_destroy(&g_gate_units_lock); if (status_dev != 0) destroy_dev(status_dev); + free(g_gate_units, M_GATE); break; default: return (EOPNOTSUPP); diff --git a/sys/geom/gate/g_gate.h b/sys/geom/gate/g_gate.h index cd2564d..4f41348 100644 --- a/sys/geom/gate/g_gate.h +++ b/sys/geom/gate/g_gate.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * Copyright (c) 2004-2009 Pawel Jakub Dawidek <pjd@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -41,7 +41,7 @@ #define G_GATE_MOD_NAME "ggate" #define G_GATE_CTL_NAME "ggctl" -#define G_GATE_VERSION 1 +#define G_GATE_VERSION 2 /* * Maximum number of request that can be stored in @@ -54,6 +54,15 @@ #define G_GATE_FLAG_DESTROY 0x1000 #define G_GATE_USERFLAGS (G_GATE_FLAG_READONLY | G_GATE_FLAG_WRITEONLY) +/* + * Pick unit number automatically in /dev/ggate<unit>. + */ +#define G_GATE_UNIT_AUTO (-1) +/* + * Full provider name is given, so don't use ggate<unit>. + */ +#define G_GATE_NAME_GIVEN (-2) + #define G_GATE_CMD_CREATE _IOWR('m', 0, struct g_gate_ctl_create) #define G_GATE_CMD_DESTROY _IOWR('m', 1, struct g_gate_ctl_destroy) #define G_GATE_CMD_CANCEL _IOWR('m', 2, struct g_gate_ctl_cancel) @@ -120,20 +129,23 @@ struct g_gate_ctl_create { u_int gctl_flags; u_int gctl_maxcount; u_int gctl_timeout; + char gctl_name[NAME_MAX]; char gctl_info[G_GATE_INFOSIZE]; - int gctl_unit; /* out */ + int gctl_unit; /* in/out */ }; struct g_gate_ctl_destroy { u_int gctl_version; int gctl_unit; int gctl_force; + char gctl_name[NAME_MAX]; }; struct g_gate_ctl_cancel { u_int gctl_version; int gctl_unit; uintptr_t gctl_seq; + char gctl_name[NAME_MAX]; }; struct g_gate_ctl_io { |