summaryrefslogtreecommitdiffstats
path: root/sbin/hastd
diff options
context:
space:
mode:
authorpjd <pjd@FreeBSD.org>2010-02-18 23:16:19 +0000
committerpjd <pjd@FreeBSD.org>2010-02-18 23:16:19 +0000
commit1c1e2e8b7128696797c328aefd45f618c64ba613 (patch)
treeb457b029c3a2e668bca4294dab77cfceeba7dcbd /sbin/hastd
parenta448fe30c9ff8f38fe6cc08b23bf069aabb7438c (diff)
downloadFreeBSD-src-1c1e2e8b7128696797c328aefd45f618c64ba613.zip
FreeBSD-src-1c1e2e8b7128696797c328aefd45f618c64ba613.tar.gz
Please welcome HAST - Highly Avalable Storage.
HAST allows to transparently store data on two physically separated machines connected over the TCP/IP network. HAST works in Primary-Secondary (Master-Backup, Master-Slave) configuration, which means that only one of the cluster nodes can be active at any given time. Only Primary node is able to handle I/O requests to HAST-managed devices. Currently HAST is limited to two cluster nodes in total. HAST operates on block level - it provides disk-like devices in /dev/hast/ directory for use by file systems and/or applications. Working on block level makes it transparent for file systems and applications. There in no difference between using HAST-provided device and raw disk, partition, etc. All of them are just regular GEOM providers in FreeBSD. For more information please consult hastd(8), hastctl(8) and hast.conf(5) manual pages, as well as http://wiki.FreeBSD.org/HAST. Sponsored by: FreeBSD Foundation Sponsored by: OMCnet Internet Service GmbH Sponsored by: TransIP BV
Diffstat (limited to 'sbin/hastd')
-rw-r--r--sbin/hastd/Makefile37
-rw-r--r--sbin/hastd/activemap.c691
-rw-r--r--sbin/hastd/activemap.h69
-rw-r--r--sbin/hastd/control.c426
-rw-r--r--sbin/hastd/control.h44
-rw-r--r--sbin/hastd/ebuf.c252
-rw-r--r--sbin/hastd/ebuf.h51
-rw-r--r--sbin/hastd/hast.conf.5267
-rw-r--r--sbin/hastd/hast.h190
-rw-r--r--sbin/hastd/hast_proto.c401
-rw-r--r--sbin/hastd/hast_proto.h48
-rw-r--r--sbin/hastd/hastd.8232
-rw-r--r--sbin/hastd/hastd.c522
-rw-r--r--sbin/hastd/hastd.h48
-rw-r--r--sbin/hastd/hooks.c148
-rw-r--r--sbin/hastd/hooks.h40
-rw-r--r--sbin/hastd/metadata.c222
-rw-r--r--sbin/hastd/metadata.h48
-rw-r--r--sbin/hastd/nv.c882
-rw-r--r--sbin/hastd/nv.h158
-rw-r--r--sbin/hastd/parse.y507
-rw-r--r--sbin/hastd/pjdlog.c367
-rw-r--r--sbin/hastd/pjdlog.h88
-rw-r--r--sbin/hastd/primary.c1769
-rw-r--r--sbin/hastd/proto.c261
-rw-r--r--sbin/hastd/proto.h54
-rw-r--r--sbin/hastd/proto_common.c85
-rw-r--r--sbin/hastd/proto_impl.h75
-rw-r--r--sbin/hastd/proto_socketpair.c272
-rw-r--r--sbin/hastd/proto_tcp4.c447
-rw-r--r--sbin/hastd/proto_uds.c330
-rw-r--r--sbin/hastd/rangelock.c137
-rw-r--r--sbin/hastd/rangelock.h46
-rw-r--r--sbin/hastd/secondary.c697
-rw-r--r--sbin/hastd/subr.c118
-rw-r--r--sbin/hastd/subr.h51
-rw-r--r--sbin/hastd/synch.h162
-rw-r--r--sbin/hastd/token.l66
38 files changed, 10308 insertions, 0 deletions
diff --git a/sbin/hastd/Makefile b/sbin/hastd/Makefile
new file mode 100644
index 0000000..16a0b8f
--- /dev/null
+++ b/sbin/hastd/Makefile
@@ -0,0 +1,37 @@
+# $FreeBSD$
+
+.include <bsd.own.mk>
+
+PROG= hastd
+SRCS= activemap.c
+SRCS+= control.c
+SRCS+= ebuf.c
+SRCS+= hast_proto.c hastd.c hooks.c
+SRCS+= metadata.c
+SRCS+= nv.c
+SRCS+= secondary.c
+SRCS+= parse.y pjdlog.c primary.c
+SRCS+= proto.c proto_common.c proto_socketpair.c proto_tcp4.c proto_uds.c
+SRCS+= rangelock.c
+SRCS+= subr.c
+SRCS+= token.l
+SRCS+= y.tab.h
+WARNS?= 6
+MAN= hastd.8 hast.conf.5
+
+CFLAGS+=-I${.CURDIR}
+CFLAGS+=-DINET
+.if ${MK_INET6_SUPPORT} != "no"
+CFLAGS+=-DINET6
+.endif
+# This is needed to have WARNS > 1.
+CFLAGS+=-DYY_NO_UNPUT
+
+DPADD= ${LIBCRYPTO} ${LIBGEOM} ${LIBL} ${LIBPTHREAD} ${LIBUTIL}
+LDADD= -lcrypto -lgeom -ll -lpthread -lutil
+
+YFLAGS+=-v
+
+CLEANFILES=y.tab.c y.tab.h y.output
+
+.include <bsd.prog.mk>
diff --git a/sbin/hastd/activemap.c b/sbin/hastd/activemap.c
new file mode 100644
index 0000000..10eb641
--- /dev/null
+++ b/sbin/hastd/activemap.c
@@ -0,0 +1,691 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h> /* powerof2() */
+#include <sys/queue.h>
+
+#include <assert.h>
+#include <bitstring.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <activemap.h>
+
+#define ACTIVEMAP_MAGIC 0xac71e4
+struct activemap {
+ int am_magic; /* Magic value. */
+ off_t am_mediasize; /* Media size in bytes. */
+ uint32_t am_extentsize; /* Extent size in bytes,
+ must be power of 2. */
+ uint8_t am_extentshift;/* 2 ^ extentbits == extentsize */
+ int am_nextents; /* Number of extents. */
+ size_t am_mapsize; /* Bitmap size in bytes. */
+ uint16_t *am_memtab; /* An array that holds number of pending
+ writes per extent. */
+ bitstr_t *am_diskmap; /* On-disk bitmap of dirty extents. */
+ bitstr_t *am_memmap; /* In-memory bitmap of dirty extents. */
+ size_t am_diskmapsize; /* Map size rounded up to sector size. */
+ uint64_t am_ndirty; /* Number of dirty regions. */
+ bitstr_t *am_syncmap; /* Bitmap of extents to sync. */
+ off_t am_syncoff; /* Next synchronization offset. */
+ TAILQ_HEAD(skeepdirty, keepdirty) am_keepdirty; /* List of extents that
+ we keep dirty to reduce bitmap
+ updates. */
+ int am_nkeepdirty; /* Number of am_keepdirty elements. */
+ int am_nkeepdirty_limit; /* Maximum number of am_keepdirty
+ elements. */
+};
+
+struct keepdirty {
+ int kd_extent;
+ TAILQ_ENTRY(keepdirty) kd_next;
+};
+
+/*
+ * Helper function taken from sys/systm.h to calculate extentshift.
+ */
+static uint32_t
+bitcount32(uint32_t x)
+{
+
+ x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1);
+ x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2);
+ x = (x + (x >> 4)) & 0x0f0f0f0f;
+ x = (x + (x >> 8));
+ x = (x + (x >> 16)) & 0x000000ff;
+ return (x);
+}
+
+static __inline int
+off2ext(const struct activemap *amp, off_t offset)
+{
+ int extent;
+
+ assert(offset >= 0 && offset < amp->am_mediasize);
+ extent = (offset >> amp->am_extentshift);
+ assert(extent >= 0 && extent < amp->am_nextents);
+ return (extent);
+}
+
+static __inline off_t
+ext2off(const struct activemap *amp, int extent)
+{
+ off_t offset;
+
+ assert(extent >= 0 && extent < amp->am_nextents);
+ offset = ((off_t)extent << amp->am_extentshift);
+ assert(offset >= 0 && offset < amp->am_mediasize);
+ return (offset);
+}
+
+/*
+ * Function calculates number of requests needed to synchronize the given
+ * extent.
+ */
+static __inline int
+ext2reqs(const struct activemap *amp, int ext)
+{
+ off_t left;
+
+ if (ext < amp->am_nextents - 1)
+ return (((amp->am_extentsize - 1) / MAXPHYS) + 1);
+
+ assert(ext == amp->am_nextents - 1);
+ left = amp->am_mediasize % amp->am_extentsize;
+ if (left == 0)
+ left = amp->am_extentsize;
+ return (((left - 1) / MAXPHYS) + 1);
+}
+
+/*
+ * Initialize activemap structure and allocate memory for internal needs.
+ * Function returns 0 on success and -1 if any of the allocations failed.
+ */
+int
+activemap_init(struct activemap **ampp, uint64_t mediasize, uint32_t extentsize,
+ uint32_t sectorsize, uint32_t keepdirty)
+{
+ struct activemap *amp;
+
+ assert(ampp != NULL);
+ assert(mediasize > 0);
+ assert(extentsize > 0);
+ assert(powerof2(extentsize));
+ assert(sectorsize > 0);
+ assert(powerof2(sectorsize));
+ assert(keepdirty > 0);
+
+ amp = malloc(sizeof(*amp));
+ if (amp == NULL)
+ return (-1);
+
+ amp->am_mediasize = mediasize;
+ amp->am_nkeepdirty_limit = keepdirty;
+ amp->am_extentsize = extentsize;
+ amp->am_extentshift = bitcount32(extentsize - 1);
+ amp->am_nextents = ((mediasize - 1) / extentsize) + 1;
+ amp->am_mapsize = sizeof(bitstr_t) * bitstr_size(amp->am_nextents);
+ amp->am_diskmapsize = roundup2(amp->am_mapsize, sectorsize);
+ amp->am_ndirty = 0;
+ amp->am_syncoff = -2;
+ TAILQ_INIT(&amp->am_keepdirty);
+ amp->am_nkeepdirty = 0;
+
+ amp->am_memtab = calloc(amp->am_nextents, sizeof(amp->am_memtab[0]));
+ amp->am_diskmap = calloc(1, amp->am_diskmapsize);
+ amp->am_memmap = bit_alloc(amp->am_nextents);
+ amp->am_syncmap = bit_alloc(amp->am_nextents);
+
+ /*
+ * Check to see if any of the allocations above failed.
+ */
+ if (amp->am_memtab == NULL || amp->am_diskmap == NULL ||
+ amp->am_memmap == NULL || amp->am_syncmap == NULL) {
+ if (amp->am_memtab != NULL)
+ free(amp->am_memtab);
+ if (amp->am_diskmap != NULL)
+ free(amp->am_diskmap);
+ if (amp->am_memmap != NULL)
+ free(amp->am_memmap);
+ if (amp->am_syncmap != NULL)
+ free(amp->am_syncmap);
+ amp->am_magic = 0;
+ free(amp);
+ errno = ENOMEM;
+ return (-1);
+ }
+
+ amp->am_magic = ACTIVEMAP_MAGIC;
+ *ampp = amp;
+
+ return (0);
+}
+
+static struct keepdirty *
+keepdirty_find(struct activemap *amp, int extent)
+{
+ struct keepdirty *kd;
+
+ TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next) {
+ if (kd->kd_extent == extent)
+ break;
+ }
+ return (kd);
+}
+
+static void
+keepdirty_add(struct activemap *amp, int extent)
+{
+ struct keepdirty *kd;
+
+ kd = keepdirty_find(amp, extent);
+ if (kd != NULL) {
+ /*
+ * Only move element at the begining.
+ */
+ TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
+ TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
+ return;
+ }
+ /*
+ * Add new element, but first remove the most unused one if
+ * we have too many.
+ */
+ if (amp->am_nkeepdirty >= amp->am_nkeepdirty_limit) {
+ kd = TAILQ_LAST(&amp->am_keepdirty, skeepdirty);
+ assert(kd != NULL);
+ TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
+ amp->am_nkeepdirty--;
+ assert(amp->am_nkeepdirty > 0);
+ }
+ if (kd == NULL)
+ kd = malloc(sizeof(*kd));
+ /* We can ignore allocation failure. */
+ if (kd != NULL) {
+ kd->kd_extent = extent;
+ amp->am_nkeepdirty++;
+ TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
+ }
+}
+
+static void
+keepdirty_fill(struct activemap *amp)
+{
+ struct keepdirty *kd;
+
+ TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next)
+ bit_set(amp->am_diskmap, kd->kd_extent);
+}
+
+static void
+keepdirty_free(struct activemap *amp)
+{
+ struct keepdirty *kd;
+
+ while ((kd = TAILQ_FIRST(&amp->am_keepdirty)) != NULL) {
+ TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
+ amp->am_nkeepdirty--;
+ free(kd);
+ }
+ assert(amp->am_nkeepdirty == 0);
+}
+
+/*
+ * Function frees resources allocated by activemap_init() function.
+ */
+void
+activemap_free(struct activemap *amp)
+{
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ amp->am_magic = 0;
+
+ keepdirty_free(amp);
+ free(amp->am_memtab);
+ free(amp->am_diskmap);
+ free(amp->am_memmap);
+ free(amp->am_syncmap);
+}
+
+/*
+ * Function should be called before we handle write requests. It updates
+ * internal structures and returns true if on-disk metadata should be updated.
+ */
+bool
+activemap_write_start(struct activemap *amp, off_t offset, off_t length)
+{
+ bool modified;
+ off_t end;
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+ assert(length > 0);
+
+ modified = false;
+ end = offset + length - 1;
+
+ for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
+ /*
+ * If the number of pending writes is increased from 0,
+ * we have to mark the extent as dirty also in on-disk bitmap.
+ * By returning true we inform the caller that on-disk bitmap
+ * was modified and has to be flushed to disk.
+ */
+ if (amp->am_memtab[ext]++ == 0) {
+ assert(!bit_test(amp->am_memmap, ext));
+ bit_set(amp->am_memmap, ext);
+ amp->am_ndirty++;
+ modified = true;
+ }
+ keepdirty_add(amp, ext);
+ }
+
+ return (modified);
+}
+
+/*
+ * Function should be called after receiving write confirmation. It updates
+ * internal structures and returns true if on-disk metadata should be updated.
+ */
+bool
+activemap_write_complete(struct activemap *amp, off_t offset, off_t length)
+{
+ bool modified;
+ off_t end;
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+ assert(length > 0);
+
+ modified = false;
+ end = offset + length - 1;
+
+ for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
+ /*
+ * If the number of pending writes goes down to 0, we have to
+ * mark the extent as clean also in on-disk bitmap.
+ * By returning true we inform the caller that on-disk bitmap
+ * was modified and has to be flushed to disk.
+ */
+ assert(amp->am_memtab[ext] > 0);
+ assert(bit_test(amp->am_memmap, ext));
+ if (--amp->am_memtab[ext] == 0) {
+ bit_clear(amp->am_memmap, ext);
+ amp->am_ndirty--;
+ modified = true;
+ }
+ }
+
+ return (modified);
+}
+
+/*
+ * Function should be called after finishing synchronization of one extent.
+ * It returns true if on-disk metadata should be updated.
+ */
+bool
+activemap_extent_complete(struct activemap *amp, int extent)
+{
+ bool modified;
+ int reqs;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+ assert(extent >= 0 && extent < amp->am_nextents);
+
+ modified = false;
+
+ reqs = ext2reqs(amp, extent);
+ assert(amp->am_memtab[extent] >= reqs);
+ amp->am_memtab[extent] -= reqs;
+ assert(bit_test(amp->am_memmap, extent));
+ if (amp->am_memtab[extent] == 0) {
+ bit_clear(amp->am_memmap, extent);
+ amp->am_ndirty--;
+ modified = true;
+ }
+
+ return (modified);
+}
+
+/*
+ * Function returns number of dirty regions.
+ */
+uint64_t
+activemap_ndirty(const struct activemap *amp)
+{
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ return (amp->am_ndirty);
+}
+
+/*
+ * Function compare on-disk bitmap and in-memory bitmap and returns true if
+ * they differ and should be flushed to the disk.
+ */
+bool
+activemap_differ(const struct activemap *amp)
+{
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ return (memcmp(amp->am_diskmap, amp->am_memmap,
+ amp->am_mapsize) != 0);
+}
+
+/*
+ * Function returns number of bytes used by bitmap.
+ */
+size_t
+activemap_size(const struct activemap *amp)
+{
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ return (amp->am_mapsize);
+}
+
+/*
+ * Function returns number of bytes needed for storing on-disk bitmap.
+ * This is the same as activemap_size(), but rounded up to sector size.
+ */
+size_t
+activemap_ondisk_size(const struct activemap *amp)
+{
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ return (amp->am_diskmapsize);
+}
+
+/*
+ * Function copies the given buffer read from disk to the internal bitmap.
+ */
+void
+activemap_copyin(struct activemap *amp, const unsigned char *buf, size_t size)
+{
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+ assert(size >= amp->am_mapsize);
+
+ memcpy(amp->am_diskmap, buf, amp->am_mapsize);
+ memcpy(amp->am_memmap, buf, amp->am_mapsize);
+ memcpy(amp->am_syncmap, buf, amp->am_mapsize);
+
+ bit_ffs(amp->am_memmap, amp->am_nextents, &ext);
+ if (ext == -1) {
+ /* There are no dirty extents, so we can leave now. */
+ return;
+ }
+ /*
+ * Set synchronization offset to the first dirty extent.
+ */
+ activemap_sync_rewind(amp);
+ /*
+ * We have dirty extents and we want them to stay that way until
+ * we synchronize, so we set number of pending writes to number
+ * of requests needed to synchronize one extent.
+ */
+ amp->am_ndirty = 0;
+ for (; ext < amp->am_nextents; ext++) {
+ if (bit_test(amp->am_memmap, ext)) {
+ amp->am_memtab[ext] = ext2reqs(amp, ext);
+ amp->am_ndirty++;
+ }
+ }
+}
+
+/*
+ * Function merges the given bitmap with existng one.
+ */
+void
+activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size)
+{
+ bitstr_t *remmap = __DECONST(bitstr_t *, buf);
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+ assert(size >= amp->am_mapsize);
+
+ bit_ffs(remmap, amp->am_nextents, &ext);
+ if (ext == -1) {
+ /* There are no dirty extents, so we can leave now. */
+ return;
+ }
+ /*
+ * We have dirty extents and we want them to stay that way until
+ * we synchronize, so we set number of pending writes to number
+ * of requests needed to synchronize one extent.
+ */
+ for (; ext < amp->am_nextents; ext++) {
+ /* Local extent already dirty. */
+ if (bit_test(amp->am_syncmap, ext))
+ continue;
+ /* Remote extent isn't dirty. */
+ if (!bit_test(remmap, ext))
+ continue;
+ bit_set(amp->am_syncmap, ext);
+ bit_set(amp->am_memmap, ext);
+ bit_set(amp->am_diskmap, ext);
+ if (amp->am_memtab[ext] == 0)
+ amp->am_ndirty++;
+ amp->am_memtab[ext] = ext2reqs(amp, ext);
+ }
+ /*
+ * Set synchronization offset to the first dirty extent.
+ */
+ activemap_sync_rewind(amp);
+}
+
+/*
+ * Function returns pointer to internal bitmap that should be written to disk.
+ */
+const unsigned char *
+activemap_bitmap(struct activemap *amp, size_t *sizep)
+{
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ if (sizep != NULL)
+ *sizep = amp->am_diskmapsize;
+ memcpy(amp->am_diskmap, amp->am_memmap, amp->am_mapsize);
+ keepdirty_fill(amp);
+ return ((const unsigned char *)amp->am_diskmap);
+}
+
+/*
+ * Function calculates size needed to store bitmap on disk.
+ */
+size_t
+activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize,
+ uint32_t sectorsize)
+{
+ uint64_t nextents, mapsize;
+
+ assert(mediasize > 0);
+ assert(extentsize > 0);
+ assert(powerof2(extentsize));
+ assert(sectorsize > 0);
+ assert(powerof2(sectorsize));
+
+ nextents = ((mediasize - 1) / extentsize) + 1;
+ mapsize = sizeof(bitstr_t) * bitstr_size(nextents);
+ return (roundup2(mapsize, sectorsize));
+}
+
+/*
+ * Set synchronization offset to the first dirty extent.
+ */
+void
+activemap_sync_rewind(struct activemap *amp)
+{
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
+ if (ext == -1) {
+ /* There are no extents to synchronize. */
+ amp->am_syncoff = -2;
+ return;
+ }
+ /*
+ * Mark that we want to start synchronization from the begining.
+ */
+ amp->am_syncoff = -1;
+}
+
+/*
+ * Return next offset of where we should synchronize.
+ */
+off_t
+activemap_sync_offset(struct activemap *amp, off_t *lengthp, int *syncextp)
+{
+ off_t syncoff, left;
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+ assert(lengthp != NULL);
+ assert(syncextp != NULL);
+
+ *syncextp = -1;
+
+ if (amp->am_syncoff == -2)
+ return (-1);
+
+ if (amp->am_syncoff >= 0 &&
+ (amp->am_syncoff + MAXPHYS >= amp->am_mediasize ||
+ off2ext(amp, amp->am_syncoff) !=
+ off2ext(amp, amp->am_syncoff + MAXPHYS))) {
+ /*
+ * We are about to change extent, so mark previous one as clean.
+ */
+ ext = off2ext(amp, amp->am_syncoff);
+ bit_clear(amp->am_syncmap, ext);
+ *syncextp = ext;
+ amp->am_syncoff = -1;
+ }
+
+ if (amp->am_syncoff == -1) {
+ /*
+ * Let's find first extent to synchronize.
+ */
+ bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
+ if (ext == -1) {
+ amp->am_syncoff = -2;
+ return (-1);
+ }
+ amp->am_syncoff = ext2off(amp, ext);
+ } else {
+ /*
+ * We don't change extent, so just increase offset.
+ */
+ amp->am_syncoff += MAXPHYS;
+ if (amp->am_syncoff >= amp->am_mediasize) {
+ amp->am_syncoff = -2;
+ return (-1);
+ }
+ }
+
+ syncoff = amp->am_syncoff;
+ left = ext2off(amp, off2ext(amp, syncoff)) +
+ amp->am_extentsize - syncoff;
+ if (syncoff + left > amp->am_mediasize)
+ left = amp->am_mediasize - syncoff;
+ if (left > MAXPHYS)
+ left = MAXPHYS;
+
+ assert(left >= 0 && left <= MAXPHYS);
+ assert(syncoff >= 0 && syncoff < amp->am_mediasize);
+ assert(syncoff + left >= 0 && syncoff + left <= amp->am_mediasize);
+
+ *lengthp = left;
+ return (syncoff);
+}
+
+/*
+ * Mark extent(s) containing the given region for synchronization.
+ * Most likely one of the components is unavailable.
+ */
+bool
+activemap_need_sync(struct activemap *amp, off_t offset, off_t length)
+{
+ bool modified;
+ off_t end;
+ int ext;
+
+ assert(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ modified = false;
+ end = offset + length - 1;
+
+ for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
+ if (bit_test(amp->am_syncmap, ext)) {
+ /* Already marked for synchronization. */
+ assert(bit_test(amp->am_memmap, ext));
+ continue;
+ }
+ bit_set(amp->am_syncmap, ext);
+ if (!bit_test(amp->am_memmap, ext)) {
+ bit_set(amp->am_memmap, ext);
+ amp->am_ndirty++;
+ }
+ amp->am_memtab[ext] += ext2reqs(amp, ext);
+ modified = true;
+ }
+
+ return (modified);
+}
+
+void
+activemap_dump(const struct activemap *amp)
+{
+ int bit;
+
+ printf("M: ");
+ for (bit = 0; bit < amp->am_nextents; bit++)
+ printf("%d", bit_test(amp->am_memmap, bit) ? 1 : 0);
+ printf("\n");
+ printf("D: ");
+ for (bit = 0; bit < amp->am_nextents; bit++)
+ printf("%d", bit_test(amp->am_diskmap, bit) ? 1 : 0);
+ printf("\n");
+ printf("S: ");
+ for (bit = 0; bit < amp->am_nextents; bit++)
+ printf("%d", bit_test(amp->am_syncmap, bit) ? 1 : 0);
+ printf("\n");
+}
diff --git a/sbin/hastd/activemap.h b/sbin/hastd/activemap.h
new file mode 100644
index 0000000..42f0221
--- /dev/null
+++ b/sbin/hastd/activemap.h
@@ -0,0 +1,69 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ACTIVEMAP_H_
+#define _ACTIVEMAP_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+struct activemap;
+
+int activemap_init(struct activemap **ampp, uint64_t mediasize,
+ uint32_t extentsize, uint32_t sectorsize, uint32_t keepdirty);
+void activemap_free(struct activemap *amp);
+
+bool activemap_write_start(struct activemap *amp, off_t offset, off_t length);
+bool activemap_write_complete(struct activemap *amp, off_t offset,
+ off_t length);
+bool activemap_extent_complete(struct activemap *amp, int extent);
+uint64_t activemap_ndirty(const struct activemap *amp);
+
+bool activemap_differ(const struct activemap *amp);
+size_t activemap_size(const struct activemap *amp);
+size_t activemap_ondisk_size(const struct activemap *amp);
+void activemap_copyin(struct activemap *amp, const unsigned char *buf,
+ size_t size);
+void activemap_merge(struct activemap *amp, const unsigned char *buf,
+ size_t size);
+const unsigned char *activemap_bitmap(struct activemap *amp, size_t *sizep);
+
+size_t activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize,
+ uint32_t sectorsize);
+
+void activemap_sync_rewind(struct activemap *amp);
+off_t activemap_sync_offset(struct activemap *amp, off_t *lengthp,
+ int *syncextp);
+bool activemap_need_sync(struct activemap *amp, off_t offset, off_t length);
+
+void activemap_dump(const struct activemap *amp);
+
+#endif /* !_ACTIVEMAP_H_ */
diff --git a/sbin/hastd/control.c b/sbin/hastd/control.c
new file mode 100644
index 0000000..0ad39b4
--- /dev/null
+++ b/sbin/hastd/control.c
@@ -0,0 +1,426 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <signal.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "hast.h"
+#include "hastd.h"
+#include "hast_proto.h"
+#include "nv.h"
+#include "pjdlog.h"
+#include "proto.h"
+#include "subr.h"
+
+#include "control.h"
+
+static void
+control_set_role(struct hastd_config *cfg, struct nv *nvout, uint8_t role,
+ struct hast_resource *res, const char *name, unsigned int no)
+{
+
+ assert(cfg != NULL);
+ assert(nvout != NULL);
+ assert(name != NULL);
+
+ /* Name is always needed. */
+ nv_add_string(nvout, name, "resource%u", no);
+
+ if (res == NULL) {
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (strcmp(res->hr_name, name) == 0)
+ break;
+ }
+ if (res == NULL) {
+ nv_add_int16(nvout, EHAST_NOENTRY, "error%u", no);
+ return;
+ }
+ }
+ assert(res != NULL);
+
+ /* Send previous role back. */
+ nv_add_string(nvout, role2str(res->hr_role), "role%u", no);
+
+ /* Nothing changed, return here. */
+ if (role == res->hr_role)
+ return;
+
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
+ pjdlog_info("Role changed to %s.", role2str(role));
+
+ /* Change role to the new one. */
+ res->hr_role = role;
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
+
+ /*
+ * If previous role was primary or secondary we have to kill process
+ * doing that work.
+ */
+ if (res->hr_workerpid != 0) {
+ if (kill(res->hr_workerpid, SIGTERM) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to kill worker process %u",
+ (unsigned int)res->hr_workerpid);
+ } else if (waitpid(res->hr_workerpid, NULL, 0) !=
+ res->hr_workerpid) {
+ pjdlog_errno(LOG_WARNING,
+ "Error while waiting for worker process %u",
+ (unsigned int)res->hr_workerpid);
+ } else {
+ pjdlog_debug(1, "Worker process %u stopped.",
+ (unsigned int)res->hr_workerpid);
+ }
+ res->hr_workerpid = 0;
+ }
+
+ /* Start worker process if we are changing to primary. */
+ if (role == HAST_ROLE_PRIMARY)
+ hastd_primary(res);
+ pjdlog_prefix_set("%s", "");
+}
+
+static void
+control_status_worker(struct hast_resource *res, struct nv *nvout,
+ unsigned int no)
+{
+ struct nv *cnvin, *cnvout;
+ const char *str;
+ int error;
+
+ cnvin = cnvout = NULL;
+ error = 0;
+
+ /*
+ * Prepare and send command to worker process.
+ */
+ cnvout = nv_alloc();
+ nv_add_uint8(cnvout, HASTCTL_STATUS, "cmd");
+ error = nv_error(cnvout);
+ if (error != 0) {
+ /* LOG */
+ goto end;
+ }
+ if (hast_proto_send(res, res->hr_ctrl, cnvout, NULL, 0) < 0) {
+ error = errno;
+ /* LOG */
+ goto end;
+ }
+
+ /*
+ * Receive response.
+ */
+ if (hast_proto_recv_hdr(res->hr_ctrl, &cnvin) < 0) {
+ error = errno;
+ /* LOG */
+ goto end;
+ }
+
+ error = nv_get_int64(cnvin, "error");
+ if (error != 0)
+ goto end;
+
+ if ((str = nv_get_string(cnvin, "status")) == NULL) {
+ error = ENOENT;
+ /* LOG */
+ goto end;
+ }
+ nv_add_string(nvout, str, "status%u", no);
+ nv_add_uint64(nvout, nv_get_uint64(cnvin, "dirty"), "dirty%u", no);
+ nv_add_uint32(nvout, nv_get_uint32(cnvin, "extentsize"),
+ "extentsize%u", no);
+ nv_add_uint32(nvout, nv_get_uint32(cnvin, "keepdirty"),
+ "keepdirty%u", no);
+end:
+ if (cnvin != NULL)
+ nv_free(cnvin);
+ if (cnvout != NULL)
+ nv_free(cnvout);
+ if (error != 0)
+ nv_add_int16(nvout, error, "error");
+}
+
+static void
+control_status(struct hastd_config *cfg, struct nv *nvout,
+ struct hast_resource *res, const char *name, unsigned int no)
+{
+
+ assert(cfg != NULL);
+ assert(nvout != NULL);
+ assert(name != NULL);
+
+ /* Name is always needed. */
+ nv_add_string(nvout, name, "resource%u", no);
+
+ if (res == NULL) {
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (strcmp(res->hr_name, name) == 0)
+ break;
+ }
+ if (res == NULL) {
+ nv_add_int16(nvout, EHAST_NOENTRY, "error%u", no);
+ return;
+ }
+ }
+ assert(res != NULL);
+ nv_add_string(nvout, res->hr_provname, "provname%u", no);
+ nv_add_string(nvout, res->hr_localpath, "localpath%u", no);
+ nv_add_string(nvout, res->hr_remoteaddr, "remoteaddr%u", no);
+ switch (res->hr_replication) {
+ case HAST_REPLICATION_FULLSYNC:
+ nv_add_string(nvout, "fullsync", "replication%u", no);
+ break;
+ case HAST_REPLICATION_MEMSYNC:
+ nv_add_string(nvout, "memsync", "replication%u", no);
+ break;
+ case HAST_REPLICATION_ASYNC:
+ nv_add_string(nvout, "async", "replication%u", no);
+ break;
+ default:
+ nv_add_string(nvout, "unknown", "replication%u", no);
+ break;
+ }
+ nv_add_string(nvout, role2str(res->hr_role), "role%u", no);
+
+ switch (res->hr_role) {
+ case HAST_ROLE_PRIMARY:
+ assert(res->hr_workerpid != 0);
+ /* FALLTHROUGH */
+ case HAST_ROLE_SECONDARY:
+ if (res->hr_workerpid != 0)
+ break;
+ /* FALLTHROUGH */
+ default:
+ return;
+ }
+
+ /*
+ * If we are here, it means that we have a worker process, which we
+ * want to ask some questions.
+ */
+ control_status_worker(res, nvout, no);
+}
+
+void
+control_handle(struct hastd_config *cfg)
+{
+ struct proto_conn *conn;
+ struct nv *nvin, *nvout;
+ unsigned int ii;
+ const char *str;
+ uint8_t cmd, role;
+ int error;
+
+ if (proto_accept(cfg->hc_controlconn, &conn) < 0) {
+ pjdlog_errno(LOG_ERR, "Unable to accept control connection");
+ return;
+ }
+
+ nvin = nvout = NULL;
+ role = HAST_ROLE_UNDEF;
+
+ if (hast_proto_recv_hdr(conn, &nvin) < 0) {
+ pjdlog_errno(LOG_ERR, "Unable to receive control header");
+ nvin = NULL;
+ goto close;
+ }
+
+ /* Obtain command code. 0 means that nv_get_uint8() failed. */
+ cmd = nv_get_uint8(nvin, "cmd");
+ if (cmd == 0) {
+ pjdlog_error("Control header is missing 'cmd' field.");
+ error = EHAST_INVALID;
+ goto close;
+ }
+
+ /* Allocate outgoing nv structure. */
+ nvout = nv_alloc();
+ if (nvout == NULL) {
+ pjdlog_error("Unable to allocate header for control response.");
+ error = EHAST_NOMEMORY;
+ goto close;
+ }
+
+ error = 0;
+
+ str = nv_get_string(nvin, "resource0");
+ if (str == NULL) {
+ pjdlog_error("Control header is missing 'resource0' field.");
+ error = EHAST_INVALID;
+ goto fail;
+ }
+ if (cmd == HASTCTL_SET_ROLE) {
+ role = nv_get_uint8(nvin, "role");
+ switch (role) {
+ case HAST_ROLE_INIT: /* Is that valid to set, hmm? */
+ case HAST_ROLE_PRIMARY:
+ case HAST_ROLE_SECONDARY:
+ break;
+ default:
+ pjdlog_error("Invalid role received (%hhu).", role);
+ error = EHAST_INVALID;
+ goto fail;
+ }
+ }
+ if (strcmp(str, "all") == 0) {
+ struct hast_resource *res;
+
+ /* All configured resources. */
+
+ ii = 0;
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ switch (cmd) {
+ case HASTCTL_SET_ROLE:
+ control_set_role(cfg, nvout, role, res,
+ res->hr_name, ii++);
+ break;
+ case HASTCTL_STATUS:
+ control_status(cfg, nvout, res, res->hr_name,
+ ii++);
+ break;
+ default:
+ pjdlog_error("Invalid command received (%hhu).",
+ cmd);
+ error = EHAST_UNIMPLEMENTED;
+ goto fail;
+ }
+ }
+ } else {
+ /* Only selected resources. */
+
+ for (ii = 0; ; ii++) {
+ str = nv_get_string(nvin, "resource%u", ii);
+ if (str == NULL)
+ break;
+ switch (cmd) {
+ case HASTCTL_SET_ROLE:
+ control_set_role(cfg, nvout, role, NULL, str,
+ ii);
+ break;
+ case HASTCTL_STATUS:
+ control_status(cfg, nvout, NULL, str, ii);
+ break;
+ default:
+ pjdlog_error("Invalid command received (%hhu).",
+ cmd);
+ error = EHAST_UNIMPLEMENTED;
+ goto fail;
+ }
+ }
+ }
+ if (nv_error(nvout) != 0)
+ goto close;
+fail:
+ if (error != 0)
+ nv_add_int16(nvout, error, "error");
+
+ if (hast_proto_send(NULL, conn, nvout, NULL, 0) < 0)
+ pjdlog_errno(LOG_ERR, "Unable to send control response");
+close:
+ if (nvin != NULL)
+ nv_free(nvin);
+ if (nvout != NULL)
+ nv_free(nvout);
+ proto_close(conn);
+}
+
+/*
+ * Thread handles control requests from the parent.
+ */
+void *
+ctrl_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct nv *nvin, *nvout;
+ uint8_t cmd;
+
+ for (;;) {
+ if (hast_proto_recv_hdr(res->hr_ctrl, &nvin) < 0) {
+ if (sigexit_received)
+ pthread_exit(NULL);
+ pjdlog_errno(LOG_ERR,
+ "Unable to receive control message");
+ continue;
+ }
+ cmd = nv_get_uint8(nvin, "cmd");
+ if (cmd == 0) {
+ pjdlog_error("Control message is missing 'cmd' field.");
+ nv_free(nvin);
+ continue;
+ }
+ nv_free(nvin);
+ nvout = nv_alloc();
+ switch (cmd) {
+ case HASTCTL_STATUS:
+ if (res->hr_remotein != NULL &&
+ res->hr_remoteout != NULL) {
+ nv_add_string(nvout, "complete", "status");
+ } else {
+ nv_add_string(nvout, "degraded", "status");
+ }
+ nv_add_uint32(nvout, (uint32_t)res->hr_extentsize,
+ "extentsize");
+ if (res->hr_role == HAST_ROLE_PRIMARY) {
+ nv_add_uint32(nvout,
+ (uint32_t)res->hr_keepdirty, "keepdirty");
+ nv_add_uint64(nvout,
+ (uint64_t)(activemap_ndirty(res->hr_amp) *
+ res->hr_extentsize), "dirty");
+ } else {
+ nv_add_uint32(nvout, (uint32_t)0, "keepdirty");
+ nv_add_uint64(nvout, (uint64_t)0, "dirty");
+ }
+ break;
+ default:
+ nv_add_int16(nvout, EINVAL, "error");
+ break;
+ }
+ if (nv_error(nvout) != 0) {
+ pjdlog_error("Unable to create answer on control message.");
+ nv_free(nvout);
+ continue;
+ }
+ if (hast_proto_send(NULL, res->hr_ctrl, nvout, NULL, 0) < 0) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to send reply to control message");
+ }
+ nv_free(nvout);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
diff --git a/sbin/hastd/control.h b/sbin/hastd/control.h
new file mode 100644
index 0000000..15ea290
--- /dev/null
+++ b/sbin/hastd/control.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CONTROL_H_
+#define _CONTROL_H_
+
+#define HASTCTL_SET_ROLE 1
+#define HASTCTL_STATUS 2
+
+struct hastd_config;
+
+void control_handle(struct hastd_config *cfg);
+
+void *ctrl_thread(void *arg);
+
+#endif /* !_CONTROL_H_ */
diff --git a/sbin/hastd/ebuf.c b/sbin/hastd/ebuf.c
new file mode 100644
index 0000000..47b7530
--- /dev/null
+++ b/sbin/hastd/ebuf.c
@@ -0,0 +1,252 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include "ebuf.h"
+
+#define EBUF_MAGIC 0xeb0f41c
+struct ebuf {
+ /* Magic to assert the caller uses valid structure. */
+ int eb_magic;
+ /* Address where we did the allocation. */
+ unsigned char *eb_start;
+ /* Allocation end address. */
+ unsigned char *eb_end;
+ /* Start of real data. */
+ unsigned char *eb_used;
+ /* Size of real data. */
+ size_t eb_size;
+};
+
+static int ebuf_head_extent(struct ebuf *eb, size_t size);
+static int ebuf_tail_extent(struct ebuf *eb, size_t size);
+
+struct ebuf *
+ebuf_alloc(size_t size)
+{
+ struct ebuf *eb;
+ int rerrno;
+
+ eb = malloc(sizeof(*eb));
+ if (eb == NULL)
+ return (NULL);
+ size += PAGE_SIZE;
+ eb->eb_start = malloc(size);
+ if (eb->eb_start == NULL) {
+ rerrno = errno;
+ free(eb);
+ errno = rerrno;
+ return (NULL);
+ }
+ eb->eb_end = eb->eb_start + size;
+ /*
+ * We set start address for real data not at the first entry, because
+ * we want to be able to add data at the front.
+ */
+ eb->eb_used = eb->eb_start + PAGE_SIZE / 4;
+ eb->eb_size = 0;
+ eb->eb_magic = EBUF_MAGIC;
+
+ return (eb);
+}
+
+void
+ebuf_free(struct ebuf *eb)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ eb->eb_magic = 0;
+
+ free(eb->eb_start);
+ free(eb);
+}
+
+int
+ebuf_add_head(struct ebuf *eb, const void *data, size_t size)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ if (size > (size_t)(eb->eb_used - eb->eb_start)) {
+ /*
+ * We can't add more entries at the front, so we have to extend
+ * our buffer.
+ */
+ if (ebuf_head_extent(eb, size) < 0)
+ return (-1);
+ }
+ assert(size <= (size_t)(eb->eb_used - eb->eb_start));
+
+ eb->eb_size += size;
+ eb->eb_used -= size;
+ /*
+ * If data is NULL the caller just wants to reserve place.
+ */
+ if (data != NULL)
+ bcopy(data, eb->eb_used, size);
+
+ return (0);
+}
+
+int
+ebuf_add_tail(struct ebuf *eb, const void *data, size_t size)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ if (size > (size_t)(eb->eb_end - (eb->eb_used + eb->eb_size))) {
+ /*
+ * We can't add more entries at the back, so we have to extend
+ * our buffer.
+ */
+ if (ebuf_tail_extent(eb, size) < 0)
+ return (-1);
+ }
+ assert(size <= (size_t)(eb->eb_end - (eb->eb_used + eb->eb_size)));
+
+ /*
+ * If data is NULL the caller just wants to reserve place.
+ */
+ if (data != NULL)
+ bcopy(data, eb->eb_used + eb->eb_size, size);
+ eb->eb_size += size;
+
+ return (0);
+}
+
+void
+ebuf_del_head(struct ebuf *eb, size_t size)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+ assert(size <= eb->eb_size);
+
+ eb->eb_used += size;
+ eb->eb_size -= size;
+}
+
+void
+ebuf_del_tail(struct ebuf *eb, size_t size)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+ assert(size <= eb->eb_size);
+
+ eb->eb_size -= size;
+}
+
+/*
+ * Return pointer to the data and data size.
+ */
+void *
+ebuf_data(struct ebuf *eb, size_t *sizep)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ if (sizep != NULL)
+ *sizep = eb->eb_size;
+ return (eb->eb_size > 0 ? eb->eb_used : NULL);
+}
+
+/*
+ * Return data size.
+ */
+size_t
+ebuf_size(struct ebuf *eb)
+{
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ return (eb->eb_size);
+}
+
+/*
+ * Function adds size + (PAGE_SIZE / 4) bytes at the front of the buffer..
+ */
+static int
+ebuf_head_extent(struct ebuf *eb, size_t size)
+{
+ unsigned char *newstart, *newused;
+ size_t newsize;
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ newsize = eb->eb_end - eb->eb_start + (PAGE_SIZE / 4) + size;
+
+ newstart = malloc(newsize);
+ if (newstart == NULL)
+ return (-1);
+ newused =
+ newstart + (PAGE_SIZE / 4) + size + (eb->eb_used - eb->eb_start);
+
+ bcopy(eb->eb_used, newused, eb->eb_size);
+
+ eb->eb_start = newstart;
+ eb->eb_used = newused;
+ eb->eb_end = newstart + newsize;
+
+ return (0);
+}
+
+/*
+ * Function adds size + ((3 * PAGE_SIZE) / 4) bytes at the back.
+ */
+static int
+ebuf_tail_extent(struct ebuf *eb, size_t size)
+{
+ unsigned char *newstart;
+ size_t newsize;
+
+ assert(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ newsize = eb->eb_end - eb->eb_start + size + ((3 * PAGE_SIZE) / 4);
+
+ newstart = realloc(eb->eb_start, newsize);
+ if (newstart == NULL)
+ return (-1);
+
+ eb->eb_used = newstart + (eb->eb_used - eb->eb_start);
+ eb->eb_start = newstart;
+ eb->eb_end = newstart + newsize;
+
+ return (0);
+}
diff --git a/sbin/hastd/ebuf.h b/sbin/hastd/ebuf.h
new file mode 100644
index 0000000..06275e7
--- /dev/null
+++ b/sbin/hastd/ebuf.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _EBUF_H_
+#define _EBUF_H_
+
+#include <stdlib.h> /* size_t */
+
+struct ebuf;
+
+struct ebuf *ebuf_alloc(size_t size);
+void ebuf_free(struct ebuf *eb);
+
+int ebuf_add_head(struct ebuf *eb, const void *data, size_t size);
+int ebuf_add_tail(struct ebuf *eb, const void *data, size_t size);
+
+void ebuf_del_head(struct ebuf *eb, size_t size);
+void ebuf_del_tail(struct ebuf *eb, size_t size);
+
+void *ebuf_data(struct ebuf *eb, size_t *sizep);
+size_t ebuf_size(struct ebuf *eb);
+
+#endif /* !_EBUF_H_ */
diff --git a/sbin/hastd/hast.conf.5 b/sbin/hastd/hast.conf.5
new file mode 100644
index 0000000..5734ee8
--- /dev/null
+++ b/sbin/hastd/hast.conf.5
@@ -0,0 +1,267 @@
+.\" Copyright (c) 2010 The FreeBSD Foundation
+.\" All rights reserved.
+.\"
+.\" This software was developed by Pawel Jakub Dawidek under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd February 1, 2010
+.Dt HAST.CONF 5
+.Os
+.Sh NAME
+.Nm hast.conf
+.Nd configuration file for the
+.Xr hastd 8
+deamon and the
+.Xr hastctl 8
+utility.
+.Sh DESCRIPTION
+The
+.Nm
+file is used by both
+.Xr hastd 8
+daemon
+and
+.Xr hastctl 8
+control utility.
+Configuration file is designed in a way that exactly the same file can be
+(and should be) used on both HAST nodes.
+Every line starting with # is treated as comment and ignored.
+.Sh CONFIGURATION FILE SYNTAX
+General syntax of the
+.Nm
+file is following:
+.Bd -literal -offset indent
+# Global section
+control <addr>
+listen <addr>
+replication <mode>
+
+on <node> {
+ # Node section
+ control <addr>
+ listen <addr>
+}
+
+on <node> {
+ # Node section
+ control <addr>
+ listen <addr>
+}
+
+resource <name> {
+ # Resource section
+ replication <mode>
+ name <name>
+ local <path>
+
+ on <node> {
+ # Resource-node section
+ name <name>
+ # Required
+ local <path>
+ # Required
+ remote <addr>
+ }
+ on <node> {
+ # Resource-node section
+ name <name>
+ # Required
+ local <path>
+ # Required
+ remote <addr>
+ }
+}
+.Ed
+.Pp
+Most of the various available configuration parameters are optional.
+If parameter is not defined in the particular section, it will be
+inherited from the parent section.
+For example, if the
+.Ic listen
+parameter is not defined in the node section, it will be inherited from
+the global section.
+In case the global section does not define the
+.Ic listen
+parameter at all, the default value will be used.
+.Sh CONFIGURATION FILE DESCRIPTION
+The
+.Aq node
+argument can be replaced either by a full hostname as obtained by
+.Xr gethostname 3 ,
+only first part of the hostname, or by node's UUID as found in the
+.Va kern.hostuuid
+.Xr sysctl 8
+variable.
+.Pp
+The following statements are available:
+.Bl -tag -width ".Ic xxxx"
+.It Ic control Aq addr
+.Pp
+Address for communication with
+.Xr hastctl 8 .
+Each of the following examples defines the same control address:
+.Bd -literal -offset indent
+uds:///var/run/hastctl
+unix:///var/run/hastctl
+/var/run/hastctl
+.Ed
+.Pp
+The default value is
+.Pa uds:///var/run/hastctl .
+.It Ic listen Aq addr
+.Pp
+Address to listen on in form of:
+.Bd -literal -offset indent
+protocol://protocol-specific-address
+.Ed
+.Pp
+Each of the following examples defines the same listen address:
+.Bd -literal -offset indent
+0.0.0.0
+0.0.0.0:8457
+tcp://0.0.0.0
+tcp://0.0.0.0:8457
+tcp4://0.0.0.0
+tcp4://0.0.0.0:8457
+.Ed
+.Pp
+The default value is
+.Pa tcp4://0.0.0.0:8457 .
+.It Ic replication Aq mode
+.Pp
+Replication mode should be one of the following:
+.Bl -tag -width ".Ic xxxx"
+.It Ic memsync
+.Pp
+Report the write operation as completed when local write completes and
+when the remote node acknowledges the data receipt, but before it
+actually stores the data.
+The data on remote node will be stored directly after sending
+acknowledgement.
+This mode is intended to reduce latency, but still provides a very good
+reliability.
+The only situation where some small amount of data could be lost is when
+the data is stored on primary node and sent to the secondary.
+Secondary node then acknowledges data receipt and primary reports
+success to an application.
+However, it may happen that the seconderay goes down before the received
+data is really stored locally.
+Before secondary node returns, primary node dies entirely.
+When the secondary node comes back to life it becomes the new primary.
+Unfortunately some small amount of data which was confirmed to be stored
+to the application was lost.
+The risk of such a situation is very small, which is the reason for this
+mode to be the default.
+.It Ic fullsync
+.Pp
+Mark the write operation as completed when local as well as remote
+write completes.
+This is the safest and the slowest replication mode.
+The
+.Ic fullsync
+replication mode is currently not implemented.
+.It Ic async
+.Pp
+The write operation is reported as complete right after the local write
+completes.
+This is the fastest and the most dangerous replication mode.
+This mode should be used when replicating to a distant node where
+latency is too high for other modes.
+The
+.Ic async
+replication mode is currently not implemented.
+.El
+.It Ic name Aq name
+.Pp
+GEOM provider name that will appear as
+.Pa /dev/hast/<name> .
+If name is not defined, resource name will be used as provider name.
+.It Ic local Aq path
+.Pp
+Path to the local component which will be used as backend provider for
+the resource.
+This can be either GEOM provider or regular file.
+.It Ic remote Aq addr
+.Pp
+Address of the remote
+.Nm hastd
+daemon.
+Format is the same as for the
+.Ic listen
+statement.
+When operating as a primary node this address will be used to connect to
+the secondary node.
+When operating as a secondary node only connections from this address
+will be accepted.
+.El
+.Sh EXAMPLES
+The example configuration file can look as follows:
+.Bd -literal -offset indent
+resource shared {
+ local /dev/da0
+
+ on hasta {
+ remote tcp4://10.0.0.2
+ }
+ on hastb {
+ remote tcp4://10.0.0.1
+ }
+}
+resource tank {
+ on hasta {
+ local /dev/mirror/tanka
+ remote tcp4://10.0.0.2
+ }
+ on hastb {
+ local /dev/mirror/tankb
+ remote tcp4://10.0.0.1
+ }
+}
+.Ed
+.Sh FILES
+.Bl -tag -width ".Pa /var/run/hastctl" -compact
+.It Pa /etc/hast.conf
+The default
+.Nm
+configuration file.
+.It Pa /var/run/hastctl
+Control socket used by the
+.Xr hastctl 8
+control utility to communicate with the
+.Xr hastd 8
+daemon.
+.El
+.Sh SEE ALSO
+.Xr gethostname 3 ,
+.Xr geom 4 ,
+.Xr hastctl 8 ,
+.Xr hastd 8 .
+.Sh AUTHORS
+The
+.Nm
+was written by
+.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org
+under sponsorship of the FreeBSD Foundation.
diff --git a/sbin/hastd/hast.h b/sbin/hastd/hast.h
new file mode 100644
index 0000000..c5220b5
--- /dev/null
+++ b/sbin/hastd/hast.h
@@ -0,0 +1,190 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HAST_H_
+#define _HAST_H_
+
+#include <sys/queue.h>
+#include <sys/socket.h>
+
+#include <arpa/inet.h>
+
+#include <netinet/in.h>
+
+#include <limits.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <activemap.h>
+
+#include "proto.h"
+
+#define HAST_PROTO_VERSION 0
+
+#define EHAST_OK 0
+#define EHAST_NOENTRY 1
+#define EHAST_INVALID 2
+#define EHAST_NOMEMORY 3
+#define EHAST_UNIMPLEMENTED 4
+
+#define HASTCTL_CMD_UNKNOWN 0
+#define HASTCTL_CMD_SETROLE 1
+#define HASTCTL_CMD_STATUS 2
+
+#define HAST_ROLE_UNDEF 0
+#define HAST_ROLE_INIT 1
+#define HAST_ROLE_PRIMARY 2
+#define HAST_ROLE_SECONDARY 3
+
+#define HAST_SYNCSRC_UNDEF 0
+#define HAST_SYNCSRC_PRIMARY 1
+#define HAST_SYNCSRC_SECONDARY 2
+
+#define HIO_UNDEF 0
+#define HIO_READ 1
+#define HIO_WRITE 2
+#define HIO_DELETE 3
+#define HIO_FLUSH 4
+
+#define HAST_CONFIG "/etc/hast.conf"
+#define HAST_CONTROL "/var/run/hastctl"
+#define HASTD_PORT 8457
+#define HASTD_LISTEN "tcp4://0.0.0.0:8457"
+#define HASTD_PIDFILE "/var/run/hastd.pid"
+
+/* Default extent size. */
+#define HAST_EXTENTSIZE 2097152
+/* Default maximum number of extents that are kept dirty. */
+#define HAST_KEEPDIRTY 64
+
+#define HAST_ADDRSIZE 1024
+#define HAST_TOKEN_SIZE 16
+
+struct hastd_config {
+ /* Address to communicate with hastctl(8). */
+ char hc_controladdr[HAST_ADDRSIZE];
+ /* Protocol-specific data. */
+ struct proto_conn *hc_controlconn;
+ /* Address to listen on. */
+ char hc_listenaddr[HAST_ADDRSIZE];
+ /* Protocol-specific data. */
+ struct proto_conn *hc_listenconn;
+ /* List of resources. */
+ TAILQ_HEAD(, hast_resource) hc_resources;
+};
+
+#define HAST_REPLICATION_FULLSYNC 0
+#define HAST_REPLICATION_MEMSYNC 1
+#define HAST_REPLICATION_ASYNC 2
+
+/*
+ * Structure that describes single resource.
+ */
+struct hast_resource {
+ /* Resource name. */
+ char hr_name[NAME_MAX];
+ /* Replication mode (HAST_REPLICATION_*). */
+ int hr_replication;
+ /* Provider name that will appear in /dev/hast/. */
+ char hr_provname[NAME_MAX];
+ /* Synchronization extent size. */
+ int hr_extentsize;
+ /* Maximum number of extents that are kept dirty. */
+ int hr_keepdirty;
+
+ /* Path to local component. */
+ char hr_localpath[PATH_MAX];
+ /* Descriptor to access local component. */
+ int hr_localfd;
+ /* Offset into local component. */
+ off_t hr_localoff;
+ /* Size of usable space. */
+ off_t hr_datasize;
+ /* Size of entire local provider. */
+ off_t hr_local_mediasize;
+ /* Sector size of local provider. */
+ unsigned int hr_local_sectorsize;
+
+ /* Descriptor for /dev/ggctl communication. */
+ int hr_ggatefd;
+ /* Unit number for ggate communication. */
+ int hr_ggateunit;
+
+ /* Address of the remote component. */
+ char hr_remoteaddr[HAST_ADDRSIZE];
+ /* Connection for incoming data. */
+ struct proto_conn *hr_remotein;
+ /* Connection for outgoing data. */
+ struct proto_conn *hr_remoteout;
+ /* Token to verify both in and out connection are coming from
+ the same node (not necessarily from the same address). */
+ unsigned char hr_token[HAST_TOKEN_SIZE];
+
+ /* Resource unique identifier. */
+ uint64_t hr_resuid;
+ /* Primary's local modification count. */
+ uint64_t hr_primary_localcnt;
+ /* Primary's remote modification count. */
+ uint64_t hr_primary_remotecnt;
+ /* Secondary's local modification count. */
+ uint64_t hr_secondary_localcnt;
+ /* Secondary's remote modification count. */
+ uint64_t hr_secondary_remotecnt;
+ /* Synchronization source. */
+ uint8_t hr_syncsrc;
+
+ /* Resource role: HAST_ROLE_{INIT,PRIMARY,SECONDARY}. */
+ int hr_role;
+ /* Previous resource role: HAST_ROLE_{INIT,PRIMARY,SECONDARY}. */
+ int hr_previous_role;
+ /* PID of child worker process. 0 - no child. */
+ pid_t hr_workerpid;
+ /* Control connection between parent and child. */
+ struct proto_conn *hr_ctrl;
+
+ /* Activemap structure. */
+ struct activemap *hr_amp;
+ /* Locked used to synchronize access to hr_amp. */
+ pthread_mutex_t hr_amp_lock;
+
+ /* Next resource. */
+ TAILQ_ENTRY(hast_resource) hr_next;
+};
+
+struct hastd_config *yy_config_parse(const char *config);
+void yy_config_free(struct hastd_config *config);
+
+void yyerror(const char *);
+int yylex(void);
+int yyparse(void);
+
+#endif /* !_HAST_H_ */
diff --git a/sbin/hastd/hast_proto.c b/sbin/hastd/hast_proto.c
new file mode 100644
index 0000000..6e66006
--- /dev/null
+++ b/sbin/hastd/hast_proto.c
@@ -0,0 +1,401 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/endian.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+#include <strings.h>
+
+#include <openssl/sha.h>
+
+#include <hast.h>
+#include <ebuf.h>
+#include <nv.h>
+#include <pjdlog.h>
+#include <proto.h>
+
+#include "hast_proto.h"
+
+struct hast_main_header {
+ /* Protocol version. */
+ uint8_t version;
+ /* Size of nv headers. */
+ uint32_t size;
+} __packed;
+
+typedef int hps_send_t(struct hast_resource *, struct nv *nv, void **, size_t *, bool *);
+typedef int hps_recv_t(struct hast_resource *, struct nv *nv, void **, size_t *, bool *);
+
+struct hast_pipe_stage {
+ const char *hps_name;
+ hps_send_t *hps_send;
+ hps_recv_t *hps_recv;
+};
+
+static int compression_send(struct hast_resource *res, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap);
+static int compression_recv(struct hast_resource *res, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap);
+static int checksum_send(struct hast_resource *res, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap);
+static int checksum_recv(struct hast_resource *res, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap);
+
+static struct hast_pipe_stage pipeline[] = {
+ { "compression", compression_send, compression_recv },
+ { "checksum", checksum_send, checksum_recv }
+};
+
+static int
+compression_send(struct hast_resource *res, struct nv *nv, void **datap,
+ size_t *sizep, bool *freedatap)
+{
+ unsigned char *newbuf;
+
+ res = res; /* TODO */
+
+ /*
+ * TODO: For now we emulate compression.
+ * At 80% probability we succeed to compress data, which means we
+ * allocate new buffer, copy the data over set *freedatap to true.
+ */
+
+ if (arc4random_uniform(100) < 80) {
+ uint32_t *origsize;
+
+ /*
+ * Compression succeeded (but we will grow by 4 bytes, not
+ * shrink for now).
+ */
+ newbuf = malloc(sizeof(uint32_t) + *sizep);
+ if (newbuf == NULL)
+ return (-1);
+ origsize = (void *)newbuf;
+ *origsize = htole32((uint32_t)*sizep);
+ nv_add_string(nv, "null", "compression");
+ if (nv_error(nv) != 0) {
+ free(newbuf);
+ errno = nv_error(nv);
+ return (-1);
+ }
+ bcopy(*datap, newbuf + sizeof(uint32_t), *sizep);
+ if (*freedatap)
+ free(*datap);
+ *freedatap = true;
+ *datap = newbuf;
+ *sizep = sizeof(uint32_t) + *sizep;
+ } else {
+ /*
+ * Compression failed, so we leave everything as it was.
+ * It is not critical for compression to succeed.
+ */
+ }
+
+ return (0);
+}
+
+static int
+compression_recv(struct hast_resource *res, struct nv *nv, void **datap,
+ size_t *sizep, bool *freedatap)
+{
+ unsigned char *newbuf;
+ const char *algo;
+ size_t origsize;
+
+ res = res; /* TODO */
+
+ /*
+ * TODO: For now we emulate compression.
+ */
+
+ algo = nv_get_string(nv, "compression");
+ if (algo == NULL)
+ return (0); /* No compression. */
+ if (strcmp(algo, "null") != 0) {
+ pjdlog_error("Unknown compression algorithm '%s'.", algo);
+ return (-1); /* Unknown compression algorithm. */
+ }
+
+ origsize = le32toh(*(uint32_t *)*datap);
+ newbuf = malloc(origsize);
+ if (newbuf == NULL)
+ return (-1);
+ bcopy((unsigned char *)*datap + sizeof(uint32_t), newbuf, origsize);
+ if (*freedatap)
+ free(*datap);
+ *freedatap = true;
+ *datap = newbuf;
+ *sizep = origsize;
+
+ return (0);
+}
+
+static int
+checksum_send(struct hast_resource *res, struct nv *nv, void **datap,
+ size_t *sizep, bool *freedatap __unused)
+{
+ unsigned char hash[SHA256_DIGEST_LENGTH];
+ SHA256_CTX ctx;
+
+ res = res; /* TODO */
+
+ SHA256_Init(&ctx);
+ SHA256_Update(&ctx, *datap, *sizep);
+ SHA256_Final(hash, &ctx);
+
+ nv_add_string(nv, "sha256", "checksum");
+ nv_add_uint8_array(nv, hash, sizeof(hash), "hash");
+
+ return (0);
+}
+
+static int
+checksum_recv(struct hast_resource *res, struct nv *nv, void **datap,
+ size_t *sizep, bool *freedatap __unused)
+{
+ unsigned char chash[SHA256_DIGEST_LENGTH];
+ const unsigned char *rhash;
+ SHA256_CTX ctx;
+ const char *algo;
+ size_t size;
+
+ res = res; /* TODO */
+
+ algo = nv_get_string(nv, "checksum");
+ if (algo == NULL)
+ return (0); /* No checksum. */
+ if (strcmp(algo, "sha256") != 0) {
+ pjdlog_error("Unknown checksum algorithm '%s'.", algo);
+ return (-1); /* Unknown checksum algorithm. */
+ }
+ rhash = nv_get_uint8_array(nv, &size, "hash");
+ if (rhash == NULL) {
+ pjdlog_error("Checksum algorithm is present, but hash is missing.");
+ return (-1); /* Hash not found. */
+ }
+ if (size != sizeof(chash)) {
+ pjdlog_error("Invalid hash size (%zu) for %s, should be %zu.",
+ size, algo, sizeof(chash));
+ return (-1); /* Different hash size. */
+ }
+
+ SHA256_Init(&ctx);
+ SHA256_Update(&ctx, *datap, *sizep);
+ SHA256_Final(chash, &ctx);
+
+ if (bcmp(rhash, chash, sizeof(chash)) != 0) {
+ pjdlog_error("Hash mismatch.");
+ return (-1); /* Hash mismatch. */
+ }
+
+ return (0);
+}
+
+/*
+ * Send the given nv structure via conn.
+ * We keep headers in nv structure and pass data in separate argument.
+ * There can be no data at all (data is NULL then).
+ */
+int
+hast_proto_send(struct hast_resource *res, struct proto_conn *conn,
+ struct nv *nv, const void *data, size_t size)
+{
+ struct hast_main_header hdr;
+ struct ebuf *eb;
+ bool freedata;
+ void *dptr, *hptr;
+ size_t hsize;
+ int ret;
+
+ dptr = (void *)(uintptr_t)data;
+ freedata = false;
+ ret = -1;
+
+ if (data != NULL) {
+if (false) {
+ unsigned int ii;
+
+ for (ii = 0; ii < sizeof(pipeline) / sizeof(pipeline[0]);
+ ii++) {
+ ret = pipeline[ii].hps_send(res, nv, &dptr, &size,
+ &freedata);
+ if (ret == -1)
+ goto end;
+ }
+ ret = -1;
+}
+ nv_add_uint32(nv, size, "size");
+ if (nv_error(nv) != 0) {
+ errno = nv_error(nv);
+ goto end;
+ }
+ }
+
+ eb = nv_hton(nv);
+ if (eb == NULL)
+ goto end;
+
+ hdr.version = HAST_PROTO_VERSION;
+ hdr.size = htole32((uint32_t)ebuf_size(eb));
+ if (ebuf_add_head(eb, &hdr, sizeof(hdr)) < 0)
+ goto end;
+
+ hptr = ebuf_data(eb, &hsize);
+ if (proto_send(conn, hptr, hsize) < 0)
+ goto end;
+ if (data != NULL && proto_send(conn, dptr, size) < 0)
+ goto end;
+
+ ret = 0;
+end:
+ if (freedata)
+ free(dptr);
+ return (ret);
+}
+
+int
+hast_proto_recv_hdr(struct proto_conn *conn, struct nv **nvp)
+{
+ struct hast_main_header hdr;
+ struct nv *nv;
+ struct ebuf *eb;
+ void *hptr;
+
+ eb = NULL;
+ nv = NULL;
+
+ if (proto_recv(conn, &hdr, sizeof(hdr)) < 0)
+ goto fail;
+
+ if (hdr.version != HAST_PROTO_VERSION) {
+ errno = ERPCMISMATCH;
+ goto fail;
+ }
+
+ hdr.size = le32toh(hdr.size);
+
+ eb = ebuf_alloc(hdr.size);
+ if (eb == NULL)
+ goto fail;
+ if (ebuf_add_tail(eb, NULL, hdr.size) < 0)
+ goto fail;
+ hptr = ebuf_data(eb, NULL);
+ assert(hptr != NULL);
+ if (proto_recv(conn, hptr, hdr.size) < 0)
+ goto fail;
+ nv = nv_ntoh(eb);
+ if (nv == NULL)
+ goto fail;
+
+ *nvp = nv;
+ return (0);
+fail:
+ if (nv != NULL)
+ nv_free(nv);
+ else if (eb != NULL)
+ ebuf_free(eb);
+ return (-1);
+}
+
+int
+hast_proto_recv_data(struct hast_resource *res, struct proto_conn *conn,
+ struct nv *nv, void *data, size_t size)
+{
+ unsigned int ii;
+ bool freedata;
+ size_t dsize;
+ void *dptr;
+ int ret;
+
+ assert(data != NULL);
+ assert(size > 0);
+
+ ret = -1;
+ freedata = false;
+ dptr = data;
+
+ dsize = nv_get_uint32(nv, "size");
+ if (dsize == 0)
+ (void)nv_set_error(nv, 0);
+ else {
+ if (proto_recv(conn, data, dsize) < 0)
+ goto end;
+if (false) {
+ for (ii = sizeof(pipeline) / sizeof(pipeline[0]); ii > 0;
+ ii--) {
+ assert(!"to be verified");
+ ret = pipeline[ii - 1].hps_recv(res, nv, &dptr,
+ &dsize, &freedata);
+ if (ret == -1)
+ goto end;
+ }
+ ret = -1;
+ if (dsize < size)
+ goto end;
+ /* TODO: 'size' doesn't seem right here. It is maximum data size. */
+ if (dptr != data)
+ bcopy(dptr, data, dsize);
+}
+ }
+
+ ret = 0;
+end:
+if (ret < 0) printf("%s:%u %s\n", __func__, __LINE__, strerror(errno));
+ if (freedata)
+ free(dptr);
+ return (ret);
+}
+
+int
+hast_proto_recv(struct hast_resource *res, struct proto_conn *conn,
+ struct nv **nvp, void *data, size_t size)
+{
+ struct nv *nv;
+ size_t dsize;
+ int ret;
+
+ ret = hast_proto_recv_hdr(conn, &nv);
+ if (ret < 0)
+ return (ret);
+ dsize = nv_get_uint32(nv, "size");
+ if (dsize == 0)
+ (void)nv_set_error(nv, 0);
+ else
+ ret = hast_proto_recv_data(res, conn, nv, data, size);
+ if (ret < 0)
+ nv_free(nv);
+ else
+ *nvp = nv;
+ return (ret);
+}
diff --git a/sbin/hastd/hast_proto.h b/sbin/hastd/hast_proto.h
new file mode 100644
index 0000000..3894e38
--- /dev/null
+++ b/sbin/hastd/hast_proto.h
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HAST_PROTO_H_
+#define _HAST_PROTO_H_
+
+#include <stdlib.h> /* size_t */
+
+#include <nv.h>
+#include <proto.h>
+
+int hast_proto_send(struct hast_resource *res, struct proto_conn *conn,
+ struct nv *nv, const void *data, size_t size);
+int hast_proto_recv(struct hast_resource *res, struct proto_conn *conn,
+ struct nv **nvp, void *data, size_t size);
+int hast_proto_recv_hdr(struct proto_conn *conn, struct nv **nvp);
+int hast_proto_recv_data(struct hast_resource *res, struct proto_conn *conn,
+ struct nv *nv, void *data, size_t size);
+
+#endif /* !_HAST_PROTO_H_ */
diff --git a/sbin/hastd/hastd.8 b/sbin/hastd/hastd.8
new file mode 100644
index 0000000..276b3d3
--- /dev/null
+++ b/sbin/hastd/hastd.8
@@ -0,0 +1,232 @@
+.\" Copyright (c) 2010 The FreeBSD Foundation
+.\" All rights reserved.
+.\"
+.\" This software was developed by Pawel Jakub Dawidek under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd February 1, 2010
+.Dt HASTD 8
+.Os
+.Sh NAME
+.Nm hastd
+.Nd "Highly Available Storage daemon"
+.Sh SYNOPSIS
+.Nm
+.Op Fl dFh
+.Op Fl c Ar config
+.Op Fl P Ar pidfile
+.Sh DESCRIPTION
+The
+.Nm
+daemon is responsible for managing highly available GEOM providers.
+.Pp
+.Nm
+allows to transparently store data on two physically separated machines
+connected over the TCP/IP network.
+Only one machine (cluster node) can actively use storage provided by
+.Nm .
+This machine is called primary.
+The
+.Nm
+daemon operates on block level, which makes it transparent for file
+systems and applications.
+.Pp
+There is one main
+.Nm
+daemon which starts new worker process as soon as a role for the given
+resource is changed to primary or as soon as a role for the given
+resource is changed to secondary and remote (primary) node will
+successfully connect to it.
+Every worker process gets a new process title (see
+.Xr setproctitle 3 ) ,
+which describes its role and resource it controls.
+The exact format is:
+.Bd -literal -offset indent
+hastd: <resource name> (<role>)
+.Ed
+.Pp
+When (and only when)
+.Nm
+operates in primary role for the given resource, corresponding
+.Pa /dev/hast/<name>
+disk-like device (GEOM provider) is created.
+File systems and applications can use this provider to send I/O
+requests to.
+Every write, delete and flush operation
+.Dv ( BIO_WRITE , BIO_DELETE , BIO_FLUSH )
+is send to local component and synchronously replicated
+to the remote (secondary) node if it is available.
+Read operations
+.Dv ( BIO_READ )
+are handled locally unless I/O error occurs or local version of the data
+is not up-to-date yet (synchronization is in progress).
+.Pp
+The
+.Nm
+daemon uses the GEOM Gate class to receive I/O requests from the
+in-kernel GEOM infrastructure.
+The
+.Nm geom_gate.ko
+module is loaded automatically if the kernel was not compiled with the
+following option:
+.Bd -ragged -offset indent
+.Cd "options GEOM_GATE"
+.Ed
+.Pp
+The connection between two
+.Nm
+daemons is always initiated from the one running as primary to the one
+running as secondary.
+When primary
+.Nm
+is unable to connect or connection fails, it will try to re-establish
+connection every few seconds.
+Once connection is established, primary
+.Nm
+will synchronize every extent that was modified during connection outage
+to the secondary
+.Nm .
+.Pp
+It is possible that in case of connection outage between the nodes
+.Nm
+primary role for the given resource will be configured on both nodes.
+This in turn leads to incompatible data modifications.
+Such condition is called split-brain and cannot be automatically
+resolved by the
+.Nm
+daemon as this will lead most likely to data corruption or lost of
+important changes.
+Even though it cannot be fixed by
+.Nm
+itself, it will be detected and further connection between independently
+modified nodes will not be possible.
+Once this situation is manually resolved by an administrator, resource
+on one of the nodes can be initialized (erasing local data), which makes
+connection to the remote node possible again.
+Connection of freshly initialized component will trigger full resource
+synchronization.
+.Pp
+The
+.Nm
+daemon itself never picks his role up automatically.
+The role has to be configured with the
+.Xr hastctl 8
+control utility by additional software like
+.Nm ucarp
+or
+.Nm heartbeat
+that can reliably manage role separation and switch secondary node to
+primary role in case of original primary failure.
+.Pp
+The
+.Nm
+daemon can be started with the following command line arguments:
+.Bl -tag -width ".Fl P Ar pidfile"
+.It Fl c Ar config
+Specify alternative location of the configuration file.
+The default location is
+.Pa /etc/hast.conf .
+.It Fl d
+Print or log debugging information.
+This option can be specified multiple times to raise the verbosity
+level.
+.It Fl F
+Start the
+.Nm
+daemon in the foreground.
+By default
+.Nm
+starts in the background.
+.It Fl h
+Print the
+.Nm
+usage message.
+.It Fl P Ar pidfile
+Specify alternative location of a file where main process PID will be
+stored.
+The default location is
+.Pa /var/run/hastd.pid .
+.El
+.Sh EXIT STATUS
+Exit status is 0 on success, or one of the values described in
+.Xr sysexits 3
+on failure.
+.Sh EXAMPLES
+Launch
+.Nm
+on both nodes.
+Set role for resource
+.Nm shared
+to primary on
+.Nm nodeA
+and to secondary on
+.Nm nodeB .
+Create file system on
+.Pa /dev/hast/shared
+provider and mount it.
+.Bd -literal -offset indent
+nodeB# hastd
+nodeB# hastctl role secondary shared
+
+nodeA# hastd
+nodeA# hastctl role primary shared
+nodeA# newfs -U /dev/hast/shared
+nodeA# mount -o noatime /dev/hast/shared /shared
+.Ed
+.Sh FILES
+.Bl -tag -width ".Pa /var/run/hastctl" -compact
+.It Pa /etc/hast.conf
+The configuration file for
+.Nm
+and
+.Xr hastctl 8 .
+.It Pa /var/run/hastctl
+Control socket used by the
+.Xr hastctl 8
+control utility to communicate with
+.Nm .
+.It Pa /var/run/hastd.pid
+The default location of the
+.Nm
+PID file.
+.El
+.Sh SEE ALSO
+.Xr sysexits 3 ,
+.Xr geom 4 ,
+.Xr hast.conf 5 ,
+.Xr ggatec 8 ,
+.Xr ggated 8 ,
+.Xr ggatel 8 ,
+.Xr hastctl 8 ,
+.Xr mount 8 ,
+.Xr newfs 8 ,
+.Xr g_bio 9 .
+.Sh AUTHORS
+The
+.Nm
+was developed by
+.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org
+under sponsorship of the FreeBSD Foundation.
diff --git a/sbin/hastd/hastd.c b/sbin/hastd/hastd.c
new file mode 100644
index 0000000..19f0893
--- /dev/null
+++ b/sbin/hastd/hastd.c
@@ -0,0 +1,522 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker.h>
+#include <sys/module.h>
+#include <sys/wait.h>
+
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <libutil.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <activemap.h>
+#include <pjdlog.h>
+
+#include "control.h"
+#include "hast.h"
+#include "hast_proto.h"
+#include "hastd.h"
+#include "subr.h"
+
+/* Path to configuration file. */
+static const char *cfgpath = HAST_CONFIG;
+/* Hastd configuration. */
+static struct hastd_config *cfg;
+/* Was SIGCHLD signal received? */
+static bool sigchld_received = false;
+/* Was SIGHUP signal received? */
+static bool sighup_received = false;
+/* Was SIGINT or SIGTERM signal received? */
+bool sigexit_received = false;
+/* PID file handle. */
+struct pidfh *pfh;
+
+static void
+usage(void)
+{
+
+ errx(EX_USAGE, "[-dFh] [-c config] [-P pidfile]");
+}
+
+static void
+sighandler(int sig)
+{
+
+ switch (sig) {
+ case SIGCHLD:
+ sigchld_received = true;
+ break;
+ case SIGHUP:
+ sighup_received = true;
+ break;
+ default:
+ assert(!"invalid condition");
+ }
+}
+
+static void
+g_gate_load(void)
+{
+
+ if (modfind("g_gate") == -1) {
+ /* Not present in kernel, try loading it. */
+ if (kldload("geom_gate") == -1 || modfind("g_gate") == -1) {
+ if (errno != EEXIST) {
+ pjdlog_exit(EX_OSERR,
+ "Unable to load geom_gate module");
+ }
+ }
+ }
+}
+
+static void
+child_exit(void)
+{
+ struct hast_resource *res;
+ int status;
+ pid_t pid;
+
+ while ((pid = wait3(&status, WNOHANG, NULL)) > 0) {
+ /* Find resource related to the process that just exited. */
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (pid == res->hr_workerpid)
+ break;
+ }
+ if (res == NULL) {
+ /*
+ * This can happen when new connection arrives and we
+ * cancel child responsible for the old one.
+ */
+ continue;
+ }
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name,
+ role2str(res->hr_role));
+ if (WEXITSTATUS(status) == 0) {
+ pjdlog_debug(1,
+ "Worker process exited gracefully (pid=%u).",
+ (unsigned int)pid);
+ } else {
+ pjdlog_error("Worker process failed (pid=%u, status=%d).",
+ (unsigned int)pid, WEXITSTATUS(status));
+ }
+ res->hr_workerpid = 0;
+ if (res->hr_role == HAST_ROLE_PRIMARY) {
+ sleep(1);
+ pjdlog_info("Restarting worker process.");
+ hastd_primary(res);
+ }
+ pjdlog_prefix_set("%s", "");
+ }
+}
+
+static void
+hastd_reload(void)
+{
+
+ /* TODO */
+ pjdlog_warning("Configuration reload is not implemented.");
+}
+
+static void
+listen_accept(void)
+{
+ struct hast_resource *res;
+ struct proto_conn *conn;
+ struct nv *nvin, *nvout, *nverr;
+ const char *resname;
+ const unsigned char *token;
+ char laddr[256], raddr[256];
+ size_t size;
+ pid_t pid;
+ int status;
+
+ proto_local_address(cfg->hc_listenconn, laddr, sizeof(laddr));
+ pjdlog_debug(1, "Accepting connection to %s.", laddr);
+
+ if (proto_accept(cfg->hc_listenconn, &conn) < 0) {
+ pjdlog_errno(LOG_ERR, "Unable to accept connection %s", laddr);
+ return;
+ }
+
+ proto_local_address(conn, laddr, sizeof(laddr));
+ proto_remote_address(conn, raddr, sizeof(raddr));
+ pjdlog_info("Connection from %s to %s.", laddr, raddr);
+
+ nvin = nvout = nverr = NULL;
+
+ /*
+ * Before receiving any data see if remote host have access to any
+ * resource.
+ */
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (proto_address_match(conn, res->hr_remoteaddr))
+ break;
+ }
+ if (res == NULL) {
+ pjdlog_error("Client %s isn't known.", raddr);
+ goto close;
+ }
+ /* Ok, remote host can access at least one resource. */
+
+ if (hast_proto_recv_hdr(conn, &nvin) < 0) {
+ pjdlog_errno(LOG_ERR, "Unable to receive header from %s",
+ raddr);
+ goto close;
+ }
+
+ resname = nv_get_string(nvin, "resource");
+ if (resname == NULL) {
+ pjdlog_error("No 'resource' field in the header received from %s.",
+ raddr);
+ goto close;
+ }
+ pjdlog_debug(2, "%s: resource=%s", raddr, resname);
+ token = nv_get_uint8_array(nvin, &size, "token");
+ /*
+ * NULL token means that this is first conection.
+ */
+ if (token != NULL && size != sizeof(res->hr_token)) {
+ pjdlog_error("Received token of invalid size from %s (expected %zu, got %zu).",
+ raddr, sizeof(res->hr_token), size);
+ goto close;
+ }
+
+ /*
+ * From now on we want to send errors to the remote node.
+ */
+ nverr = nv_alloc();
+
+ /* Find resource related to this connection. */
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (strcmp(resname, res->hr_name) == 0)
+ break;
+ }
+ /* Have we found the resource? */
+ if (res == NULL) {
+ pjdlog_error("No resource '%s' as requested by %s.",
+ resname, raddr);
+ nv_add_stringf(nverr, "errmsg", "Resource not configured.");
+ goto fail;
+ }
+
+ /* Now that we know resource name setup log prefix. */
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
+
+ /* Does the remote host have access to this resource? */
+ if (!proto_address_match(conn, res->hr_remoteaddr)) {
+ pjdlog_error("Client %s has no access to the resource.", raddr);
+ nv_add_stringf(nverr, "errmsg", "No access to the resource.");
+ goto fail;
+ }
+ /* Is the resource marked as secondary? */
+ if (res->hr_role != HAST_ROLE_SECONDARY) {
+ pjdlog_error("We act as %s for the resource and not as %s as requested by %s.",
+ role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY),
+ raddr);
+ nv_add_stringf(nverr, "errmsg",
+ "Remote node acts as %s for the resource and not as %s.",
+ role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY));
+ goto fail;
+ }
+ /* Does token (if exists) match? */
+ if (token != NULL && memcmp(token, res->hr_token,
+ sizeof(res->hr_token)) != 0) {
+ pjdlog_error("Token received from %s doesn't match.", raddr);
+ nv_add_stringf(nverr, "errmsg", "Toke doesn't match.");
+ goto fail;
+ }
+ /*
+ * If there is no token, but we have half-open connection
+ * (only remotein) or full connection (worker process is running)
+ * we have to cancel those and accept the new connection.
+ */
+ if (token == NULL) {
+ assert(res->hr_remoteout == NULL);
+ pjdlog_debug(1, "Initial connection from %s.", raddr);
+ if (res->hr_workerpid != 0) {
+ assert(res->hr_remotein == NULL);
+ pjdlog_debug(1,
+ "Worker process exists (pid=%u), stopping it.",
+ (unsigned int)res->hr_workerpid);
+ /* Stop child process. */
+ if (kill(res->hr_workerpid, SIGINT) < 0) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to stop worker process (pid=%u)",
+ (unsigned int)res->hr_workerpid);
+ /*
+ * Other than logging the problem we
+ * ignore it - nothing smart to do.
+ */
+ }
+ /* Wait for it to exit. */
+ else if ((pid = waitpid(res->hr_workerpid,
+ &status, 0)) != res->hr_workerpid) {
+ pjdlog_errno(LOG_ERR,
+ "Waiting for worker process (pid=%u) failed",
+ (unsigned int)res->hr_workerpid);
+ /* See above. */
+ } else if (status != 0) {
+ pjdlog_error("Worker process (pid=%u) exited ungracefully: status=%d.",
+ (unsigned int)res->hr_workerpid, status);
+ /* See above. */
+ } else {
+ pjdlog_debug(1,
+ "Worker process (pid=%u) exited gracefully.",
+ (unsigned int)res->hr_workerpid);
+ }
+ res->hr_workerpid = 0;
+ } else if (res->hr_remotein != NULL) {
+ char oaddr[256];
+
+ proto_remote_address(conn, oaddr, sizeof(oaddr));
+ pjdlog_debug(1,
+ "Canceling half-open connection from %s on connection from %s.",
+ oaddr, raddr);
+ proto_close(res->hr_remotein);
+ res->hr_remotein = NULL;
+ }
+ }
+
+ /*
+ * Checks and cleanups are done.
+ */
+
+ if (token == NULL) {
+ arc4random_buf(res->hr_token, sizeof(res->hr_token));
+ nvout = nv_alloc();
+ nv_add_uint8_array(nvout, res->hr_token,
+ sizeof(res->hr_token), "token");
+ if (nv_error(nvout) != 0) {
+ pjdlog_common(LOG_ERR, 0, nv_error(nvout),
+ "Unable to prepare return header for %s", raddr);
+ nv_add_stringf(nverr, "errmsg",
+ "Remote node was unable to prepare return header: %s.",
+ strerror(nv_error(nvout)));
+ goto fail;
+ }
+ if (hast_proto_send(NULL, conn, nvout, NULL, 0) < 0) {
+ int error = errno;
+
+ pjdlog_errno(LOG_ERR, "Unable to send response to %s",
+ raddr);
+ nv_add_stringf(nverr, "errmsg",
+ "Remote node was unable to send response: %s.",
+ strerror(error));
+ goto fail;
+ }
+ res->hr_remotein = conn;
+ pjdlog_debug(1, "Incoming connection from %s configured.",
+ raddr);
+ } else {
+ res->hr_remoteout = conn;
+ pjdlog_debug(1, "Outgoing connection to %s configured.", raddr);
+ hastd_secondary(res, nvin);
+ }
+ nv_free(nvin);
+ nv_free(nvout);
+ nv_free(nverr);
+ pjdlog_prefix_set("%s", "");
+ return;
+fail:
+ if (nv_error(nverr) != 0) {
+ pjdlog_common(LOG_ERR, 0, nv_error(nverr),
+ "Unable to prepare error header for %s", raddr);
+ goto close;
+ }
+ if (hast_proto_send(NULL, conn, nverr, NULL, 0) < 0) {
+ pjdlog_errno(LOG_ERR, "Unable to send error to %s", raddr);
+ goto close;
+ }
+close:
+ if (nvin != NULL)
+ nv_free(nvin);
+ if (nvout != NULL)
+ nv_free(nvout);
+ if (nverr != NULL)
+ nv_free(nverr);
+ proto_close(conn);
+ pjdlog_prefix_set("%s", "");
+}
+
+static void
+main_loop(void)
+{
+ fd_set rfds, wfds;
+ int fd, maxfd, ret;
+
+ for (;;) {
+ if (sigchld_received) {
+ sigchld_received = false;
+ child_exit();
+ }
+ if (sighup_received) {
+ sighup_received = false;
+ hastd_reload();
+ }
+
+ maxfd = 0;
+ FD_ZERO(&rfds);
+ FD_ZERO(&wfds);
+
+ /* Setup descriptors for select(2). */
+#define SETUP_FD(conn) do { \
+ fd = proto_descriptor(conn); \
+ if (fd >= 0) { \
+ maxfd = fd > maxfd ? fd : maxfd; \
+ FD_SET(fd, &rfds); \
+ FD_SET(fd, &wfds); \
+ } \
+} while (0)
+ SETUP_FD(cfg->hc_controlconn);
+ SETUP_FD(cfg->hc_listenconn);
+#undef SETUP_FD
+
+ ret = select(maxfd + 1, &rfds, &wfds, NULL, NULL);
+ if (ret == -1) {
+ if (errno == EINTR)
+ continue;
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "select() failed");
+ }
+
+#define ISSET_FD(conn) \
+ (FD_ISSET((fd = proto_descriptor(conn)), &rfds) || FD_ISSET(fd, &wfds))
+ if (ISSET_FD(cfg->hc_controlconn))
+ control_handle(cfg);
+ if (ISSET_FD(cfg->hc_listenconn))
+ listen_accept();
+#undef ISSET_FD
+ }
+}
+
+int
+main(int argc, char *argv[])
+{
+ const char *pidfile;
+ pid_t otherpid;
+ bool foreground;
+ int debuglevel;
+
+ g_gate_load();
+
+ foreground = false;
+ debuglevel = 0;
+ pidfile = HASTD_PIDFILE;
+
+ for (;;) {
+ int ch;
+
+ ch = getopt(argc, argv, "c:dFhP:");
+ if (ch == -1)
+ break;
+ switch (ch) {
+ case 'c':
+ cfgpath = optarg;
+ break;
+ case 'd':
+ debuglevel++;
+ break;
+ case 'F':
+ foreground = true;
+ break;
+ case 'P':
+ pidfile = optarg;
+ break;
+ case 'h':
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ pjdlog_debug_set(debuglevel);
+
+ pfh = pidfile_open(pidfile, 0600, &otherpid);
+ if (pfh == NULL) {
+ if (errno == EEXIST) {
+ pjdlog_exitx(EX_TEMPFAIL,
+ "Another hastd is already running, pid: %jd.",
+ (intmax_t)otherpid);
+ }
+ /* If we cannot create pidfile from other reasons, only warn. */
+ pjdlog_errno(LOG_WARNING, "Cannot open or create pidfile");
+ }
+
+ cfg = yy_config_parse(cfgpath);
+ assert(cfg != NULL);
+
+ signal(SIGHUP, sighandler);
+ signal(SIGCHLD, sighandler);
+
+ /* Listen on control address. */
+ if (proto_server(cfg->hc_controladdr, &cfg->hc_controlconn) < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "Unable to listen on control address %s",
+ cfg->hc_controladdr);
+ }
+ /* Listen for remote connections. */
+ if (proto_server(cfg->hc_listenaddr, &cfg->hc_listenconn) < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "Unable to listen on address %s",
+ cfg->hc_listenaddr);
+ }
+
+ if (!foreground) {
+ if (daemon(0, 0) < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "Unable to daemonize");
+ }
+
+ /* Start logging to syslog. */
+ pjdlog_mode_set(PJDLOG_MODE_SYSLOG);
+
+ /* Write PID to a file. */
+ if (pidfile_write(pfh) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to write PID to a file");
+ }
+ }
+
+ main_loop();
+
+ exit(0);
+}
diff --git a/sbin/hastd/hastd.h b/sbin/hastd/hastd.h
new file mode 100644
index 0000000..199de8c
--- /dev/null
+++ b/sbin/hastd/hastd.h
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HASTD_H_
+#define _HASTD_H_
+
+#include <sys/param.h>
+#include <libutil.h>
+
+#include <nv.h>
+
+#include "hast.h"
+
+extern bool sigexit_received;
+extern struct pidfh *pfh;
+
+void hastd_primary(struct hast_resource *res);
+void hastd_secondary(struct hast_resource *res, struct nv *nvin);
+
+#endif /* !_HASTD_H_ */
diff --git a/sbin/hastd/hooks.c b/sbin/hastd/hooks.c
new file mode 100644
index 0000000..1fdeb75
--- /dev/null
+++ b/sbin/hastd/hooks.c
@@ -0,0 +1,148 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syslog.h>
+#include <libgen.h>
+#include <paths.h>
+
+#include <pjdlog.h>
+
+#include "hooks.h"
+
+static void
+descriptors(void)
+{
+ long maxfd;
+ int fd;
+
+ /*
+ * Close all descriptors.
+ */
+ maxfd = sysconf(_SC_OPEN_MAX);
+ if (maxfd < 0) {
+ pjdlog_errno(LOG_WARNING, "sysconf(_SC_OPEN_MAX) failed");
+ maxfd = 1024;
+ }
+ for (fd = 0; fd <= maxfd; fd++)
+ close(fd);
+ /*
+ * Redirect stdin, stdout and stderr to /dev/null.
+ */
+ fd = open(_PATH_DEVNULL, O_RDONLY);
+ if (fd < 0) {
+ pjdlog_errno(LOG_WARNING, "Unable to open %s for reading",
+ _PATH_DEVNULL);
+ } else if (fd != STDIN_FILENO) {
+ if (dup2(fd, STDIN_FILENO) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to duplicate descriptor for stdin");
+ }
+ close(fd);
+ }
+ fd = open(_PATH_DEVNULL, O_WRONLY);
+ if (fd < 0) {
+ pjdlog_errno(LOG_WARNING, "Unable to open %s for writing",
+ _PATH_DEVNULL);
+ } else {
+ if (fd != STDOUT_FILENO && dup2(fd, STDOUT_FILENO) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to duplicate descriptor for stdout");
+ }
+ if (fd != STDERR_FILENO && dup2(fd, STDERR_FILENO) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to duplicate descriptor for stderr");
+ }
+ if (fd != STDOUT_FILENO && fd != STDERR_FILENO)
+ close(fd);
+ }
+}
+
+int
+hook_exec(const char *path, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, path);
+ ret = hook_execv(path, ap);
+ va_end(ap);
+ return (ret);
+}
+
+int
+hook_execv(const char *path, va_list ap)
+{
+ char *args[64];
+ unsigned int ii;
+ pid_t pid, wpid;
+ int status;
+
+ if (path == NULL || path[0] == '\0')
+ return (0);
+
+ memset(args, 0, sizeof(args));
+ args[0] = basename(path);
+ for (ii = 1; ii < sizeof(args) / sizeof(args[0]); ii++) {
+ args[ii] = va_arg(ap, char *);
+ if (args[ii] == NULL)
+ break;
+ }
+ assert(ii < sizeof(args) / sizeof(args[0]));
+
+ pid = fork();
+ switch (pid) {
+ case -1: /* Error. */
+ pjdlog_errno(LOG_ERR, "Unable to fork %s", path);
+ return (-1);
+ case 0: /* Child. */
+ descriptors();
+ execv(path, args);
+ pjdlog_errno(LOG_ERR, "Unable to execute %s", path);
+ exit(EX_SOFTWARE);
+ default: /* Parent. */
+ break;
+ }
+
+ wpid = waitpid(pid, &status, 0);
+ assert(wpid == pid);
+
+ return (WEXITSTATUS(status));
+}
diff --git a/sbin/hastd/hooks.h b/sbin/hastd/hooks.h
new file mode 100644
index 0000000..799b781
--- /dev/null
+++ b/sbin/hastd/hooks.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HOOKS_H_
+#define _HOOKS_H_
+
+#include <stdarg.h>
+
+int hook_exec(const char *path, ...);
+int hook_execv(const char *path, va_list ap);
+
+#endif /* !_HOOKS_H_ */
diff --git a/sbin/hastd/metadata.c b/sbin/hastd/metadata.c
new file mode 100644
index 0000000..9bca66b
--- /dev/null
+++ b/sbin/hastd/metadata.c
@@ -0,0 +1,222 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <ebuf.h>
+#include <nv.h>
+#include <pjdlog.h>
+#include <subr.h>
+
+#include "metadata.h"
+
+int
+metadata_read(struct hast_resource *res, bool openrw)
+{
+ unsigned char *buf;
+ struct ebuf *eb;
+ struct nv *nv;
+ ssize_t done;
+ const char *str;
+ int rerrno;
+ bool opened_here;
+
+ opened_here = false;
+ rerrno = 0;
+
+ /*
+ * Is this first metadata_read() call for this resource?
+ */
+ if (res->hr_localfd == -1) {
+ if (provinfo(res, openrw) < 0) {
+ rerrno = errno;
+ goto fail;
+ }
+ opened_here = true;
+ pjdlog_debug(1, "Obtained info about %s.", res->hr_localpath);
+ if (openrw) {
+ if (flock(res->hr_localfd, LOCK_EX | LOCK_NB) < 0) {
+ rerrno = errno;
+ if (errno == EOPNOTSUPP) {
+ pjdlog_warning("Unable to lock %s (operation not supported), but continuing.",
+ res->hr_localpath);
+ } else {
+ pjdlog_errno(LOG_ERR,
+ "Unable to lock %s",
+ res->hr_localpath);
+ goto fail;
+ }
+ }
+ pjdlog_debug(1, "Locked %s.", res->hr_localpath);
+ }
+ }
+
+ eb = ebuf_alloc(METADATA_SIZE);
+ if (eb == NULL) {
+ rerrno = errno;
+ pjdlog_errno(LOG_ERR,
+ "Unable to allocate memory to read metadata");
+ goto fail;
+ }
+ if (ebuf_add_tail(eb, NULL, METADATA_SIZE) < 0) {
+ rerrno = errno;
+ pjdlog_errno(LOG_ERR,
+ "Unable to allocate memory to read metadata");
+ goto fail;
+ }
+ buf = ebuf_data(eb, NULL);
+ assert(buf != NULL);
+ done = pread(res->hr_localfd, buf, METADATA_SIZE, 0);
+ if (done < 0 || done != METADATA_SIZE) {
+ rerrno = errno;
+ pjdlog_errno(LOG_ERR, "Unable to read metadata");
+ ebuf_free(eb);
+ goto fail;
+ }
+ nv = nv_ntoh(eb);
+ if (nv == NULL) {
+ rerrno = errno;
+ pjdlog_errno(LOG_ERR, "Metadata read from %s is invalid",
+ res->hr_localpath);
+ ebuf_free(eb);
+ goto fail;
+ }
+
+ str = nv_get_string(nv, "resource");
+ if (strcmp(str, res->hr_name) != 0) {
+ pjdlog_error("Provider %s is not part of resource %s.",
+ res->hr_localpath, res->hr_name);
+ nv_free(nv);
+ goto fail;
+ }
+
+ res->hr_datasize = nv_get_uint64(nv, "datasize");
+ res->hr_extentsize = (int)nv_get_uint32(nv, "extentsize");
+ res->hr_keepdirty = (int)nv_get_uint32(nv, "keepdirty");
+ res->hr_localoff = nv_get_uint64(nv, "offset");
+ res->hr_resuid = nv_get_uint64(nv, "resuid");
+ if (res->hr_role != HAST_ROLE_PRIMARY) {
+ /* Secondary or init role. */
+ res->hr_secondary_localcnt = nv_get_uint64(nv, "localcnt");
+ res->hr_secondary_remotecnt = nv_get_uint64(nv, "remotecnt");
+ }
+ if (res->hr_role != HAST_ROLE_SECONDARY) {
+ /* Primary or init role. */
+ res->hr_primary_localcnt = nv_get_uint64(nv, "localcnt");
+ res->hr_primary_remotecnt = nv_get_uint64(nv, "remotecnt");
+ }
+ str = nv_get_string(nv, "prevrole");
+ if (str != NULL) {
+ if (strcmp(str, "primary") == 0)
+ res->hr_previous_role = HAST_ROLE_PRIMARY;
+ else if (strcmp(str, "secondary") == 0)
+ res->hr_previous_role = HAST_ROLE_SECONDARY;
+ }
+
+ if (nv_error(nv) != 0) {
+ errno = rerrno = nv_error(nv);
+ pjdlog_errno(LOG_ERR, "Unable to read metadata from %s",
+ res->hr_localpath);
+ nv_free(nv);
+ goto fail;
+ }
+ return (0);
+fail:
+ if (opened_here) {
+ close(res->hr_localfd);
+ res->hr_localfd = -1;
+ }
+ errno = rerrno;
+ return (-1);
+}
+
+int
+metadata_write(struct hast_resource *res)
+{
+ struct ebuf *eb;
+ struct nv *nv;
+ unsigned char *buf, *ptr;
+ size_t size;
+ ssize_t done;
+
+ buf = calloc(1, METADATA_SIZE);
+ if (buf == NULL) {
+ pjdlog_error("Unable to allocate %zu bytes for metadata.",
+ (size_t)METADATA_SIZE);
+ return (-1);
+ }
+
+ nv = nv_alloc();
+ nv_add_string(nv, res->hr_name, "resource");
+ nv_add_uint64(nv, (uint64_t)res->hr_datasize, "datasize");
+ nv_add_uint32(nv, (uint32_t)res->hr_extentsize, "extentsize");
+ nv_add_uint32(nv, (uint32_t)res->hr_keepdirty, "keepdirty");
+ nv_add_uint64(nv, (uint64_t)res->hr_localoff, "offset");
+ nv_add_uint64(nv, res->hr_resuid, "resuid");
+ if (res->hr_role == HAST_ROLE_PRIMARY ||
+ res->hr_role == HAST_ROLE_INIT) {
+ nv_add_uint64(nv, res->hr_primary_localcnt, "localcnt");
+ nv_add_uint64(nv, res->hr_primary_remotecnt, "remotecnt");
+ } else /* if (res->hr_role == HAST_ROLE_SECONDARY) */ {
+ assert(res->hr_role == HAST_ROLE_SECONDARY);
+ nv_add_uint64(nv, res->hr_secondary_localcnt, "localcnt");
+ nv_add_uint64(nv, res->hr_secondary_remotecnt, "remotecnt");
+ }
+ nv_add_string(nv, role2str(res->hr_role), "prevrole");
+ if (nv_error(nv) != 0) {
+ pjdlog_error("Unable to create metadata.");
+ goto fail;
+ }
+ res->hr_previous_role = res->hr_role;
+ eb = nv_hton(nv);
+ assert(eb != NULL);
+ ptr = ebuf_data(eb, &size);
+ assert(ptr != NULL);
+ assert(size < METADATA_SIZE);
+ bcopy(ptr, buf, size);
+ done = pwrite(res->hr_localfd, buf, METADATA_SIZE, 0);
+ if (done < 0 || done != METADATA_SIZE) {
+ pjdlog_errno(LOG_ERR, "Unable to write metadata");
+ goto fail;
+ }
+
+ return (0);
+fail:
+ free(buf);
+ nv_free(nv);
+ return (-1);
+}
diff --git a/sbin/hastd/metadata.h b/sbin/hastd/metadata.h
new file mode 100644
index 0000000..83d35f4
--- /dev/null
+++ b/sbin/hastd/metadata.h
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _METADATA_H_
+#define _METADATA_H_
+
+#include <stdbool.h>
+
+#include <hast.h>
+
+/*
+ * Maximum size of metadata.
+ * XXX: We should take sector size into account.
+ */
+#define METADATA_SIZE 4096
+
+int metadata_read(struct hast_resource *res, bool openrw);
+int metadata_write(struct hast_resource *res);
+
+#endif /* !_METADATA_H_ */
diff --git a/sbin/hastd/nv.c b/sbin/hastd/nv.c
new file mode 100644
index 0000000..0b4e362
--- /dev/null
+++ b/sbin/hastd/nv.c
@@ -0,0 +1,882 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/endian.h>
+
+#include <assert.h>
+#include <bitstring.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <ebuf.h>
+#include <nv.h>
+
+#define NV_MAGIC 0xaea1e
+struct nv {
+ int nv_magic;
+ int nv_error;
+ struct ebuf *nv_ebuf;
+};
+
+struct nvhdr {
+ uint8_t nvh_type;
+ uint8_t nvh_namesize;
+ uint32_t nvh_dsize;
+ char nvh_name[0];
+} __packed;
+#define NVH_DATA(nvh) ((unsigned char *)nvh + NVH_HSIZE(nvh))
+#define NVH_HSIZE(nvh) \
+ (sizeof(struct nvhdr) + roundup2((nvh)->nvh_namesize, 8))
+#define NVH_DSIZE(nvh) \
+ (((nvh)->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST ? \
+ (nvh)->nvh_dsize : \
+ le32toh((nvh)->nvh_dsize))
+#define NVH_SIZE(nvh) (NVH_HSIZE(nvh) + roundup2(NVH_DSIZE(nvh), 8))
+
+#define NV_CHECK(nv) do { \
+ assert((nv) != NULL); \
+ assert((nv)->nv_magic == NV_MAGIC); \
+} while (0)
+
+static void nv_add(struct nv *nv, const unsigned char *value, size_t vsize,
+ int type, const char *name);
+static void nv_addv(struct nv *nv, const unsigned char *value, size_t vsize,
+ int type, const char *namefmt, va_list nameap);
+static struct nvhdr *nv_find(struct nv *nv, int type, const char *namefmt,
+ va_list nameap);
+static void nv_swap(struct nvhdr *nvh, bool tohost);
+
+/*
+ * Allocate and initialize new nv structure.
+ * Return NULL in case of malloc(3) failure.
+ */
+struct nv *
+nv_alloc(void)
+{
+ struct nv *nv;
+
+ nv = malloc(sizeof(*nv));
+ if (nv == NULL)
+ return (NULL);
+ nv->nv_ebuf = ebuf_alloc(0);
+ if (nv->nv_ebuf == NULL) {
+ free(nv);
+ return (NULL);
+ }
+ nv->nv_error = 0;
+ nv->nv_magic = NV_MAGIC;
+ return (nv);
+}
+
+/*
+ * Free the given nv structure.
+ */
+void
+nv_free(struct nv *nv)
+{
+
+ if (nv == NULL)
+ return;
+
+ NV_CHECK(nv);
+
+ nv->nv_magic = 0;
+ ebuf_free(nv->nv_ebuf);
+ free(nv);
+}
+
+/*
+ * Return error for the given nv structure.
+ */
+int
+nv_error(const struct nv *nv)
+{
+
+ if (nv == NULL)
+ return (ENOMEM);
+
+ NV_CHECK(nv);
+
+ return (nv->nv_error);
+}
+
+/*
+ * Set error for the given nv structure and return previous error.
+ */
+int
+nv_set_error(struct nv *nv, int error)
+{
+ int preverr;
+
+ if (nv == NULL)
+ return (ENOMEM);
+
+ NV_CHECK(nv);
+
+ preverr = nv->nv_error;
+ nv->nv_error = error;
+ return (preverr);
+}
+
+/*
+ * Validate correctness of the entire nv structure and all its elements.
+ * If extrap is not NULL, store number of extra bytes at the end of the buffer.
+ */
+int
+nv_validate(struct nv *nv, size_t *extrap)
+{
+ struct nvhdr *nvh;
+ unsigned char *data, *ptr;
+ size_t dsize, size, vsize;
+ int error;
+
+ if (nv == NULL) {
+ errno = ENOMEM;
+ return (-1);
+ }
+
+ NV_CHECK(nv);
+ assert(nv->nv_error == 0);
+
+ /* TODO: Check that names are unique? */
+
+ error = 0;
+ ptr = ebuf_data(nv->nv_ebuf, &size);
+ while (size > 0) {
+ /*
+ * Zeros at the end of the buffer are acceptable.
+ */
+ if (ptr[0] == '\0')
+ break;
+ /*
+ * Minimum size at this point is size of nvhdr structure, one
+ * character long name plus terminating '\0'.
+ */
+ if (size < sizeof(*nvh) + 2) {
+ error = EINVAL;
+ break;
+ }
+ nvh = (struct nvhdr *)ptr;
+ if (size < NVH_HSIZE(nvh)) {
+ error = EINVAL;
+ break;
+ }
+ if (nvh->nvh_name[nvh->nvh_namesize - 1] != '\0') {
+ error = EINVAL;
+ break;
+ }
+ if (strlen(nvh->nvh_name) !=
+ (size_t)(nvh->nvh_namesize - 1)) {
+ error = EINVAL;
+ break;
+ }
+ if ((nvh->nvh_type & NV_TYPE_MASK) < NV_TYPE_FIRST ||
+ (nvh->nvh_type & NV_TYPE_MASK) > NV_TYPE_LAST) {
+ error = EINVAL;
+ break;
+ }
+ dsize = NVH_DSIZE(nvh);
+ if (dsize == 0) {
+ error = EINVAL;
+ break;
+ }
+ if (size < NVH_SIZE(nvh)) {
+ error = EINVAL;
+ break;
+ }
+ vsize = 0;
+ switch (nvh->nvh_type & NV_TYPE_MASK) {
+ case NV_TYPE_INT8:
+ case NV_TYPE_UINT8:
+ if (vsize == 0)
+ vsize = 1;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT16:
+ case NV_TYPE_UINT16:
+ if (vsize == 0)
+ vsize = 2;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT32:
+ case NV_TYPE_UINT32:
+ if (vsize == 0)
+ vsize = 4;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT64:
+ case NV_TYPE_UINT64:
+ if (vsize == 0)
+ vsize = 8;
+ if (dsize != vsize) {
+ error = EINVAL;
+ break;
+ }
+ break;
+ case NV_TYPE_INT8_ARRAY:
+ case NV_TYPE_UINT8_ARRAY:
+ break;
+ case NV_TYPE_INT16_ARRAY:
+ case NV_TYPE_UINT16_ARRAY:
+ if (vsize == 0)
+ vsize = 2;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT32_ARRAY:
+ case NV_TYPE_UINT32_ARRAY:
+ if (vsize == 0)
+ vsize = 4;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT64_ARRAY:
+ case NV_TYPE_UINT64_ARRAY:
+ if (vsize == 0)
+ vsize = 8;
+ if ((dsize % vsize) != 0) {
+ error = EINVAL;
+ break;
+ }
+ break;
+ case NV_TYPE_STRING:
+ data = NVH_DATA(nvh);
+ if (data[dsize - 1] != '\0') {
+ error = EINVAL;
+ break;
+ }
+ if (strlen((char *)data) != dsize - 1) {
+ error = EINVAL;
+ break;
+ }
+ break;
+ default:
+ assert(!"invalid condition");
+ }
+ if (error != 0)
+ break;
+ ptr += NVH_SIZE(nvh);
+ size -= NVH_SIZE(nvh);
+ }
+ if (error != 0) {
+ errno = error;
+ if (nv->nv_error == 0)
+ nv->nv_error = error;
+ return (-1);
+ }
+ if (extrap != NULL)
+ *extrap = size;
+ return (0);
+}
+
+/*
+ * Convert the given nv structure to network byte order and return ebuf
+ * structure.
+ */
+struct ebuf *
+nv_hton(struct nv *nv)
+{
+ struct nvhdr *nvh;
+ unsigned char *ptr;
+ size_t size;
+
+ NV_CHECK(nv);
+ assert(nv->nv_error == 0);
+
+ ptr = ebuf_data(nv->nv_ebuf, &size);
+ while (size > 0) {
+ /*
+ * Minimum size at this point is size of nvhdr structure,
+ * one character long name plus terminating '\0'.
+ */
+ assert(size >= sizeof(*nvh) + 2);
+ nvh = (struct nvhdr *)ptr;
+ assert(NVH_SIZE(nvh) <= size);
+ nv_swap(nvh, false);
+ ptr += NVH_SIZE(nvh);
+ size -= NVH_SIZE(nvh);
+ }
+
+ return (nv->nv_ebuf);
+}
+
+/*
+ * Create nv structure based on ebuf received from the network.
+ */
+struct nv *
+nv_ntoh(struct ebuf *eb)
+{
+ struct nv *nv;
+ size_t extra;
+ int rerrno;
+
+ assert(eb != NULL);
+
+ nv = malloc(sizeof(*nv));
+ if (nv == NULL)
+ return (NULL);
+ nv->nv_error = 0;
+ nv->nv_ebuf = eb;
+ nv->nv_magic = NV_MAGIC;
+
+ if (nv_validate(nv, &extra) < 0) {
+ rerrno = errno;
+ nv->nv_magic = 0;
+ free(nv);
+ errno = rerrno;
+ return (NULL);
+ }
+ /*
+ * Remove extra zeros at the end of the buffer.
+ */
+ ebuf_del_tail(eb, extra);
+
+ return (nv);
+}
+
+#define NV_DEFINE_ADD(type, TYPE) \
+void \
+nv_add_##type(struct nv *nv, type##_t value, const char *namefmt, ...) \
+{ \
+ va_list nameap; \
+ \
+ va_start(nameap, namefmt); \
+ nv_addv(nv, (unsigned char *)&value, sizeof(value), \
+ NV_TYPE_##TYPE, namefmt, nameap); \
+ va_end(nameap); \
+}
+
+NV_DEFINE_ADD(int8, INT8)
+NV_DEFINE_ADD(uint8, UINT8)
+NV_DEFINE_ADD(int16, INT16)
+NV_DEFINE_ADD(uint16, UINT16)
+NV_DEFINE_ADD(int32, INT32)
+NV_DEFINE_ADD(uint32, UINT32)
+NV_DEFINE_ADD(int64, INT64)
+NV_DEFINE_ADD(uint64, UINT64)
+
+#undef NV_DEFINE_ADD
+
+#define NV_DEFINE_ADD_ARRAY(type, TYPE) \
+void \
+nv_add_##type##_array(struct nv *nv, const type##_t *value, \
+ size_t nsize, const char *namefmt, ...) \
+{ \
+ va_list nameap; \
+ \
+ va_start(nameap, namefmt); \
+ nv_addv(nv, (const unsigned char *)value, \
+ sizeof(value[0]) * nsize, NV_TYPE_##TYPE##_ARRAY, namefmt, \
+ nameap); \
+ va_end(nameap); \
+}
+
+NV_DEFINE_ADD_ARRAY(int8, INT8)
+NV_DEFINE_ADD_ARRAY(uint8, UINT8)
+NV_DEFINE_ADD_ARRAY(int16, INT16)
+NV_DEFINE_ADD_ARRAY(uint16, UINT16)
+NV_DEFINE_ADD_ARRAY(int32, INT32)
+NV_DEFINE_ADD_ARRAY(uint32, UINT32)
+NV_DEFINE_ADD_ARRAY(int64, INT64)
+NV_DEFINE_ADD_ARRAY(uint64, UINT64)
+
+#undef NV_DEFINE_ADD_ARRAY
+
+void
+nv_add_string(struct nv *nv, const char *value, const char *namefmt, ...)
+{
+ va_list nameap;
+ size_t size;
+
+ size = strlen(value) + 1;
+
+ va_start(nameap, namefmt);
+ nv_addv(nv, (const unsigned char *)value, size, NV_TYPE_STRING,
+ namefmt, nameap);
+ va_end(nameap);
+}
+
+void
+nv_add_stringf(struct nv *nv, const char *name, const char *valuefmt, ...)
+{
+ va_list valueap;
+
+ va_start(valueap, valuefmt);
+ nv_add_stringv(nv, name, valuefmt, valueap);
+ va_end(valueap);
+}
+
+void
+nv_add_stringv(struct nv *nv, const char *name, const char *valuefmt,
+ va_list valueap)
+{
+ char *value;
+ ssize_t size;
+
+ size = vasprintf(&value, valuefmt, valueap);
+ if (size < 0) {
+ if (nv->nv_error == 0)
+ nv->nv_error = ENOMEM;
+ return;
+ }
+ size++;
+ nv_add(nv, (const unsigned char *)value, size, NV_TYPE_STRING, name);
+ free(value);
+}
+
+#define NV_DEFINE_GET(type, TYPE) \
+type##_t \
+nv_get_##type(struct nv *nv, const char *namefmt, ...) \
+{ \
+ struct nvhdr *nvh; \
+ va_list nameap; \
+ type##_t value; \
+ \
+ va_start(nameap, namefmt); \
+ nvh = nv_find(nv, NV_TYPE_##TYPE, namefmt, nameap); \
+ va_end(nameap); \
+ if (nvh == NULL) \
+ return (0); \
+ assert((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST); \
+ assert(sizeof(value) == nvh->nvh_dsize); \
+ bcopy(NVH_DATA(nvh), &value, sizeof(value)); \
+ \
+ return (value); \
+}
+
+NV_DEFINE_GET(int8, INT8)
+NV_DEFINE_GET(uint8, UINT8)
+NV_DEFINE_GET(int16, INT16)
+NV_DEFINE_GET(uint16, UINT16)
+NV_DEFINE_GET(int32, INT32)
+NV_DEFINE_GET(uint32, UINT32)
+NV_DEFINE_GET(int64, INT64)
+NV_DEFINE_GET(uint64, UINT64)
+
+#undef NV_DEFINE_GET
+
+#define NV_DEFINE_GET_ARRAY(type, TYPE) \
+const type##_t * \
+nv_get_##type##_array(struct nv *nv, size_t *sizep, \
+ const char *namefmt, ...) \
+{ \
+ struct nvhdr *nvh; \
+ va_list nameap; \
+ \
+ va_start(nameap, namefmt); \
+ nvh = nv_find(nv, NV_TYPE_##TYPE##_ARRAY, namefmt, nameap); \
+ va_end(nameap); \
+ if (nvh == NULL) \
+ return (NULL); \
+ assert((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST); \
+ assert((nvh->nvh_dsize % sizeof(type##_t)) == 0); \
+ if (sizep != NULL) \
+ *sizep = nvh->nvh_dsize / sizeof(type##_t); \
+ return ((type##_t *)(void *)NVH_DATA(nvh)); \
+}
+
+NV_DEFINE_GET_ARRAY(int8, INT8)
+NV_DEFINE_GET_ARRAY(uint8, UINT8)
+NV_DEFINE_GET_ARRAY(int16, INT16)
+NV_DEFINE_GET_ARRAY(uint16, UINT16)
+NV_DEFINE_GET_ARRAY(int32, INT32)
+NV_DEFINE_GET_ARRAY(uint32, UINT32)
+NV_DEFINE_GET_ARRAY(int64, INT64)
+NV_DEFINE_GET_ARRAY(uint64, UINT64)
+
+#undef NV_DEFINE_GET_ARRAY
+
+const char *
+nv_get_string(struct nv *nv, const char *namefmt, ...)
+{
+ struct nvhdr *nvh;
+ va_list nameap;
+ char *str;
+
+ va_start(nameap, namefmt);
+ nvh = nv_find(nv, NV_TYPE_STRING, namefmt, nameap);
+ va_end(nameap);
+ if (nvh == NULL)
+ return (NULL);
+ assert((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST);
+ assert(nvh->nvh_dsize >= 1);
+ str = NVH_DATA(nvh);
+ assert(str[nvh->nvh_dsize - 1] == '\0');
+ assert(strlen(str) == nvh->nvh_dsize - 1);
+ return (str);
+}
+
+/*
+ * Dump content of the nv structure.
+ */
+void
+nv_dump(struct nv *nv)
+{
+ struct nvhdr *nvh;
+ unsigned char *data, *ptr;
+ size_t dsize, size;
+ unsigned int ii;
+ bool swap;
+
+ if (nv_validate(nv, NULL) < 0) {
+ printf("error: %d\n", errno);
+ return;
+ }
+
+ NV_CHECK(nv);
+ assert(nv->nv_error == 0);
+
+ ptr = ebuf_data(nv->nv_ebuf, &size);
+ while (size > 0) {
+ assert(size >= sizeof(*nvh) + 2);
+ nvh = (struct nvhdr *)ptr;
+ assert(size >= NVH_SIZE(nvh));
+ swap = ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_NETWORK);
+ dsize = NVH_DSIZE(nvh);
+ data = NVH_DATA(nvh);
+ printf(" %s", nvh->nvh_name);
+ switch (nvh->nvh_type & NV_TYPE_MASK) {
+ case NV_TYPE_INT8:
+ printf("(int8): %jd", (intmax_t)(*(int8_t *)data));
+ break;
+ case NV_TYPE_UINT8:
+ printf("(uint8): %ju", (uintmax_t)(*(uint8_t *)data));
+ break;
+ case NV_TYPE_INT16:
+ printf("(int16): %jd", swap ?
+ (intmax_t)le16toh(*(int16_t *)(void *)data) :
+ (intmax_t)*(int16_t *)(void *)data);
+ break;
+ case NV_TYPE_UINT16:
+ printf("(uint16): %ju", swap ?
+ (uintmax_t)le16toh(*(uint16_t *)(void *)data) :
+ (uintmax_t)*(uint16_t *)(void *)data);
+ break;
+ case NV_TYPE_INT32:
+ printf("(int32): %jd", swap ?
+ (intmax_t)le32toh(*(int32_t *)(void *)data) :
+ (intmax_t)*(int32_t *)(void *)data);
+ break;
+ case NV_TYPE_UINT32:
+ printf("(uint32): %ju", swap ?
+ (uintmax_t)le32toh(*(uint32_t *)(void *)data) :
+ (uintmax_t)*(uint32_t *)(void *)data);
+ break;
+ case NV_TYPE_INT64:
+ printf("(int64): %jd", swap ?
+ (intmax_t)le64toh(*(int64_t *)(void *)data) :
+ (intmax_t)*(int64_t *)(void *)data);
+ break;
+ case NV_TYPE_UINT64:
+ printf("(uint64): %ju", swap ?
+ (uintmax_t)le64toh(*(uint64_t *)(void *)data) :
+ (uintmax_t)*(uint64_t *)(void *)data);
+ break;
+ case NV_TYPE_INT8_ARRAY:
+ printf("(int8 array):");
+ for (ii = 0; ii < dsize; ii++)
+ printf(" %jd", (intmax_t)((int8_t *)data)[ii]);
+ break;
+ case NV_TYPE_UINT8_ARRAY:
+ printf("(uint8 array):");
+ for (ii = 0; ii < dsize; ii++)
+ printf(" %ju", (uintmax_t)((uint8_t *)data)[ii]);
+ break;
+ case NV_TYPE_INT16_ARRAY:
+ printf("(int16 array):");
+ for (ii = 0; ii < dsize / 2; ii++) {
+ printf(" %jd", swap ?
+ (intmax_t)le16toh(((int16_t *)(void *)data)[ii]) :
+ (intmax_t)((int16_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_UINT16_ARRAY:
+ printf("(uint16 array):");
+ for (ii = 0; ii < dsize / 2; ii++) {
+ printf(" %ju", swap ?
+ (uintmax_t)le16toh(((uint16_t *)(void *)data)[ii]) :
+ (uintmax_t)((uint16_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_INT32_ARRAY:
+ printf("(int32 array):");
+ for (ii = 0; ii < dsize / 4; ii++) {
+ printf(" %jd", swap ?
+ (intmax_t)le32toh(((int32_t *)(void *)data)[ii]) :
+ (intmax_t)((int32_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_UINT32_ARRAY:
+ printf("(uint32 array):");
+ for (ii = 0; ii < dsize / 4; ii++) {
+ printf(" %ju", swap ?
+ (uintmax_t)le32toh(((uint32_t *)(void *)data)[ii]) :
+ (uintmax_t)((uint32_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_INT64_ARRAY:
+ printf("(int64 array):");
+ for (ii = 0; ii < dsize / 8; ii++) {
+ printf(" %ju", swap ?
+ (uintmax_t)le64toh(((uint64_t *)(void *)data)[ii]) :
+ (uintmax_t)((uint64_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_UINT64_ARRAY:
+ printf("(uint64 array):");
+ for (ii = 0; ii < dsize / 8; ii++) {
+ printf(" %ju", swap ?
+ (uintmax_t)le64toh(((uint64_t *)(void *)data)[ii]) :
+ (uintmax_t)((uint64_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_STRING:
+ printf("(string): %s", (char *)data);
+ break;
+ default:
+ assert(!"invalid condition");
+ }
+ printf("\n");
+ ptr += NVH_SIZE(nvh);
+ size -= NVH_SIZE(nvh);
+ }
+}
+
+/*
+ * Local routines below.
+ */
+
+static void
+nv_add(struct nv *nv, const unsigned char *value, size_t vsize, int type,
+ const char *name)
+{
+ static unsigned char align[7];
+ struct nvhdr *nvh;
+ size_t namesize;
+
+ if (nv == NULL) {
+ errno = ENOMEM;
+ return;
+ }
+
+ NV_CHECK(nv);
+
+ namesize = strlen(name) + 1;
+
+ nvh = malloc(sizeof(*nvh) + roundup2(namesize, 8));
+ if (nvh == NULL) {
+ if (nv->nv_error == 0)
+ nv->nv_error = ENOMEM;
+ return;
+ }
+ nvh->nvh_type = NV_ORDER_HOST | type;
+ nvh->nvh_namesize = (uint8_t)namesize;
+ nvh->nvh_dsize = (uint32_t)vsize;
+ bcopy(name, nvh->nvh_name, namesize);
+
+ /* Add header first. */
+ if (ebuf_add_tail(nv->nv_ebuf, nvh, NVH_HSIZE(nvh)) < 0) {
+ assert(errno != 0);
+ if (nv->nv_error == 0)
+ nv->nv_error = errno;
+ return;
+ }
+ /* Add the actual data. */
+ if (ebuf_add_tail(nv->nv_ebuf, value, vsize) < 0) {
+ assert(errno != 0);
+ if (nv->nv_error == 0)
+ nv->nv_error = errno;
+ return;
+ }
+ /* Align the data (if needed). */
+ vsize = roundup2(vsize, 8) - vsize;
+ if (vsize == 0)
+ return;
+ assert(vsize > 0 && vsize <= sizeof(align));
+ if (ebuf_add_tail(nv->nv_ebuf, align, vsize) < 0) {
+ assert(errno != 0);
+ if (nv->nv_error == 0)
+ nv->nv_error = errno;
+ return;
+ }
+}
+
+static void
+nv_addv(struct nv *nv, const unsigned char *value, size_t vsize, int type,
+ const char *namefmt, va_list nameap)
+{
+ char name[255];
+ size_t namesize;
+
+ namesize = vsnprintf(name, sizeof(name), namefmt, nameap);
+ assert(namesize > 0 && namesize < sizeof(name));
+
+ nv_add(nv, value, vsize, type, name);
+}
+
+static struct nvhdr *
+nv_find(struct nv *nv, int type, const char *namefmt, va_list nameap)
+{
+ char name[255];
+ struct nvhdr *nvh;
+ unsigned char *ptr;
+ size_t size, namesize;
+
+ if (nv == NULL) {
+ errno = ENOMEM;
+ return (NULL);
+ }
+
+ NV_CHECK(nv);
+
+ namesize = vsnprintf(name, sizeof(name), namefmt, nameap);
+ assert(namesize > 0 && namesize < sizeof(name));
+ namesize++;
+
+ ptr = ebuf_data(nv->nv_ebuf, &size);
+ while (size > 0) {
+ assert(size >= sizeof(*nvh) + 2);
+ nvh = (struct nvhdr *)ptr;
+ assert(size >= NVH_SIZE(nvh));
+ nv_swap(nvh, true);
+ if (strcmp(nvh->nvh_name, name) == 0) {
+ if ((nvh->nvh_type & NV_TYPE_MASK) != type) {
+ errno = EINVAL;
+ if (nv->nv_error == 0)
+ nv->nv_error = EINVAL;
+ return (NULL);
+ }
+ return (nvh);
+ }
+ ptr += NVH_SIZE(nvh);
+ size -= NVH_SIZE(nvh);
+ }
+ errno = ENOENT;
+ if (nv->nv_error == 0)
+ nv->nv_error = ENOENT;
+ return (NULL);
+}
+
+static void
+nv_swap(struct nvhdr *nvh, bool tohost)
+{
+ unsigned char *data, *end, *p;
+ size_t vsize;
+
+ data = NVH_DATA(nvh);
+ if (tohost) {
+ if ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST)
+ return;
+ nvh->nvh_dsize = le32toh(nvh->nvh_dsize);
+ end = data + nvh->nvh_dsize;
+ nvh->nvh_type &= ~NV_ORDER_MASK;
+ nvh->nvh_type |= NV_ORDER_HOST;
+ } else {
+ if ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_NETWORK)
+ return;
+ end = data + nvh->nvh_dsize;
+ nvh->nvh_dsize = htole32(nvh->nvh_dsize);
+ nvh->nvh_type &= ~NV_ORDER_MASK;
+ nvh->nvh_type |= NV_ORDER_NETWORK;
+ }
+
+ vsize = 0;
+
+ switch (nvh->nvh_type & NV_TYPE_MASK) {
+ case NV_TYPE_INT8:
+ case NV_TYPE_UINT8:
+ case NV_TYPE_INT8_ARRAY:
+ case NV_TYPE_UINT8_ARRAY:
+ break;
+ case NV_TYPE_INT16:
+ case NV_TYPE_UINT16:
+ case NV_TYPE_INT16_ARRAY:
+ case NV_TYPE_UINT16_ARRAY:
+ if (vsize == 0)
+ vsize = 2;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT32:
+ case NV_TYPE_UINT32:
+ case NV_TYPE_INT32_ARRAY:
+ case NV_TYPE_UINT32_ARRAY:
+ if (vsize == 0)
+ vsize = 4;
+ /* FALLTHOUGH */
+ case NV_TYPE_INT64:
+ case NV_TYPE_UINT64:
+ case NV_TYPE_INT64_ARRAY:
+ case NV_TYPE_UINT64_ARRAY:
+ if (vsize == 0)
+ vsize = 8;
+ for (p = data; p < end; p += vsize) {
+ if (tohost) {
+ switch (vsize) {
+ case 2:
+ *(uint16_t *)(void *)p =
+ le16toh(*(uint16_t *)(void *)p);
+ break;
+ case 4:
+ *(uint32_t *)(void *)p =
+ le32toh(*(uint32_t *)(void *)p);
+ break;
+ case 8:
+ *(uint64_t *)(void *)p =
+ le64toh(*(uint64_t *)(void *)p);
+ break;
+ default:
+ assert(!"invalid condition");
+ }
+ } else {
+ switch (vsize) {
+ case 2:
+ *(uint16_t *)(void *)p =
+ htole16(*(uint16_t *)(void *)p);
+ break;
+ case 4:
+ *(uint32_t *)(void *)p =
+ htole32(*(uint32_t *)(void *)p);
+ break;
+ case 8:
+ *(uint64_t *)(void *)p =
+ htole64(*(uint64_t *)(void *)p);
+ break;
+ default:
+ assert(!"invalid condition");
+ }
+ }
+ }
+ break;
+ case NV_TYPE_STRING:
+ break;
+ default:
+ assert(!"unrecognized type");
+ }
+}
diff --git a/sbin/hastd/nv.h b/sbin/hastd/nv.h
new file mode 100644
index 0000000..1677548
--- /dev/null
+++ b/sbin/hastd/nv.h
@@ -0,0 +1,158 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NV_H_
+#define _NV_H_
+
+#include <sys/cdefs.h>
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <ebuf.h>
+
+#define NV_TYPE_INT8 1
+#define NV_TYPE_UINT8 2
+#define NV_TYPE_INT16 3
+#define NV_TYPE_UINT16 4
+#define NV_TYPE_INT32 5
+#define NV_TYPE_UINT32 6
+#define NV_TYPE_INT64 7
+#define NV_TYPE_UINT64 8
+#define NV_TYPE_INT8_ARRAY 9
+#define NV_TYPE_UINT8_ARRAY 10
+#define NV_TYPE_INT16_ARRAY 11
+#define NV_TYPE_UINT16_ARRAY 12
+#define NV_TYPE_INT32_ARRAY 13
+#define NV_TYPE_UINT32_ARRAY 14
+#define NV_TYPE_INT64_ARRAY 15
+#define NV_TYPE_UINT64_ARRAY 16
+#define NV_TYPE_STRING 17
+
+#define NV_TYPE_MASK 0x7f
+#define NV_TYPE_FIRST NV_TYPE_INT8
+#define NV_TYPE_LAST NV_TYPE_STRING
+
+#define NV_ORDER_NETWORK 0x00
+#define NV_ORDER_HOST 0x80
+
+#define NV_ORDER_MASK 0x80
+
+struct nv;
+
+struct nv *nv_alloc(void);
+void nv_free(struct nv *nv);
+int nv_error(const struct nv *nv);
+int nv_set_error(struct nv *nv, int error);
+int nv_validate(struct nv *nv, size_t *extrap);
+
+struct ebuf *nv_hton(struct nv *nv);
+struct nv *nv_ntoh(struct ebuf *eb);
+
+void nv_add_int8(struct nv *nv, int8_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_uint8(struct nv *nv, uint8_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_int16(struct nv *nv, int16_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_uint16(struct nv *nv, uint16_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_int32(struct nv *nv, int32_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_uint32(struct nv *nv, uint32_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_int64(struct nv *nv, int64_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_uint64(struct nv *nv, uint64_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_int8_array(struct nv *nv, const int8_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_uint8_array(struct nv *nv, const uint8_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_int16_array(struct nv *nv, const int16_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_uint16_array(struct nv *nv, const uint16_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_int32_array(struct nv *nv, const int32_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_uint32_array(struct nv *nv, const uint32_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_int64_array(struct nv *nv, const int64_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_uint64_array(struct nv *nv, const uint64_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_string(struct nv *nv, const char *value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_stringf(struct nv *nv, const char *name, const char *valuefmt, ...)
+ __printflike(3, 4);
+void nv_add_stringv(struct nv *nv, const char *name, const char *valuefmt,
+ va_list valueap) __printflike(3, 0);
+
+int8_t nv_get_int8(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+uint8_t nv_get_uint8(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+int16_t nv_get_int16(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+uint16_t nv_get_uint16(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+int32_t nv_get_int32(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+uint32_t nv_get_uint32(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+int64_t nv_get_int64(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+uint64_t nv_get_uint64(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+const int8_t *nv_get_int8_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const uint8_t *nv_get_uint8_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const int16_t *nv_get_int16_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const uint16_t *nv_get_uint16_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const int32_t *nv_get_int32_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const uint32_t *nv_get_uint32_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const int64_t *nv_get_int64_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const uint64_t *nv_get_uint64_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const char *nv_get_string(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+
+void nv_dump(struct nv *nv);
+
+#endif /* !_NV_H_ */
diff --git a/sbin/hastd/parse.y b/sbin/hastd/parse.y
new file mode 100644
index 0000000..6755320
--- /dev/null
+++ b/sbin/hastd/parse.y
@@ -0,0 +1,507 @@
+%{
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h> /* MAXHOSTNAMELEN */
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+
+#include <arpa/inet.h>
+
+#include <assert.h>
+#include <err.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include "hast.h"
+
+extern int depth;
+extern int lineno;
+
+extern FILE *yyin;
+extern char *yytext;
+
+static struct hastd_config lconfig;
+static struct hast_resource *curres;
+static bool mynode;
+
+static char depth0_control[HAST_ADDRSIZE];
+static char depth0_listen[HAST_ADDRSIZE];
+static int depth0_replication;
+
+static char depth1_provname[PATH_MAX];
+static char depth1_localpath[PATH_MAX];
+
+static bool
+isitme(const char *name)
+{
+ char buf[MAXHOSTNAMELEN];
+ char *pos;
+ size_t bufsize;
+
+ /*
+ * First check if the give name matches our full hostname.
+ */
+ if (gethostname(buf, sizeof(buf)) < 0)
+ err(EX_OSERR, "gethostname() failed");
+ if (strcmp(buf, name) == 0)
+ return (true);
+
+ /*
+ * Now check if it matches first part of the host name.
+ */
+ pos = strchr(buf, '.');
+ if (pos != NULL && pos != buf && strncmp(buf, name, pos - buf) == 0)
+ return (true);
+
+ /*
+ * At the end check if name is equal to our host's UUID.
+ */
+ bufsize = sizeof(buf);
+ if (sysctlbyname("kern.hostuuid", buf, &bufsize, NULL, 0) < 0)
+ err(EX_OSERR, "sysctlbyname(kern.hostuuid) failed");
+ if (strcasecmp(buf, name) == 0)
+ return (true);
+
+ /*
+ * Looks like this isn't about us.
+ */
+ return (false);
+}
+
+void
+yyerror(const char *str)
+{
+
+ fprintf(stderr, "error at line %d near '%s': %s\n",
+ lineno, yytext, str);
+}
+
+struct hastd_config *
+yy_config_parse(const char *config)
+{
+ int ret;
+
+ curres = NULL;
+ mynode = false;
+
+ depth0_replication = HAST_REPLICATION_MEMSYNC;
+ strlcpy(depth0_control, HAST_CONTROL, sizeof(depth0_control));
+ strlcpy(depth0_listen, HASTD_LISTEN, sizeof(depth0_listen));
+
+ TAILQ_INIT(&lconfig.hc_resources);
+
+ yyin = fopen(config, "r");
+ if (yyin == NULL)
+ err(EX_OSFILE, "cannot open configuration file %s", config);
+ ret = yyparse();
+ fclose(yyin);
+ if (ret != 0) {
+ yy_config_free(&lconfig);
+ exit(EX_CONFIG);
+ }
+
+ /*
+ * Let's see if everything is set up.
+ */
+ if (lconfig.hc_controladdr[0] == '\0') {
+ strlcpy(lconfig.hc_controladdr, depth0_control,
+ sizeof(lconfig.hc_controladdr));
+ }
+ if (lconfig.hc_listenaddr[0] == '\0') {
+ strlcpy(lconfig.hc_listenaddr, depth0_listen,
+ sizeof(lconfig.hc_listenaddr));
+ }
+ TAILQ_FOREACH(curres, &lconfig.hc_resources, hr_next) {
+ assert(curres->hr_provname[0] != '\0');
+ assert(curres->hr_localpath[0] != '\0');
+ assert(curres->hr_remoteaddr[0] != '\0');
+
+ if (curres->hr_replication == -1) {
+ /*
+ * Replication is not set at resource-level.
+ * Use global or default setting.
+ */
+ curres->hr_replication = depth0_replication;
+ }
+ }
+
+ return (&lconfig);
+}
+
+void
+yy_config_free(struct hastd_config *config)
+{
+ struct hast_resource *res;
+
+ while ((res = TAILQ_FIRST(&config->hc_resources)) != NULL) {
+ TAILQ_REMOVE(&config->hc_resources, res, hr_next);
+ free(res);
+ }
+}
+%}
+
+%token CONTROL LISTEN PORT REPLICATION EXTENTSIZE RESOURCE NAME LOCAL REMOTE ON
+%token FULLSYNC MEMSYNC ASYNC
+%token NUM STR OB CB
+
+%type <num> replication_type
+
+%union
+{
+ int num;
+ char *str;
+}
+
+%token <num> NUM
+%token <str> STR
+
+%%
+
+statements:
+ |
+ statements statement
+ ;
+
+statement:
+ control_statement
+ |
+ listen_statement
+ |
+ replication_statement
+ |
+ node_statement
+ |
+ resource_statement
+ ;
+
+control_statement: CONTROL STR
+ {
+ switch (depth) {
+ case 0:
+ if (strlcpy(depth0_control, $2,
+ sizeof(depth0_control)) >=
+ sizeof(depth0_control)) {
+ errx(EX_CONFIG, "control argument too long");
+ }
+ break;
+ case 1:
+ if (mynode) {
+ if (strlcpy(lconfig.hc_controladdr, $2,
+ sizeof(lconfig.hc_controladdr)) >=
+ sizeof(lconfig.hc_controladdr)) {
+ errx(EX_CONFIG,
+ "control argument too long");
+ }
+ }
+ break;
+ default:
+ assert(!"control at wrong depth level");
+ }
+ }
+ ;
+
+listen_statement: LISTEN STR
+ {
+ switch (depth) {
+ case 0:
+ if (strlcpy(depth0_listen, $2,
+ sizeof(depth0_listen)) >=
+ sizeof(depth0_listen)) {
+ errx(EX_CONFIG, "listen argument too long");
+ }
+ break;
+ case 1:
+ if (mynode) {
+ if (strlcpy(lconfig.hc_listenaddr, $2,
+ sizeof(lconfig.hc_listenaddr)) >=
+ sizeof(lconfig.hc_listenaddr)) {
+ errx(EX_CONFIG,
+ "listen argument too long");
+ }
+ }
+ break;
+ default:
+ assert(!"listen at wrong depth level");
+ }
+ }
+ ;
+
+replication_statement: REPLICATION replication_type
+ {
+ switch (depth) {
+ case 0:
+ depth0_replication = $2;
+ break;
+ case 1:
+ if (curres != NULL)
+ curres->hr_replication = $2;
+ break;
+ default:
+ assert(!"replication at wrong depth level");
+ }
+ }
+ ;
+
+replication_type:
+ FULLSYNC { $$ = HAST_REPLICATION_FULLSYNC; }
+ |
+ MEMSYNC { $$ = HAST_REPLICATION_MEMSYNC; }
+ |
+ ASYNC { $$ = HAST_REPLICATION_ASYNC; }
+ ;
+
+node_statement: ON node_start OB node_entries CB
+ {
+ mynode = false;
+ }
+ ;
+
+node_start: STR
+ {
+ if (isitme($1))
+ mynode = true;
+ }
+ ;
+
+node_entries:
+ |
+ node_entries node_entry
+ ;
+
+node_entry:
+ control_statement
+ |
+ listen_statement
+ ;
+
+resource_statement: RESOURCE resource_start OB resource_entries CB
+ {
+ if (curres != NULL) {
+ /*
+ * Let's see there are some resource-level settings
+ * that we can use for node-level settings.
+ */
+ if (curres->hr_provname[0] == '\0' &&
+ depth1_provname[0] != '\0') {
+ /*
+ * Provider name is not set at node-level,
+ * but is set at resource-level, use it.
+ */
+ strlcpy(curres->hr_provname, depth1_provname,
+ sizeof(curres->hr_provname));
+ }
+ if (curres->hr_localpath[0] == '\0' &&
+ depth1_localpath[0] != '\0') {
+ /*
+ * Path to local provider is not set at
+ * node-level, but is set at resource-level,
+ * use it.
+ */
+ strlcpy(curres->hr_localpath, depth1_localpath,
+ sizeof(curres->hr_localpath));
+ }
+
+ /*
+ * If provider name is not given, use resource name
+ * as provider name.
+ */
+ if (curres->hr_provname[0] == '\0') {
+ strlcpy(curres->hr_provname, curres->hr_name,
+ sizeof(curres->hr_provname));
+ }
+
+ /*
+ * Remote address has to be configured at this point.
+ */
+ if (curres->hr_remoteaddr[0] == '\0') {
+ errx(EX_CONFIG,
+ "remote address not configured for resource %s",
+ curres->hr_name);
+ }
+ /*
+ * Path to local provider has to be configured at this
+ * point.
+ */
+ if (curres->hr_localpath[0] == '\0') {
+ errx(EX_CONFIG,
+ "path local component not configured for resource %s",
+ curres->hr_name);
+ }
+
+ /* Put it onto resource list. */
+ TAILQ_INSERT_TAIL(&lconfig.hc_resources, curres, hr_next);
+ curres = NULL;
+ }
+ }
+ ;
+
+resource_start: STR
+ {
+ /*
+ * Clear those, so we can tell if they were set at
+ * resource-level or not.
+ */
+ depth1_provname[0] = '\0';
+ depth1_localpath[0] = '\0';
+
+ curres = calloc(1, sizeof(*curres));
+ if (curres == NULL) {
+ errx(EX_TEMPFAIL,
+ "cannot allocate memory for resource");
+ }
+ if (strlcpy(curres->hr_name, $1,
+ sizeof(curres->hr_name)) >=
+ sizeof(curres->hr_name)) {
+ errx(EX_CONFIG,
+ "resource name (%s) too long", $1);
+ }
+ curres->hr_role = HAST_ROLE_INIT;
+ curres->hr_previous_role = HAST_ROLE_INIT;
+ curres->hr_replication = -1;
+ curres->hr_provname[0] = '\0';
+ curres->hr_localpath[0] = '\0';
+ curres->hr_localfd = -1;
+ curres->hr_remoteaddr[0] = '\0';
+ curres->hr_ggateunit = -1;
+ }
+ ;
+
+resource_entries:
+ |
+ resource_entries resource_entry
+ ;
+
+resource_entry:
+ replication_statement
+ |
+ name_statement
+ |
+ local_statement
+ |
+ resource_node_statement
+ ;
+
+name_statement: NAME STR
+ {
+ switch (depth) {
+ case 1:
+ if (strlcpy(depth1_provname, $2,
+ sizeof(depth1_provname)) >=
+ sizeof(depth1_provname)) {
+ errx(EX_CONFIG, "name argument too long");
+ }
+ break;
+ case 2:
+ if (mynode) {
+ assert(curres != NULL);
+ if (strlcpy(curres->hr_provname, $2,
+ sizeof(curres->hr_provname)) >=
+ sizeof(curres->hr_provname)) {
+ errx(EX_CONFIG,
+ "name argument too long");
+ }
+ }
+ break;
+ default:
+ assert(!"name at wrong depth level");
+ }
+ }
+ ;
+
+local_statement: LOCAL STR
+ {
+ switch (depth) {
+ case 1:
+ if (strlcpy(depth1_localpath, $2,
+ sizeof(depth1_localpath)) >=
+ sizeof(depth1_localpath)) {
+ errx(EX_CONFIG, "local argument too long");
+ }
+ break;
+ case 2:
+ if (mynode) {
+ assert(curres != NULL);
+ if (strlcpy(curres->hr_localpath, $2,
+ sizeof(curres->hr_localpath)) >=
+ sizeof(curres->hr_localpath)) {
+ errx(EX_CONFIG,
+ "local argument too long");
+ }
+ }
+ break;
+ default:
+ assert(!"local at wrong depth level");
+ }
+ }
+ ;
+
+resource_node_statement:ON resource_node_start OB resource_node_entries CB
+ {
+ mynode = false;
+ }
+ ;
+
+resource_node_start: STR
+ {
+ if (curres != NULL && isitme($1))
+ mynode = true;
+ }
+ ;
+
+resource_node_entries:
+ |
+ resource_node_entries resource_node_entry
+ ;
+
+resource_node_entry:
+ name_statement
+ |
+ local_statement
+ |
+ remote_statement
+ ;
+
+remote_statement: REMOTE STR
+ {
+ assert(depth == 2);
+ if (mynode) {
+ assert(curres != NULL);
+ if (strlcpy(curres->hr_remoteaddr, $2,
+ sizeof(curres->hr_remoteaddr)) >=
+ sizeof(curres->hr_remoteaddr)) {
+ errx(EX_CONFIG, "remote argument too long");
+ }
+ }
+ }
+ ;
diff --git a/sbin/hastd/pjdlog.c b/sbin/hastd/pjdlog.c
new file mode 100644
index 0000000..38c5539
--- /dev/null
+++ b/sbin/hastd/pjdlog.c
@@ -0,0 +1,367 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
+
+#include "pjdlog.h"
+
+static int pjdlog_mode = PJDLOG_MODE_STD;
+static int pjdlog_debug_level = 0;
+static char pjdlog_prefix[128];
+
+/*
+ * Configure where the logs should go.
+ * By default they are send to stdout/stderr, but after going into background
+ * (eg. by calling daemon(3)) application is responsible for changing mode to
+ * PJDLOG_MODE_SYSLOG, so logs will be send to syslog.
+ */
+void
+pjdlog_mode_set(int mode)
+{
+
+ assert(mode == PJDLOG_MODE_STD || mode == PJDLOG_MODE_SYSLOG);
+
+ pjdlog_mode = mode;
+}
+
+/*
+ * Return current mode.
+ */
+int
+pjdlog_mode_get(void)
+{
+
+ return (pjdlog_mode);
+}
+
+/*
+ * Set debug level. All the logs above the level specified here will be
+ * ignored.
+ */
+void
+pjdlog_debug_set(int level)
+{
+
+ assert(level >= 0);
+
+ pjdlog_debug_level = level;
+}
+
+/*
+ * Return current debug level.
+ */
+int
+pjdlog_debug_get(void)
+{
+
+ return (pjdlog_debug_level);
+}
+
+/*
+ * Set prefix that will be used before each log.
+ * Setting prefix to NULL will remove it.
+ */
+void
+pjdlog_prefix_set(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlog_prefix_setv(fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Set prefix that will be used before each log.
+ * Setting prefix to NULL will remove it.
+ */
+void
+pjdlog_prefix_setv(const char *fmt, va_list ap)
+{
+
+ assert(fmt != NULL);
+
+ vsnprintf(pjdlog_prefix, sizeof(pjdlog_prefix), fmt, ap);
+}
+
+/*
+ * Convert log level into string.
+ */
+static const char *
+pjdlog_level_string(int loglevel)
+{
+
+ switch (loglevel) {
+ case LOG_EMERG:
+ return ("EMERG");
+ case LOG_ALERT:
+ return ("ALERT");
+ case LOG_CRIT:
+ return ("CRIT");
+ case LOG_ERR:
+ return ("ERROR");
+ case LOG_WARNING:
+ return ("WARNING");
+ case LOG_NOTICE:
+ return ("NOTICE");
+ case LOG_INFO:
+ return ("INFO");
+ case LOG_DEBUG:
+ return ("DEBUG");
+ }
+ assert(!"Invalid log level.");
+ abort(); /* XXX: gcc */
+}
+
+/*
+ * Common log routine.
+ */
+void
+pjdlog_common(int loglevel, int debuglevel, int error, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv_common(loglevel, debuglevel, error, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Common log routine, which can handle regular log level as well as debug
+ * level. We decide here where to send the logs (stdout/stderr or syslog).
+ */
+void
+pjdlogv_common(int loglevel, int debuglevel, int error, const char *fmt,
+ va_list ap)
+{
+
+ assert(loglevel == LOG_EMERG || loglevel == LOG_ALERT ||
+ loglevel == LOG_CRIT || loglevel == LOG_ERR ||
+ loglevel == LOG_WARNING || loglevel == LOG_NOTICE ||
+ loglevel == LOG_INFO || loglevel == LOG_DEBUG);
+ assert(loglevel != LOG_DEBUG || debuglevel > 0);
+ assert(error >= -1);
+
+ /* Ignore debug above configured level. */
+ if (loglevel == LOG_DEBUG && debuglevel > pjdlog_debug_level)
+ return;
+
+ switch (pjdlog_mode) {
+ case PJDLOG_MODE_STD:
+ {
+ FILE *out;
+
+ /*
+ * We send errors and warning to stderr and the rest to stdout.
+ */
+ switch (loglevel) {
+ case LOG_EMERG:
+ case LOG_ALERT:
+ case LOG_CRIT:
+ case LOG_ERR:
+ case LOG_WARNING:
+ out = stderr;
+ break;
+ case LOG_NOTICE:
+ case LOG_INFO:
+ case LOG_DEBUG:
+ out = stdout;
+ break;
+ default:
+ assert(!"Invalid loglevel.");
+ abort(); /* XXX: gcc */
+ }
+
+ fprintf(out, "[%s]", pjdlog_level_string(loglevel));
+ /* Attach debuglevel if this is debug log. */
+ if (loglevel == LOG_DEBUG)
+ fprintf(out, "[%d]", debuglevel);
+ fprintf(out, " ");
+ fprintf(out, "%s", pjdlog_prefix);
+ vfprintf(out, fmt, ap);
+ if (error != -1)
+ fprintf(out, ": %s.", strerror(error));
+ fprintf(out, "\n");
+ break;
+ }
+ case PJDLOG_MODE_SYSLOG:
+ {
+ char log[1024];
+ int len;
+
+ len = snprintf(log, sizeof(log), "%s", pjdlog_prefix);
+ if ((size_t)len < sizeof(log))
+ len = vsnprintf(log + len, sizeof(log) - len, fmt, ap);
+ if (error != -1 && (size_t)len < sizeof(log)) {
+ (void)snprintf(log + len, sizeof(log) - len, ": %s.",
+ strerror(error));
+ }
+ syslog(loglevel, "%s", log);
+ break;
+ }
+ default:
+ assert(!"Invalid mode.");
+ }
+}
+
+/*
+ * Regular logs.
+ */
+void
+pjdlogv(int loglevel, const char *fmt, va_list ap)
+{
+
+ /* LOG_DEBUG is invalid here, pjdlogv?_debug() should be used. */
+ assert(loglevel == LOG_EMERG || loglevel == LOG_ALERT ||
+ loglevel == LOG_CRIT || loglevel == LOG_ERR ||
+ loglevel == LOG_WARNING || loglevel == LOG_NOTICE ||
+ loglevel == LOG_INFO);
+
+ pjdlogv_common(loglevel, 0, -1, fmt, ap);
+}
+
+/*
+ * Regular logs.
+ */
+void
+pjdlog(int loglevel, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv(loglevel, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Debug logs.
+ */
+void
+pjdlogv_debug(int debuglevel, const char *fmt, va_list ap)
+{
+
+ pjdlogv_common(LOG_DEBUG, debuglevel, -1, fmt, ap);
+}
+
+/*
+ * Debug logs.
+ */
+void
+pjdlog_debug(int debuglevel, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv_debug(debuglevel, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Error logs with errno logging.
+ */
+void
+pjdlogv_errno(int loglevel, const char *fmt, va_list ap)
+{
+
+ pjdlogv_common(loglevel, 0, errno, fmt, ap);
+}
+
+/*
+ * Error logs with errno logging.
+ */
+void
+pjdlog_errno(int loglevel, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv_errno(loglevel, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Log error, errno and exit.
+ */
+void
+pjdlogv_exit(int exitcode, const char *fmt, va_list ap)
+{
+
+ pjdlogv_errno(LOG_ERR, fmt, ap);
+ exit(exitcode);
+}
+
+/*
+ * Log error, errno and exit.
+ */
+void
+pjdlog_exit(int exitcode, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv_exit(exitcode, fmt, ap);
+ /* NOTREACHED */
+ va_end(ap);
+}
+
+/*
+ * Log error and exit.
+ */
+void
+pjdlogv_exitx(int exitcode, const char *fmt, va_list ap)
+{
+
+ pjdlogv(LOG_ERR, fmt, ap);
+ exit(exitcode);
+}
+
+/*
+ * Log error and exit.
+ */
+void
+pjdlog_exitx(int exitcode, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv_exitx(exitcode, fmt, ap);
+ /* NOTREACHED */
+ va_end(ap);
+}
diff --git a/sbin/hastd/pjdlog.h b/sbin/hastd/pjdlog.h
new file mode 100644
index 0000000..2136b12
--- /dev/null
+++ b/sbin/hastd/pjdlog.h
@@ -0,0 +1,88 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PJDLOG_H_
+#define _PJDLOG_H_
+
+#include <sys/cdefs.h>
+
+#include <stdarg.h>
+#include <sysexits.h>
+#include <syslog.h>
+
+#define PJDLOG_MODE_STD 0
+#define PJDLOG_MODE_SYSLOG 1
+
+void pjdlog_mode_set(int mode);
+int pjdlog_mode_get(void);
+
+void pjdlog_debug_set(int level);
+int pjdlog_debug_get(void);
+
+void pjdlog_prefix_set(const char *fmt, ...) __printflike(1, 2);
+void pjdlog_prefix_setv(const char *fmt, va_list ap) __printflike(1, 0);
+
+void pjdlog_common(int loglevel, int debuglevel, int error, const char *fmt,
+ ...) __printflike(4, 5);
+void pjdlogv_common(int loglevel, int debuglevel, int error, const char *fmt,
+ va_list ap) __printflike(4, 0);
+
+void pjdlog(int loglevel, const char *fmt, ...) __printflike(2, 3);
+void pjdlogv(int loglevel, const char *fmt, va_list ap) __printflike(2, 0);
+
+#define pjdlogv_emergency(fmt, ap) pjdlogv(LOG_EMERG, (fmt), (ap))
+#define pjdlog_emergency(...) pjdlog(LOG_EMERG, __VA_ARGS__)
+#define pjdlogv_alert(fmt, ap) pjdlogv(LOG_ALERT, (fmt), (ap))
+#define pjdlog_alert(...) pjdlog(LOG_ALERT, __VA_ARGS__)
+#define pjdlogv_critical(fmt, ap) pjdlogv(LOG_CRIT, (fmt), (ap))
+#define pjdlog_critical(...) pjdlog(LOG_CRIT, __VA_ARGS__)
+#define pjdlogv_error(fmt, ap) pjdlogv(LOG_ERR, (fmt), (ap))
+#define pjdlog_error(...) pjdlog(LOG_ERR, __VA_ARGS__)
+#define pjdlogv_warning(fmt, ap) pjdlogv(LOG_WARNING, (fmt), (ap))
+#define pjdlog_warning(...) pjdlog(LOG_WARNING, __VA_ARGS__)
+#define pjdlogv_notice(fmt, ap) pjdlogv(LOG_NOTICE, (fmt), (ap))
+#define pjdlog_notice(...) pjdlog(LOG_NOTICE, __VA_ARGS__)
+#define pjdlogv_info(fmt, ap) pjdlogv(LOG_INFO, (fmt), (ap))
+#define pjdlog_info(...) pjdlog(LOG_INFO, __VA_ARGS__)
+
+void pjdlog_debug(int debuglevel, const char *fmt, ...) __printflike(2, 3);
+void pjdlogv_debug(int debuglevel, const char *fmt, va_list ap) __printflike(2, 0);
+
+void pjdlog_errno(int loglevel, const char *fmt, ...) __printflike(2, 3);
+void pjdlogv_errno(int loglevel, const char *fmt, va_list ap) __printflike(2, 0);
+
+void pjdlog_exit(int exitcode, const char *fmt, ...) __printflike(2, 3) __dead2;
+void pjdlogv_exit(int exitcode, const char *fmt, va_list ap) __printflike(2, 0) __dead2;
+
+void pjdlog_exitx(int exitcode, const char *fmt, ...) __printflike(2, 3) __dead2;
+void pjdlogv_exitx(int exitcode, const char *fmt, va_list ap) __printflike(2, 0) __dead2;
+
+#endif /* !_PJDLOG_H_ */
diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c
new file mode 100644
index 0000000..ed6e91c
--- /dev/null
+++ b/sbin/hastd/primary.c
@@ -0,0 +1,1769 @@
+/*-
+ * Copyright (c) 2009 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/bio.h>
+#include <sys/disk.h>
+#include <sys/refcount.h>
+#include <sys/stat.h>
+
+#include <geom/gate/g_gate.h>
+
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgeom.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <activemap.h>
+#include <nv.h>
+#include <rangelock.h>
+
+#include "control.h"
+#include "hast.h"
+#include "hast_proto.h"
+#include "hastd.h"
+#include "metadata.h"
+#include "proto.h"
+#include "pjdlog.h"
+#include "subr.h"
+#include "synch.h"
+
+struct hio {
+ /*
+ * Number of components we are still waiting for.
+ * When this field goes to 0, we can send the request back to the
+ * kernel. Each component has to decrease this counter by one
+ * even on failure.
+ */
+ unsigned int hio_countdown;
+ /*
+ * Each component has a place to store its own error.
+ * Once the request is handled by all components we can decide if the
+ * request overall is successful or not.
+ */
+ int *hio_errors;
+ /*
+ * Structure used to comunicate with GEOM Gate class.
+ */
+ struct g_gate_ctl_io hio_ggio;
+ TAILQ_ENTRY(hio) *hio_next;
+};
+#define hio_free_next hio_next[0]
+#define hio_done_next hio_next[0]
+
+/*
+ * Free list holds unused structures. When free list is empty, we have to wait
+ * until some in-progress requests are freed.
+ */
+static TAILQ_HEAD(, hio) hio_free_list;
+static pthread_mutex_t hio_free_list_lock;
+static pthread_cond_t hio_free_list_cond;
+/*
+ * There is one send list for every component. One requests is placed on all
+ * send lists - each component gets the same request, but each component is
+ * responsible for managing his own send list.
+ */
+static TAILQ_HEAD(, hio) *hio_send_list;
+static pthread_mutex_t *hio_send_list_lock;
+static pthread_cond_t *hio_send_list_cond;
+/*
+ * There is one recv list for every component, although local components don't
+ * use recv lists as local requests are done synchronously.
+ */
+static TAILQ_HEAD(, hio) *hio_recv_list;
+static pthread_mutex_t *hio_recv_list_lock;
+static pthread_cond_t *hio_recv_list_cond;
+/*
+ * Request is placed on done list by the slowest component (the one that
+ * decreased hio_countdown from 1 to 0).
+ */
+static TAILQ_HEAD(, hio) hio_done_list;
+static pthread_mutex_t hio_done_list_lock;
+static pthread_cond_t hio_done_list_cond;
+/*
+ * Structure below are for interaction with sync thread.
+ */
+static bool sync_inprogress;
+static pthread_mutex_t sync_lock;
+static pthread_cond_t sync_cond;
+/*
+ * The lock below allows to synchornize access to remote connections.
+ */
+static pthread_rwlock_t *hio_remote_lock;
+static pthread_mutex_t hio_guard_lock;
+static pthread_cond_t hio_guard_cond;
+
+/*
+ * Lock to synchronize metadata updates. Also synchronize access to
+ * hr_primary_localcnt and hr_primary_remotecnt fields.
+ */
+static pthread_mutex_t metadata_lock;
+
+/*
+ * Maximum number of outstanding I/O requests.
+ */
+#define HAST_HIO_MAX 256
+/*
+ * Number of components. At this point there are only two components: local
+ * and remote, but in the future it might be possible to use multiple local
+ * and remote components.
+ */
+#define HAST_NCOMPONENTS 2
+/*
+ * Number of seconds to sleep before next reconnect try.
+ */
+#define RECONNECT_SLEEP 5
+
+#define ISCONNECTED(res, no) \
+ ((res)->hr_remotein != NULL && (res)->hr_remoteout != NULL)
+
+#define QUEUE_INSERT1(hio, name, ncomp) do { \
+ bool _wakeup; \
+ \
+ mtx_lock(&hio_##name##_list_lock[(ncomp)]); \
+ _wakeup = TAILQ_EMPTY(&hio_##name##_list[(ncomp)]); \
+ TAILQ_INSERT_TAIL(&hio_##name##_list[(ncomp)], (hio), \
+ hio_next[(ncomp)]); \
+ mtx_unlock(&hio_##name##_list_lock[ncomp]); \
+ if (_wakeup) \
+ cv_signal(&hio_##name##_list_cond[(ncomp)]); \
+} while (0)
+#define QUEUE_INSERT2(hio, name) do { \
+ bool _wakeup; \
+ \
+ mtx_lock(&hio_##name##_list_lock); \
+ _wakeup = TAILQ_EMPTY(&hio_##name##_list); \
+ TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_##name##_next);\
+ mtx_unlock(&hio_##name##_list_lock); \
+ if (_wakeup) \
+ cv_signal(&hio_##name##_list_cond); \
+} while (0)
+#define QUEUE_TAKE1(hio, name, ncomp) do { \
+ mtx_lock(&hio_##name##_list_lock[(ncomp)]); \
+ while (((hio) = TAILQ_FIRST(&hio_##name##_list[(ncomp)])) == NULL) { \
+ cv_wait(&hio_##name##_list_cond[(ncomp)], \
+ &hio_##name##_list_lock[(ncomp)]); \
+ } \
+ TAILQ_REMOVE(&hio_##name##_list[(ncomp)], (hio), \
+ hio_next[(ncomp)]); \
+ mtx_unlock(&hio_##name##_list_lock[(ncomp)]); \
+} while (0)
+#define QUEUE_TAKE2(hio, name) do { \
+ mtx_lock(&hio_##name##_list_lock); \
+ while (((hio) = TAILQ_FIRST(&hio_##name##_list)) == NULL) { \
+ cv_wait(&hio_##name##_list_cond, \
+ &hio_##name##_list_lock); \
+ } \
+ TAILQ_REMOVE(&hio_##name##_list, (hio), hio_##name##_next); \
+ mtx_unlock(&hio_##name##_list_lock); \
+} while (0)
+
+#define SYNCREQ(hio) do { (hio)->hio_ggio.gctl_unit = -1; } while (0)
+#define ISSYNCREQ(hio) ((hio)->hio_ggio.gctl_unit == -1)
+#define SYNCREQDONE(hio) do { (hio)->hio_ggio.gctl_unit = -2; } while (0)
+#define ISSYNCREQDONE(hio) ((hio)->hio_ggio.gctl_unit == -2)
+
+static struct hast_resource *gres;
+
+static pthread_mutex_t range_lock;
+static struct rangelocks *range_regular;
+static bool range_regular_wait;
+static pthread_cond_t range_regular_cond;
+static struct rangelocks *range_sync;
+static bool range_sync_wait;
+static pthread_cond_t range_sync_cond;
+
+static void *ggate_recv_thread(void *arg);
+static void *local_send_thread(void *arg);
+static void *remote_send_thread(void *arg);
+static void *remote_recv_thread(void *arg);
+static void *ggate_send_thread(void *arg);
+static void *sync_thread(void *arg);
+static void *guard_thread(void *arg);
+
+static void sighandler(int sig);
+
+static void
+cleanup(struct hast_resource *res)
+{
+ int rerrno;
+
+ /* Remember errno. */
+ rerrno = errno;
+
+ /*
+ * Close descriptor to /dev/hast/<name>
+ * to work-around race in the kernel.
+ */
+ close(res->hr_localfd);
+
+ /* Destroy ggate provider if we created one. */
+ if (res->hr_ggateunit >= 0) {
+ struct g_gate_ctl_destroy ggiod;
+
+ ggiod.gctl_version = G_GATE_VERSION;
+ ggiod.gctl_unit = res->hr_ggateunit;
+ ggiod.gctl_force = 1;
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_DESTROY, &ggiod) < 0) {
+ pjdlog_warning("Unable to destroy hast/%s device",
+ res->hr_provname);
+ }
+ res->hr_ggateunit = -1;
+ }
+
+ /* Restore errno. */
+ errno = rerrno;
+}
+
+static void
+primary_exit(int exitcode, const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(exitcode != EX_OK);
+ va_start(ap, fmt);
+ pjdlogv_errno(LOG_ERR, fmt, ap);
+ va_end(ap);
+ cleanup(gres);
+ exit(exitcode);
+}
+
+static void
+primary_exitx(int exitcode, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv(exitcode == EX_OK ? LOG_INFO : LOG_ERR, fmt, ap);
+ va_end(ap);
+ cleanup(gres);
+ exit(exitcode);
+}
+
+static int
+hast_activemap_flush(struct hast_resource *res)
+{
+ const unsigned char *buf;
+ size_t size;
+
+ buf = activemap_bitmap(res->hr_amp, &size);
+ assert(buf != NULL);
+ assert((size % res->hr_local_sectorsize) == 0);
+ if (pwrite(res->hr_localfd, buf, size, METADATA_SIZE) !=
+ (ssize_t)size) {
+ KEEP_ERRNO(pjdlog_errno(LOG_ERR,
+ "Unable to flush activemap to disk"));
+ return (-1);
+ }
+ return (0);
+}
+
+static void
+init_environment(struct hast_resource *res __unused)
+{
+ struct hio *hio;
+ unsigned int ii, ncomps;
+
+ /*
+ * In the future it might be per-resource value.
+ */
+ ncomps = HAST_NCOMPONENTS;
+
+ /*
+ * Allocate memory needed by lists.
+ */
+ hio_send_list = malloc(sizeof(hio_send_list[0]) * ncomps);
+ if (hio_send_list == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for send lists.",
+ sizeof(hio_send_list[0]) * ncomps);
+ }
+ hio_send_list_lock = malloc(sizeof(hio_send_list_lock[0]) * ncomps);
+ if (hio_send_list_lock == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for send list locks.",
+ sizeof(hio_send_list_lock[0]) * ncomps);
+ }
+ hio_send_list_cond = malloc(sizeof(hio_send_list_cond[0]) * ncomps);
+ if (hio_send_list_cond == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for send list condition variables.",
+ sizeof(hio_send_list_cond[0]) * ncomps);
+ }
+ hio_recv_list = malloc(sizeof(hio_recv_list[0]) * ncomps);
+ if (hio_recv_list == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for recv lists.",
+ sizeof(hio_recv_list[0]) * ncomps);
+ }
+ hio_recv_list_lock = malloc(sizeof(hio_recv_list_lock[0]) * ncomps);
+ if (hio_recv_list_lock == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for recv list locks.",
+ sizeof(hio_recv_list_lock[0]) * ncomps);
+ }
+ hio_recv_list_cond = malloc(sizeof(hio_recv_list_cond[0]) * ncomps);
+ if (hio_recv_list_cond == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for recv list condition variables.",
+ sizeof(hio_recv_list_cond[0]) * ncomps);
+ }
+ hio_remote_lock = malloc(sizeof(hio_remote_lock[0]) * ncomps);
+ if (hio_remote_lock == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for remote connections locks.",
+ sizeof(hio_remote_lock[0]) * ncomps);
+ }
+
+ /*
+ * Initialize lists, their locks and theirs condition variables.
+ */
+ TAILQ_INIT(&hio_free_list);
+ mtx_init(&hio_free_list_lock);
+ cv_init(&hio_free_list_cond);
+ for (ii = 0; ii < HAST_NCOMPONENTS; ii++) {
+ TAILQ_INIT(&hio_send_list[ii]);
+ mtx_init(&hio_send_list_lock[ii]);
+ cv_init(&hio_send_list_cond[ii]);
+ TAILQ_INIT(&hio_recv_list[ii]);
+ mtx_init(&hio_recv_list_lock[ii]);
+ cv_init(&hio_recv_list_cond[ii]);
+ rw_init(&hio_remote_lock[ii]);
+ }
+ TAILQ_INIT(&hio_done_list);
+ mtx_init(&hio_done_list_lock);
+ cv_init(&hio_done_list_cond);
+ mtx_init(&hio_guard_lock);
+ cv_init(&hio_guard_cond);
+ mtx_init(&metadata_lock);
+
+ /*
+ * Allocate requests pool and initialize requests.
+ */
+ for (ii = 0; ii < HAST_HIO_MAX; ii++) {
+ hio = malloc(sizeof(*hio));
+ if (hio == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for hio request.",
+ sizeof(*hio));
+ }
+ hio->hio_countdown = 0;
+ hio->hio_errors = malloc(sizeof(hio->hio_errors[0]) * ncomps);
+ if (hio->hio_errors == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable allocate %zu bytes of memory for hio errors.",
+ sizeof(hio->hio_errors[0]) * ncomps);
+ }
+ hio->hio_next = malloc(sizeof(hio->hio_next[0]) * ncomps);
+ if (hio->hio_next == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable allocate %zu bytes of memory for hio_next field.",
+ sizeof(hio->hio_next[0]) * ncomps);
+ }
+ hio->hio_ggio.gctl_version = G_GATE_VERSION;
+ hio->hio_ggio.gctl_data = malloc(MAXPHYS);
+ if (hio->hio_ggio.gctl_data == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for gctl_data.",
+ MAXPHYS);
+ }
+ hio->hio_ggio.gctl_length = MAXPHYS;
+ hio->hio_ggio.gctl_error = 0;
+ TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_free_next);
+ }
+
+ /*
+ * Turn on signals handling.
+ */
+ signal(SIGINT, sighandler);
+ signal(SIGTERM, sighandler);
+}
+
+static void
+init_local(struct hast_resource *res)
+{
+ unsigned char *buf;
+ size_t mapsize;
+
+ if (metadata_read(res, true) < 0)
+ exit(EX_NOINPUT);
+ mtx_init(&res->hr_amp_lock);
+ if (activemap_init(&res->hr_amp, res->hr_datasize, res->hr_extentsize,
+ res->hr_local_sectorsize, res->hr_keepdirty) < 0) {
+ primary_exit(EX_TEMPFAIL, "Unable to create activemap");
+ }
+ mtx_init(&range_lock);
+ cv_init(&range_regular_cond);
+ if (rangelock_init(&range_regular) < 0)
+ primary_exit(EX_TEMPFAIL, "Unable to create regular range lock");
+ cv_init(&range_sync_cond);
+ if (rangelock_init(&range_sync) < 0)
+ primary_exit(EX_TEMPFAIL, "Unable to create sync range lock");
+ mapsize = activemap_ondisk_size(res->hr_amp);
+ buf = calloc(1, mapsize);
+ if (buf == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate buffer for activemap.");
+ }
+ if (pread(res->hr_localfd, buf, mapsize, METADATA_SIZE) !=
+ (ssize_t)mapsize) {
+ primary_exit(EX_NOINPUT, "Unable to read activemap");
+ }
+ activemap_copyin(res->hr_amp, buf, mapsize);
+ if (res->hr_resuid != 0)
+ return;
+ /*
+ * We're using provider for the first time, so we have to generate
+ * resource unique identifier and initialize local and remote counts.
+ */
+ arc4random_buf(&res->hr_resuid, sizeof(res->hr_resuid));
+ res->hr_primary_localcnt = 1;
+ res->hr_primary_remotecnt = 0;
+ if (metadata_write(res) < 0)
+ exit(EX_NOINPUT);
+}
+
+static void
+init_remote(struct hast_resource *res)
+{
+ struct nv *nvout, *nvin;
+ const unsigned char *token;
+ unsigned char *map;
+ const char *errmsg;
+ int32_t extentsize;
+ int64_t datasize;
+ uint32_t mapsize;
+ size_t size;
+
+ /* Prepare outgoing connection with remote node. */
+ if (proto_client(res->hr_remoteaddr, &res->hr_remoteout) < 0) {
+ primary_exit(EX_OSERR, "Unable to create connection to %s",
+ res->hr_remoteaddr);
+ }
+ /* Try to connect, but accept failure. */
+ if (proto_connect(res->hr_remoteout) < 0) {
+ pjdlog_errno(LOG_WARNING, "Unable to connect to %s",
+ res->hr_remoteaddr);
+ goto close;
+ }
+ /*
+ * First handshake step.
+ * Setup outgoing connection with remote node.
+ */
+ nvout = nv_alloc();
+ nv_add_string(nvout, res->hr_name, "resource");
+ if (nv_error(nvout) != 0) {
+ pjdlog_common(LOG_WARNING, 0, nv_error(nvout),
+ "Unable to allocate header for connection with %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ goto close;
+ }
+ if (hast_proto_send(res, res->hr_remoteout, nvout, NULL, 0) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to send handshake header to %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ goto close;
+ }
+ nv_free(nvout);
+ if (hast_proto_recv_hdr(res->hr_remoteout, &nvin) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to receive handshake header from %s",
+ res->hr_remoteaddr);
+ goto close;
+ }
+ errmsg = nv_get_string(nvin, "errmsg");
+ if (errmsg != NULL) {
+ pjdlog_warning("%s", errmsg);
+ nv_free(nvin);
+ goto close;
+ }
+ token = nv_get_uint8_array(nvin, &size, "token");
+ if (token == NULL) {
+ pjdlog_warning("Handshake header from %s has no 'token' field.",
+ res->hr_remoteaddr);
+ nv_free(nvin);
+ goto close;
+ }
+ if (size != sizeof(res->hr_token)) {
+ pjdlog_warning("Handshake header from %s contains 'token' of wrong size (got %zu, expected %zu).",
+ res->hr_remoteaddr, size, sizeof(res->hr_token));
+ nv_free(nvin);
+ goto close;
+ }
+ bcopy(token, res->hr_token, sizeof(res->hr_token));
+ nv_free(nvin);
+
+ /*
+ * Second handshake step.
+ * Setup incoming connection with remote node.
+ */
+ if (proto_client(res->hr_remoteaddr, &res->hr_remotein) < 0) {
+ pjdlog_errno(LOG_WARNING, "Unable to create connection to %s",
+ res->hr_remoteaddr);
+ }
+ /* Try to connect, but accept failure. */
+ if (proto_connect(res->hr_remotein) < 0) {
+ pjdlog_errno(LOG_WARNING, "Unable to connect to %s",
+ res->hr_remoteaddr);
+ goto close;
+ }
+ nvout = nv_alloc();
+ nv_add_string(nvout, res->hr_name, "resource");
+ nv_add_uint8_array(nvout, res->hr_token, sizeof(res->hr_token),
+ "token");
+ nv_add_uint64(nvout, res->hr_resuid, "resuid");
+ nv_add_uint64(nvout, res->hr_primary_localcnt, "localcnt");
+ nv_add_uint64(nvout, res->hr_primary_remotecnt, "remotecnt");
+ if (nv_error(nvout) != 0) {
+ pjdlog_common(LOG_WARNING, 0, nv_error(nvout),
+ "Unable to allocate header for connection with %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ goto close;
+ }
+ if (hast_proto_send(res, res->hr_remotein, nvout, NULL, 0) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to send handshake header to %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ goto close;
+ }
+ nv_free(nvout);
+ if (hast_proto_recv_hdr(res->hr_remoteout, &nvin) < 0) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to receive handshake header from %s",
+ res->hr_remoteaddr);
+ goto close;
+ }
+ errmsg = nv_get_string(nvin, "errmsg");
+ if (errmsg != NULL) {
+ pjdlog_warning("%s", errmsg);
+ nv_free(nvin);
+ goto close;
+ }
+ datasize = nv_get_int64(nvin, "datasize");
+ if (datasize != res->hr_datasize) {
+ pjdlog_warning("Data size differs between nodes (local=%jd, remote=%jd).",
+ (intmax_t)res->hr_datasize, (intmax_t)datasize);
+ nv_free(nvin);
+ goto close;
+ }
+ extentsize = nv_get_int32(nvin, "extentsize");
+ if (extentsize != res->hr_extentsize) {
+ pjdlog_warning("Extent size differs between nodes (local=%zd, remote=%zd).",
+ (ssize_t)res->hr_extentsize, (ssize_t)extentsize);
+ nv_free(nvin);
+ goto close;
+ }
+ res->hr_secondary_localcnt = nv_get_uint64(nvin, "localcnt");
+ res->hr_secondary_remotecnt = nv_get_uint64(nvin, "remotecnt");
+ res->hr_syncsrc = nv_get_uint8(nvin, "syncsrc");
+ map = NULL;
+ mapsize = nv_get_uint32(nvin, "mapsize");
+ if (mapsize > 0) {
+ map = malloc(mapsize);
+ if (map == NULL) {
+ pjdlog_error("Unable to allocate memory for remote activemap (mapsize=%ju).",
+ (uintmax_t)mapsize);
+ nv_free(nvin);
+ goto close;
+ }
+ /*
+ * Remote node have some dirty extents on its own, lets
+ * download its activemap.
+ */
+ if (hast_proto_recv_data(res, res->hr_remoteout, nvin, map,
+ mapsize) < 0) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to receive remote activemap");
+ nv_free(nvin);
+ free(map);
+ goto close;
+ }
+ /*
+ * Merge local and remote bitmaps.
+ */
+ activemap_merge(res->hr_amp, map, mapsize);
+ free(map);
+ /*
+ * Now that we merged bitmaps from both nodes, flush it to the
+ * disk before we start to synchronize.
+ */
+ (void)hast_activemap_flush(res);
+ }
+ pjdlog_info("Connected to %s.", res->hr_remoteaddr);
+ mtx_lock(&sync_lock);
+ sync_inprogress = true;
+ mtx_unlock(&sync_lock);
+ cv_signal(&sync_cond);
+ return;
+close:
+ proto_close(res->hr_remoteout);
+ res->hr_remoteout = NULL;
+ if (res->hr_remotein != NULL) {
+ proto_close(res->hr_remotein);
+ res->hr_remotein = NULL;
+ }
+}
+
+static void
+init_ggate(struct hast_resource *res)
+{
+ struct g_gate_ctl_create ggiocreate;
+ struct g_gate_ctl_cancel ggiocancel;
+
+ /*
+ * We communicate with ggate via /dev/ggctl. Open it.
+ */
+ res->hr_ggatefd = open("/dev/" G_GATE_CTL_NAME, O_RDWR);
+ if (res->hr_ggatefd < 0)
+ primary_exit(EX_OSFILE, "Unable to open /dev/" G_GATE_CTL_NAME);
+ /*
+ * Create provider before trying to connect, as connection failure
+ * is not critical, but may take some time.
+ */
+ ggiocreate.gctl_version = G_GATE_VERSION;
+ ggiocreate.gctl_mediasize = res->hr_datasize;
+ ggiocreate.gctl_sectorsize = res->hr_local_sectorsize;
+ ggiocreate.gctl_flags = 0;
+ ggiocreate.gctl_maxcount = 128;
+ ggiocreate.gctl_timeout = 0;
+ ggiocreate.gctl_unit = G_GATE_NAME_GIVEN;
+ snprintf(ggiocreate.gctl_name, sizeof(ggiocreate.gctl_name), "hast/%s",
+ res->hr_provname);
+ bzero(ggiocreate.gctl_info, sizeof(ggiocreate.gctl_info));
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_CREATE, &ggiocreate) == 0) {
+ pjdlog_info("Device hast/%s created.", res->hr_provname);
+ res->hr_ggateunit = ggiocreate.gctl_unit;
+ return;
+ }
+ if (errno != EEXIST) {
+ primary_exit(EX_OSERR, "Unable to create hast/%s device",
+ res->hr_provname);
+ }
+ pjdlog_debug(1,
+ "Device hast/%s already exists, we will try to take it over.",
+ res->hr_provname);
+ /*
+ * If we received EEXIST, we assume that the process who created the
+ * provider died and didn't clean up. In that case we will start from
+ * where he left of.
+ */
+ ggiocancel.gctl_version = G_GATE_VERSION;
+ ggiocancel.gctl_unit = G_GATE_NAME_GIVEN;
+ snprintf(ggiocancel.gctl_name, sizeof(ggiocancel.gctl_name), "hast/%s",
+ res->hr_provname);
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_CANCEL, &ggiocancel) == 0) {
+ pjdlog_info("Device hast/%s recovered.", res->hr_provname);
+ res->hr_ggateunit = ggiocancel.gctl_unit;
+ return;
+ }
+ primary_exit(EX_OSERR, "Unable to take over hast/%s device",
+ res->hr_provname);
+}
+
+void
+hastd_primary(struct hast_resource *res)
+{
+ pthread_t td;
+ pid_t pid;
+ int error;
+
+ gres = res;
+
+ /*
+ * Create communication channel between parent and child.
+ */
+ if (proto_client("socketpair://", &res->hr_ctrl) < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ primary_exit(EX_OSERR,
+ "Unable to create control sockets between parent and child");
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ primary_exit(EX_OSERR, "Unable to fork");
+ }
+
+ if (pid > 0) {
+ /* This is parent. */
+ res->hr_workerpid = pid;
+ return;
+ }
+ (void)pidfile_close(pfh);
+
+ setproctitle("%s (primary)", res->hr_name);
+
+ init_local(res);
+ init_remote(res);
+ init_ggate(res);
+ init_environment(res);
+ error = pthread_create(&td, NULL, ggate_recv_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, local_send_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, remote_send_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, remote_recv_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, ggate_send_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, sync_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, ctrl_thread, res);
+ assert(error == 0);
+ (void)guard_thread(res);
+}
+
+static void
+reqlog(int loglevel, int debuglevel, struct g_gate_ctl_io *ggio, const char *fmt, ...)
+{
+ char msg[1024];
+ va_list ap;
+ int len;
+
+ va_start(ap, fmt);
+ len = vsnprintf(msg, sizeof(msg), fmt, ap);
+ va_end(ap);
+ if ((size_t)len < sizeof(msg)) {
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "READ(%ju, %ju).", (uintmax_t)ggio->gctl_offset,
+ (uintmax_t)ggio->gctl_length);
+ break;
+ case BIO_DELETE:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "DELETE(%ju, %ju).", (uintmax_t)ggio->gctl_offset,
+ (uintmax_t)ggio->gctl_length);
+ break;
+ case BIO_FLUSH:
+ (void)snprintf(msg + len, sizeof(msg) - len, "FLUSH.");
+ break;
+ case BIO_WRITE:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "WRITE(%ju, %ju).", (uintmax_t)ggio->gctl_offset,
+ (uintmax_t)ggio->gctl_length);
+ break;
+ default:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "UNKNOWN(%u).", (unsigned int)ggio->gctl_cmd);
+ break;
+ }
+ }
+ pjdlog_common(loglevel, debuglevel, -1, "%s", msg);
+}
+
+static void
+remote_close(struct hast_resource *res, int ncomp)
+{
+
+ rw_wlock(&hio_remote_lock[ncomp]);
+ /*
+ * A race is possible between dropping rlock and acquiring wlock -
+ * another thread can close connection in-between.
+ */
+ if (!ISCONNECTED(res, ncomp)) {
+ assert(res->hr_remotein == NULL);
+ assert(res->hr_remoteout == NULL);
+ rw_unlock(&hio_remote_lock[ncomp]);
+ return;
+ }
+
+ assert(res->hr_remotein != NULL);
+ assert(res->hr_remoteout != NULL);
+
+ pjdlog_debug(2, "Closing old incoming connection to %s.",
+ res->hr_remoteaddr);
+ proto_close(res->hr_remotein);
+ res->hr_remotein = NULL;
+ pjdlog_debug(2, "Closing old outgoing connection to %s.",
+ res->hr_remoteaddr);
+ proto_close(res->hr_remoteout);
+ res->hr_remoteout = NULL;
+
+ rw_unlock(&hio_remote_lock[ncomp]);
+
+ /*
+ * Stop synchronization if in-progress.
+ */
+ mtx_lock(&sync_lock);
+ if (sync_inprogress)
+ sync_inprogress = false;
+ mtx_unlock(&sync_lock);
+
+ /*
+ * Wake up guard thread, so it can immediately start reconnect.
+ */
+ mtx_lock(&hio_guard_lock);
+ cv_signal(&hio_guard_cond);
+ mtx_unlock(&hio_guard_lock);
+}
+
+/*
+ * Thread receives ggate I/O requests from the kernel and passes them to
+ * appropriate threads:
+ * WRITE - always goes to both local_send and remote_send threads
+ * READ (when the block is up-to-date on local component) -
+ * only local_send thread
+ * READ (when the block isn't up-to-date on local component) -
+ * only remote_send thread
+ * DELETE - always goes to both local_send and remote_send threads
+ * FLUSH - always goes to both local_send and remote_send threads
+ */
+static void *
+ggate_recv_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ unsigned int ii, ncomp, ncomps;
+ int error;
+
+ ncomps = HAST_NCOMPONENTS;
+
+ for (;;) {
+ pjdlog_debug(2, "ggate_recv: Taking free request.");
+ QUEUE_TAKE2(hio, free);
+ pjdlog_debug(2, "ggate_recv: (%p) Got free request.", hio);
+ ggio = &hio->hio_ggio;
+ ggio->gctl_unit = res->hr_ggateunit;
+ ggio->gctl_length = MAXPHYS;
+ ggio->gctl_error = 0;
+ pjdlog_debug(2,
+ "ggate_recv: (%p) Waiting for request from the kernel.",
+ hio);
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_START, ggio) < 0) {
+ if (sigexit_received)
+ pthread_exit(NULL);
+ primary_exit(EX_OSERR, "G_GATE_CMD_START failed");
+ }
+ error = ggio->gctl_error;
+ switch (error) {
+ case 0:
+ break;
+ case ECANCELED:
+ /* Exit gracefully. */
+ if (!sigexit_received) {
+ pjdlog_debug(2,
+ "ggate_recv: (%p) Received cancel from the kernel.",
+ hio);
+ pjdlog_info("Received cancel from the kernel, exiting.");
+ }
+ pthread_exit(NULL);
+ case ENOMEM:
+ /*
+ * Buffer too small? Impossible, we allocate MAXPHYS
+ * bytes - request can't be bigger than that.
+ */
+ /* FALLTHROUGH */
+ case ENXIO:
+ default:
+ primary_exitx(EX_OSERR, "G_GATE_CMD_START failed: %s.",
+ strerror(error));
+ }
+ for (ii = 0; ii < ncomps; ii++)
+ hio->hio_errors[ii] = EINVAL;
+ reqlog(LOG_DEBUG, 2, ggio,
+ "ggate_recv: (%p) Request received from the kernel: ",
+ hio);
+ /*
+ * Inform all components about new write request.
+ * For read request prefer local component unless the given
+ * range is out-of-date, then use remote component.
+ */
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ pjdlog_debug(2,
+ "ggate_recv: (%p) Moving request to the send queue.",
+ hio);
+ refcount_init(&hio->hio_countdown, 1);
+ mtx_lock(&metadata_lock);
+ if (res->hr_syncsrc == HAST_SYNCSRC_UNDEF ||
+ res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) {
+ /*
+ * This range is up-to-date on local component,
+ * so handle request locally.
+ */
+ /* Local component is 0 for now. */
+ ncomp = 0;
+ } else /* if (res->hr_syncsrc ==
+ HAST_SYNCSRC_SECONDARY) */ {
+ assert(res->hr_syncsrc ==
+ HAST_SYNCSRC_SECONDARY);
+ /*
+ * This range is out-of-date on local component,
+ * so send request to the remote node.
+ */
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+ }
+ mtx_unlock(&metadata_lock);
+ QUEUE_INSERT1(hio, send, ncomp);
+ break;
+ case BIO_WRITE:
+ for (;;) {
+ mtx_lock(&range_lock);
+ if (rangelock_islocked(range_sync,
+ ggio->gctl_offset, ggio->gctl_length)) {
+ pjdlog_debug(2,
+ "regular: Range offset=%jd length=%zu locked.",
+ (intmax_t)ggio->gctl_offset,
+ (size_t)ggio->gctl_length);
+ range_regular_wait = true;
+ cv_wait(&range_regular_cond, &range_lock);
+ range_regular_wait = false;
+ mtx_unlock(&range_lock);
+ continue;
+ }
+ if (rangelock_add(range_regular,
+ ggio->gctl_offset, ggio->gctl_length) < 0) {
+ mtx_unlock(&range_lock);
+ pjdlog_debug(2,
+ "regular: Range offset=%jd length=%zu is already locked, waiting.",
+ (intmax_t)ggio->gctl_offset,
+ (size_t)ggio->gctl_length);
+ sleep(1);
+ continue;
+ }
+ mtx_unlock(&range_lock);
+ break;
+ }
+ mtx_lock(&res->hr_amp_lock);
+ if (activemap_write_start(res->hr_amp,
+ ggio->gctl_offset, ggio->gctl_length)) {
+ (void)hast_activemap_flush(res);
+ }
+ mtx_unlock(&res->hr_amp_lock);
+ /* FALLTHROUGH */
+ case BIO_DELETE:
+ case BIO_FLUSH:
+ pjdlog_debug(2,
+ "ggate_recv: (%p) Moving request to the send queues.",
+ hio);
+ refcount_init(&hio->hio_countdown, ncomps);
+ for (ii = 0; ii < ncomps; ii++)
+ QUEUE_INSERT1(hio, send, ii);
+ break;
+ }
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread reads from or writes to local component.
+ * If local read fails, it redirects it to remote_send thread.
+ */
+static void *
+local_send_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ unsigned int ncomp, rncomp;
+ ssize_t ret;
+
+ /* Local component is 0 for now. */
+ ncomp = 0;
+ /* Remote component is 1 for now. */
+ rncomp = 1;
+
+ for (;;) {
+ pjdlog_debug(2, "local_send: Taking request.");
+ QUEUE_TAKE1(hio, send, ncomp);
+ pjdlog_debug(2, "local_send: (%p) Got request.", hio);
+ ggio = &hio->hio_ggio;
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ ret = pread(res->hr_localfd, ggio->gctl_data,
+ ggio->gctl_length,
+ ggio->gctl_offset + res->hr_localoff);
+ if (ret == ggio->gctl_length)
+ hio->hio_errors[ncomp] = 0;
+ else {
+ /*
+ * If READ failed, try to read from remote node.
+ */
+ QUEUE_INSERT1(hio, send, rncomp);
+ continue;
+ }
+ break;
+ case BIO_WRITE:
+ ret = pwrite(res->hr_localfd, ggio->gctl_data,
+ ggio->gctl_length,
+ ggio->gctl_offset + res->hr_localoff);
+ if (ret < 0)
+ hio->hio_errors[ncomp] = errno;
+ else if (ret != ggio->gctl_length)
+ hio->hio_errors[ncomp] = EIO;
+ else
+ hio->hio_errors[ncomp] = 0;
+ break;
+ case BIO_DELETE:
+ ret = g_delete(res->hr_localfd,
+ ggio->gctl_offset + res->hr_localoff,
+ ggio->gctl_length);
+ if (ret < 0)
+ hio->hio_errors[ncomp] = errno;
+ else
+ hio->hio_errors[ncomp] = 0;
+ break;
+ case BIO_FLUSH:
+ ret = g_flush(res->hr_localfd);
+ if (ret < 0)
+ hio->hio_errors[ncomp] = errno;
+ else
+ hio->hio_errors[ncomp] = 0;
+ break;
+ }
+ if (refcount_release(&hio->hio_countdown)) {
+ if (ISSYNCREQ(hio)) {
+ mtx_lock(&sync_lock);
+ SYNCREQDONE(hio);
+ mtx_unlock(&sync_lock);
+ cv_signal(&sync_cond);
+ } else {
+ pjdlog_debug(2,
+ "local_send: (%p) Moving request to the done queue.",
+ hio);
+ QUEUE_INSERT2(hio, done);
+ }
+ }
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread sends request to secondary node.
+ */
+static void *
+remote_send_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ struct nv *nv;
+ unsigned int ncomp;
+ bool wakeup;
+ uint64_t offset, length;
+ uint8_t cmd;
+ void *data;
+
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+
+ for (;;) {
+ pjdlog_debug(2, "remote_send: Taking request.");
+ QUEUE_TAKE1(hio, send, ncomp);
+ pjdlog_debug(2, "remote_send: (%p) Got request.", hio);
+ ggio = &hio->hio_ggio;
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ cmd = HIO_READ;
+ data = NULL;
+ offset = ggio->gctl_offset;
+ length = ggio->gctl_length;
+ break;
+ case BIO_WRITE:
+ cmd = HIO_WRITE;
+ data = ggio->gctl_data;
+ offset = ggio->gctl_offset;
+ length = ggio->gctl_length;
+ break;
+ case BIO_DELETE:
+ cmd = HIO_DELETE;
+ data = NULL;
+ offset = ggio->gctl_offset;
+ length = ggio->gctl_length;
+ break;
+ case BIO_FLUSH:
+ cmd = HIO_FLUSH;
+ data = NULL;
+ offset = 0;
+ length = 0;
+ break;
+ default:
+ assert(!"invalid condition");
+ abort();
+ }
+ nv = nv_alloc();
+ nv_add_uint8(nv, cmd, "cmd");
+ nv_add_uint64(nv, (uint64_t)ggio->gctl_seq, "seq");
+ nv_add_uint64(nv, offset, "offset");
+ nv_add_uint64(nv, length, "length");
+ if (nv_error(nv) != 0) {
+ hio->hio_errors[ncomp] = nv_error(nv);
+ pjdlog_debug(2,
+ "remote_send: (%p) Unable to prepare header to send.",
+ hio);
+ reqlog(LOG_ERR, 0, ggio,
+ "Unable to prepare header to send (%s): ",
+ strerror(nv_error(nv)));
+ /* Move failed request immediately to the done queue. */
+ goto done_queue;
+ }
+ pjdlog_debug(2,
+ "remote_send: (%p) Moving request to the recv queue.",
+ hio);
+ /*
+ * Protect connection from disappearing.
+ */
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (!ISCONNECTED(res, ncomp)) {
+ rw_unlock(&hio_remote_lock[ncomp]);
+ hio->hio_errors[ncomp] = ENOTCONN;
+ goto done_queue;
+ }
+ /*
+ * Move the request to recv queue before sending it, because
+ * in different order we can get reply before we move request
+ * to recv queue.
+ */
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ wakeup = TAILQ_EMPTY(&hio_recv_list[ncomp]);
+ TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], hio, hio_next[ncomp]);
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ if (hast_proto_send(res, res->hr_remoteout, nv, data,
+ data != NULL ? length : 0) < 0) {
+ hio->hio_errors[ncomp] = errno;
+ rw_unlock(&hio_remote_lock[ncomp]);
+ remote_close(res, ncomp);
+ pjdlog_debug(2,
+ "remote_send: (%p) Unable to send request.", hio);
+ reqlog(LOG_ERR, 0, ggio,
+ "Unable to send request (%s): ",
+ strerror(hio->hio_errors[ncomp]));
+ /*
+ * Take request back from the receive queue and move
+ * it immediately to the done queue.
+ */
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ TAILQ_REMOVE(&hio_recv_list[ncomp], hio, hio_next[ncomp]);
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ goto done_queue;
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ nv_free(nv);
+ if (wakeup)
+ cv_signal(&hio_recv_list_cond[ncomp]);
+ continue;
+done_queue:
+ nv_free(nv);
+ if (ISSYNCREQ(hio)) {
+ if (!refcount_release(&hio->hio_countdown))
+ continue;
+ mtx_lock(&sync_lock);
+ SYNCREQDONE(hio);
+ mtx_unlock(&sync_lock);
+ cv_signal(&sync_cond);
+ continue;
+ }
+ if (ggio->gctl_cmd == BIO_WRITE) {
+ mtx_lock(&res->hr_amp_lock);
+ if (activemap_need_sync(res->hr_amp, ggio->gctl_offset,
+ ggio->gctl_length)) {
+ (void)hast_activemap_flush(res);
+ }
+ mtx_unlock(&res->hr_amp_lock);
+ }
+ if (!refcount_release(&hio->hio_countdown))
+ continue;
+ pjdlog_debug(2,
+ "remote_send: (%p) Moving request to the done queue.",
+ hio);
+ QUEUE_INSERT2(hio, done);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread receives answer from secondary node and passes it to ggate_send
+ * thread.
+ */
+static void *
+remote_recv_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ struct nv *nv;
+ unsigned int ncomp;
+ uint64_t seq;
+ int error;
+
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+
+ for (;;) {
+ /* Wait until there is anything to receive. */
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ while (TAILQ_EMPTY(&hio_recv_list[ncomp])) {
+ pjdlog_debug(2, "remote_recv: No requests, waiting.");
+ cv_wait(&hio_recv_list_cond[ncomp],
+ &hio_recv_list_lock[ncomp]);
+ }
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (!ISCONNECTED(res, ncomp)) {
+ rw_unlock(&hio_remote_lock[ncomp]);
+ /*
+ * Connection is dead, so move all pending requests to
+ * the done queue (one-by-one).
+ */
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ hio = TAILQ_FIRST(&hio_recv_list[ncomp]);
+ assert(hio != NULL);
+ TAILQ_REMOVE(&hio_recv_list[ncomp], hio,
+ hio_next[ncomp]);
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ goto done_queue;
+ }
+ if (hast_proto_recv_hdr(res->hr_remotein, &nv) < 0) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to receive reply header");
+ rw_unlock(&hio_remote_lock[ncomp]);
+ remote_close(res, ncomp);
+ continue;
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ seq = nv_get_uint64(nv, "seq");
+ if (seq == 0) {
+ pjdlog_error("Header contains no 'seq' field.");
+ nv_free(nv);
+ continue;
+ }
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ TAILQ_FOREACH(hio, &hio_recv_list[ncomp], hio_next[ncomp]) {
+ if (hio->hio_ggio.gctl_seq == seq) {
+ TAILQ_REMOVE(&hio_recv_list[ncomp], hio,
+ hio_next[ncomp]);
+ break;
+ }
+ }
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ if (hio == NULL) {
+ pjdlog_error("Found no request matching received 'seq' field (%ju).",
+ (uintmax_t)seq);
+ nv_free(nv);
+ continue;
+ }
+ error = nv_get_int16(nv, "error");
+ if (error != 0) {
+ /* Request failed on remote side. */
+ hio->hio_errors[ncomp] = 0;
+ nv_free(nv);
+ goto done_queue;
+ }
+ ggio = &hio->hio_ggio;
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (!ISCONNECTED(res, ncomp)) {
+ rw_unlock(&hio_remote_lock[ncomp]);
+ nv_free(nv);
+ goto done_queue;
+ }
+ if (hast_proto_recv_data(res, res->hr_remotein, nv,
+ ggio->gctl_data, ggio->gctl_length) < 0) {
+ hio->hio_errors[ncomp] = errno;
+ pjdlog_errno(LOG_ERR,
+ "Unable to receive reply data");
+ rw_unlock(&hio_remote_lock[ncomp]);
+ nv_free(nv);
+ remote_close(res, ncomp);
+ goto done_queue;
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ break;
+ case BIO_WRITE:
+ case BIO_DELETE:
+ case BIO_FLUSH:
+ break;
+ default:
+ assert(!"invalid condition");
+ abort();
+ }
+ hio->hio_errors[ncomp] = 0;
+ nv_free(nv);
+done_queue:
+ if (refcount_release(&hio->hio_countdown)) {
+ if (ISSYNCREQ(hio)) {
+ mtx_lock(&sync_lock);
+ SYNCREQDONE(hio);
+ mtx_unlock(&sync_lock);
+ cv_signal(&sync_cond);
+ } else {
+ pjdlog_debug(2,
+ "remote_recv: (%p) Moving request to the done queue.",
+ hio);
+ QUEUE_INSERT2(hio, done);
+ }
+ }
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread sends answer to the kernel.
+ */
+static void *
+ggate_send_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ unsigned int ii, ncomp, ncomps;
+
+ ncomps = HAST_NCOMPONENTS;
+
+ for (;;) {
+ pjdlog_debug(2, "ggate_send: Taking request.");
+ QUEUE_TAKE2(hio, done);
+ pjdlog_debug(2, "ggate_send: (%p) Got request.", hio);
+ ggio = &hio->hio_ggio;
+ for (ii = 0; ii < ncomps; ii++) {
+ if (hio->hio_errors[ii] == 0) {
+ /*
+ * One successful request is enough to declare
+ * success.
+ */
+ ggio->gctl_error = 0;
+ break;
+ }
+ }
+ if (ii == ncomps) {
+ /*
+ * None of the requests were successful.
+ * Use first error.
+ */
+ ggio->gctl_error = hio->hio_errors[0];
+ }
+ if (ggio->gctl_error == 0 && ggio->gctl_cmd == BIO_WRITE) {
+ mtx_lock(&res->hr_amp_lock);
+ activemap_write_complete(res->hr_amp,
+ ggio->gctl_offset, ggio->gctl_length);
+ mtx_unlock(&res->hr_amp_lock);
+ }
+ if (ggio->gctl_cmd == BIO_WRITE) {
+ /*
+ * Unlock range we locked.
+ */
+ mtx_lock(&range_lock);
+ rangelock_del(range_regular, ggio->gctl_offset,
+ ggio->gctl_length);
+ if (range_sync_wait)
+ cv_signal(&range_sync_cond);
+ mtx_unlock(&range_lock);
+ /*
+ * Bump local count if this is first write after
+ * connection failure with remote node.
+ */
+ ncomp = 1;
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (!ISCONNECTED(res, ncomp)) {
+ mtx_lock(&metadata_lock);
+ if (res->hr_primary_localcnt ==
+ res->hr_secondary_remotecnt) {
+ res->hr_primary_localcnt++;
+ pjdlog_debug(1,
+ "Increasing localcnt to %ju.",
+ (uintmax_t)res->hr_primary_localcnt);
+ (void)metadata_write(res);
+ }
+ mtx_unlock(&metadata_lock);
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ }
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_DONE, ggio) < 0)
+ primary_exit(EX_OSERR, "G_GATE_CMD_DONE failed");
+ pjdlog_debug(2,
+ "ggate_send: (%p) Moving request to the free queue.", hio);
+ QUEUE_INSERT2(hio, free);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread synchronize local and remote components.
+ */
+static void *
+sync_thread(void *arg __unused)
+{
+ struct hast_resource *res = arg;
+ struct hio *hio;
+ struct g_gate_ctl_io *ggio;
+ unsigned int ii, ncomp, ncomps;
+ off_t offset, length, synced;
+ bool dorewind;
+ int syncext;
+
+ ncomps = HAST_NCOMPONENTS;
+ dorewind = true;
+ synced = 0;
+
+ for (;;) {
+ mtx_lock(&sync_lock);
+ while (!sync_inprogress) {
+ dorewind = true;
+ synced = 0;
+ cv_wait(&sync_cond, &sync_lock);
+ }
+ mtx_unlock(&sync_lock);
+ /*
+ * Obtain offset at which we should synchronize.
+ * Rewind synchronization if needed.
+ */
+ mtx_lock(&res->hr_amp_lock);
+ if (dorewind)
+ activemap_sync_rewind(res->hr_amp);
+ offset = activemap_sync_offset(res->hr_amp, &length, &syncext);
+ if (syncext != -1) {
+ /*
+ * We synchronized entire syncext extent, we can mark
+ * it as clean now.
+ */
+ if (activemap_extent_complete(res->hr_amp, syncext))
+ (void)hast_activemap_flush(res);
+ }
+ mtx_unlock(&res->hr_amp_lock);
+ if (dorewind) {
+ dorewind = false;
+ if (offset < 0)
+ pjdlog_info("Nodes are in sync.");
+ else {
+ pjdlog_info("Synchronization started. %ju bytes to go.",
+ (uintmax_t)(res->hr_extentsize *
+ activemap_ndirty(res->hr_amp)));
+ }
+ }
+ if (offset < 0) {
+ mtx_lock(&sync_lock);
+ sync_inprogress = false;
+ mtx_unlock(&sync_lock);
+ pjdlog_debug(1, "Nothing to synchronize.");
+ /*
+ * Synchronization complete, make both localcnt and
+ * remotecnt equal.
+ */
+ ncomp = 1;
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (ISCONNECTED(res, ncomp)) {
+ if (synced > 0) {
+ pjdlog_info("Synchronization complete. "
+ "%jd bytes synchronized.",
+ (intmax_t)synced);
+ }
+ mtx_lock(&metadata_lock);
+ res->hr_syncsrc = HAST_SYNCSRC_UNDEF;
+ res->hr_primary_localcnt =
+ res->hr_secondary_localcnt;
+ res->hr_primary_remotecnt =
+ res->hr_secondary_remotecnt;
+ pjdlog_debug(1,
+ "Setting localcnt to %ju and remotecnt to %ju.",
+ (uintmax_t)res->hr_primary_localcnt,
+ (uintmax_t)res->hr_secondary_localcnt);
+ (void)metadata_write(res);
+ mtx_unlock(&metadata_lock);
+ } else if (synced > 0) {
+ pjdlog_info("Synchronization interrupted. "
+ "%jd bytes synchronized so far.",
+ (intmax_t)synced);
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ continue;
+ }
+ pjdlog_debug(2, "sync: Taking free request.");
+ QUEUE_TAKE2(hio, free);
+ pjdlog_debug(2, "sync: (%p) Got free request.", hio);
+ /*
+ * Lock the range we are going to synchronize. We don't want
+ * race where someone writes between our read and write.
+ */
+ for (;;) {
+ mtx_lock(&range_lock);
+ if (rangelock_islocked(range_regular, offset, length)) {
+ pjdlog_debug(2,
+ "sync: Range offset=%jd length=%jd locked.",
+ (intmax_t)offset, (intmax_t)length);
+ range_sync_wait = true;
+ cv_wait(&range_sync_cond, &range_lock);
+ range_sync_wait = false;
+ mtx_unlock(&range_lock);
+ continue;
+ }
+ if (rangelock_add(range_sync, offset, length) < 0) {
+ mtx_unlock(&range_lock);
+ pjdlog_debug(2,
+ "sync: Range offset=%jd length=%jd is already locked, waiting.",
+ (intmax_t)offset, (intmax_t)length);
+ sleep(1);
+ continue;
+ }
+ mtx_unlock(&range_lock);
+ break;
+ }
+ /*
+ * First read the data from synchronization source.
+ */
+ SYNCREQ(hio);
+ ggio = &hio->hio_ggio;
+ ggio->gctl_cmd = BIO_READ;
+ ggio->gctl_offset = offset;
+ ggio->gctl_length = length;
+ ggio->gctl_error = 0;
+ for (ii = 0; ii < ncomps; ii++)
+ hio->hio_errors[ii] = EINVAL;
+ reqlog(LOG_DEBUG, 2, ggio, "sync: (%p) Sending sync request: ",
+ hio);
+ pjdlog_debug(2, "sync: (%p) Moving request to the send queue.",
+ hio);
+ mtx_lock(&metadata_lock);
+ if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) {
+ /*
+ * This range is up-to-date on local component,
+ * so handle request locally.
+ */
+ /* Local component is 0 for now. */
+ ncomp = 0;
+ } else /* if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) */ {
+ assert(res->hr_syncsrc == HAST_SYNCSRC_SECONDARY);
+ /*
+ * This range is out-of-date on local component,
+ * so send request to the remote node.
+ */
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+ }
+ mtx_unlock(&metadata_lock);
+ refcount_init(&hio->hio_countdown, 1);
+ QUEUE_INSERT1(hio, send, ncomp);
+
+ /*
+ * Let's wait for READ to finish.
+ */
+ mtx_lock(&sync_lock);
+ while (!ISSYNCREQDONE(hio))
+ cv_wait(&sync_cond, &sync_lock);
+ mtx_unlock(&sync_lock);
+
+ if (hio->hio_errors[ncomp] != 0) {
+ pjdlog_error("Unable to read synchronization data: %s.",
+ strerror(hio->hio_errors[ncomp]));
+ goto free_queue;
+ }
+
+ /*
+ * We read the data from synchronization source, now write it
+ * to synchronization target.
+ */
+ SYNCREQ(hio);
+ ggio->gctl_cmd = BIO_WRITE;
+ for (ii = 0; ii < ncomps; ii++)
+ hio->hio_errors[ii] = EINVAL;
+ reqlog(LOG_DEBUG, 2, ggio, "sync: (%p) Sending sync request: ",
+ hio);
+ pjdlog_debug(2, "sync: (%p) Moving request to the send queue.",
+ hio);
+ mtx_lock(&metadata_lock);
+ if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) {
+ /*
+ * This range is up-to-date on local component,
+ * so we update remote component.
+ */
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+ } else /* if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) */ {
+ assert(res->hr_syncsrc == HAST_SYNCSRC_SECONDARY);
+ /*
+ * This range is out-of-date on local component,
+ * so we update it.
+ */
+ /* Local component is 0 for now. */
+ ncomp = 0;
+ }
+ mtx_unlock(&metadata_lock);
+
+ pjdlog_debug(2, "sync: (%p) Moving request to the send queues.",
+ hio);
+ refcount_init(&hio->hio_countdown, 1);
+ QUEUE_INSERT1(hio, send, ncomp);
+
+ /*
+ * Let's wait for WRITE to finish.
+ */
+ mtx_lock(&sync_lock);
+ while (!ISSYNCREQDONE(hio))
+ cv_wait(&sync_cond, &sync_lock);
+ mtx_unlock(&sync_lock);
+
+ if (hio->hio_errors[ncomp] != 0) {
+ pjdlog_error("Unable to write synchronization data: %s.",
+ strerror(hio->hio_errors[ncomp]));
+ goto free_queue;
+ }
+free_queue:
+ mtx_lock(&range_lock);
+ rangelock_del(range_sync, offset, length);
+ if (range_regular_wait)
+ cv_signal(&range_regular_cond);
+ mtx_unlock(&range_lock);
+
+ synced += length;
+
+ pjdlog_debug(2, "sync: (%p) Moving request to the free queue.",
+ hio);
+ QUEUE_INSERT2(hio, free);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+static void
+sighandler(int sig)
+{
+ bool unlock;
+
+ switch (sig) {
+ case SIGINT:
+ case SIGTERM:
+ sigexit_received = true;
+ break;
+ default:
+ assert(!"invalid condition");
+ }
+ /*
+ * XXX: Racy, but if we cannot obtain hio_guard_lock here, we don't
+ * want to risk deadlock.
+ */
+ unlock = mtx_trylock(&hio_guard_lock);
+ cv_signal(&hio_guard_cond);
+ if (unlock)
+ mtx_unlock(&hio_guard_lock);
+}
+
+/*
+ * Thread guards remote connections and reconnects when needed, handles
+ * signals, etc.
+ */
+static void *
+guard_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ unsigned int ii, ncomps;
+ int timeout;
+
+ ncomps = HAST_NCOMPONENTS;
+ /* The is only one remote component for now. */
+#define ISREMOTE(no) ((no) == 1)
+
+ for (;;) {
+ if (sigexit_received) {
+ primary_exitx(EX_OK,
+ "Termination signal received, exiting.");
+ }
+ /*
+ * If all the connection will be fine, we will sleep until
+ * someone wakes us up.
+ * If any of the connections will be broken and we won't be
+ * able to connect, we will sleep only for RECONNECT_SLEEP
+ * seconds so we can retry soon.
+ */
+ timeout = 0;
+ pjdlog_debug(2, "remote_guard: Checking connections.");
+ mtx_lock(&hio_guard_lock);
+ for (ii = 0; ii < ncomps; ii++) {
+ if (!ISREMOTE(ii))
+ continue;
+ rw_rlock(&hio_remote_lock[ii]);
+ if (ISCONNECTED(res, ii)) {
+ assert(res->hr_remotein != NULL);
+ assert(res->hr_remoteout != NULL);
+ rw_unlock(&hio_remote_lock[ii]);
+ pjdlog_debug(2,
+ "remote_guard: Connection to %s is ok.",
+ res->hr_remoteaddr);
+ } else {
+ assert(res->hr_remotein == NULL);
+ assert(res->hr_remoteout == NULL);
+ /*
+ * Upgrade the lock. It doesn't have to be
+ * atomic as no other thread can change
+ * connection status from disconnected to
+ * connected.
+ */
+ rw_unlock(&hio_remote_lock[ii]);
+ rw_wlock(&hio_remote_lock[ii]);
+ assert(res->hr_remotein == NULL);
+ assert(res->hr_remoteout == NULL);
+ pjdlog_debug(2,
+ "remote_guard: Reconnecting to %s.",
+ res->hr_remoteaddr);
+ init_remote(res);
+ if (ISCONNECTED(res, ii)) {
+ pjdlog_info("Successfully reconnected to %s.",
+ res->hr_remoteaddr);
+ } else {
+ /* Both connections should be NULL. */
+ assert(res->hr_remotein == NULL);
+ assert(res->hr_remoteout == NULL);
+ pjdlog_debug(2,
+ "remote_guard: Reconnect to %s failed.",
+ res->hr_remoteaddr);
+ timeout = RECONNECT_SLEEP;
+ }
+ rw_unlock(&hio_remote_lock[ii]);
+ }
+ }
+ (void)cv_timedwait(&hio_guard_cond, &hio_guard_lock, timeout);
+ mtx_unlock(&hio_guard_lock);
+ }
+#undef ISREMOTE
+ /* NOTREACHED */
+ return (NULL);
+}
diff --git a/sbin/hastd/proto.c b/sbin/hastd/proto.c
new file mode 100644
index 0000000..103f20c
--- /dev/null
+++ b/sbin/hastd/proto.c
@@ -0,0 +1,261 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/queue.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdint.h>
+
+#include "proto.h"
+#include "proto_impl.h"
+
+#define PROTO_CONN_MAGIC 0x907041c
+struct proto_conn {
+ int pc_magic;
+ struct hast_proto *pc_proto;
+ void *pc_ctx;
+ int pc_side;
+#define PROTO_SIDE_CLIENT 0
+#define PROTO_SIDE_SERVER_LISTEN 1
+#define PROTO_SIDE_SERVER_WORK 2
+};
+
+static LIST_HEAD(, hast_proto) protos = LIST_HEAD_INITIALIZER(protos);
+
+void
+proto_register(struct hast_proto *proto)
+{
+
+ LIST_INSERT_HEAD(&protos, proto, hp_next);
+}
+
+static int
+proto_common_setup(const char *addr, struct proto_conn **connp, int side)
+{
+ struct hast_proto *proto;
+ struct proto_conn *conn;
+ void *ctx;
+ int ret;
+
+ assert(side == PROTO_SIDE_CLIENT || side == PROTO_SIDE_SERVER_LISTEN);
+
+ conn = malloc(sizeof(*conn));
+ if (conn == NULL)
+ return (-1);
+
+ LIST_FOREACH(proto, &protos, hp_next) {
+ if (side == PROTO_SIDE_CLIENT)
+ ret = proto->hp_client(addr, &ctx);
+ else /* if (side == PROTO_SIDE_SERVER_LISTEN) */
+ ret = proto->hp_server(addr, &ctx);
+ /*
+ * ret == 0 - success
+ * ret == -1 - addr is not for this protocol
+ * ret > 0 - right protocol, but an error occured
+ */
+ if (ret >= 0)
+ break;
+ }
+ if (proto == NULL) {
+ /* Unrecognized address. */
+ free(conn);
+ errno = EINVAL;
+ return (-1);
+ }
+ if (ret > 0) {
+ /* An error occured. */
+ free(conn);
+ errno = ret;
+ return (-1);
+ }
+ conn->pc_proto = proto;
+ conn->pc_ctx = ctx;
+ conn->pc_side = side;
+ conn->pc_magic = PROTO_CONN_MAGIC;
+ *connp = conn;
+ return (0);
+}
+
+int
+proto_client(const char *addr, struct proto_conn **connp)
+{
+
+ return (proto_common_setup(addr, connp, PROTO_SIDE_CLIENT));
+}
+
+int
+proto_connect(struct proto_conn *conn)
+{
+ int ret;
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_side == PROTO_SIDE_CLIENT);
+ assert(conn->pc_proto != NULL);
+
+ ret = conn->pc_proto->hp_connect(conn->pc_ctx);
+ if (ret != 0) {
+ errno = ret;
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
+proto_server(const char *addr, struct proto_conn **connp)
+{
+
+ return (proto_common_setup(addr, connp, PROTO_SIDE_SERVER_LISTEN));
+}
+
+int
+proto_accept(struct proto_conn *conn, struct proto_conn **newconnp)
+{
+ struct proto_conn *newconn;
+ int ret;
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_side == PROTO_SIDE_SERVER_LISTEN);
+ assert(conn->pc_proto != NULL);
+
+ newconn = malloc(sizeof(*newconn));
+ if (newconn == NULL)
+ return (-1);
+
+ ret = conn->pc_proto->hp_accept(conn->pc_ctx, &newconn->pc_ctx);
+ if (ret != 0) {
+ free(newconn);
+ errno = ret;
+ return (-1);
+ }
+
+ newconn->pc_proto = conn->pc_proto;
+ newconn->pc_side = PROTO_SIDE_SERVER_WORK;
+ newconn->pc_magic = PROTO_CONN_MAGIC;
+ *newconnp = newconn;
+
+ return (0);
+}
+
+int
+proto_send(struct proto_conn *conn, const void *data, size_t size)
+{
+ int ret;
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ ret = conn->pc_proto->hp_send(conn->pc_ctx, data, size);
+ if (ret != 0) {
+ errno = ret;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+proto_recv(struct proto_conn *conn, void *data, size_t size)
+{
+ int ret;
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ ret = conn->pc_proto->hp_recv(conn->pc_ctx, data, size);
+ if (ret != 0) {
+ errno = ret;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+proto_descriptor(const struct proto_conn *conn)
+{
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ return (conn->pc_proto->hp_descriptor(conn->pc_ctx));
+}
+
+bool
+proto_address_match(const struct proto_conn *conn, const char *addr)
+{
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ return (conn->pc_proto->hp_address_match(conn->pc_ctx, addr));
+}
+
+void
+proto_local_address(const struct proto_conn *conn, char *addr, size_t size)
+{
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ conn->pc_proto->hp_local_address(conn->pc_ctx, addr, size);
+}
+
+void
+proto_remote_address(const struct proto_conn *conn, char *addr, size_t size)
+{
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ conn->pc_proto->hp_remote_address(conn->pc_ctx, addr, size);
+}
+
+void
+proto_close(struct proto_conn *conn)
+{
+
+ assert(conn != NULL);
+ assert(conn->pc_magic == PROTO_CONN_MAGIC);
+ assert(conn->pc_proto != NULL);
+
+ conn->pc_proto->hp_close(conn->pc_ctx);
+ conn->pc_magic = 0;
+ free(conn);
+}
diff --git a/sbin/hastd/proto.h b/sbin/hastd/proto.h
new file mode 100644
index 0000000..cb196d8
--- /dev/null
+++ b/sbin/hastd/proto.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PROTO_H_
+#define _PROTO_H_
+
+#include <stdbool.h> /* bool */
+#include <stdlib.h> /* size_t */
+
+struct proto_conn;
+
+int proto_client(const char *addr, struct proto_conn **connp);
+int proto_connect(struct proto_conn *conn);
+int proto_server(const char *addr, struct proto_conn **connp);
+int proto_accept(struct proto_conn *conn, struct proto_conn **newconnp);
+int proto_send(struct proto_conn *conn, const void *data, size_t size);
+int proto_recv(struct proto_conn *conn, void *data, size_t size);
+int proto_descriptor(const struct proto_conn *conn);
+bool proto_address_match(const struct proto_conn *conn, const char *addr);
+void proto_local_address(const struct proto_conn *conn, char *addr,
+ size_t size);
+void proto_remote_address(const struct proto_conn *conn, char *addr,
+ size_t size);
+void proto_close(struct proto_conn *conn);
+
+#endif /* !_PROTO_H_ */
diff --git a/sbin/hastd/proto_common.c b/sbin/hastd/proto_common.c
new file mode 100644
index 0000000..22102d8
--- /dev/null
+++ b/sbin/hastd/proto_common.c
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#include "proto_impl.h"
+
+/* Maximum size of packet we want to use when sending data. */
+#ifndef MAX_SEND_SIZE
+//#define MAX_SEND_SIZE 32768
+#define MAX_SEND_SIZE 131072
+#endif
+
+int
+proto_common_send(int fd, const unsigned char *data, size_t size)
+{
+ ssize_t done;
+ size_t sendsize;
+
+ do {
+ sendsize = size < MAX_SEND_SIZE ? size : MAX_SEND_SIZE;
+ done = send(fd, data, sendsize, MSG_NOSIGNAL);
+ if (done == 0)
+ return (ENOTCONN);
+ else if (done < 0) {
+ if (errno == EAGAIN)
+ continue;
+ return (errno);
+ }
+ data += done;
+ size -= done;
+ } while (size > 0);
+
+ return (0);
+}
+
+int
+proto_common_recv(int fd, unsigned char *data, size_t size)
+{
+ ssize_t done;
+
+ do {
+ done = recv(fd, data, size, MSG_WAITALL);
+ } while (done == -1 && errno == EAGAIN);
+ if (done == 0)
+ return (ENOTCONN);
+ else if (done < 0)
+ return (errno);
+ return (0);
+}
diff --git a/sbin/hastd/proto_impl.h b/sbin/hastd/proto_impl.h
new file mode 100644
index 0000000..ea6548d
--- /dev/null
+++ b/sbin/hastd/proto_impl.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PROTO_IMPL_H_
+#define _PROTO_IMPL_H_
+
+#include <sys/queue.h>
+
+#include <stdbool.h> /* bool */
+#include <stdlib.h> /* size_t */
+
+#define __constructor __attribute__((constructor))
+
+typedef int hp_client_t(const char *, void **);
+typedef int hp_connect_t(void *);
+typedef int hp_server_t(const char *, void **);
+typedef int hp_accept_t(void *, void **);
+typedef int hp_send_t(void *, const unsigned char *, size_t);
+typedef int hp_recv_t(void *, unsigned char *, size_t);
+typedef int hp_descriptor_t(const void *);
+typedef bool hp_address_match_t(const void *, const char *);
+typedef void hp_local_address_t(const void *, char *, size_t);
+typedef void hp_remote_address_t(const void *, char *, size_t);
+typedef void hp_close_t(void *);
+
+struct hast_proto {
+ const char *hp_name;
+ hp_client_t *hp_client;
+ hp_connect_t *hp_connect;
+ hp_server_t *hp_server;
+ hp_accept_t *hp_accept;
+ hp_send_t *hp_send;
+ hp_recv_t *hp_recv;
+ hp_descriptor_t *hp_descriptor;
+ hp_address_match_t *hp_address_match;
+ hp_local_address_t *hp_local_address;
+ hp_remote_address_t *hp_remote_address;
+ hp_close_t *hp_close;
+ LIST_ENTRY(hast_proto) hp_next;
+};
+
+void proto_register(struct hast_proto *proto);
+
+int proto_common_send(int fd, const unsigned char *data, size_t size);
+int proto_common_recv(int fd, unsigned char *data, size_t size);
+
+#endif /* !_PROTO_IMPL_H_ */
diff --git a/sbin/hastd/proto_socketpair.c b/sbin/hastd/proto_socketpair.c
new file mode 100644
index 0000000..0e2cfa2
--- /dev/null
+++ b/sbin/hastd/proto_socketpair.c
@@ -0,0 +1,272 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "hast.h"
+#include "proto_impl.h"
+
+#define SP_CTX_MAGIC 0x50c3741
+struct sp_ctx {
+ int sp_magic;
+ int sp_fd[2];
+ int sp_side;
+#define SP_SIDE_UNDEF 0
+#define SP_SIDE_CLIENT 1
+#define SP_SIDE_SERVER 2
+};
+
+static void sp_close(void *ctx);
+
+static int
+sp_client(const char *addr, void **ctxp)
+{
+ struct sp_ctx *spctx;
+ int ret;
+
+ if (strcmp(addr, "socketpair://") != 0)
+ return (-1);
+
+ spctx = malloc(sizeof(*spctx));
+ if (spctx == NULL)
+ return (errno);
+
+ if (socketpair(PF_UNIX, SOCK_STREAM, 0, spctx->sp_fd) < 0) {
+ ret = errno;
+ free(spctx);
+ return (ret);
+ }
+
+ spctx->sp_side = SP_SIDE_UNDEF;
+ spctx->sp_magic = SP_CTX_MAGIC;
+ *ctxp = spctx;
+
+ return (0);
+}
+
+static int
+sp_connect(void *ctx __unused)
+{
+
+ assert(!"proto_connect() not supported on socketpairs");
+ abort();
+}
+
+static int
+sp_server(const char *addr __unused, void **ctxp __unused)
+{
+
+ assert(!"proto_server() not supported on socketpairs");
+ abort();
+}
+
+static int
+sp_accept(void *ctx __unused, void **newctxp __unused)
+{
+
+ assert(!"proto_server() not supported on socketpairs");
+ abort();
+}
+
+static int
+sp_send(void *ctx, const unsigned char *data, size_t size)
+{
+ struct sp_ctx *spctx = ctx;
+ int fd;
+
+ assert(spctx != NULL);
+ assert(spctx->sp_magic == SP_CTX_MAGIC);
+
+ switch (spctx->sp_side) {
+ case SP_SIDE_UNDEF:
+ /*
+ * If the first operation done by the caller is proto_send(),
+ * we assume this the client.
+ */
+ /* FALLTHROUGH */
+ spctx->sp_side = SP_SIDE_CLIENT;
+ /* Close other end. */
+ close(spctx->sp_fd[1]);
+ case SP_SIDE_CLIENT:
+ assert(spctx->sp_fd[0] >= 0);
+ fd = spctx->sp_fd[0];
+ break;
+ case SP_SIDE_SERVER:
+ assert(spctx->sp_fd[1] >= 0);
+ fd = spctx->sp_fd[1];
+ break;
+ default:
+ abort();
+ }
+
+ return (proto_common_send(fd, data, size));
+}
+
+static int
+sp_recv(void *ctx, unsigned char *data, size_t size)
+{
+ struct sp_ctx *spctx = ctx;
+ int fd;
+
+ assert(spctx != NULL);
+ assert(spctx->sp_magic == SP_CTX_MAGIC);
+
+ switch (spctx->sp_side) {
+ case SP_SIDE_UNDEF:
+ /*
+ * If the first operation done by the caller is proto_recv(),
+ * we assume this the server.
+ */
+ /* FALLTHROUGH */
+ spctx->sp_side = SP_SIDE_SERVER;
+ /* Close other end. */
+ close(spctx->sp_fd[0]);
+ case SP_SIDE_SERVER:
+ assert(spctx->sp_fd[1] >= 0);
+ fd = spctx->sp_fd[1];
+ break;
+ case SP_SIDE_CLIENT:
+ assert(spctx->sp_fd[0] >= 0);
+ fd = spctx->sp_fd[0];
+ break;
+ default:
+ abort();
+ }
+
+ return (proto_common_recv(fd, data, size));
+}
+
+static int
+sp_descriptor(const void *ctx)
+{
+ const struct sp_ctx *spctx = ctx;
+
+ assert(spctx != NULL);
+ assert(spctx->sp_magic == SP_CTX_MAGIC);
+ assert(spctx->sp_side == SP_SIDE_CLIENT ||
+ spctx->sp_side == SP_SIDE_SERVER);
+
+ switch (spctx->sp_side) {
+ case SP_SIDE_CLIENT:
+ assert(spctx->sp_fd[0] >= 0);
+ return (spctx->sp_fd[0]);
+ case SP_SIDE_SERVER:
+ assert(spctx->sp_fd[1] >= 0);
+ return (spctx->sp_fd[1]);
+ }
+
+ abort();
+}
+
+static bool
+sp_address_match(const void *ctx __unused, const char *addr __unused)
+{
+
+ assert(!"proto_address_match() not supported on socketpairs");
+ abort();
+}
+
+static void
+sp_local_address(const void *ctx __unused, char *addr __unused,
+ size_t size __unused)
+{
+
+ assert(!"proto_local_address() not supported on socketpairs");
+ abort();
+}
+
+static void
+sp_remote_address(const void *ctx __unused, char *addr __unused,
+ size_t size __unused)
+{
+
+ assert(!"proto_remote_address() not supported on socketpairs");
+ abort();
+}
+
+static void
+sp_close(void *ctx)
+{
+ struct sp_ctx *spctx = ctx;
+
+ assert(spctx != NULL);
+ assert(spctx->sp_magic == SP_CTX_MAGIC);
+
+ switch (spctx->sp_side) {
+ case SP_SIDE_UNDEF:
+ close(spctx->sp_fd[0]);
+ close(spctx->sp_fd[1]);
+ break;
+ case SP_SIDE_CLIENT:
+ close(spctx->sp_fd[0]);
+ break;
+ case SP_SIDE_SERVER:
+ close(spctx->sp_fd[1]);
+ break;
+ default:
+ abort();
+ }
+
+ spctx->sp_magic = 0;
+ free(spctx);
+}
+
+static struct hast_proto sp_proto = {
+ .hp_name = "socketpair",
+ .hp_client = sp_client,
+ .hp_connect = sp_connect,
+ .hp_server = sp_server,
+ .hp_accept = sp_accept,
+ .hp_send = sp_send,
+ .hp_recv = sp_recv,
+ .hp_descriptor = sp_descriptor,
+ .hp_address_match = sp_address_match,
+ .hp_local_address = sp_local_address,
+ .hp_remote_address = sp_remote_address,
+ .hp_close = sp_close
+};
+
+static __constructor void
+sp_ctor(void)
+{
+
+ proto_register(&sp_proto);
+}
diff --git a/sbin/hastd/proto_tcp4.c b/sbin/hastd/proto_tcp4.c
new file mode 100644
index 0000000..2fba996
--- /dev/null
+++ b/sbin/hastd/proto_tcp4.c
@@ -0,0 +1,447 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h> /* MAXHOSTNAMELEN */
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "hast.h"
+#include "pjdlog.h"
+#include "proto_impl.h"
+
+#define TCP4_CTX_MAGIC 0x7c441c
+struct tcp4_ctx {
+ int tc_magic;
+ struct sockaddr_in tc_sin;
+ int tc_fd;
+ int tc_side;
+#define TCP4_SIDE_CLIENT 0
+#define TCP4_SIDE_SERVER_LISTEN 1
+#define TCP4_SIDE_SERVER_WORK 2
+};
+
+static void tcp4_close(void *ctx);
+
+static in_addr_t
+str2ip(const char *str)
+{
+ struct hostent *hp;
+ in_addr_t ip;
+
+ ip = inet_addr(str);
+ if (ip != INADDR_NONE) {
+ /* It is a valid IP address. */
+ return (ip);
+ }
+ /* Check if it is a valid host name. */
+ hp = gethostbyname(str);
+ if (hp == NULL)
+ return (INADDR_NONE);
+ return (((struct in_addr *)(void *)hp->h_addr)->s_addr);
+}
+
+/*
+ * Function converts the given string to unsigned number.
+ */
+static int
+numfromstr(const char *str, intmax_t minnum, intmax_t maxnum, intmax_t *nump)
+{
+ intmax_t digit, num;
+
+ if (str[0] == '\0')
+ goto invalid; /* Empty string. */
+ num = 0;
+ for (; *str != '\0'; str++) {
+ if (*str < '0' || *str > '9')
+ goto invalid; /* Non-digit character. */
+ digit = *str - '0';
+ if (num > num * 10 + digit)
+ goto invalid; /* Overflow. */
+ num = num * 10 + digit;
+ if (num > maxnum)
+ goto invalid; /* Too big. */
+ }
+ if (num < minnum)
+ goto invalid; /* Too small. */
+ *nump = num;
+ return (0);
+invalid:
+ errno = EINVAL;
+ return (-1);
+}
+
+static int
+tcp4_addr(const char *addr, struct sockaddr_in *sinp)
+{
+ char iporhost[MAXHOSTNAMELEN];
+ const char *pp;
+ size_t size;
+ in_addr_t ip;
+
+ if (addr == NULL)
+ return (-1);
+
+ if (strncasecmp(addr, "tcp4://", 7) == 0)
+ addr += 7;
+ else if (strncasecmp(addr, "tcp://", 6) == 0)
+ addr += 6;
+ else if (addr[0] != '/' && /* If this is not path... */
+ strstr(addr, "://") == NULL)/* ...and has no prefix... */
+ ; /* ...tcp4 is the default. */
+ else
+ return (-1);
+
+ sinp->sin_family = AF_INET;
+ sinp->sin_len = sizeof(*sinp);
+ /* Extract optional port. */
+ pp = strrchr(addr, ':');
+ if (pp == NULL) {
+ /* Port not given, use the default. */
+ sinp->sin_port = htons(HASTD_PORT);
+ } else {
+ intmax_t port;
+
+ if (numfromstr(pp + 1, 1, 65535, &port) < 0)
+ return (errno);
+ sinp->sin_port = htons(port);
+ }
+ /* Extract host name or IP address. */
+ if (pp == NULL) {
+ size = sizeof(iporhost);
+ if (strlcpy(iporhost, addr, size) >= size)
+ return (ENAMETOOLONG);
+ } else {
+ size = (size_t)(pp - addr + 1);
+ if (size > sizeof(iporhost))
+ return (ENAMETOOLONG);
+ strlcpy(iporhost, addr, size);
+ }
+ /* Convert string (IP address or host name) to in_addr_t. */
+ ip = str2ip(iporhost);
+ if (ip == INADDR_NONE)
+ return (EINVAL);
+ sinp->sin_addr.s_addr = ip;
+
+ return (0);
+}
+
+static int
+tcp4_common_setup(const char *addr, void **ctxp, int side)
+{
+ struct tcp4_ctx *tctx;
+ int ret, val;
+
+ tctx = malloc(sizeof(*tctx));
+ if (tctx == NULL)
+ return (errno);
+
+ /* Parse given address. */
+ if ((ret = tcp4_addr(addr, &tctx->tc_sin)) != 0) {
+ free(tctx);
+ return (ret);
+ }
+
+ tctx->tc_fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (tctx->tc_fd == -1) {
+ ret = errno;
+ free(tctx);
+ return (ret);
+ }
+
+ /* Socket settings. */
+ val = 1;
+ if (setsockopt(tctx->tc_fd, IPPROTO_TCP, TCP_NODELAY, &val,
+ sizeof(val)) == -1) {
+ pjdlog_warning("Unable to set TCP_NOELAY on %s", addr);
+ }
+ val = 131072;
+ if (setsockopt(tctx->tc_fd, SOL_SOCKET, SO_SNDBUF, &val,
+ sizeof(val)) == -1) {
+ pjdlog_warning("Unable to set send buffer size on %s", addr);
+ }
+ val = 131072;
+ if (setsockopt(tctx->tc_fd, SOL_SOCKET, SO_RCVBUF, &val,
+ sizeof(val)) == -1) {
+ pjdlog_warning("Unable to set receive buffer size on %s", addr);
+ }
+
+ tctx->tc_side = side;
+ tctx->tc_magic = TCP4_CTX_MAGIC;
+ *ctxp = tctx;
+
+ return (0);
+}
+
+static int
+tcp4_client(const char *addr, void **ctxp)
+{
+
+ return (tcp4_common_setup(addr, ctxp, TCP4_SIDE_CLIENT));
+}
+
+static int
+tcp4_connect(void *ctx)
+{
+ struct tcp4_ctx *tctx = ctx;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+ assert(tctx->tc_side == TCP4_SIDE_CLIENT);
+ assert(tctx->tc_fd >= 0);
+
+ if (connect(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sin,
+ sizeof(tctx->tc_sin)) < 0) {
+ return (errno);
+ }
+
+ return (0);
+}
+
+static int
+tcp4_server(const char *addr, void **ctxp)
+{
+ struct tcp4_ctx *tctx;
+ int ret, val;
+
+ ret = tcp4_common_setup(addr, ctxp, TCP4_SIDE_SERVER_LISTEN);
+ if (ret != 0)
+ return (ret);
+
+ tctx = *ctxp;
+
+ val = 1;
+ /* Ignore failure. */
+ (void)setsockopt(tctx->tc_fd, SOL_SOCKET, SO_REUSEADDR, &val,
+ sizeof(val));
+
+ if (bind(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sin,
+ sizeof(tctx->tc_sin)) < 0) {
+ ret = errno;
+ tcp4_close(tctx);
+ return (ret);
+ }
+ if (listen(tctx->tc_fd, 8) < 0) {
+ ret = errno;
+ tcp4_close(tctx);
+ return (ret);
+ }
+
+ return (0);
+}
+
+static int
+tcp4_accept(void *ctx, void **newctxp)
+{
+ struct tcp4_ctx *tctx = ctx;
+ struct tcp4_ctx *newtctx;
+ socklen_t fromlen;
+ int ret;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+ assert(tctx->tc_side == TCP4_SIDE_SERVER_LISTEN);
+ assert(tctx->tc_fd >= 0);
+
+ newtctx = malloc(sizeof(*newtctx));
+ if (newtctx == NULL)
+ return (errno);
+
+ fromlen = sizeof(tctx->tc_sin);
+ newtctx->tc_fd = accept(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sin,
+ &fromlen);
+ if (newtctx->tc_fd < 0) {
+ ret = errno;
+ free(newtctx);
+ return (ret);
+ }
+
+ newtctx->tc_side = TCP4_SIDE_SERVER_WORK;
+ newtctx->tc_magic = TCP4_CTX_MAGIC;
+ *newctxp = newtctx;
+
+ return (0);
+}
+
+static int
+tcp4_send(void *ctx, const unsigned char *data, size_t size)
+{
+ struct tcp4_ctx *tctx = ctx;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+ assert(tctx->tc_fd >= 0);
+
+ return (proto_common_send(tctx->tc_fd, data, size));
+}
+
+static int
+tcp4_recv(void *ctx, unsigned char *data, size_t size)
+{
+ struct tcp4_ctx *tctx = ctx;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+ assert(tctx->tc_fd >= 0);
+
+ return (proto_common_recv(tctx->tc_fd, data, size));
+}
+
+static int
+tcp4_descriptor(const void *ctx)
+{
+ const struct tcp4_ctx *tctx = ctx;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+
+ return (tctx->tc_fd);
+}
+
+static void
+sin2str(struct sockaddr_in *sinp, char *addr, size_t size)
+{
+ in_addr_t ip;
+ unsigned int port;
+
+ assert(addr != NULL);
+ assert(sinp->sin_family == AF_INET);
+
+ ip = ntohl(sinp->sin_addr.s_addr);
+ port = ntohs(sinp->sin_port);
+ snprintf(addr, size, "tcp4://%u.%u.%u.%u:%u", ((ip >> 24) & 0xff),
+ ((ip >> 16) & 0xff), ((ip >> 8) & 0xff), (ip & 0xff), port);
+}
+
+static bool
+tcp4_address_match(const void *ctx, const char *addr)
+{
+ const struct tcp4_ctx *tctx = ctx;
+ struct sockaddr_in sin;
+ socklen_t sinlen;
+ in_addr_t ip1, ip2;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+
+ if (tcp4_addr(addr, &sin) != 0)
+ return (false);
+ ip1 = sin.sin_addr.s_addr;
+
+ sinlen = sizeof(sin);
+ if (getpeername(tctx->tc_fd, (struct sockaddr *)&sin, &sinlen) < 0)
+ return (false);
+ ip2 = sin.sin_addr.s_addr;
+
+ return (ip1 == ip2);
+}
+
+static void
+tcp4_local_address(const void *ctx, char *addr, size_t size)
+{
+ const struct tcp4_ctx *tctx = ctx;
+ struct sockaddr_in sin;
+ socklen_t sinlen;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+
+ sinlen = sizeof(sin);
+ if (getsockname(tctx->tc_fd, (struct sockaddr *)&sin, &sinlen) < 0) {
+ strlcpy(addr, "N/A", size);
+ return;
+ }
+ sin2str(&sin, addr, size);
+}
+
+static void
+tcp4_remote_address(const void *ctx, char *addr, size_t size)
+{
+ const struct tcp4_ctx *tctx = ctx;
+ struct sockaddr_in sin;
+ socklen_t sinlen;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+
+ sinlen = sizeof(sin);
+ if (getpeername(tctx->tc_fd, (struct sockaddr *)&sin, &sinlen) < 0) {
+ strlcpy(addr, "N/A", size);
+ return;
+ }
+ sin2str(&sin, addr, size);
+}
+
+static void
+tcp4_close(void *ctx)
+{
+ struct tcp4_ctx *tctx = ctx;
+
+ assert(tctx != NULL);
+ assert(tctx->tc_magic == TCP4_CTX_MAGIC);
+
+ if (tctx->tc_fd >= 0)
+ close(tctx->tc_fd);
+ tctx->tc_magic = 0;
+ free(tctx);
+}
+
+static struct hast_proto tcp4_proto = {
+ .hp_name = "tcp4",
+ .hp_client = tcp4_client,
+ .hp_connect = tcp4_connect,
+ .hp_server = tcp4_server,
+ .hp_accept = tcp4_accept,
+ .hp_send = tcp4_send,
+ .hp_recv = tcp4_recv,
+ .hp_descriptor = tcp4_descriptor,
+ .hp_address_match = tcp4_address_match,
+ .hp_local_address = tcp4_local_address,
+ .hp_remote_address = tcp4_remote_address,
+ .hp_close = tcp4_close
+};
+
+static __constructor void
+tcp4_ctor(void)
+{
+
+ proto_register(&tcp4_proto);
+}
diff --git a/sbin/hastd/proto_uds.c b/sbin/hastd/proto_uds.c
new file mode 100644
index 0000000..0fac82f
--- /dev/null
+++ b/sbin/hastd/proto_uds.c
@@ -0,0 +1,330 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/* UDS - UNIX Domain Socket */
+
+#include <sys/un.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "hast.h"
+#include "proto_impl.h"
+
+#define UDS_CTX_MAGIC 0xd541c
+struct uds_ctx {
+ int uc_magic;
+ struct sockaddr_un uc_sun;
+ int uc_fd;
+ int uc_side;
+#define UDS_SIDE_CLIENT 0
+#define UDS_SIDE_SERVER_LISTEN 1
+#define UDS_SIDE_SERVER_WORK 2
+};
+
+static void uds_close(void *ctx);
+
+static int
+uds_addr(const char *addr, struct sockaddr_un *sunp)
+{
+
+ if (addr == NULL)
+ return (-1);
+
+ if (strncasecmp(addr, "uds://", 6) == 0)
+ addr += 6;
+ else if (strncasecmp(addr, "unix://", 7) == 0)
+ addr += 7;
+ else if (addr[0] == '/' && /* If it starts from /... */
+ strstr(addr, "://") == NULL)/* ...and there is no prefix... */
+ ; /* ...we assume its us. */
+ else
+ return (-1);
+
+ sunp->sun_family = AF_UNIX;
+ if (strlcpy(sunp->sun_path, addr, sizeof(sunp->sun_path)) >=
+ sizeof(sunp->sun_path)) {
+ return (ENAMETOOLONG);
+ }
+ sunp->sun_len = SUN_LEN(sunp);
+
+ return (0);
+}
+
+static int
+uds_common_setup(const char *addr, void **ctxp, int side)
+{
+ struct uds_ctx *uctx;
+ int ret;
+
+ uctx = malloc(sizeof(*uctx));
+ if (uctx == NULL)
+ return (errno);
+
+ /* Parse given address. */
+ if ((ret = uds_addr(addr, &uctx->uc_sun)) != 0) {
+ free(uctx);
+ return (ret);
+ }
+
+ uctx->uc_fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (uctx->uc_fd == -1) {
+ ret = errno;
+ free(uctx);
+ return (ret);
+ }
+
+ uctx->uc_side = side;
+ uctx->uc_magic = UDS_CTX_MAGIC;
+ *ctxp = uctx;
+
+ return (0);
+}
+
+static int
+uds_client(const char *addr, void **ctxp)
+{
+
+ return (uds_common_setup(addr, ctxp, UDS_SIDE_CLIENT));
+}
+
+static int
+uds_connect(void *ctx)
+{
+ struct uds_ctx *uctx = ctx;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+ assert(uctx->uc_side == UDS_SIDE_CLIENT);
+ assert(uctx->uc_fd >= 0);
+
+ if (connect(uctx->uc_fd, (struct sockaddr *)&uctx->uc_sun,
+ sizeof(uctx->uc_sun)) < 0) {
+ return (errno);
+ }
+
+ return (0);
+}
+
+static int
+uds_server(const char *addr, void **ctxp)
+{
+ struct uds_ctx *uctx;
+ int ret;
+
+ ret = uds_common_setup(addr, ctxp, UDS_SIDE_SERVER_LISTEN);
+ if (ret != 0)
+ return (ret);
+
+ uctx = *ctxp;
+
+ unlink(uctx->uc_sun.sun_path);
+ if (bind(uctx->uc_fd, (struct sockaddr *)&uctx->uc_sun,
+ sizeof(uctx->uc_sun)) < 0) {
+ ret = errno;
+ uds_close(uctx);
+ return (ret);
+ }
+ if (listen(uctx->uc_fd, 8) < 0) {
+ ret = errno;
+ uds_close(uctx);
+ return (ret);
+ }
+
+ return (0);
+}
+
+static int
+uds_accept(void *ctx, void **newctxp)
+{
+ struct uds_ctx *uctx = ctx;
+ struct uds_ctx *newuctx;
+ socklen_t fromlen;
+ int ret;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+ assert(uctx->uc_side == UDS_SIDE_SERVER_LISTEN);
+ assert(uctx->uc_fd >= 0);
+
+ newuctx = malloc(sizeof(*newuctx));
+ if (newuctx == NULL)
+ return (errno);
+
+ fromlen = sizeof(uctx->uc_sun);
+ newuctx->uc_fd = accept(uctx->uc_fd, (struct sockaddr *)&uctx->uc_sun,
+ &fromlen);
+ if (newuctx->uc_fd < 0) {
+ ret = errno;
+ free(newuctx);
+ return (ret);
+ }
+
+ newuctx->uc_side = UDS_SIDE_SERVER_WORK;
+ newuctx->uc_magic = UDS_CTX_MAGIC;
+ *newctxp = newuctx;
+
+ return (0);
+}
+
+static int
+uds_send(void *ctx, const unsigned char *data, size_t size)
+{
+ struct uds_ctx *uctx = ctx;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+ assert(uctx->uc_fd >= 0);
+
+ return (proto_common_send(uctx->uc_fd, data, size));
+}
+
+static int
+uds_recv(void *ctx, unsigned char *data, size_t size)
+{
+ struct uds_ctx *uctx = ctx;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+ assert(uctx->uc_fd >= 0);
+
+ return (proto_common_recv(uctx->uc_fd, data, size));
+}
+
+static int
+uds_descriptor(const void *ctx)
+{
+ const struct uds_ctx *uctx = ctx;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+
+ return (uctx->uc_fd);
+}
+
+static bool
+uds_address_match(const void *ctx __unused, const char *addr __unused)
+{
+
+ assert(!"proto_address_match() not supported on UNIX domain sockets");
+ abort();
+}
+
+static void
+uds_local_address(const void *ctx, char *addr, size_t size)
+{
+ const struct uds_ctx *uctx = ctx;
+ struct sockaddr_un sun;
+ socklen_t sunlen;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+ assert(addr != NULL);
+
+ sunlen = sizeof(sun);
+ if (getsockname(uctx->uc_fd, (struct sockaddr *)&sun, &sunlen) < 0) {
+ strlcpy(addr, "N/A", size);
+ return;
+ }
+ assert(sun.sun_family == AF_UNIX);
+ if (sun.sun_path[0] == '\0') {
+ strlcpy(addr, "N/A", size);
+ return;
+ }
+ snprintf(addr, size, "uds://%s", sun.sun_path);
+}
+
+static void
+uds_remote_address(const void *ctx, char *addr, size_t size)
+{
+ const struct uds_ctx *uctx = ctx;
+ struct sockaddr_un sun;
+ socklen_t sunlen;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+ assert(addr != NULL);
+
+ sunlen = sizeof(sun);
+ if (getpeername(uctx->uc_fd, (struct sockaddr *)&sun, &sunlen) < 0) {
+ strlcpy(addr, "N/A", size);
+ return;
+ }
+ assert(sun.sun_family == AF_UNIX);
+ if (sun.sun_path[0] == '\0') {
+ strlcpy(addr, "N/A", size);
+ return;
+ }
+ snprintf(addr, size, "uds://%s", sun.sun_path);
+}
+
+static void
+uds_close(void *ctx)
+{
+ struct uds_ctx *uctx = ctx;
+
+ assert(uctx != NULL);
+ assert(uctx->uc_magic == UDS_CTX_MAGIC);
+
+ if (uctx->uc_fd >= 0)
+ close(uctx->uc_fd);
+ unlink(uctx->uc_sun.sun_path);
+ uctx->uc_magic = 0;
+ free(uctx);
+}
+
+static struct hast_proto uds_proto = {
+ .hp_name = "uds",
+ .hp_client = uds_client,
+ .hp_connect = uds_connect,
+ .hp_server = uds_server,
+ .hp_accept = uds_accept,
+ .hp_send = uds_send,
+ .hp_recv = uds_recv,
+ .hp_descriptor = uds_descriptor,
+ .hp_address_match = uds_address_match,
+ .hp_local_address = uds_local_address,
+ .hp_remote_address = uds_remote_address,
+ .hp_close = uds_close
+};
+
+static __constructor void
+uds_ctor(void)
+{
+
+ proto_register(&uds_proto);
+}
diff --git a/sbin/hastd/rangelock.c b/sbin/hastd/rangelock.c
new file mode 100644
index 0000000..02247d6
--- /dev/null
+++ b/sbin/hastd/rangelock.c
@@ -0,0 +1,137 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/queue.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "rangelock.h"
+
+#define RANGELOCKS_MAGIC 0x94310c
+struct rangelocks {
+ int rls_magic; /* Magic value. */
+ TAILQ_HEAD(, rlock) rls_locks; /* List of locked ranges. */
+};
+
+struct rlock {
+ off_t rl_start;
+ off_t rl_end;
+ TAILQ_ENTRY(rlock) rl_next;
+};
+
+int
+rangelock_init(struct rangelocks **rlsp)
+{
+ struct rangelocks *rls;
+
+ assert(rlsp != NULL);
+
+ rls = malloc(sizeof(*rls));
+ if (rls == NULL)
+ return (-1);
+
+ TAILQ_INIT(&rls->rls_locks);
+
+ rls->rls_magic = RANGELOCKS_MAGIC;
+ *rlsp = rls;
+
+ return (0);
+}
+
+void
+rangelock_free(struct rangelocks *rls)
+{
+ struct rlock *rl;
+
+ assert(rls->rls_magic == RANGELOCKS_MAGIC);
+
+ rls->rls_magic = 0;
+
+ while ((rl = TAILQ_FIRST(&rls->rls_locks)) != NULL) {
+ TAILQ_REMOVE(&rls->rls_locks, rl, rl_next);
+ free(rl);
+ }
+ free(rls);
+}
+
+int
+rangelock_add(struct rangelocks *rls, off_t offset, off_t length)
+{
+ struct rlock *rl;
+
+ assert(rls->rls_magic == RANGELOCKS_MAGIC);
+
+ rl = malloc(sizeof(*rl));
+ if (rl == NULL)
+ return (-1);
+ rl->rl_start = offset;
+ rl->rl_end = offset + length;
+ TAILQ_INSERT_TAIL(&rls->rls_locks, rl, rl_next);
+ return (0);
+}
+
+void
+rangelock_del(struct rangelocks *rls, off_t offset, off_t length)
+{
+ struct rlock *rl;
+
+ assert(rls->rls_magic == RANGELOCKS_MAGIC);
+
+ TAILQ_FOREACH(rl, &rls->rls_locks, rl_next) {
+ if (rl->rl_start == offset && rl->rl_end == offset + length)
+ break;
+ }
+ assert(rl != NULL);
+ TAILQ_REMOVE(&rls->rls_locks, rl, rl_next);
+ free(rl);
+}
+
+bool
+rangelock_islocked(struct rangelocks *rls, off_t offset, off_t length)
+{
+ struct rlock *rl;
+
+ assert(rls->rls_magic == RANGELOCKS_MAGIC);
+
+ TAILQ_FOREACH(rl, &rls->rls_locks, rl_next) {
+ if (rl->rl_start >= offset && rl->rl_start < offset + length)
+ break;
+ else if (rl->rl_end > offset && rl->rl_end <= offset + length)
+ break;
+ else if (rl->rl_start < offset && rl->rl_end > offset + length)
+ break;
+ }
+ return (rl != NULL);
+}
diff --git a/sbin/hastd/rangelock.h b/sbin/hastd/rangelock.h
new file mode 100644
index 0000000..2ad9895
--- /dev/null
+++ b/sbin/hastd/rangelock.h
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _RANGELOCK_H_
+#define _RANGELOCK_H_
+
+#include <stdbool.h>
+#include <unistd.h>
+
+struct rangelocks;
+
+int rangelock_init(struct rangelocks **rlsp);
+void rangelock_free(struct rangelocks *rls);
+int rangelock_add(struct rangelocks *rls, off_t offset, off_t length);
+void rangelock_del(struct rangelocks *rls, off_t offset, off_t length);
+bool rangelock_islocked(struct rangelocks *rls, off_t offset, off_t length);
+
+#endif /* !_RANGELOCK_H_ */
diff --git a/sbin/hastd/secondary.c b/sbin/hastd/secondary.c
new file mode 100644
index 0000000..6af95b5
--- /dev/null
+++ b/sbin/hastd/secondary.c
@@ -0,0 +1,697 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/bio.h>
+#include <sys/disk.h>
+#include <sys/stat.h>
+
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgeom.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <activemap.h>
+#include <nv.h>
+#include <pjdlog.h>
+
+#include "control.h"
+#include "hast.h"
+#include "hast_proto.h"
+#include "hastd.h"
+#include "metadata.h"
+#include "proto.h"
+#include "subr.h"
+#include "synch.h"
+
+struct hio {
+ uint64_t hio_seq;
+ int hio_error;
+ struct nv *hio_nv;
+ void *hio_data;
+ uint8_t hio_cmd;
+ uint64_t hio_offset;
+ uint64_t hio_length;
+ TAILQ_ENTRY(hio) hio_next;
+};
+
+/*
+ * Free list holds unused structures. When free list is empty, we have to wait
+ * until some in-progress requests are freed.
+ */
+static TAILQ_HEAD(, hio) hio_free_list;
+static pthread_mutex_t hio_free_list_lock;
+static pthread_cond_t hio_free_list_cond;
+/*
+ * Disk thread (the one that do I/O requests) takes requests from this list.
+ */
+static TAILQ_HEAD(, hio) hio_disk_list;
+static pthread_mutex_t hio_disk_list_lock;
+static pthread_cond_t hio_disk_list_cond;
+/*
+ * There is one recv list for every component, although local components don't
+ * use recv lists as local requests are done synchronously.
+ */
+static TAILQ_HEAD(, hio) hio_send_list;
+static pthread_mutex_t hio_send_list_lock;
+static pthread_cond_t hio_send_list_cond;
+
+/*
+ * Maximum number of outstanding I/O requests.
+ */
+#define HAST_HIO_MAX 256
+
+static void *recv_thread(void *arg);
+static void *disk_thread(void *arg);
+static void *send_thread(void *arg);
+
+static void
+init_environment(void)
+{
+ struct hio *hio;
+ unsigned int ii;
+
+ /*
+ * Initialize lists, their locks and theirs condition variables.
+ */
+ TAILQ_INIT(&hio_free_list);
+ mtx_init(&hio_free_list_lock);
+ cv_init(&hio_free_list_cond);
+ TAILQ_INIT(&hio_disk_list);
+ mtx_init(&hio_disk_list_lock);
+ cv_init(&hio_disk_list_cond);
+ TAILQ_INIT(&hio_send_list);
+ mtx_init(&hio_send_list_lock);
+ cv_init(&hio_send_list_cond);
+
+ /*
+ * Allocate requests pool and initialize requests.
+ */
+ for (ii = 0; ii < HAST_HIO_MAX; ii++) {
+ hio = malloc(sizeof(*hio));
+ if (hio == NULL) {
+ errx(EX_TEMPFAIL, "cannot allocate %zu bytes of memory "
+ "for hio request", sizeof(*hio));
+ }
+ hio->hio_error = 0;
+ hio->hio_data = malloc(MAXPHYS);
+ if (hio->hio_data == NULL) {
+ errx(EX_TEMPFAIL, "cannot allocate %zu bytes of memory "
+ "for gctl_data", (size_t)MAXPHYS);
+ }
+ TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_next);
+ }
+}
+
+static void
+init_local(struct hast_resource *res)
+{
+
+ if (metadata_read(res, true) < 0)
+ exit(EX_NOINPUT);
+}
+
+static void
+init_remote(struct hast_resource *res, struct nv *nvin)
+{
+ uint64_t resuid;
+ struct nv *nvout;
+ unsigned char *map;
+ size_t mapsize;
+
+ map = NULL;
+ mapsize = 0;
+ nvout = nv_alloc();
+ nv_add_int64(nvout, (int64_t)res->hr_datasize, "datasize");
+ nv_add_int32(nvout, (int32_t)res->hr_extentsize, "extentsize");
+ resuid = nv_get_uint64(nvin, "resuid");
+ res->hr_primary_localcnt = nv_get_uint64(nvin, "localcnt");
+ res->hr_primary_remotecnt = nv_get_uint64(nvin, "remotecnt");
+ nv_add_uint64(nvout, res->hr_secondary_localcnt, "localcnt");
+ nv_add_uint64(nvout, res->hr_secondary_remotecnt, "remotecnt");
+ mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize -
+ METADATA_SIZE, res->hr_extentsize, res->hr_local_sectorsize);
+ map = malloc(mapsize);
+ if (map == NULL) {
+ pjdlog_exitx(EX_TEMPFAIL,
+ "Unable to allocate memory (%zu bytes) for activemap.",
+ mapsize);
+ }
+ nv_add_uint32(nvout, (uint32_t)mapsize, "mapsize");
+ /*
+ * When we work as primary and secondary is missing we will increase
+ * localcnt in our metadata. When secondary is connected and synced
+ * we make localcnt be equal to remotecnt, which means nodes are more
+ * or less in sync.
+ * Split-brain condition is when both nodes are not able to communicate
+ * and are both configured as primary nodes. In turn, they can both
+ * make incompatible changes to the data and we have to detect that.
+ * Under split-brain condition we will increase our localcnt on first
+ * write and remote node will increase its localcnt on first write.
+ * When we connect we can see that primary's localcnt is greater than
+ * our remotecnt (primary was modified while we weren't watching) and
+ * our localcnt is greater than primary's remotecnt (we were modified
+ * while primary wasn't watching).
+ * There are many possible combinations which are all gathered below.
+ * Don't pay too much attention to exact numbers, the more important
+ * is to compare them. We compare secondary's local with primary's
+ * remote and secondary's remote with primary's local.
+ * Note that every case where primary's localcnt is smaller than
+ * secondary's remotecnt and where secondary's localcnt is smaller than
+ * primary's remotecnt should be impossible in practise. We will perform
+ * full synchronization then. Those cases are marked with an asterisk.
+ * Regular synchronization means that only extents marked as dirty are
+ * synchronized (regular synchronization).
+ *
+ * SECONDARY METADATA PRIMARY METADATA
+ * local=3 remote=3 local=2 remote=2* ?! Full sync from secondary.
+ * local=3 remote=3 local=2 remote=3* ?! Full sync from primary.
+ * local=3 remote=3 local=2 remote=4* ?! Full sync from primary.
+ * local=3 remote=3 local=3 remote=2 Primary is out-of-date,
+ * regular sync from secondary.
+ * local=3 remote=3 local=3 remote=3 Regular sync just in case.
+ * local=3 remote=3 local=3 remote=4* ?! Full sync from primary.
+ * local=3 remote=3 local=4 remote=2 Split-brain condition.
+ * local=3 remote=3 local=4 remote=3 Secondary out-of-date,
+ * regular sync from primary.
+ * local=3 remote=3 local=4 remote=4* ?! Full sync from primary.
+ */
+ if (res->hr_resuid == 0) {
+ /*
+ * Provider is used for the first time. Initialize everything.
+ */
+ assert(res->hr_secondary_localcnt == 0);
+ res->hr_resuid = resuid;
+ if (metadata_write(res) < 0)
+ exit(EX_NOINPUT);
+ memset(map, 0xff, mapsize);
+ nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
+ } else if (
+ /* Is primary is out-of-date? */
+ (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
+ res->hr_secondary_remotecnt == res->hr_primary_localcnt) ||
+ /* Node are more or less in sync? */
+ (res->hr_secondary_localcnt == res->hr_primary_remotecnt &&
+ res->hr_secondary_remotecnt == res->hr_primary_localcnt) ||
+ /* Is secondary is out-of-date? */
+ (res->hr_secondary_localcnt == res->hr_primary_remotecnt &&
+ res->hr_secondary_remotecnt < res->hr_primary_localcnt)) {
+ /*
+ * Nodes are more or less in sync or one of the nodes is
+ * out-of-date.
+ * It doesn't matter at this point which one, we just have to
+ * send out local bitmap to the remote node.
+ */
+ if (pread(res->hr_localfd, map, mapsize, METADATA_SIZE) !=
+ (ssize_t)mapsize) {
+ pjdlog_exit(LOG_ERR, "Unable to read activemap");
+ }
+ if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
+ res->hr_secondary_remotecnt == res->hr_primary_localcnt) {
+ /* Primary is out-of-date, sync from secondary. */
+ nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc");
+ } else {
+ /*
+ * Secondary is out-of-date or counts match.
+ * Sync from primary.
+ */
+ nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
+ }
+ } else if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
+ res->hr_primary_localcnt > res->hr_secondary_remotecnt) {
+ /*
+ * Not good, we have split-brain condition.
+ */
+ pjdlog_error("Split-brain detected, exiting.");
+ nv_add_string(nvout, "Split-brain condition!", "errmsg");
+ free(map);
+ map = NULL;
+ mapsize = 0;
+ } else /* if (res->hr_secondary_localcnt < res->hr_primary_remotecnt ||
+ res->hr_primary_localcnt < res->hr_secondary_remotecnt) */ {
+ /*
+ * This should never happen in practise, but we will perform
+ * full synchronization.
+ */
+ assert(res->hr_secondary_localcnt < res->hr_primary_remotecnt ||
+ res->hr_primary_localcnt < res->hr_secondary_remotecnt);
+ mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize -
+ METADATA_SIZE, res->hr_extentsize,
+ res->hr_local_sectorsize);
+ memset(map, 0xff, mapsize);
+ if (res->hr_secondary_localcnt > res->hr_primary_remotecnt) {
+ /* In this one of five cases sync from secondary. */
+ nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc");
+ } else {
+ /* For the rest four cases sync from primary. */
+ nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
+ }
+ pjdlog_warning("This should never happen, asking for full synchronization (primary(local=%ju, remote=%ju), secondary(local=%ju, remote=%ju)).",
+ (uintmax_t)res->hr_primary_localcnt,
+ (uintmax_t)res->hr_primary_remotecnt,
+ (uintmax_t)res->hr_secondary_localcnt,
+ (uintmax_t)res->hr_secondary_remotecnt);
+ }
+ if (hast_proto_send(res, res->hr_remotein, nvout, map, mapsize) < 0) {
+ pjdlog_errno(LOG_WARNING, "Unable to send activemap to %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ exit(EX_TEMPFAIL);
+ }
+ if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
+ res->hr_primary_localcnt > res->hr_secondary_remotecnt) {
+ /* Exit on split-brain. */
+ exit(EX_CONFIG);
+ }
+}
+
+void
+hastd_secondary(struct hast_resource *res, struct nv *nvin)
+{
+ pthread_t td;
+ pid_t pid;
+ int error;
+
+ /*
+ * Create communication channel between parent and child.
+ */
+ if (proto_client("socketpair://", &res->hr_ctrl) < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR,
+ "Unable to create control sockets between parent and child");
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "Unable to fork");
+ }
+
+ if (pid > 0) {
+ /* This is parent. */
+ proto_close(res->hr_remotein);
+ res->hr_remotein = NULL;
+ proto_close(res->hr_remoteout);
+ res->hr_remoteout = NULL;
+ res->hr_workerpid = pid;
+ return;
+ }
+ (void)pidfile_close(pfh);
+
+ setproctitle("%s (secondary)", res->hr_name);
+
+ init_local(res);
+ init_remote(res, nvin);
+ init_environment();
+
+ error = pthread_create(&td, NULL, recv_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, disk_thread, res);
+ assert(error == 0);
+ error = pthread_create(&td, NULL, send_thread, res);
+ assert(error == 0);
+ (void)ctrl_thread(res);
+}
+
+static void
+reqlog(int loglevel, int debuglevel, int error, struct hio *hio, const char *fmt, ...)
+{
+ char msg[1024];
+ va_list ap;
+ int len;
+
+ va_start(ap, fmt);
+ len = vsnprintf(msg, sizeof(msg), fmt, ap);
+ va_end(ap);
+ if ((size_t)len < sizeof(msg)) {
+ switch (hio->hio_cmd) {
+ case HIO_READ:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "READ(%ju, %ju).", (uintmax_t)hio->hio_offset,
+ (uintmax_t)hio->hio_length);
+ break;
+ case HIO_DELETE:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "DELETE(%ju, %ju).", (uintmax_t)hio->hio_offset,
+ (uintmax_t)hio->hio_length);
+ break;
+ case HIO_FLUSH:
+ (void)snprintf(msg + len, sizeof(msg) - len, "FLUSH.");
+ break;
+ case HIO_WRITE:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "WRITE(%ju, %ju).", (uintmax_t)hio->hio_offset,
+ (uintmax_t)hio->hio_length);
+ break;
+ default:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "UNKNOWN(%u).", (unsigned int)hio->hio_cmd);
+ break;
+ }
+ }
+ pjdlog_common(loglevel, debuglevel, error, "%s", msg);
+}
+
+static int
+requnpack(struct hast_resource *res, struct hio *hio)
+{
+
+ hio->hio_cmd = nv_get_uint8(hio->hio_nv, "cmd");
+ if (hio->hio_cmd == 0) {
+ pjdlog_error("Header contains no 'cmd' field.");
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ switch (hio->hio_cmd) {
+ case HIO_READ:
+ case HIO_WRITE:
+ case HIO_DELETE:
+ hio->hio_offset = nv_get_uint64(hio->hio_nv, "offset");
+ if (nv_error(hio->hio_nv) != 0) {
+ pjdlog_error("Header is missing 'offset' field.");
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ hio->hio_length = nv_get_uint64(hio->hio_nv, "length");
+ if (nv_error(hio->hio_nv) != 0) {
+ pjdlog_error("Header is missing 'length' field.");
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if (hio->hio_length == 0) {
+ pjdlog_error("Data length is zero.");
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if (hio->hio_length > MAXPHYS) {
+ pjdlog_error("Data length is too large (%ju > %ju).",
+ (uintmax_t)hio->hio_length, (uintmax_t)MAXPHYS);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if ((hio->hio_offset % res->hr_local_sectorsize) != 0) {
+ pjdlog_error("Offset %ju is not multiple of sector size.",
+ (uintmax_t)hio->hio_offset);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if ((hio->hio_length % res->hr_local_sectorsize) != 0) {
+ pjdlog_error("Length %ju is not multiple of sector size.",
+ (uintmax_t)hio->hio_length);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if (hio->hio_offset + hio->hio_length >
+ (uint64_t)res->hr_datasize) {
+ pjdlog_error("Data offset is too large (%ju > %ju).",
+ (uintmax_t)(hio->hio_offset + hio->hio_length),
+ (uintmax_t)res->hr_datasize);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ break;
+ default:
+ pjdlog_error("Header contains invalid 'cmd' (%hhu).",
+ hio->hio_cmd);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ hio->hio_error = 0;
+end:
+ return (hio->hio_error);
+}
+
+/*
+ * Thread receives requests from the primary node.
+ */
+static void *
+recv_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct hio *hio;
+ bool wakeup;
+
+ for (;;) {
+ pjdlog_debug(2, "recv: Taking free request.");
+ mtx_lock(&hio_free_list_lock);
+ while ((hio = TAILQ_FIRST(&hio_free_list)) == NULL) {
+ pjdlog_debug(2, "recv: No free requests, waiting.");
+ cv_wait(&hio_free_list_cond, &hio_free_list_lock);
+ }
+ TAILQ_REMOVE(&hio_free_list, hio, hio_next);
+ mtx_unlock(&hio_free_list_lock);
+ pjdlog_debug(2, "recv: (%p) Got request.", hio);
+ if (hast_proto_recv_hdr(res->hr_remotein, &hio->hio_nv) < 0) {
+ pjdlog_exit(EX_TEMPFAIL,
+ "Unable to receive request header");
+ }
+ if (requnpack(res, hio) != 0)
+ goto send_queue;
+ reqlog(LOG_DEBUG, 2, -1, hio,
+ "recv: (%p) Got request header: ", hio);
+ if (hio->hio_cmd == HIO_WRITE) {
+ if (hast_proto_recv_data(res, res->hr_remotein,
+ hio->hio_nv, hio->hio_data, MAXPHYS) < 0) {
+ pjdlog_exit(EX_TEMPFAIL,
+ "Unable to receive reply data");
+ }
+ }
+ pjdlog_debug(2, "recv: (%p) Moving request to the disk queue.",
+ hio);
+ mtx_lock(&hio_disk_list_lock);
+ wakeup = TAILQ_EMPTY(&hio_disk_list);
+ TAILQ_INSERT_TAIL(&hio_disk_list, hio, hio_next);
+ mtx_unlock(&hio_disk_list_lock);
+ if (wakeup)
+ cv_signal(&hio_disk_list_cond);
+ continue;
+send_queue:
+ pjdlog_debug(2, "recv: (%p) Moving request to the send queue.",
+ hio);
+ mtx_lock(&hio_send_list_lock);
+ wakeup = TAILQ_EMPTY(&hio_send_list);
+ TAILQ_INSERT_TAIL(&hio_send_list, hio, hio_next);
+ mtx_unlock(&hio_send_list_lock);
+ if (wakeup)
+ cv_signal(&hio_send_list_cond);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread reads from or writes to local component and also handles DELETE and
+ * FLUSH requests.
+ */
+static void *
+disk_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct hio *hio;
+ ssize_t ret;
+ bool clear_activemap, wakeup;
+
+ clear_activemap = true;
+
+ for (;;) {
+ pjdlog_debug(2, "disk: Taking request.");
+ mtx_lock(&hio_disk_list_lock);
+ while ((hio = TAILQ_FIRST(&hio_disk_list)) == NULL) {
+ pjdlog_debug(2, "disk: No requests, waiting.");
+ cv_wait(&hio_disk_list_cond, &hio_disk_list_lock);
+ }
+ TAILQ_REMOVE(&hio_disk_list, hio, hio_next);
+ mtx_unlock(&hio_disk_list_lock);
+ while (clear_activemap) {
+ unsigned char *map;
+ size_t mapsize;
+
+ /*
+ * When first request is received, it means that primary
+ * already received our activemap, merged it and stored
+ * locally. We can now safely clear our activemap.
+ */
+ mapsize =
+ activemap_calc_ondisk_size(res->hr_local_mediasize -
+ METADATA_SIZE, res->hr_extentsize,
+ res->hr_local_sectorsize);
+ map = calloc(1, mapsize);
+ if (map == NULL) {
+ pjdlog_warning("Unable to allocate memory to clear local activemap.");
+ break;
+ }
+ if (pwrite(res->hr_localfd, map, mapsize,
+ METADATA_SIZE) != (ssize_t)mapsize) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to store cleared activemap");
+ free(map);
+ break;
+ }
+ free(map);
+ clear_activemap = false;
+ pjdlog_debug(1, "Local activemap cleared.");
+ }
+ reqlog(LOG_DEBUG, 2, -1, hio, "disk: (%p) Got request: ", hio);
+ /* Handle the actual request. */
+ switch (hio->hio_cmd) {
+ case HIO_READ:
+ ret = pread(res->hr_localfd, hio->hio_data,
+ hio->hio_length,
+ hio->hio_offset + res->hr_localoff);
+ if (ret < 0)
+ hio->hio_error = errno;
+ else if (ret != (int64_t)hio->hio_length)
+ hio->hio_error = EIO;
+ else
+ hio->hio_error = 0;
+ break;
+ case HIO_WRITE:
+ ret = pwrite(res->hr_localfd, hio->hio_data,
+ hio->hio_length,
+ hio->hio_offset + res->hr_localoff);
+ if (ret < 0)
+ hio->hio_error = errno;
+ else if (ret != (int64_t)hio->hio_length)
+ hio->hio_error = EIO;
+ else
+ hio->hio_error = 0;
+ break;
+ case HIO_DELETE:
+ ret = g_delete(res->hr_localfd,
+ hio->hio_offset + res->hr_localoff,
+ hio->hio_length);
+ if (ret < 0)
+ hio->hio_error = errno;
+ else
+ hio->hio_error = 0;
+ break;
+ case HIO_FLUSH:
+ ret = g_flush(res->hr_localfd);
+ if (ret < 0)
+ hio->hio_error = errno;
+ else
+ hio->hio_error = 0;
+ break;
+ }
+ if (hio->hio_error != 0) {
+ reqlog(LOG_ERR, 0, hio->hio_error, hio,
+ "Request failed: ");
+ }
+ pjdlog_debug(2, "disk: (%p) Moving request to the send queue.",
+ hio);
+ mtx_lock(&hio_send_list_lock);
+ wakeup = TAILQ_EMPTY(&hio_send_list);
+ TAILQ_INSERT_TAIL(&hio_send_list, hio, hio_next);
+ mtx_unlock(&hio_send_list_lock);
+ if (wakeup)
+ cv_signal(&hio_send_list_cond);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread sends requests back to primary node.
+ */
+static void *
+send_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct nv *nvout;
+ struct hio *hio;
+ void *data;
+ size_t length;
+ bool wakeup;
+
+ for (;;) {
+ pjdlog_debug(2, "send: Taking request.");
+ mtx_lock(&hio_send_list_lock);
+ while ((hio = TAILQ_FIRST(&hio_send_list)) == NULL) {
+ pjdlog_debug(2, "send: No requests, waiting.");
+ cv_wait(&hio_send_list_cond, &hio_send_list_lock);
+ }
+ TAILQ_REMOVE(&hio_send_list, hio, hio_next);
+ mtx_unlock(&hio_send_list_lock);
+ reqlog(LOG_DEBUG, 2, -1, hio, "send: (%p) Got request: ", hio);
+ nvout = nv_alloc();
+ /* Copy sequence number. */
+ nv_add_uint64(nvout, nv_get_uint64(hio->hio_nv, "seq"), "seq");
+ switch (hio->hio_cmd) {
+ case HIO_READ:
+ if (hio->hio_error == 0) {
+ data = hio->hio_data;
+ length = hio->hio_length;
+ break;
+ }
+ /*
+ * We send no data in case of an error.
+ */
+ /* FALLTHROUGH */
+ case HIO_DELETE:
+ case HIO_FLUSH:
+ case HIO_WRITE:
+ data = NULL;
+ length = 0;
+ break;
+ default:
+ abort();
+ break;
+ }
+ if (hio->hio_error != 0)
+ nv_add_int16(nvout, hio->hio_error, "error");
+ if (hast_proto_send(res, res->hr_remoteout, nvout, data,
+ length) < 0) {
+ pjdlog_exit(EX_TEMPFAIL, "Unable to send reply.");
+ }
+ nv_free(nvout);
+ pjdlog_debug(2, "disk: (%p) Moving request to the free queue.",
+ hio);
+ nv_free(hio->hio_nv);
+ hio->hio_error = 0;
+ mtx_lock(&hio_free_list_lock);
+ wakeup = TAILQ_EMPTY(&hio_free_list);
+ TAILQ_INSERT_TAIL(&hio_free_list, hio, hio_next);
+ mtx_unlock(&hio_free_list_lock);
+ if (wakeup)
+ cv_signal(&hio_free_list_cond);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
diff --git a/sbin/hastd/subr.c b/sbin/hastd/subr.c
new file mode 100644
index 0000000..16ea93f
--- /dev/null
+++ b/sbin/hastd/subr.c
@@ -0,0 +1,118 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/disk.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include <pjdlog.h>
+
+#include "hast.h"
+#include "subr.h"
+
+int
+provinfo(struct hast_resource *res, bool dowrite)
+{
+ struct stat sb;
+
+ assert(res->hr_localpath != NULL && res->hr_localpath[0] != '\0');
+
+ if (res->hr_localfd == -1) {
+ res->hr_localfd = open(res->hr_localpath,
+ dowrite ? O_RDWR : O_RDONLY);
+ if (res->hr_localfd < 0) {
+ KEEP_ERRNO(pjdlog_errno(LOG_ERR, "Unable to open %s",
+ res->hr_localpath));
+ return (-1);
+ }
+ }
+ if (fstat(res->hr_localfd, &sb) < 0) {
+ KEEP_ERRNO(pjdlog_errno(LOG_ERR, "Unable to stat %s",
+ res->hr_localpath));
+ return (-1);
+ }
+ if (S_ISCHR(sb.st_mode)) {
+ /*
+ * If this is character device, it is most likely GEOM provider.
+ */
+ if (ioctl(res->hr_localfd, DIOCGMEDIASIZE,
+ &res->hr_local_mediasize) < 0) {
+ KEEP_ERRNO(pjdlog_errno(LOG_ERR,
+ "Unable obtain provider %s mediasize",
+ res->hr_localpath));
+ return (-1);
+ }
+ if (ioctl(res->hr_localfd, DIOCGSECTORSIZE,
+ &res->hr_local_sectorsize) < 0) {
+ KEEP_ERRNO(pjdlog_errno(LOG_ERR,
+ "Unable obtain provider %s sectorsize",
+ res->hr_localpath));
+ return (-1);
+ }
+ } else if (S_ISREG(sb.st_mode)) {
+ /*
+ * We also support regular files for which we hardcode
+ * sector size of 512 bytes.
+ */
+ res->hr_local_mediasize = sb.st_size;
+ res->hr_local_sectorsize = 512;
+ } else {
+ /*
+ * We support no other file types.
+ */
+ pjdlog_error("%s is neither GEOM provider nor regular file.",
+ res->hr_localpath);
+ errno = EFTYPE;
+ return (-1);
+ }
+ return (0);
+}
+
+const char *
+role2str(int role)
+{
+
+ switch (role) {
+ case HAST_ROLE_INIT:
+ return ("init");
+ case HAST_ROLE_PRIMARY:
+ return ("primary");
+ case HAST_ROLE_SECONDARY:
+ return ("secondary");
+ }
+ return ("unknown");
+}
diff --git a/sbin/hastd/subr.h b/sbin/hastd/subr.h
new file mode 100644
index 0000000..c486f5c
--- /dev/null
+++ b/sbin/hastd/subr.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SUBR_H_
+#define _SUBR_H_
+
+#include <sys/types.h>
+#include <stdbool.h>
+
+#include "hast.h"
+
+#define KEEP_ERRNO(work) do { \
+ int _rerrno; \
+ \
+ _rerrno = errno; \
+ work; \
+ errno = _rerrno; \
+} while (0)
+
+int provinfo(struct hast_resource *res, bool dowrite);
+const char *role2str(int role);
+
+#endif /* !_SUBR_H_ */
diff --git a/sbin/hastd/synch.h b/sbin/hastd/synch.h
new file mode 100644
index 0000000..7269aea
--- /dev/null
+++ b/sbin/hastd/synch.h
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYNCH_H_
+#define _SYNCH_H_
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <time.h>
+
+static __inline void
+mtx_init(pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_mutex_init(lock, NULL);
+ assert(error == 0);
+}
+static __inline void
+mtx_lock(pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_mutex_lock(lock);
+ assert(error == 0);
+}
+static __inline bool
+mtx_trylock(pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_mutex_trylock(lock);
+ assert(error == 0 || error == EBUSY);
+ return (error == 0);
+}
+static __inline void
+mtx_unlock(pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_mutex_unlock(lock);
+ assert(error == 0);
+}
+
+static __inline void
+rw_init(pthread_rwlock_t *lock)
+{
+ int error;
+
+ error = pthread_rwlock_init(lock, NULL);
+ assert(error == 0);
+}
+static __inline void
+rw_rlock(pthread_rwlock_t *lock)
+{
+ int error;
+
+ error = pthread_rwlock_rdlock(lock);
+ assert(error == 0);
+}
+static __inline void
+rw_wlock(pthread_rwlock_t *lock)
+{
+ int error;
+
+ error = pthread_rwlock_wrlock(lock);
+ assert(error == 0);
+}
+static __inline void
+rw_unlock(pthread_rwlock_t *lock)
+{
+ int error;
+
+ error = pthread_rwlock_unlock(lock);
+ assert(error == 0);
+}
+
+static __inline void
+cv_init(pthread_cond_t *cv)
+{
+ pthread_condattr_t attr;
+ int error;
+
+ error = pthread_condattr_init(&attr);
+ assert(error == 0);
+ error = pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
+ assert(error == 0);
+ error = pthread_cond_init(cv, &attr);
+ assert(error == 0);
+}
+static __inline void
+cv_wait(pthread_cond_t *cv, pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_cond_wait(cv, lock);
+ assert(error == 0);
+}
+static __inline bool
+cv_timedwait(pthread_cond_t *cv, pthread_mutex_t *lock, int timeout)
+{
+ struct timespec ts;
+ int error;
+
+ if (timeout == 0) {
+ cv_wait(cv, lock);
+ return (false);
+ }
+
+ error = clock_gettime(CLOCK_MONOTONIC, &ts);
+ assert(error == 0);
+ ts.tv_sec += timeout;
+ error = pthread_cond_timedwait(cv, lock, &ts);
+ assert(error == 0 || error == ETIMEDOUT);
+ return (error == ETIMEDOUT);
+}
+static __inline void
+cv_signal(pthread_cond_t *cv)
+{
+ int error;
+
+ error = pthread_cond_signal(cv);
+ assert(error == 0);
+}
+static __inline void
+cv_broadcast(pthread_cond_t *cv)
+{
+ int error;
+
+ error = pthread_cond_broadcast(cv);
+ assert(error == 0);
+}
+#endif /* !_SYNCH_H_ */
diff --git a/sbin/hastd/token.l b/sbin/hastd/token.l
new file mode 100644
index 0000000..7b80384
--- /dev/null
+++ b/sbin/hastd/token.l
@@ -0,0 +1,66 @@
+%{
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "hast.h"
+
+#include "y.tab.h"
+
+int depth;
+int lineno;
+
+#define DP do { } while (0)
+%}
+
+%%
+control { DP; return CONTROL; }
+listen { DP; return LISTEN; }
+port { DP; return PORT; }
+replication { DP; return REPLICATION; }
+resource { DP; return RESOURCE; }
+name { DP; return NAME; }
+local { DP; return LOCAL; }
+remote { DP; return REMOTE; }
+on { DP; return ON; }
+fullsync { DP; return FULLSYNC; }
+memsync { DP; return MEMSYNC; }
+async { DP; return ASYNC; }
+[0-9]+ { DP; yylval.num = atoi(yytext); return NUM; }
+[a-zA-Z0-9\.\-_/\:]+ { DP; yylval.str = strdup(yytext); return STR; }
+\{ { DP; depth++; return OB; }
+\} { DP; depth--; return CB; }
+#.*$ /* ignore comments */;
+\n { lineno++; }
+[ \t]+ /* ignore whitespace */;
+%%
OpenPOWER on IntegriCloud