summaryrefslogtreecommitdiffstats
path: root/sbin/hastd
diff options
context:
space:
mode:
Diffstat (limited to 'sbin/hastd')
-rw-r--r--sbin/hastd/Makefile45
-rw-r--r--sbin/hastd/activemap.c701
-rw-r--r--sbin/hastd/activemap.h69
-rw-r--r--sbin/hastd/control.c511
-rw-r--r--sbin/hastd/control.h49
-rw-r--r--sbin/hastd/crc32.c115
-rw-r--r--sbin/hastd/crc32.h28
-rw-r--r--sbin/hastd/ebuf.c259
-rw-r--r--sbin/hastd/ebuf.h51
-rw-r--r--sbin/hastd/event.c161
-rw-r--r--sbin/hastd/event.h46
-rw-r--r--sbin/hastd/hast.conf.5449
-rw-r--r--sbin/hastd/hast.h264
-rw-r--r--sbin/hastd/hast_checksum.c160
-rw-r--r--sbin/hastd/hast_checksum.h44
-rw-r--r--sbin/hastd/hast_compression.c283
-rw-r--r--sbin/hastd/hast_compression.h44
-rw-r--r--sbin/hastd/hast_proto.c222
-rw-r--r--sbin/hastd/hast_proto.h46
-rw-r--r--sbin/hastd/hastd.8232
-rw-r--r--sbin/hastd/hastd.c1337
-rw-r--r--sbin/hastd/hastd.h54
-rw-r--r--sbin/hastd/hooks.c391
-rw-r--r--sbin/hastd/hooks.h48
-rw-r--r--sbin/hastd/lzf.c406
-rw-r--r--sbin/hastd/lzf.h211
-rw-r--r--sbin/hastd/metadata.c225
-rw-r--r--sbin/hastd/metadata.h48
-rw-r--r--sbin/hastd/nv.c966
-rw-r--r--sbin/hastd/nv.h133
-rw-r--r--sbin/hastd/parse.y1037
-rw-r--r--sbin/hastd/pjdlog.c614
-rw-r--r--sbin/hastd/pjdlog.h117
-rw-r--r--sbin/hastd/primary.c2477
-rw-r--r--sbin/hastd/proto.c446
-rw-r--r--sbin/hastd/proto.h61
-rw-r--r--sbin/hastd/proto_common.c232
-rw-r--r--sbin/hastd/proto_impl.h79
-rw-r--r--sbin/hastd/proto_socketpair.c237
-rw-r--r--sbin/hastd/proto_tcp.c637
-rw-r--r--sbin/hastd/proto_uds.c361
-rw-r--r--sbin/hastd/rangelock.c141
-rw-r--r--sbin/hastd/rangelock.h46
-rw-r--r--sbin/hastd/refcnt.h66
-rw-r--r--sbin/hastd/secondary.c915
-rw-r--r--sbin/hastd/subr.c299
-rw-r--r--sbin/hastd/subr.h56
-rw-r--r--sbin/hastd/synch.h194
-rw-r--r--sbin/hastd/token.l86
49 files changed, 15699 insertions, 0 deletions
diff --git a/sbin/hastd/Makefile b/sbin/hastd/Makefile
new file mode 100644
index 0000000..7ff6ee8
--- /dev/null
+++ b/sbin/hastd/Makefile
@@ -0,0 +1,45 @@
+# $FreeBSD$
+
+.include <bsd.own.mk>
+
+PROG= hastd
+SRCS= activemap.c
+SRCS+= control.c crc32.c
+SRCS+= ebuf.c event.c
+SRCS+= hast_checksum.c hast_compression.c hast_proto.c hastd.c hooks.c
+SRCS+= lzf.c
+SRCS+= metadata.c
+SRCS+= nv.c
+SRCS+= secondary.c
+SRCS+= parse.y pjdlog.c primary.c
+SRCS+= proto.c proto_common.c proto_socketpair.c proto_tcp.c proto_uds.c
+SRCS+= rangelock.c
+SRCS+= subr.c
+SRCS+= token.l
+SRCS+= y.tab.h
+MAN= hastd.8 hast.conf.5
+
+NO_WFORMAT=
+NO_WCAST_ALIGN=
+NO_WMISSING_VARIABLE_DECLARATIONS=
+CFLAGS+=-I${.CURDIR}
+CFLAGS+=-DHAVE_CAPSICUM
+CFLAGS+=-DPROTO_TCP_DEFAULT_PORT=8457
+CFLAGS+=-DINET
+.if ${MK_INET6_SUPPORT} != "no"
+CFLAGS+=-DINET6
+.endif
+
+DPADD= ${LIBGEOM} ${LIBBSDXML} ${LIBSBUF} ${LIBL} ${LIBPTHREAD} ${LIBUTIL}
+LDADD= -lgeom -lbsdxml -lsbuf -lpthread -lutil
+.if ${MK_OPENSSL} != "no"
+DPADD+= ${LIBCRYPTO}
+LDADD+= -lcrypto
+CFLAGS+=-DHAVE_CRYPTO
+.endif
+
+YFLAGS+=-v
+
+CLEANFILES=y.tab.c y.tab.h y.output
+
+.include <bsd.prog.mk>
diff --git a/sbin/hastd/activemap.c b/sbin/hastd/activemap.c
new file mode 100644
index 0000000..64b95e3
--- /dev/null
+++ b/sbin/hastd/activemap.c
@@ -0,0 +1,701 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h> /* powerof2() */
+#include <sys/queue.h>
+
+#include <bitstring.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <pjdlog.h>
+
+#include "activemap.h"
+
+#ifndef PJDLOG_ASSERT
+#include <assert.h>
+#define PJDLOG_ASSERT(...) assert(__VA_ARGS__)
+#endif
+
+#define ACTIVEMAP_MAGIC 0xac71e4
+struct activemap {
+ int am_magic; /* Magic value. */
+ off_t am_mediasize; /* Media size in bytes. */
+ uint32_t am_extentsize; /* Extent size in bytes,
+ must be power of 2. */
+ uint8_t am_extentshift;/* 2 ^ extentbits == extentsize */
+ int am_nextents; /* Number of extents. */
+ size_t am_mapsize; /* Bitmap size in bytes. */
+ uint16_t *am_memtab; /* An array that holds number of pending
+ writes per extent. */
+ bitstr_t *am_diskmap; /* On-disk bitmap of dirty extents. */
+ bitstr_t *am_memmap; /* In-memory bitmap of dirty extents. */
+ size_t am_diskmapsize; /* Map size rounded up to sector size. */
+ uint64_t am_ndirty; /* Number of dirty regions. */
+ bitstr_t *am_syncmap; /* Bitmap of extents to sync. */
+ off_t am_syncoff; /* Next synchronization offset. */
+ TAILQ_HEAD(skeepdirty, keepdirty) am_keepdirty; /* List of extents that
+ we keep dirty to reduce bitmap
+ updates. */
+ int am_nkeepdirty; /* Number of am_keepdirty elements. */
+ int am_nkeepdirty_limit; /* Maximum number of am_keepdirty
+ elements. */
+};
+
+struct keepdirty {
+ int kd_extent;
+ TAILQ_ENTRY(keepdirty) kd_next;
+};
+
+/*
+ * Helper function taken from sys/systm.h to calculate extentshift.
+ */
+static uint32_t
+bitcount32(uint32_t x)
+{
+
+ x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1);
+ x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2);
+ x = (x + (x >> 4)) & 0x0f0f0f0f;
+ x = (x + (x >> 8));
+ x = (x + (x >> 16)) & 0x000000ff;
+ return (x);
+}
+
+static __inline int
+off2ext(const struct activemap *amp, off_t offset)
+{
+ int extent;
+
+ PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize);
+ extent = (offset >> amp->am_extentshift);
+ PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents);
+ return (extent);
+}
+
+static __inline off_t
+ext2off(const struct activemap *amp, int extent)
+{
+ off_t offset;
+
+ PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents);
+ offset = ((off_t)extent << amp->am_extentshift);
+ PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize);
+ return (offset);
+}
+
+/*
+ * Function calculates number of requests needed to synchronize the given
+ * extent.
+ */
+static __inline int
+ext2reqs(const struct activemap *amp, int ext)
+{
+ off_t left;
+
+ if (ext < amp->am_nextents - 1)
+ return (((amp->am_extentsize - 1) / MAXPHYS) + 1);
+
+ PJDLOG_ASSERT(ext == amp->am_nextents - 1);
+ left = amp->am_mediasize % amp->am_extentsize;
+ if (left == 0)
+ left = amp->am_extentsize;
+ return (((left - 1) / MAXPHYS) + 1);
+}
+
+/*
+ * Initialize activemap structure and allocate memory for internal needs.
+ * Function returns 0 on success and -1 if any of the allocations failed.
+ */
+int
+activemap_init(struct activemap **ampp, uint64_t mediasize, uint32_t extentsize,
+ uint32_t sectorsize, uint32_t keepdirty)
+{
+ struct activemap *amp;
+
+ PJDLOG_ASSERT(ampp != NULL);
+ PJDLOG_ASSERT(mediasize > 0);
+ PJDLOG_ASSERT(extentsize > 0);
+ PJDLOG_ASSERT(powerof2(extentsize));
+ PJDLOG_ASSERT(sectorsize > 0);
+ PJDLOG_ASSERT(powerof2(sectorsize));
+ PJDLOG_ASSERT(keepdirty > 0);
+
+ amp = malloc(sizeof(*amp));
+ if (amp == NULL)
+ return (-1);
+
+ amp->am_mediasize = mediasize;
+ amp->am_nkeepdirty_limit = keepdirty;
+ amp->am_extentsize = extentsize;
+ amp->am_extentshift = bitcount32(extentsize - 1);
+ amp->am_nextents = ((mediasize - 1) / extentsize) + 1;
+ amp->am_mapsize = sizeof(bitstr_t) * bitstr_size(amp->am_nextents);
+ amp->am_diskmapsize = roundup2(amp->am_mapsize, sectorsize);
+ amp->am_ndirty = 0;
+ amp->am_syncoff = -2;
+ TAILQ_INIT(&amp->am_keepdirty);
+ amp->am_nkeepdirty = 0;
+
+ amp->am_memtab = calloc(amp->am_nextents, sizeof(amp->am_memtab[0]));
+ amp->am_diskmap = calloc(1, amp->am_diskmapsize);
+ amp->am_memmap = bit_alloc(amp->am_nextents);
+ amp->am_syncmap = bit_alloc(amp->am_nextents);
+
+ /*
+ * Check to see if any of the allocations above failed.
+ */
+ if (amp->am_memtab == NULL || amp->am_diskmap == NULL ||
+ amp->am_memmap == NULL || amp->am_syncmap == NULL) {
+ if (amp->am_memtab != NULL)
+ free(amp->am_memtab);
+ if (amp->am_diskmap != NULL)
+ free(amp->am_diskmap);
+ if (amp->am_memmap != NULL)
+ free(amp->am_memmap);
+ if (amp->am_syncmap != NULL)
+ free(amp->am_syncmap);
+ amp->am_magic = 0;
+ free(amp);
+ errno = ENOMEM;
+ return (-1);
+ }
+
+ amp->am_magic = ACTIVEMAP_MAGIC;
+ *ampp = amp;
+
+ return (0);
+}
+
+static struct keepdirty *
+keepdirty_find(struct activemap *amp, int extent)
+{
+ struct keepdirty *kd;
+
+ TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next) {
+ if (kd->kd_extent == extent)
+ break;
+ }
+ return (kd);
+}
+
+static bool
+keepdirty_add(struct activemap *amp, int extent)
+{
+ struct keepdirty *kd;
+
+ kd = keepdirty_find(amp, extent);
+ if (kd != NULL) {
+ /*
+ * Only move element at the beginning.
+ */
+ TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
+ TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
+ return (false);
+ }
+ /*
+ * Add new element, but first remove the most unused one if
+ * we have too many.
+ */
+ if (amp->am_nkeepdirty >= amp->am_nkeepdirty_limit) {
+ kd = TAILQ_LAST(&amp->am_keepdirty, skeepdirty);
+ PJDLOG_ASSERT(kd != NULL);
+ TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
+ amp->am_nkeepdirty--;
+ PJDLOG_ASSERT(amp->am_nkeepdirty > 0);
+ }
+ if (kd == NULL)
+ kd = malloc(sizeof(*kd));
+ /* We can ignore allocation failure. */
+ if (kd != NULL) {
+ kd->kd_extent = extent;
+ amp->am_nkeepdirty++;
+ TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
+ }
+
+ return (true);
+}
+
+static void
+keepdirty_fill(struct activemap *amp)
+{
+ struct keepdirty *kd;
+
+ TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next)
+ bit_set(amp->am_diskmap, kd->kd_extent);
+}
+
+static void
+keepdirty_free(struct activemap *amp)
+{
+ struct keepdirty *kd;
+
+ while ((kd = TAILQ_FIRST(&amp->am_keepdirty)) != NULL) {
+ TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
+ amp->am_nkeepdirty--;
+ free(kd);
+ }
+ PJDLOG_ASSERT(amp->am_nkeepdirty == 0);
+}
+
+/*
+ * Function frees resources allocated by activemap_init() function.
+ */
+void
+activemap_free(struct activemap *amp)
+{
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ amp->am_magic = 0;
+
+ keepdirty_free(amp);
+ free(amp->am_memtab);
+ free(amp->am_diskmap);
+ free(amp->am_memmap);
+ free(amp->am_syncmap);
+}
+
+/*
+ * Function should be called before we handle write requests. It updates
+ * internal structures and returns true if on-disk metadata should be updated.
+ */
+bool
+activemap_write_start(struct activemap *amp, off_t offset, off_t length)
+{
+ bool modified;
+ off_t end;
+ int ext;
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+ PJDLOG_ASSERT(length > 0);
+
+ modified = false;
+ end = offset + length - 1;
+
+ for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
+ /*
+ * If the number of pending writes is increased from 0,
+ * we have to mark the extent as dirty also in on-disk bitmap.
+ * By returning true we inform the caller that on-disk bitmap
+ * was modified and has to be flushed to disk.
+ */
+ if (amp->am_memtab[ext]++ == 0) {
+ PJDLOG_ASSERT(!bit_test(amp->am_memmap, ext));
+ bit_set(amp->am_memmap, ext);
+ amp->am_ndirty++;
+ }
+ if (keepdirty_add(amp, ext))
+ modified = true;
+ }
+
+ return (modified);
+}
+
+/*
+ * Function should be called after receiving write confirmation. It updates
+ * internal structures and returns true if on-disk metadata should be updated.
+ */
+bool
+activemap_write_complete(struct activemap *amp, off_t offset, off_t length)
+{
+ bool modified;
+ off_t end;
+ int ext;
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+ PJDLOG_ASSERT(length > 0);
+
+ modified = false;
+ end = offset + length - 1;
+
+ for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
+ /*
+ * If the number of pending writes goes down to 0, we have to
+ * mark the extent as clean also in on-disk bitmap.
+ * By returning true we inform the caller that on-disk bitmap
+ * was modified and has to be flushed to disk.
+ */
+ PJDLOG_ASSERT(amp->am_memtab[ext] > 0);
+ PJDLOG_ASSERT(bit_test(amp->am_memmap, ext));
+ if (--amp->am_memtab[ext] == 0) {
+ bit_clear(amp->am_memmap, ext);
+ amp->am_ndirty--;
+ if (keepdirty_find(amp, ext) == NULL)
+ modified = true;
+ }
+ }
+
+ return (modified);
+}
+
+/*
+ * Function should be called after finishing synchronization of one extent.
+ * It returns true if on-disk metadata should be updated.
+ */
+bool
+activemap_extent_complete(struct activemap *amp, int extent)
+{
+ bool modified;
+ int reqs;
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+ PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents);
+
+ modified = false;
+
+ reqs = ext2reqs(amp, extent);
+ PJDLOG_ASSERT(amp->am_memtab[extent] >= reqs);
+ amp->am_memtab[extent] -= reqs;
+ PJDLOG_ASSERT(bit_test(amp->am_memmap, extent));
+ if (amp->am_memtab[extent] == 0) {
+ bit_clear(amp->am_memmap, extent);
+ amp->am_ndirty--;
+ modified = true;
+ }
+
+ return (modified);
+}
+
+/*
+ * Function returns number of dirty regions.
+ */
+uint64_t
+activemap_ndirty(const struct activemap *amp)
+{
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ return (amp->am_ndirty);
+}
+
+/*
+ * Function compare on-disk bitmap and in-memory bitmap and returns true if
+ * they differ and should be flushed to the disk.
+ */
+bool
+activemap_differ(const struct activemap *amp)
+{
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ return (memcmp(amp->am_diskmap, amp->am_memmap,
+ amp->am_mapsize) != 0);
+}
+
+/*
+ * Function returns number of bytes used by bitmap.
+ */
+size_t
+activemap_size(const struct activemap *amp)
+{
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ return (amp->am_mapsize);
+}
+
+/*
+ * Function returns number of bytes needed for storing on-disk bitmap.
+ * This is the same as activemap_size(), but rounded up to sector size.
+ */
+size_t
+activemap_ondisk_size(const struct activemap *amp)
+{
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ return (amp->am_diskmapsize);
+}
+
+/*
+ * Function copies the given buffer read from disk to the internal bitmap.
+ */
+void
+activemap_copyin(struct activemap *amp, const unsigned char *buf, size_t size)
+{
+ int ext;
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+ PJDLOG_ASSERT(size >= amp->am_mapsize);
+
+ memcpy(amp->am_diskmap, buf, amp->am_mapsize);
+ memcpy(amp->am_memmap, buf, amp->am_mapsize);
+ memcpy(amp->am_syncmap, buf, amp->am_mapsize);
+
+ bit_ffs(amp->am_memmap, amp->am_nextents, &ext);
+ if (ext == -1) {
+ /* There are no dirty extents, so we can leave now. */
+ return;
+ }
+ /*
+ * Set synchronization offset to the first dirty extent.
+ */
+ activemap_sync_rewind(amp);
+ /*
+ * We have dirty extents and we want them to stay that way until
+ * we synchronize, so we set number of pending writes to number
+ * of requests needed to synchronize one extent.
+ */
+ amp->am_ndirty = 0;
+ for (; ext < amp->am_nextents; ext++) {
+ if (bit_test(amp->am_memmap, ext)) {
+ amp->am_memtab[ext] = ext2reqs(amp, ext);
+ amp->am_ndirty++;
+ }
+ }
+}
+
+/*
+ * Function merges the given bitmap with existing one.
+ */
+void
+activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size)
+{
+ bitstr_t *remmap = __DECONST(bitstr_t *, buf);
+ int ext;
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+ PJDLOG_ASSERT(size >= amp->am_mapsize);
+
+ bit_ffs(remmap, amp->am_nextents, &ext);
+ if (ext == -1) {
+ /* There are no dirty extents, so we can leave now. */
+ return;
+ }
+ /*
+ * We have dirty extents and we want them to stay that way until
+ * we synchronize, so we set number of pending writes to number
+ * of requests needed to synchronize one extent.
+ */
+ for (; ext < amp->am_nextents; ext++) {
+ /* Local extent already dirty. */
+ if (bit_test(amp->am_syncmap, ext))
+ continue;
+ /* Remote extent isn't dirty. */
+ if (!bit_test(remmap, ext))
+ continue;
+ bit_set(amp->am_syncmap, ext);
+ bit_set(amp->am_memmap, ext);
+ bit_set(amp->am_diskmap, ext);
+ if (amp->am_memtab[ext] == 0)
+ amp->am_ndirty++;
+ amp->am_memtab[ext] = ext2reqs(amp, ext);
+ }
+ /*
+ * Set synchronization offset to the first dirty extent.
+ */
+ activemap_sync_rewind(amp);
+}
+
+/*
+ * Function returns pointer to internal bitmap that should be written to disk.
+ */
+const unsigned char *
+activemap_bitmap(struct activemap *amp, size_t *sizep)
+{
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ if (sizep != NULL)
+ *sizep = amp->am_diskmapsize;
+ memcpy(amp->am_diskmap, amp->am_memmap, amp->am_mapsize);
+ keepdirty_fill(amp);
+ return ((const unsigned char *)amp->am_diskmap);
+}
+
+/*
+ * Function calculates size needed to store bitmap on disk.
+ */
+size_t
+activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize,
+ uint32_t sectorsize)
+{
+ uint64_t nextents, mapsize;
+
+ PJDLOG_ASSERT(mediasize > 0);
+ PJDLOG_ASSERT(extentsize > 0);
+ PJDLOG_ASSERT(powerof2(extentsize));
+ PJDLOG_ASSERT(sectorsize > 0);
+ PJDLOG_ASSERT(powerof2(sectorsize));
+
+ nextents = ((mediasize - 1) / extentsize) + 1;
+ mapsize = sizeof(bitstr_t) * bitstr_size(nextents);
+ return (roundup2(mapsize, sectorsize));
+}
+
+/*
+ * Set synchronization offset to the first dirty extent.
+ */
+void
+activemap_sync_rewind(struct activemap *amp)
+{
+ int ext;
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
+ if (ext == -1) {
+ /* There are no extents to synchronize. */
+ amp->am_syncoff = -2;
+ return;
+ }
+ /*
+ * Mark that we want to start synchronization from the beginning.
+ */
+ amp->am_syncoff = -1;
+}
+
+/*
+ * Return next offset of where we should synchronize.
+ */
+off_t
+activemap_sync_offset(struct activemap *amp, off_t *lengthp, int *syncextp)
+{
+ off_t syncoff, left;
+ int ext;
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+ PJDLOG_ASSERT(lengthp != NULL);
+ PJDLOG_ASSERT(syncextp != NULL);
+
+ *syncextp = -1;
+
+ if (amp->am_syncoff == -2)
+ return (-1);
+
+ if (amp->am_syncoff >= 0 &&
+ (amp->am_syncoff + MAXPHYS >= amp->am_mediasize ||
+ off2ext(amp, amp->am_syncoff) !=
+ off2ext(amp, amp->am_syncoff + MAXPHYS))) {
+ /*
+ * We are about to change extent, so mark previous one as clean.
+ */
+ ext = off2ext(amp, amp->am_syncoff);
+ bit_clear(amp->am_syncmap, ext);
+ *syncextp = ext;
+ amp->am_syncoff = -1;
+ }
+
+ if (amp->am_syncoff == -1) {
+ /*
+ * Let's find first extent to synchronize.
+ */
+ bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
+ if (ext == -1) {
+ amp->am_syncoff = -2;
+ return (-1);
+ }
+ amp->am_syncoff = ext2off(amp, ext);
+ } else {
+ /*
+ * We don't change extent, so just increase offset.
+ */
+ amp->am_syncoff += MAXPHYS;
+ if (amp->am_syncoff >= amp->am_mediasize) {
+ amp->am_syncoff = -2;
+ return (-1);
+ }
+ }
+
+ syncoff = amp->am_syncoff;
+ left = ext2off(amp, off2ext(amp, syncoff)) +
+ amp->am_extentsize - syncoff;
+ if (syncoff + left > amp->am_mediasize)
+ left = amp->am_mediasize - syncoff;
+ if (left > MAXPHYS)
+ left = MAXPHYS;
+
+ PJDLOG_ASSERT(left >= 0 && left <= MAXPHYS);
+ PJDLOG_ASSERT(syncoff >= 0 && syncoff < amp->am_mediasize);
+ PJDLOG_ASSERT(syncoff + left >= 0 &&
+ syncoff + left <= amp->am_mediasize);
+
+ *lengthp = left;
+ return (syncoff);
+}
+
+/*
+ * Mark extent(s) containing the given region for synchronization.
+ * Most likely one of the components is unavailable.
+ */
+bool
+activemap_need_sync(struct activemap *amp, off_t offset, off_t length)
+{
+ bool modified;
+ off_t end;
+ int ext;
+
+ PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
+
+ modified = false;
+ end = offset + length - 1;
+
+ for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
+ if (bit_test(amp->am_syncmap, ext)) {
+ /* Already marked for synchronization. */
+ PJDLOG_ASSERT(bit_test(amp->am_memmap, ext));
+ continue;
+ }
+ bit_set(amp->am_syncmap, ext);
+ if (!bit_test(amp->am_memmap, ext)) {
+ bit_set(amp->am_memmap, ext);
+ amp->am_ndirty++;
+ }
+ amp->am_memtab[ext] += ext2reqs(amp, ext);
+ modified = true;
+ }
+
+ return (modified);
+}
+
+void
+activemap_dump(const struct activemap *amp)
+{
+ int bit;
+
+ printf("M: ");
+ for (bit = 0; bit < amp->am_nextents; bit++)
+ printf("%d", bit_test(amp->am_memmap, bit) ? 1 : 0);
+ printf("\n");
+ printf("D: ");
+ for (bit = 0; bit < amp->am_nextents; bit++)
+ printf("%d", bit_test(amp->am_diskmap, bit) ? 1 : 0);
+ printf("\n");
+ printf("S: ");
+ for (bit = 0; bit < amp->am_nextents; bit++)
+ printf("%d", bit_test(amp->am_syncmap, bit) ? 1 : 0);
+ printf("\n");
+}
diff --git a/sbin/hastd/activemap.h b/sbin/hastd/activemap.h
new file mode 100644
index 0000000..42f0221
--- /dev/null
+++ b/sbin/hastd/activemap.h
@@ -0,0 +1,69 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ACTIVEMAP_H_
+#define _ACTIVEMAP_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+struct activemap;
+
+int activemap_init(struct activemap **ampp, uint64_t mediasize,
+ uint32_t extentsize, uint32_t sectorsize, uint32_t keepdirty);
+void activemap_free(struct activemap *amp);
+
+bool activemap_write_start(struct activemap *amp, off_t offset, off_t length);
+bool activemap_write_complete(struct activemap *amp, off_t offset,
+ off_t length);
+bool activemap_extent_complete(struct activemap *amp, int extent);
+uint64_t activemap_ndirty(const struct activemap *amp);
+
+bool activemap_differ(const struct activemap *amp);
+size_t activemap_size(const struct activemap *amp);
+size_t activemap_ondisk_size(const struct activemap *amp);
+void activemap_copyin(struct activemap *amp, const unsigned char *buf,
+ size_t size);
+void activemap_merge(struct activemap *amp, const unsigned char *buf,
+ size_t size);
+const unsigned char *activemap_bitmap(struct activemap *amp, size_t *sizep);
+
+size_t activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize,
+ uint32_t sectorsize);
+
+void activemap_sync_rewind(struct activemap *amp);
+off_t activemap_sync_offset(struct activemap *amp, off_t *lengthp,
+ int *syncextp);
+bool activemap_need_sync(struct activemap *amp, off_t offset, off_t length);
+
+void activemap_dump(const struct activemap *amp);
+
+#endif /* !_ACTIVEMAP_H_ */
diff --git a/sbin/hastd/control.c b/sbin/hastd/control.c
new file mode 100644
index 0000000..922f507
--- /dev/null
+++ b/sbin/hastd/control.c
@@ -0,0 +1,511 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <errno.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "hast.h"
+#include "hastd.h"
+#include "hast_checksum.h"
+#include "hast_compression.h"
+#include "hast_proto.h"
+#include "hooks.h"
+#include "nv.h"
+#include "pjdlog.h"
+#include "proto.h"
+#include "subr.h"
+
+#include "control.h"
+
+void
+child_cleanup(struct hast_resource *res)
+{
+
+ proto_close(res->hr_ctrl);
+ res->hr_ctrl = NULL;
+ if (res->hr_event != NULL) {
+ proto_close(res->hr_event);
+ res->hr_event = NULL;
+ }
+ if (res->hr_conn != NULL) {
+ proto_close(res->hr_conn);
+ res->hr_conn = NULL;
+ }
+ res->hr_workerpid = 0;
+}
+
+static void
+control_set_role_common(struct hastd_config *cfg, struct nv *nvout,
+ uint8_t role, struct hast_resource *res, const char *name, unsigned int no)
+{
+ int oldrole;
+
+ /* Name is always needed. */
+ if (name != NULL)
+ nv_add_string(nvout, name, "resource%u", no);
+
+ if (res == NULL) {
+ PJDLOG_ASSERT(cfg != NULL);
+ PJDLOG_ASSERT(name != NULL);
+
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (strcmp(res->hr_name, name) == 0)
+ break;
+ }
+ if (res == NULL) {
+ nv_add_int16(nvout, EHAST_NOENTRY, "error%u", no);
+ return;
+ }
+ }
+ PJDLOG_ASSERT(res != NULL);
+
+ /* Send previous role back. */
+ nv_add_string(nvout, role2str(res->hr_role), "role%u", no);
+
+ /* Nothing changed, return here. */
+ if (role == res->hr_role)
+ return;
+
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
+ pjdlog_info("Role changed to %s.", role2str(role));
+
+ /* Change role to the new one. */
+ oldrole = res->hr_role;
+ res->hr_role = role;
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
+
+ /*
+ * If previous role was primary or secondary we have to kill process
+ * doing that work.
+ */
+ if (res->hr_workerpid != 0) {
+ if (kill(res->hr_workerpid, SIGTERM) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to kill worker process %u",
+ (unsigned int)res->hr_workerpid);
+ } else if (waitpid(res->hr_workerpid, NULL, 0) !=
+ res->hr_workerpid) {
+ pjdlog_errno(LOG_WARNING,
+ "Error while waiting for worker process %u",
+ (unsigned int)res->hr_workerpid);
+ } else {
+ pjdlog_debug(1, "Worker process %u stopped.",
+ (unsigned int)res->hr_workerpid);
+ }
+ child_cleanup(res);
+ }
+
+ /* Start worker process if we are changing to primary. */
+ if (role == HAST_ROLE_PRIMARY)
+ hastd_primary(res);
+ pjdlog_prefix_set("%s", "");
+ hook_exec(res->hr_exec, "role", res->hr_name, role2str(oldrole),
+ role2str(res->hr_role), NULL);
+}
+
+void
+control_set_role(struct hast_resource *res, uint8_t role)
+{
+
+ control_set_role_common(NULL, NULL, role, res, NULL, 0);
+}
+
+static void
+control_status_worker(struct hast_resource *res, struct nv *nvout,
+ unsigned int no)
+{
+ struct nv *cnvin, *cnvout;
+ const char *str;
+ int error;
+
+ cnvin = NULL;
+
+ /*
+ * Prepare and send command to worker process.
+ */
+ cnvout = nv_alloc();
+ nv_add_uint8(cnvout, CONTROL_STATUS, "cmd");
+ error = nv_error(cnvout);
+ if (error != 0) {
+ pjdlog_common(LOG_ERR, 0, error,
+ "Unable to prepare control header");
+ goto end;
+ }
+ if (hast_proto_send(res, res->hr_ctrl, cnvout, NULL, 0) == -1) {
+ error = errno;
+ pjdlog_errno(LOG_ERR, "Unable to send control header");
+ goto end;
+ }
+
+ /*
+ * Receive response.
+ */
+ if (hast_proto_recv_hdr(res->hr_ctrl, &cnvin) == -1) {
+ error = errno;
+ pjdlog_errno(LOG_ERR, "Unable to receive control header");
+ goto end;
+ }
+
+ error = nv_get_int16(cnvin, "error");
+ if (error != 0)
+ goto end;
+
+ if ((str = nv_get_string(cnvin, "status")) == NULL) {
+ error = ENOENT;
+ pjdlog_errno(LOG_ERR, "Field 'status' is missing.");
+ goto end;
+ }
+ nv_add_string(nvout, str, "status%u", no);
+ nv_add_uint64(nvout, nv_get_uint64(cnvin, "dirty"), "dirty%u", no);
+ nv_add_uint32(nvout, nv_get_uint32(cnvin, "extentsize"),
+ "extentsize%u", no);
+ nv_add_uint32(nvout, nv_get_uint32(cnvin, "keepdirty"),
+ "keepdirty%u", no);
+ nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_read"),
+ "stat_read%u", no);
+ nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_write"),
+ "stat_write%u", no);
+ nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_delete"),
+ "stat_delete%u", no);
+ nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_flush"),
+ "stat_flush%u", no);
+ nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_activemap_update"),
+ "stat_activemap_update%u", no);
+ nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_read_error"),
+ "stat_read_error%u", no);
+ nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_write_error"),
+ "stat_write_error%u", no);
+ nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_delete_error"),
+ "stat_delete_error%u", no);
+ nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_flush_error"),
+ "stat_flush_error%u", no);
+end:
+ if (cnvin != NULL)
+ nv_free(cnvin);
+ if (cnvout != NULL)
+ nv_free(cnvout);
+ if (error != 0)
+ nv_add_int16(nvout, error, "error");
+}
+
+static void
+control_status(struct hastd_config *cfg, struct nv *nvout,
+ struct hast_resource *res, const char *name, unsigned int no)
+{
+
+ PJDLOG_ASSERT(cfg != NULL);
+ PJDLOG_ASSERT(nvout != NULL);
+ PJDLOG_ASSERT(name != NULL);
+
+ /* Name is always needed. */
+ nv_add_string(nvout, name, "resource%u", no);
+
+ if (res == NULL) {
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (strcmp(res->hr_name, name) == 0)
+ break;
+ }
+ if (res == NULL) {
+ nv_add_int16(nvout, EHAST_NOENTRY, "error%u", no);
+ return;
+ }
+ }
+ PJDLOG_ASSERT(res != NULL);
+ nv_add_string(nvout, res->hr_provname, "provname%u", no);
+ nv_add_string(nvout, res->hr_localpath, "localpath%u", no);
+ nv_add_string(nvout, res->hr_remoteaddr, "remoteaddr%u", no);
+ if (res->hr_sourceaddr[0] != '\0')
+ nv_add_string(nvout, res->hr_sourceaddr, "sourceaddr%u", no);
+ switch (res->hr_replication) {
+ case HAST_REPLICATION_FULLSYNC:
+ nv_add_string(nvout, "fullsync", "replication%u", no);
+ break;
+ case HAST_REPLICATION_MEMSYNC:
+ nv_add_string(nvout, "memsync", "replication%u", no);
+ break;
+ case HAST_REPLICATION_ASYNC:
+ nv_add_string(nvout, "async", "replication%u", no);
+ break;
+ default:
+ nv_add_string(nvout, "unknown", "replication%u", no);
+ break;
+ }
+ nv_add_string(nvout, checksum_name(res->hr_checksum),
+ "checksum%u", no);
+ nv_add_string(nvout, compression_name(res->hr_compression),
+ "compression%u", no);
+ nv_add_string(nvout, role2str(res->hr_role), "role%u", no);
+ nv_add_int32(nvout, res->hr_workerpid, "workerpid%u", no);
+
+ switch (res->hr_role) {
+ case HAST_ROLE_PRIMARY:
+ PJDLOG_ASSERT(res->hr_workerpid != 0);
+ /* FALLTHROUGH */
+ case HAST_ROLE_SECONDARY:
+ if (res->hr_workerpid != 0)
+ break;
+ /* FALLTHROUGH */
+ default:
+ return;
+ }
+
+ /*
+ * If we are here, it means that we have a worker process, which we
+ * want to ask some questions.
+ */
+ control_status_worker(res, nvout, no);
+}
+
+void
+control_handle(struct hastd_config *cfg)
+{
+ struct proto_conn *conn;
+ struct nv *nvin, *nvout;
+ unsigned int ii;
+ const char *str;
+ uint8_t cmd, role;
+ int error;
+
+ if (proto_accept(cfg->hc_controlconn, &conn) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to accept control connection");
+ return;
+ }
+
+ cfg->hc_controlin = conn;
+ nvin = nvout = NULL;
+ role = HAST_ROLE_UNDEF;
+
+ if (hast_proto_recv_hdr(conn, &nvin) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to receive control header");
+ nvin = NULL;
+ goto close;
+ }
+
+ /* Obtain command code. 0 means that nv_get_uint8() failed. */
+ cmd = nv_get_uint8(nvin, "cmd");
+ if (cmd == 0) {
+ pjdlog_error("Control header is missing 'cmd' field.");
+ goto close;
+ }
+
+ /* Allocate outgoing nv structure. */
+ nvout = nv_alloc();
+ if (nvout == NULL) {
+ pjdlog_error("Unable to allocate header for control response.");
+ goto close;
+ }
+
+ error = 0;
+
+ str = nv_get_string(nvin, "resource0");
+ if (str == NULL) {
+ pjdlog_error("Control header is missing 'resource0' field.");
+ error = EHAST_INVALID;
+ goto fail;
+ }
+ if (cmd == HASTCTL_CMD_SETROLE) {
+ role = nv_get_uint8(nvin, "role");
+ switch (role) {
+ case HAST_ROLE_INIT:
+ case HAST_ROLE_PRIMARY:
+ case HAST_ROLE_SECONDARY:
+ break;
+ default:
+ pjdlog_error("Invalid role received (%hhu).", role);
+ error = EHAST_INVALID;
+ goto fail;
+ }
+ }
+ if (strcmp(str, "all") == 0) {
+ struct hast_resource *res;
+
+ /* All configured resources. */
+
+ ii = 0;
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ switch (cmd) {
+ case HASTCTL_CMD_SETROLE:
+ control_set_role_common(cfg, nvout, role, res,
+ res->hr_name, ii++);
+ break;
+ case HASTCTL_CMD_STATUS:
+ control_status(cfg, nvout, res, res->hr_name,
+ ii++);
+ break;
+ default:
+ pjdlog_error("Invalid command received (%hhu).",
+ cmd);
+ error = EHAST_UNIMPLEMENTED;
+ goto fail;
+ }
+ }
+ } else {
+ /* Only selected resources. */
+
+ for (ii = 0; ; ii++) {
+ str = nv_get_string(nvin, "resource%u", ii);
+ if (str == NULL)
+ break;
+ switch (cmd) {
+ case HASTCTL_CMD_SETROLE:
+ control_set_role_common(cfg, nvout, role, NULL,
+ str, ii);
+ break;
+ case HASTCTL_CMD_STATUS:
+ control_status(cfg, nvout, NULL, str, ii);
+ break;
+ default:
+ pjdlog_error("Invalid command received (%hhu).",
+ cmd);
+ error = EHAST_UNIMPLEMENTED;
+ goto fail;
+ }
+ }
+ }
+ if (nv_error(nvout) != 0)
+ goto close;
+fail:
+ if (error != 0)
+ nv_add_int16(nvout, error, "error");
+
+ if (hast_proto_send(NULL, conn, nvout, NULL, 0) == -1)
+ pjdlog_errno(LOG_ERR, "Unable to send control response");
+close:
+ if (nvin != NULL)
+ nv_free(nvin);
+ if (nvout != NULL)
+ nv_free(nvout);
+ proto_close(conn);
+ cfg->hc_controlin = NULL;
+}
+
+/*
+ * Thread handles control requests from the parent.
+ */
+void *
+ctrl_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct nv *nvin, *nvout;
+ uint8_t cmd;
+
+ for (;;) {
+ if (hast_proto_recv_hdr(res->hr_ctrl, &nvin) == -1) {
+ if (sigexit_received)
+ pthread_exit(NULL);
+ pjdlog_errno(LOG_ERR,
+ "Unable to receive control message");
+ kill(getpid(), SIGTERM);
+ pthread_exit(NULL);
+ }
+ cmd = nv_get_uint8(nvin, "cmd");
+ if (cmd == 0) {
+ pjdlog_error("Control message is missing 'cmd' field.");
+ nv_free(nvin);
+ continue;
+ }
+ nvout = nv_alloc();
+ switch (cmd) {
+ case CONTROL_STATUS:
+ if (res->hr_remotein != NULL &&
+ res->hr_remoteout != NULL) {
+ nv_add_string(nvout, "complete", "status");
+ } else {
+ nv_add_string(nvout, "degraded", "status");
+ }
+ nv_add_uint32(nvout, (uint32_t)res->hr_extentsize,
+ "extentsize");
+ if (res->hr_role == HAST_ROLE_PRIMARY) {
+ nv_add_uint32(nvout,
+ (uint32_t)res->hr_keepdirty, "keepdirty");
+ nv_add_uint64(nvout,
+ (uint64_t)(activemap_ndirty(res->hr_amp) *
+ res->hr_extentsize), "dirty");
+ } else {
+ nv_add_uint32(nvout, (uint32_t)0, "keepdirty");
+ nv_add_uint64(nvout, (uint64_t)0, "dirty");
+ }
+ nv_add_uint64(nvout, res->hr_stat_read, "stat_read");
+ nv_add_uint64(nvout, res->hr_stat_write, "stat_write");
+ nv_add_uint64(nvout, res->hr_stat_delete,
+ "stat_delete");
+ nv_add_uint64(nvout, res->hr_stat_flush, "stat_flush");
+ nv_add_uint64(nvout, res->hr_stat_activemap_update,
+ "stat_activemap_update");
+ nv_add_uint64(nvout, res->hr_stat_read_error,
+ "stat_read_error");
+ nv_add_uint64(nvout, res->hr_stat_write_error +
+ res->hr_stat_activemap_write_error,
+ "stat_write_error");
+ nv_add_uint64(nvout, res->hr_stat_delete_error,
+ "stat_delete_error");
+ nv_add_uint64(nvout, res->hr_stat_flush_error +
+ res->hr_stat_activemap_flush_error,
+ "stat_flush_error");
+ nv_add_int16(nvout, 0, "error");
+ break;
+ case CONTROL_RELOAD:
+ /*
+ * When parent receives SIGHUP and discovers that
+ * something related to us has changes, it sends reload
+ * message to us.
+ */
+ PJDLOG_ASSERT(res->hr_role == HAST_ROLE_PRIMARY);
+ primary_config_reload(res, nvin);
+ nv_add_int16(nvout, 0, "error");
+ break;
+ default:
+ nv_add_int16(nvout, EINVAL, "error");
+ break;
+ }
+ nv_free(nvin);
+ if (nv_error(nvout) != 0) {
+ pjdlog_error("Unable to create answer on control message.");
+ nv_free(nvout);
+ continue;
+ }
+ if (hast_proto_send(NULL, res->hr_ctrl, nvout, NULL, 0) == -1) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to send reply to control message");
+ }
+ nv_free(nvout);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
diff --git a/sbin/hastd/control.h b/sbin/hastd/control.h
new file mode 100644
index 0000000..0795c70
--- /dev/null
+++ b/sbin/hastd/control.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CONTROL_H_
+#define _CONTROL_H_
+
+#define CONTROL_STATUS 10
+#define CONTROL_RELOAD 11
+
+struct hastd_config;
+struct hast_resource;
+
+void child_cleanup(struct hast_resource *res);
+
+void control_set_role(struct hast_resource *res, uint8_t role);
+
+void control_handle(struct hastd_config *cfg);
+
+void *ctrl_thread(void *arg);
+
+#endif /* !_CONTROL_H_ */
diff --git a/sbin/hastd/crc32.c b/sbin/hastd/crc32.c
new file mode 100644
index 0000000..e8bc74a
--- /dev/null
+++ b/sbin/hastd/crc32.c
@@ -0,0 +1,115 @@
+/*-
+ * COPYRIGHT (C) 1986 Gary S. Brown. You may use this program, or
+ * code or tables extracted from it, as desired without restriction.
+ */
+
+/*
+ * First, the polynomial itself and its table of feedback terms. The
+ * polynomial is
+ * X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0
+ *
+ * Note that we take it "backwards" and put the highest-order term in
+ * the lowest-order bit. The X^32 term is "implied"; the LSB is the
+ * X^31 term, etc. The X^0 term (usually shown as "+1") results in
+ * the MSB being 1
+ *
+ * Note that the usual hardware shift register implementation, which
+ * is what we're using (we're merely optimizing it by doing eight-bit
+ * chunks at a time) shifts bits into the lowest-order term. In our
+ * implementation, that means shifting towards the right. Why do we
+ * do it this way? Because the calculated CRC must be transmitted in
+ * order from highest-order term to lowest-order term. UARTs transmit
+ * characters in order from LSB to MSB. By storing the CRC this way
+ * we hand it to the UART in the order low-byte to high-byte; the UART
+ * sends each low-bit to hight-bit; and the result is transmission bit
+ * by bit from highest- to lowest-order term without requiring any bit
+ * shuffling on our part. Reception works similarly
+ *
+ * The feedback terms table consists of 256, 32-bit entries. Notes
+ *
+ * The table can be generated at runtime if desired; code to do so
+ * is shown later. It might not be obvious, but the feedback
+ * terms simply represent the results of eight shift/xor opera
+ * tions for all combinations of data and CRC register values
+ *
+ * The values must be right-shifted by eight bits by the "updcrc
+ * logic; the shift must be unsigned (bring in zeroes). On some
+ * hardware you could probably optimize the shift in assembler by
+ * using byte-swap instructions
+ * polynomial $edb88320
+ *
+ *
+ * CRC32 code derived from work by Gary S. Brown.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <stdint.h>
+
+#include <crc32.h>
+
+uint32_t crc32_tab[] = {
+ 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+ 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+ 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
+ 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+ 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+ 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+ 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
+ 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+ 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+ 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+ 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
+ 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+ 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+ 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+ 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+ 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+ 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+ 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+ 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
+ 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+ 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+ 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
+ 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+ 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+ 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+ 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
+ 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+ 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+ 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+ 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
+ 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+ 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+ 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+ 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+ 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+ 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+ 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+ 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
+ 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+ 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+ 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+/*
+ * A function that calculates the CRC-32 based on the table above is
+ * given below for documentation purposes. An equivalent implementation
+ * of this function that's actually used in the kernel can be found
+ * in sys/libkern.h, where it can be inlined.
+ *
+ * uint32_t
+ * crc32(const void *buf, size_t size)
+ * {
+ * const uint8_t *p = buf;
+ * uint32_t crc;
+ *
+ * crc = ~0U;
+ * while (size--)
+ * crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
+ * return crc ^ ~0U;
+ * }
+ */
diff --git a/sbin/hastd/crc32.h b/sbin/hastd/crc32.h
new file mode 100644
index 0000000..3812a83
--- /dev/null
+++ b/sbin/hastd/crc32.h
@@ -0,0 +1,28 @@
+/*-
+ * COPYRIGHT (C) 1986 Gary S. Brown. You may use this program, or
+ * code or tables extracted from it, as desired without restriction.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CRC32_H_
+#define _CRC32_H_
+
+#include <stdint.h> /* uint32_t */
+#include <stdlib.h> /* size_t */
+
+extern uint32_t crc32_tab[];
+
+static __inline uint32_t
+crc32(const void *buf, size_t size)
+{
+ const uint8_t *p = buf;
+ uint32_t crc;
+
+ crc = ~0U;
+ while (size--)
+ crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
+ return (crc ^ ~0U);
+}
+
+#endif /* !_CRC32_H_ */
diff --git a/sbin/hastd/ebuf.c b/sbin/hastd/ebuf.c
new file mode 100644
index 0000000..1ae2a26
--- /dev/null
+++ b/sbin/hastd/ebuf.c
@@ -0,0 +1,259 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <pjdlog.h>
+
+#include "ebuf.h"
+
+#ifndef PJDLOG_ASSERT
+#include <assert.h>
+#define PJDLOG_ASSERT(...) assert(__VA_ARGS__)
+#endif
+
+#define EBUF_MAGIC 0xeb0f41c
+struct ebuf {
+ /* Magic to assert the caller uses valid structure. */
+ int eb_magic;
+ /* Address where we did the allocation. */
+ unsigned char *eb_start;
+ /* Allocation end address. */
+ unsigned char *eb_end;
+ /* Start of real data. */
+ unsigned char *eb_used;
+ /* Size of real data. */
+ size_t eb_size;
+};
+
+static int ebuf_head_extend(struct ebuf *eb, size_t size);
+static int ebuf_tail_extend(struct ebuf *eb, size_t size);
+
+struct ebuf *
+ebuf_alloc(size_t size)
+{
+ struct ebuf *eb;
+ int rerrno;
+
+ eb = malloc(sizeof(*eb));
+ if (eb == NULL)
+ return (NULL);
+ size += PAGE_SIZE;
+ eb->eb_start = malloc(size);
+ if (eb->eb_start == NULL) {
+ rerrno = errno;
+ free(eb);
+ errno = rerrno;
+ return (NULL);
+ }
+ eb->eb_end = eb->eb_start + size;
+ /*
+ * We set start address for real data not at the first entry, because
+ * we want to be able to add data at the front.
+ */
+ eb->eb_used = eb->eb_start + PAGE_SIZE / 4;
+ eb->eb_size = 0;
+ eb->eb_magic = EBUF_MAGIC;
+
+ return (eb);
+}
+
+void
+ebuf_free(struct ebuf *eb)
+{
+
+ PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ eb->eb_magic = 0;
+
+ free(eb->eb_start);
+ free(eb);
+}
+
+int
+ebuf_add_head(struct ebuf *eb, const void *data, size_t size)
+{
+
+ PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ if (size > (size_t)(eb->eb_used - eb->eb_start)) {
+ /*
+ * We can't add more entries at the front, so we have to extend
+ * our buffer.
+ */
+ if (ebuf_head_extend(eb, size) == -1)
+ return (-1);
+ }
+ PJDLOG_ASSERT(size <= (size_t)(eb->eb_used - eb->eb_start));
+
+ eb->eb_size += size;
+ eb->eb_used -= size;
+ /*
+ * If data is NULL the caller just wants to reserve place.
+ */
+ if (data != NULL)
+ bcopy(data, eb->eb_used, size);
+
+ return (0);
+}
+
+int
+ebuf_add_tail(struct ebuf *eb, const void *data, size_t size)
+{
+
+ PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ if (size > (size_t)(eb->eb_end - (eb->eb_used + eb->eb_size))) {
+ /*
+ * We can't add more entries at the back, so we have to extend
+ * our buffer.
+ */
+ if (ebuf_tail_extend(eb, size) == -1)
+ return (-1);
+ }
+ PJDLOG_ASSERT(size <=
+ (size_t)(eb->eb_end - (eb->eb_used + eb->eb_size)));
+
+ /*
+ * If data is NULL the caller just wants to reserve space.
+ */
+ if (data != NULL)
+ bcopy(data, eb->eb_used + eb->eb_size, size);
+ eb->eb_size += size;
+
+ return (0);
+}
+
+void
+ebuf_del_head(struct ebuf *eb, size_t size)
+{
+
+ PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+ PJDLOG_ASSERT(size <= eb->eb_size);
+
+ eb->eb_used += size;
+ eb->eb_size -= size;
+}
+
+void
+ebuf_del_tail(struct ebuf *eb, size_t size)
+{
+
+ PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+ PJDLOG_ASSERT(size <= eb->eb_size);
+
+ eb->eb_size -= size;
+}
+
+/*
+ * Return pointer to the data and data size.
+ */
+void *
+ebuf_data(struct ebuf *eb, size_t *sizep)
+{
+
+ PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ if (sizep != NULL)
+ *sizep = eb->eb_size;
+ return (eb->eb_size > 0 ? eb->eb_used : NULL);
+}
+
+/*
+ * Return data size.
+ */
+size_t
+ebuf_size(struct ebuf *eb)
+{
+
+ PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ return (eb->eb_size);
+}
+
+/*
+ * Function adds size + (PAGE_SIZE / 4) bytes at the front of the buffer..
+ */
+static int
+ebuf_head_extend(struct ebuf *eb, size_t size)
+{
+ unsigned char *newstart, *newused;
+ size_t newsize;
+
+ PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ newsize = eb->eb_end - eb->eb_start + (PAGE_SIZE / 4) + size;
+
+ newstart = malloc(newsize);
+ if (newstart == NULL)
+ return (-1);
+ newused =
+ newstart + (PAGE_SIZE / 4) + size + (eb->eb_used - eb->eb_start);
+
+ bcopy(eb->eb_used, newused, eb->eb_size);
+
+ eb->eb_start = newstart;
+ eb->eb_used = newused;
+ eb->eb_end = newstart + newsize;
+
+ return (0);
+}
+
+/*
+ * Function adds size + ((3 * PAGE_SIZE) / 4) bytes at the back.
+ */
+static int
+ebuf_tail_extend(struct ebuf *eb, size_t size)
+{
+ unsigned char *newstart;
+ size_t newsize;
+
+ PJDLOG_ASSERT(eb != NULL && eb->eb_magic == EBUF_MAGIC);
+
+ newsize = eb->eb_end - eb->eb_start + size + ((3 * PAGE_SIZE) / 4);
+
+ newstart = realloc(eb->eb_start, newsize);
+ if (newstart == NULL)
+ return (-1);
+
+ eb->eb_used = newstart + (eb->eb_used - eb->eb_start);
+ eb->eb_start = newstart;
+ eb->eb_end = newstart + newsize;
+
+ return (0);
+}
diff --git a/sbin/hastd/ebuf.h b/sbin/hastd/ebuf.h
new file mode 100644
index 0000000..06275e7
--- /dev/null
+++ b/sbin/hastd/ebuf.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _EBUF_H_
+#define _EBUF_H_
+
+#include <stdlib.h> /* size_t */
+
+struct ebuf;
+
+struct ebuf *ebuf_alloc(size_t size);
+void ebuf_free(struct ebuf *eb);
+
+int ebuf_add_head(struct ebuf *eb, const void *data, size_t size);
+int ebuf_add_tail(struct ebuf *eb, const void *data, size_t size);
+
+void ebuf_del_head(struct ebuf *eb, size_t size);
+void ebuf_del_tail(struct ebuf *eb, size_t size);
+
+void *ebuf_data(struct ebuf *eb, size_t *sizep);
+size_t ebuf_size(struct ebuf *eb);
+
+#endif /* !_EBUF_H_ */
diff --git a/sbin/hastd/event.c b/sbin/hastd/event.c
new file mode 100644
index 0000000..ef65df1
--- /dev/null
+++ b/sbin/hastd/event.c
@@ -0,0 +1,161 @@
+/*-
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <errno.h>
+
+#include "hast.h"
+#include "hast_proto.h"
+#include "hooks.h"
+#include "nv.h"
+#include "pjdlog.h"
+#include "proto.h"
+#include "subr.h"
+
+#include "event.h"
+
+void
+event_send(const struct hast_resource *res, int event)
+{
+ struct nv *nvin, *nvout;
+ int error;
+
+ PJDLOG_ASSERT(res != NULL);
+ PJDLOG_ASSERT(event >= EVENT_MIN && event <= EVENT_MAX);
+
+ nvin = nvout = NULL;
+
+ /*
+ * Prepare and send event to parent process.
+ */
+ nvout = nv_alloc();
+ nv_add_uint8(nvout, (uint8_t)event, "event");
+ error = nv_error(nvout);
+ if (error != 0) {
+ pjdlog_common(LOG_ERR, 0, error,
+ "Unable to prepare event header");
+ goto done;
+ }
+ if (hast_proto_send(res, res->hr_event, nvout, NULL, 0) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to send event header");
+ goto done;
+ }
+ if (hast_proto_recv_hdr(res->hr_event, &nvin) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to receive event header");
+ goto done;
+ }
+ /*
+ * Do nothing with the answer. We only wait for it to be sure not
+ * to exit too quickly after sending an event and exiting immediately.
+ */
+done:
+ if (nvin != NULL)
+ nv_free(nvin);
+ if (nvout != NULL)
+ nv_free(nvout);
+}
+
+int
+event_recv(const struct hast_resource *res)
+{
+ struct nv *nvin, *nvout;
+ const char *evstr;
+ uint8_t event;
+ int error;
+
+ PJDLOG_ASSERT(res != NULL);
+
+ nvin = nvout = NULL;
+
+ if (hast_proto_recv_hdr(res->hr_event, &nvin) == -1) {
+ /*
+ * First error log as debug. This is because worker process
+ * most likely exited.
+ */
+ pjdlog_common(LOG_DEBUG, 1, errno,
+ "Unable to receive event header");
+ goto fail;
+ }
+
+ event = nv_get_uint8(nvin, "event");
+ if (event == EVENT_NONE) {
+ pjdlog_error("Event header is missing 'event' field.");
+ goto fail;
+ }
+
+ switch (event) {
+ case EVENT_CONNECT:
+ evstr = "connect";
+ break;
+ case EVENT_DISCONNECT:
+ evstr = "disconnect";
+ break;
+ case EVENT_SYNCSTART:
+ evstr = "syncstart";
+ break;
+ case EVENT_SYNCDONE:
+ evstr = "syncdone";
+ break;
+ case EVENT_SYNCINTR:
+ evstr = "syncintr";
+ break;
+ case EVENT_SPLITBRAIN:
+ evstr = "split-brain";
+ break;
+ default:
+ pjdlog_error("Event header contain invalid event number (%hhu).",
+ event);
+ goto fail;
+ }
+
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
+ hook_exec(res->hr_exec, evstr, res->hr_name, NULL);
+ pjdlog_prefix_set("%s", "");
+
+ nvout = nv_alloc();
+ nv_add_int16(nvout, 0, "error");
+ error = nv_error(nvout);
+ if (error != 0) {
+ pjdlog_common(LOG_ERR, 0, error,
+ "Unable to prepare event header");
+ goto fail;
+ }
+ if (hast_proto_send(res, res->hr_event, nvout, NULL, 0) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to send event header");
+ goto fail;
+ }
+ nv_free(nvin);
+ nv_free(nvout);
+ return (0);
+fail:
+ if (nvin != NULL)
+ nv_free(nvin);
+ if (nvout != NULL)
+ nv_free(nvout);
+ return (-1);
+}
diff --git a/sbin/hastd/event.h b/sbin/hastd/event.h
new file mode 100644
index 0000000..1614bf1
--- /dev/null
+++ b/sbin/hastd/event.h
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _EVENT_H_
+#define _EVENT_H_
+
+#define EVENT_NONE 0
+#define EVENT_CONNECT 1
+#define EVENT_DISCONNECT 2
+#define EVENT_SYNCSTART 3
+#define EVENT_SYNCDONE 4
+#define EVENT_SYNCINTR 5
+#define EVENT_SPLITBRAIN 6
+
+#define EVENT_MIN EVENT_CONNECT
+#define EVENT_MAX EVENT_SPLITBRAIN
+
+void event_send(const struct hast_resource *res, int event);
+int event_recv(const struct hast_resource *res);
+
+#endif /* !_EVENT_H_ */
diff --git a/sbin/hastd/hast.conf.5 b/sbin/hastd/hast.conf.5
new file mode 100644
index 0000000..3d921e4
--- /dev/null
+++ b/sbin/hastd/hast.conf.5
@@ -0,0 +1,449 @@
+.\" Copyright (c) 2010 The FreeBSD Foundation
+.\" Copyright (c) 2010-2012 Pawel Jakub Dawidek <pawel@dawidek.net>
+.\" All rights reserved.
+.\"
+.\" This documentation was written by Pawel Jakub Dawidek under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd January 25, 2012
+.Dt HAST.CONF 5
+.Os
+.Sh NAME
+.Nm hast.conf
+.Nd configuration file for the
+.Xr hastd 8
+daemon and the
+.Xr hastctl 8
+utility
+.Sh DESCRIPTION
+The
+.Nm
+file is used by both
+.Xr hastd 8
+daemon
+and
+.Xr hastctl 8
+control utility.
+Configuration file is designed in a way that exactly the same file can be
+(and should be) used on both HAST nodes.
+Every line starting with # is treated as comment and ignored.
+.Sh CONFIGURATION FILE SYNTAX
+General syntax of the
+.Nm
+file is following:
+.Bd -literal -offset indent
+# Global section
+control <addr>
+listen <addr>
+replication <mode>
+checksum <algorithm>
+compression <algorithm>
+timeout <seconds>
+exec <path>
+metaflush on | off
+pidfile <path>
+
+on <node> {
+ # Node section
+ control <addr>
+ listen <addr>
+ pidfile <path>
+}
+
+on <node> {
+ # Node section
+ control <addr>
+ listen <addr>
+ pidfile <path>
+}
+
+resource <name> {
+ # Resource section
+ replication <mode>
+ checksum <algorithm>
+ compression <algorithm>
+ name <name>
+ local <path>
+ timeout <seconds>
+ exec <path>
+ metaflush on | off
+
+ on <node> {
+ # Resource-node section
+ name <name>
+ # Required
+ local <path>
+ metaflush on | off
+ # Required
+ remote <addr>
+ source <addr>
+ }
+ on <node> {
+ # Resource-node section
+ name <name>
+ # Required
+ local <path>
+ metaflush on | off
+ # Required
+ remote <addr>
+ source <addr>
+ }
+}
+.Ed
+.Pp
+Most of the various available configuration parameters are optional.
+If parameter is not defined in the particular section, it will be
+inherited from the parent section.
+For example, if the
+.Ic listen
+parameter is not defined in the node section, it will be inherited from
+the global section.
+In case the global section does not define the
+.Ic listen
+parameter at all, the default value will be used.
+.Sh CONFIGURATION FILE DESCRIPTION
+The
+.Aq node
+argument can be replaced either by a full hostname as obtained by
+.Xr gethostname 3 ,
+only first part of the hostname, by node's UUID as found in the
+.Va kern.hostuuid
+.Xr sysctl 8
+variable
+or by node's hostid as found in the
+.Va kern.hostid
+.Xr sysctl 8
+variable.
+.Pp
+The following statements are available:
+.Bl -tag -width ".Ic xxxx"
+.It Ic control Aq addr
+.Pp
+Address for communication with
+.Xr hastctl 8 .
+Each of the following examples defines the same control address:
+.Bd -literal -offset indent
+uds:///var/run/hastctl
+unix:///var/run/hastctl
+/var/run/hastctl
+.Ed
+.Pp
+The default value is
+.Pa uds:///var/run/hastctl .
+.It Ic pidfile Aq path
+.Pp
+File in which to store the process ID of the main
+.Xr hastd 8
+process.
+.Pp
+The default value is
+.Pa /var/run/hastd.pid .
+.It Ic listen Aq addr
+.Pp
+Address to listen on in form of:
+.Bd -literal -offset indent
+protocol://protocol-specific-address
+.Ed
+.Pp
+Each of the following examples defines the same listen address:
+.Bd -literal -offset indent
+0.0.0.0
+0.0.0.0:8457
+tcp://0.0.0.0
+tcp://0.0.0.0:8457
+tcp4://0.0.0.0
+tcp4://0.0.0.0:8457
+.Ed
+.Pp
+Multiple listen addresses can be specified.
+By default
+.Nm hastd
+listens on
+.Pa tcp4://0.0.0.0:8457
+and
+.Pa tcp6://[::]:8457
+if kernel supports IPv4 and IPv6 respectively.
+.It Ic replication Aq mode
+.Pp
+Replication mode should be one of the following:
+.Bl -tag -width ".Ic xxxx"
+.It Ic memsync
+.Pp
+Report the write operation as completed when local write completes and
+when the remote node acknowledges the data receipt, but before it
+actually stores the data.
+The data on remote node will be stored directly after sending
+acknowledgement.
+This mode is intended to reduce latency, but still provides a very good
+reliability.
+The only situation where some small amount of data could be lost is when
+the data is stored on primary node and sent to the secondary.
+Secondary node then acknowledges data receipt and primary reports
+success to an application.
+However, it may happen that the secondary goes down before the received
+data is really stored locally.
+Before secondary node returns, primary node dies entirely.
+When the secondary node comes back to life it becomes the new primary.
+Unfortunately some small amount of data which was confirmed to be stored
+to the application was lost.
+The risk of such a situation is very small.
+The
+.Ic memsync
+replication mode is the default.
+.It Ic fullsync
+.Pp
+Mark the write operation as completed when local as well as remote
+write completes.
+This is the safest and the slowest replication mode.
+.It Ic async
+.Pp
+The write operation is reported as complete right after the local write
+completes.
+This is the fastest and the most dangerous replication mode.
+This mode should be used when replicating to a distant node where
+latency is too high for other modes.
+.El
+.It Ic checksum Aq algorithm
+.Pp
+Checksum algorithm should be one of the following:
+.Bl -tag -width ".Ic sha256"
+.It Ic none
+No checksum will be calculated for the data being send over the network.
+This is the default setting.
+.It Ic crc32
+CRC32 checksum will be calculated.
+.It Ic sha256
+SHA256 checksum will be calculated.
+.El
+.It Ic compression Aq algorithm
+.Pp
+Compression algorithm should be one of the following:
+.Bl -tag -width ".Ic none"
+.It Ic none
+Data send over the network will not be compressed.
+.It Ic hole
+Only blocks that contain all zeros will be compressed.
+This is very useful for initial synchronization where potentially many blocks
+are still all zeros.
+There should be no measurable performance overhead when this algorithm is being
+used.
+This is the default setting.
+.It Ic lzf
+The LZF algorithm by Marc Alexander Lehmann will be used to compress the data
+send over the network.
+LZF is very fast, general purpose compression algorithm.
+.El
+.It Ic timeout Aq seconds
+.Pp
+Connection timeout in seconds.
+The default value is
+.Va 20 .
+.It Ic exec Aq path
+.Pp
+Execute the given program on various HAST events.
+Below is the list of currently implemented events and arguments the given
+program is executed with:
+.Bl -tag -width ".Ic xxxx"
+.It Ic "<path> role <resource> <oldrole> <newrole>"
+.Pp
+Executed on both primary and secondary nodes when resource role is changed.
+.Pp
+.It Ic "<path> connect <resource>"
+.Pp
+Executed on both primary and secondary nodes when connection for the given
+resource between the nodes is established.
+.Pp
+.It Ic "<path> disconnect <resource>"
+.Pp
+Executed on both primary and secondary nodes when connection for the given
+resource between the nodes is lost.
+.Pp
+.It Ic "<path> syncstart <resource>"
+.Pp
+Executed on primary node when synchronization process of secondary node is
+started.
+.Pp
+.It Ic "<path> syncdone <resource>"
+.Pp
+Executed on primary node when synchronization process of secondary node is
+completed successfully.
+.Pp
+.It Ic "<path> syncintr <resource>"
+.Pp
+Executed on primary node when synchronization process of secondary node is
+interrupted, most likely due to secondary node outage or connection failure
+between the nodes.
+.Pp
+.It Ic "<path> split-brain <resource>"
+.Pp
+Executed on both primary and secondary nodes when split-brain condition is
+detected.
+.Pp
+.El
+The
+.Aq path
+argument should contain full path to executable program.
+If the given program exits with code different than
+.Va 0 ,
+.Nm hastd
+will log it as an error.
+.Pp
+The
+.Aq resource
+argument is resource name from the configuration file.
+.Pp
+The
+.Aq oldrole
+argument is previous resource role (before the change).
+It can be one of:
+.Ar init ,
+.Ar secondary ,
+.Ar primary .
+.Pp
+The
+.Aq newrole
+argument is current resource role (after the change).
+It can be one of:
+.Ar init ,
+.Ar secondary ,
+.Ar primary .
+.Pp
+.It Ic metaflush on | off
+.Pp
+When set to
+.Va on ,
+flush write cache of the local provider after every metadata (activemap) update.
+Flushing write cache ensures that provider will not reorder writes and that
+metadata will be properly updated before real data is stored.
+If the local provider does not support flushing write cache (it returns
+.Er EOPNOTSUPP
+on the
+.Cm BIO_FLUSH
+request),
+.Nm hastd
+will disable
+.Ic metaflush
+automatically.
+The default value is
+.Va on .
+.Pp
+.It Ic name Aq name
+.Pp
+GEOM provider name that will appear as
+.Pa /dev/hast/<name> .
+If name is not defined, resource name will be used as provider name.
+.It Ic local Aq path
+.Pp
+Path to the local component which will be used as backend provider for
+the resource.
+This can be either GEOM provider or regular file.
+.It Ic remote Aq addr
+.Pp
+Address of the remote
+.Nm hastd
+daemon.
+Format is the same as for the
+.Ic listen
+statement.
+When operating as a primary node this address will be used to connect to
+the secondary node.
+When operating as a secondary node only connections from this address
+will be accepted.
+.Pp
+A special value of
+.Va none
+can be used when the remote address is not yet known (eg. the other node is not
+set up yet).
+.It Ic source Aq addr
+.Pp
+Local address to bind to before connecting to the remote
+.Nm hastd
+daemon.
+Format is the same as for the
+.Ic listen
+statement.
+.El
+.Sh FILES
+.Bl -tag -width ".Pa /var/run/hastctl" -compact
+.It Pa /etc/hast.conf
+The default
+.Xr hastctl 8
+and
+.Xr hastd 8
+configuration file.
+.It Pa /var/run/hastctl
+Control socket used by the
+.Xr hastctl 8
+control utility to communicate with the
+.Xr hastd 8
+daemon.
+.El
+.Sh EXAMPLES
+The example configuration file can look as follows:
+.Bd -literal -offset indent
+listen tcp://0.0.0.0
+
+on hasta {
+ listen tcp://2001:db8::1/64
+}
+on hastb {
+ listen tcp://2001:db8::2/64
+}
+
+resource shared {
+ local /dev/da0
+
+ on hasta {
+ remote tcp://10.0.0.2
+ }
+ on hastb {
+ remote tcp://10.0.0.1
+ }
+}
+resource tank {
+ on hasta {
+ local /dev/mirror/tanka
+ source tcp://10.0.0.1
+ remote tcp://10.0.0.2
+ }
+ on hastb {
+ local /dev/mirror/tankb
+ source tcp://10.0.0.2
+ remote tcp://10.0.0.1
+ }
+}
+.Ed
+.Sh SEE ALSO
+.Xr gethostname 3 ,
+.Xr geom 4 ,
+.Xr hastctl 8 ,
+.Xr hastd 8
+.Sh AUTHORS
+The
+.Nm
+was written by
+.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org
+under sponsorship of the FreeBSD Foundation.
diff --git a/sbin/hastd/hast.h b/sbin/hastd/hast.h
new file mode 100644
index 0000000..65c24f8
--- /dev/null
+++ b/sbin/hastd/hast.h
@@ -0,0 +1,264 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HAST_H_
+#define _HAST_H_
+
+#include <sys/queue.h>
+#include <sys/socket.h>
+
+#include <arpa/inet.h>
+
+#include <netinet/in.h>
+
+#include <limits.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <activemap.h>
+
+#include "proto.h"
+
+/*
+ * Version history:
+ * 0 - initial version
+ * 1 - HIO_KEEPALIVE added
+ * 2 - "memsync" and "received" attributes added for memsync mode
+ */
+#define HAST_PROTO_VERSION 2
+
+#define EHAST_OK 0
+#define EHAST_NOENTRY 1
+#define EHAST_INVALID 2
+#define EHAST_NOMEMORY 3
+#define EHAST_UNIMPLEMENTED 4
+
+#define HASTCTL_CMD_UNKNOWN 0
+#define HASTCTL_CMD_SETROLE 1
+#define HASTCTL_CMD_STATUS 2
+
+#define HAST_ROLE_UNDEF 0
+#define HAST_ROLE_INIT 1
+#define HAST_ROLE_PRIMARY 2
+#define HAST_ROLE_SECONDARY 3
+
+#define HAST_SYNCSRC_UNDEF 0
+#define HAST_SYNCSRC_PRIMARY 1
+#define HAST_SYNCSRC_SECONDARY 2
+
+#define HIO_UNDEF 0
+#define HIO_READ 1
+#define HIO_WRITE 2
+#define HIO_DELETE 3
+#define HIO_FLUSH 4
+#define HIO_KEEPALIVE 5
+
+#define HAST_USER "hast"
+#define HAST_TIMEOUT 20
+#define HAST_CONFIG "/etc/hast.conf"
+#define HAST_CONTROL "/var/run/hastctl"
+#define HASTD_LISTEN_TCP4 "tcp4://0.0.0.0:8457"
+#define HASTD_LISTEN_TCP6 "tcp6://[::]:8457"
+#define HASTD_PIDFILE "/var/run/hastd.pid"
+
+/* Default extent size. */
+#define HAST_EXTENTSIZE 2097152
+/* Default maximum number of extents that are kept dirty. */
+#define HAST_KEEPDIRTY 64
+
+#define HAST_ADDRSIZE 1024
+#define HAST_TOKEN_SIZE 16
+
+/* Number of seconds to sleep between reconnect retries or keepalive packets. */
+#define HAST_KEEPALIVE 10
+
+struct hastd_listen {
+ /* Address to listen on. */
+ char hl_addr[HAST_ADDRSIZE];
+ /* Protocol-specific data. */
+ struct proto_conn *hl_conn;
+ TAILQ_ENTRY(hastd_listen) hl_next;
+};
+
+struct hastd_config {
+ /* Address to communicate with hastctl(8). */
+ char hc_controladdr[HAST_ADDRSIZE];
+ /* Protocol-specific data. */
+ struct proto_conn *hc_controlconn;
+ /* Incoming control connection. */
+ struct proto_conn *hc_controlin;
+ /* PID file path. */
+ char hc_pidfile[PATH_MAX];
+ /* List of addresses to listen on. */
+ TAILQ_HEAD(, hastd_listen) hc_listen;
+ /* List of resources. */
+ TAILQ_HEAD(, hast_resource) hc_resources;
+};
+
+#define HAST_REPLICATION_FULLSYNC 0
+#define HAST_REPLICATION_MEMSYNC 1
+#define HAST_REPLICATION_ASYNC 2
+
+#define HAST_COMPRESSION_NONE 0
+#define HAST_COMPRESSION_HOLE 1
+#define HAST_COMPRESSION_LZF 2
+
+#define HAST_CHECKSUM_NONE 0
+#define HAST_CHECKSUM_CRC32 1
+#define HAST_CHECKSUM_SHA256 2
+
+/*
+ * Structure that describes single resource.
+ */
+struct hast_resource {
+ /* Resource name. */
+ char hr_name[NAME_MAX];
+ /* Negotiated replication mode (HAST_REPLICATION_*). */
+ int hr_replication;
+ /* Configured replication mode (HAST_REPLICATION_*). */
+ int hr_original_replication;
+ /* Provider name that will appear in /dev/hast/. */
+ char hr_provname[NAME_MAX];
+ /* Synchronization extent size. */
+ int hr_extentsize;
+ /* Maximum number of extents that are kept dirty. */
+ int hr_keepdirty;
+ /* Path to a program to execute on various events. */
+ char hr_exec[PATH_MAX];
+ /* Compression algorithm. */
+ int hr_compression;
+ /* Checksum algorithm. */
+ int hr_checksum;
+ /* Protocol version. */
+ int hr_version;
+
+ /* Path to local component. */
+ char hr_localpath[PATH_MAX];
+ /* Descriptor to access local component. */
+ int hr_localfd;
+ /* Offset into local component. */
+ off_t hr_localoff;
+ /* Size of usable space. */
+ off_t hr_datasize;
+ /* Size of entire local provider. */
+ off_t hr_local_mediasize;
+ /* Sector size of local provider. */
+ unsigned int hr_local_sectorsize;
+ /* Is flushing write cache supported by the local provider? */
+ bool hr_localflush;
+ /* Flush write cache on metadata updates? */
+ int hr_metaflush;
+
+ /* Descriptor for /dev/ggctl communication. */
+ int hr_ggatefd;
+ /* Unit number for ggate communication. */
+ int hr_ggateunit;
+
+ /* Address of the remote component. */
+ char hr_remoteaddr[HAST_ADDRSIZE];
+ /* Local address to bind to for outgoing connections. */
+ char hr_sourceaddr[HAST_ADDRSIZE];
+ /* Connection for incoming data. */
+ struct proto_conn *hr_remotein;
+ /* Connection for outgoing data. */
+ struct proto_conn *hr_remoteout;
+ /* Token to verify both in and out connection are coming from
+ the same node (not necessarily from the same address). */
+ unsigned char hr_token[HAST_TOKEN_SIZE];
+ /* Connection timeout. */
+ int hr_timeout;
+
+ /* Resource unique identifier. */
+ uint64_t hr_resuid;
+ /* Primary's local modification count. */
+ uint64_t hr_primary_localcnt;
+ /* Primary's remote modification count. */
+ uint64_t hr_primary_remotecnt;
+ /* Secondary's local modification count. */
+ uint64_t hr_secondary_localcnt;
+ /* Secondary's remote modification count. */
+ uint64_t hr_secondary_remotecnt;
+ /* Synchronization source. */
+ uint8_t hr_syncsrc;
+
+ /* Resource role: HAST_ROLE_{INIT,PRIMARY,SECONDARY}. */
+ int hr_role;
+ /* Previous resource role: HAST_ROLE_{INIT,PRIMARY,SECONDARY}. */
+ int hr_previous_role;
+ /* PID of child worker process. 0 - no child. */
+ pid_t hr_workerpid;
+ /* Control commands from parent to child. */
+ struct proto_conn *hr_ctrl;
+ /* Events from child to parent. */
+ struct proto_conn *hr_event;
+ /* Connection requests from child to parent. */
+ struct proto_conn *hr_conn;
+
+ /* Activemap structure. */
+ struct activemap *hr_amp;
+ /* Lock used to synchronize access to hr_amp. */
+ pthread_mutex_t hr_amp_lock;
+ /* Lock used to synchronize access to hr_amp diskmap. */
+ pthread_mutex_t hr_amp_diskmap_lock;
+
+ /* Number of BIO_READ requests. */
+ uint64_t hr_stat_read;
+ /* Number of BIO_WRITE requests. */
+ uint64_t hr_stat_write;
+ /* Number of BIO_DELETE requests. */
+ uint64_t hr_stat_delete;
+ /* Number of BIO_FLUSH requests. */
+ uint64_t hr_stat_flush;
+ /* Number of activemap updates. */
+ uint64_t hr_stat_activemap_update;
+ /* Number of local read errors. */
+ uint64_t hr_stat_read_error;
+ /* Number of local write errors. */
+ uint64_t hr_stat_write_error;
+ /* Number of local delete errors. */
+ uint64_t hr_stat_delete_error;
+ /* Number of flush errors. */
+ uint64_t hr_stat_flush_error;
+ /* Number of activemap write errors. */
+ uint64_t hr_stat_activemap_write_error;
+ /* Number of activemap flush errors. */
+ uint64_t hr_stat_activemap_flush_error;
+
+ /* Next resource. */
+ TAILQ_ENTRY(hast_resource) hr_next;
+};
+
+struct hastd_config *yy_config_parse(const char *config, bool exitonerror);
+void yy_config_free(struct hastd_config *config);
+
+#endif /* !_HAST_H_ */
diff --git a/sbin/hastd/hast_checksum.c b/sbin/hastd/hast_checksum.c
new file mode 100644
index 0000000..795744e
--- /dev/null
+++ b/sbin/hastd/hast_checksum.c
@@ -0,0 +1,160 @@
+/*-
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <errno.h>
+#include <string.h>
+#include <strings.h>
+
+#ifdef HAVE_CRYPTO
+#include <openssl/sha.h>
+#endif
+
+#include <crc32.h>
+#include <hast.h>
+#include <nv.h>
+#include <pjdlog.h>
+
+#include "hast_checksum.h"
+
+#ifdef HAVE_CRYPTO
+#define MAX_HASH_SIZE SHA256_DIGEST_LENGTH
+#else
+#define MAX_HASH_SIZE 4
+#endif
+
+static void
+hast_crc32_checksum(const unsigned char *data, size_t size,
+ unsigned char *hash, size_t *hsizep)
+{
+ uint32_t crc;
+
+ crc = crc32(data, size);
+ /* XXXPJD: Do we have to use htole32() on crc first? */
+ bcopy(&crc, hash, sizeof(crc));
+ *hsizep = sizeof(crc);
+}
+
+#ifdef HAVE_CRYPTO
+static void
+hast_sha256_checksum(const unsigned char *data, size_t size,
+ unsigned char *hash, size_t *hsizep)
+{
+ SHA256_CTX ctx;
+
+ SHA256_Init(&ctx);
+ SHA256_Update(&ctx, data, size);
+ SHA256_Final(hash, &ctx);
+ *hsizep = SHA256_DIGEST_LENGTH;
+}
+#endif /* HAVE_CRYPTO */
+
+const char *
+checksum_name(int num)
+{
+
+ switch (num) {
+ case HAST_CHECKSUM_NONE:
+ return ("none");
+ case HAST_CHECKSUM_CRC32:
+ return ("crc32");
+ case HAST_CHECKSUM_SHA256:
+ return ("sha256");
+ }
+ return ("unknown");
+}
+
+int
+checksum_send(const struct hast_resource *res, struct nv *nv, void **datap,
+ size_t *sizep, bool *freedatap __unused)
+{
+ unsigned char hash[MAX_HASH_SIZE];
+ size_t hsize;
+
+ switch (res->hr_checksum) {
+ case HAST_CHECKSUM_NONE:
+ return (0);
+ case HAST_CHECKSUM_CRC32:
+ hast_crc32_checksum(*datap, *sizep, hash, &hsize);
+ break;
+#ifdef HAVE_CRYPTO
+ case HAST_CHECKSUM_SHA256:
+ hast_sha256_checksum(*datap, *sizep, hash, &hsize);
+ break;
+#endif
+ default:
+ PJDLOG_ABORT("Invalid checksum: %d.", res->hr_checksum);
+ }
+ nv_add_string(nv, checksum_name(res->hr_checksum), "checksum");
+ nv_add_uint8_array(nv, hash, hsize, "hash");
+ if (nv_error(nv) != 0) {
+ errno = nv_error(nv);
+ return (-1);
+ }
+ return (0);
+}
+
+int
+checksum_recv(const struct hast_resource *res __unused, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap __unused)
+{
+ unsigned char chash[MAX_HASH_SIZE];
+ const unsigned char *rhash;
+ size_t chsize, rhsize;
+ const char *algo;
+
+ algo = nv_get_string(nv, "checksum");
+ if (algo == NULL)
+ return (0); /* No checksum. */
+ rhash = nv_get_uint8_array(nv, &rhsize, "hash");
+ if (rhash == NULL) {
+ pjdlog_error("Hash is missing.");
+ return (-1); /* Hash not found. */
+ }
+ if (strcmp(algo, "crc32") == 0)
+ hast_crc32_checksum(*datap, *sizep, chash, &chsize);
+#ifdef HAVE_CRYPTO
+ else if (strcmp(algo, "sha256") == 0)
+ hast_sha256_checksum(*datap, *sizep, chash, &chsize);
+#endif
+ else {
+ pjdlog_error("Unknown checksum algorithm '%s'.", algo);
+ return (-1); /* Unknown checksum algorithm. */
+ }
+ if (rhsize != chsize) {
+ pjdlog_error("Invalid hash size (%zu) for %s, should be %zu.",
+ rhsize, algo, chsize);
+ return (-1); /* Different hash size. */
+ }
+ if (bcmp(rhash, chash, chsize) != 0) {
+ pjdlog_error("Hash mismatch.");
+ return (-1); /* Hash mismatch. */
+ }
+
+ return (0);
+}
diff --git a/sbin/hastd/hast_checksum.h b/sbin/hastd/hast_checksum.h
new file mode 100644
index 0000000..9799828
--- /dev/null
+++ b/sbin/hastd/hast_checksum.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HAST_CHECKSUM_H_
+#define _HAST_CHECKSUM_H_
+
+#include <stdlib.h> /* size_t */
+
+#include <hast.h>
+#include <nv.h>
+
+const char *checksum_name(int num);
+
+int checksum_send(const struct hast_resource *res, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap);
+int checksum_recv(const struct hast_resource *res, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap);
+
+#endif /* !_HAST_CHECKSUM_H_ */
diff --git a/sbin/hastd/hast_compression.c b/sbin/hastd/hast_compression.c
new file mode 100644
index 0000000..f524eb1
--- /dev/null
+++ b/sbin/hastd/hast_compression.c
@@ -0,0 +1,283 @@
+/*-
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/endian.h>
+
+#include <errno.h>
+#include <string.h>
+#include <strings.h>
+
+#include <hast.h>
+#include <lzf.h>
+#include <nv.h>
+#include <pjdlog.h>
+
+#include "hast_compression.h"
+
+static bool
+allzeros(const void *data, size_t size)
+{
+ const uint64_t *p = data;
+ unsigned int i;
+ uint64_t v;
+
+ PJDLOG_ASSERT((size % sizeof(*p)) == 0);
+
+ /*
+ * This is the fastest method I found for checking if the given
+ * buffer contain all zeros.
+ * Because inside the loop we don't check at every step, we would
+ * get an answer only after walking through entire buffer.
+ * To return early if the buffer doesn't contain all zeros, we probe
+ * 8 bytes at the beginning, in the middle and at the end of the buffer
+ * first.
+ */
+
+ size >>= 3; /* divide by 8 */
+ if ((p[0] | p[size >> 1] | p[size - 1]) != 0)
+ return (false);
+ v = 0;
+ for (i = 0; i < size; i++)
+ v |= *p++;
+ return (v == 0);
+}
+
+static void *
+hast_hole_compress(const unsigned char *data, size_t *sizep)
+{
+ uint32_t size;
+ void *newbuf;
+
+ if (!allzeros(data, *sizep))
+ return (NULL);
+
+ newbuf = malloc(sizeof(size));
+ if (newbuf == NULL) {
+ pjdlog_warning("Unable to compress (no memory: %zu).",
+ (size_t)*sizep);
+ return (NULL);
+ }
+ size = htole32((uint32_t)*sizep);
+ bcopy(&size, newbuf, sizeof(size));
+ *sizep = sizeof(size);
+
+ return (newbuf);
+}
+
+static void *
+hast_hole_decompress(const unsigned char *data, size_t *sizep)
+{
+ uint32_t size;
+ void *newbuf;
+
+ if (*sizep != sizeof(size)) {
+ pjdlog_error("Unable to decompress (invalid size: %zu).",
+ *sizep);
+ return (NULL);
+ }
+
+ bcopy(data, &size, sizeof(size));
+ size = le32toh(size);
+
+ newbuf = malloc(size);
+ if (newbuf == NULL) {
+ pjdlog_error("Unable to decompress (no memory: %zu).",
+ (size_t)size);
+ return (NULL);
+ }
+ bzero(newbuf, size);
+ *sizep = size;
+
+ return (newbuf);
+}
+
+/* Minimum block size to try to compress. */
+#define HAST_LZF_COMPRESS_MIN 1024
+
+static void *
+hast_lzf_compress(const unsigned char *data, size_t *sizep)
+{
+ unsigned char *newbuf;
+ uint32_t origsize;
+ size_t newsize;
+
+ origsize = *sizep;
+
+ if (origsize <= HAST_LZF_COMPRESS_MIN)
+ return (NULL);
+
+ newsize = sizeof(origsize) + origsize - HAST_LZF_COMPRESS_MIN;
+ newbuf = malloc(newsize);
+ if (newbuf == NULL) {
+ pjdlog_warning("Unable to compress (no memory: %zu).",
+ newsize);
+ return (NULL);
+ }
+ newsize = lzf_compress(data, *sizep, newbuf + sizeof(origsize),
+ newsize - sizeof(origsize));
+ if (newsize == 0) {
+ free(newbuf);
+ return (NULL);
+ }
+ origsize = htole32(origsize);
+ bcopy(&origsize, newbuf, sizeof(origsize));
+
+ *sizep = sizeof(origsize) + newsize;
+ return (newbuf);
+}
+
+static void *
+hast_lzf_decompress(const unsigned char *data, size_t *sizep)
+{
+ unsigned char *newbuf;
+ uint32_t origsize;
+ size_t newsize;
+
+ PJDLOG_ASSERT(*sizep > sizeof(origsize));
+
+ bcopy(data, &origsize, sizeof(origsize));
+ origsize = le32toh(origsize);
+ PJDLOG_ASSERT(origsize > HAST_LZF_COMPRESS_MIN);
+
+ newbuf = malloc(origsize);
+ if (newbuf == NULL) {
+ pjdlog_error("Unable to decompress (no memory: %zu).",
+ (size_t)origsize);
+ return (NULL);
+ }
+ newsize = lzf_decompress(data + sizeof(origsize),
+ *sizep - sizeof(origsize), newbuf, origsize);
+ if (newsize == 0) {
+ free(newbuf);
+ pjdlog_error("Unable to decompress.");
+ return (NULL);
+ }
+ PJDLOG_ASSERT(newsize == origsize);
+
+ *sizep = newsize;
+ return (newbuf);
+}
+
+const char *
+compression_name(int num)
+{
+
+ switch (num) {
+ case HAST_COMPRESSION_NONE:
+ return ("none");
+ case HAST_COMPRESSION_HOLE:
+ return ("hole");
+ case HAST_COMPRESSION_LZF:
+ return ("lzf");
+ }
+ return ("unknown");
+}
+
+int
+compression_send(const struct hast_resource *res, struct nv *nv, void **datap,
+ size_t *sizep, bool *freedatap)
+{
+ unsigned char *newbuf;
+ int compression;
+ size_t size;
+
+ size = *sizep;
+ compression = res->hr_compression;
+
+ switch (compression) {
+ case HAST_COMPRESSION_NONE:
+ return (0);
+ case HAST_COMPRESSION_HOLE:
+ newbuf = hast_hole_compress(*datap, &size);
+ break;
+ case HAST_COMPRESSION_LZF:
+ /* Try 'hole' compression first. */
+ newbuf = hast_hole_compress(*datap, &size);
+ if (newbuf != NULL)
+ compression = HAST_COMPRESSION_HOLE;
+ else
+ newbuf = hast_lzf_compress(*datap, &size);
+ break;
+ default:
+ PJDLOG_ABORT("Invalid compression: %d.", res->hr_compression);
+ }
+
+ if (newbuf == NULL) {
+ /* Unable to compress the data. */
+ return (0);
+ }
+ nv_add_string(nv, compression_name(compression), "compression");
+ if (nv_error(nv) != 0) {
+ free(newbuf);
+ errno = nv_error(nv);
+ return (-1);
+ }
+ if (*freedatap)
+ free(*datap);
+ *freedatap = true;
+ *datap = newbuf;
+ *sizep = size;
+
+ return (0);
+}
+
+int
+compression_recv(const struct hast_resource *res __unused, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap)
+{
+ unsigned char *newbuf;
+ const char *algo;
+ size_t size;
+
+ algo = nv_get_string(nv, "compression");
+ if (algo == NULL)
+ return (0); /* No compression. */
+
+ newbuf = NULL;
+ size = *sizep;
+
+ if (strcmp(algo, "hole") == 0)
+ newbuf = hast_hole_decompress(*datap, &size);
+ else if (strcmp(algo, "lzf") == 0)
+ newbuf = hast_lzf_decompress(*datap, &size);
+ else {
+ pjdlog_error("Unknown compression algorithm '%s'.", algo);
+ return (-1); /* Unknown compression algorithm. */
+ }
+
+ if (newbuf == NULL)
+ return (-1);
+ if (*freedatap)
+ free(*datap);
+ *freedatap = true;
+ *datap = newbuf;
+ *sizep = size;
+
+ return (0);
+}
diff --git a/sbin/hastd/hast_compression.h b/sbin/hastd/hast_compression.h
new file mode 100644
index 0000000..eabdfb2
--- /dev/null
+++ b/sbin/hastd/hast_compression.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HAST_COMPRESSION_H_
+#define _HAST_COMPRESSION_H_
+
+#include <stdlib.h> /* size_t */
+
+#include <hast.h>
+#include <nv.h>
+
+const char *compression_name(int num);
+
+int compression_send(const struct hast_resource *res, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap);
+int compression_recv(const struct hast_resource *res, struct nv *nv,
+ void **datap, size_t *sizep, bool *freedatap);
+
+#endif /* !_HAST_COMPRESSION_H_ */
diff --git a/sbin/hastd/hast_proto.c b/sbin/hastd/hast_proto.c
new file mode 100644
index 0000000..dd41fb1
--- /dev/null
+++ b/sbin/hastd/hast_proto.c
@@ -0,0 +1,222 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/endian.h>
+
+#include <errno.h>
+#include <strings.h>
+
+#include <hast.h>
+#include <ebuf.h>
+#include <nv.h>
+#include <pjdlog.h>
+#include <proto.h>
+
+#ifdef HAVE_CRYPTO
+#include "hast_checksum.h"
+#endif
+#include "hast_compression.h"
+#include "hast_proto.h"
+
+struct hast_main_header {
+ /* Protocol version. */
+ uint8_t version;
+ /* Size of nv headers. */
+ uint32_t size;
+} __packed;
+
+typedef int hps_send_t(const struct hast_resource *, struct nv *nv, void **,
+ size_t *, bool *);
+typedef int hps_recv_t(const struct hast_resource *, struct nv *nv, void **,
+ size_t *, bool *);
+
+struct hast_pipe_stage {
+ const char *hps_name;
+ hps_send_t *hps_send;
+ hps_recv_t *hps_recv;
+};
+
+static struct hast_pipe_stage pipeline[] = {
+ { "compression", compression_send, compression_recv },
+#ifdef HAVE_CRYPTO
+ { "checksum", checksum_send, checksum_recv }
+#endif
+};
+
+/*
+ * Send the given nv structure via conn.
+ * We keep headers in nv structure and pass data in separate argument.
+ * There can be no data at all (data is NULL then).
+ */
+int
+hast_proto_send(const struct hast_resource *res, struct proto_conn *conn,
+ struct nv *nv, const void *data, size_t size)
+{
+ struct hast_main_header hdr;
+ struct ebuf *eb;
+ bool freedata;
+ void *dptr, *hptr;
+ size_t hsize;
+ int ret;
+
+ dptr = (void *)(uintptr_t)data;
+ freedata = false;
+ ret = -1;
+
+ if (data != NULL) {
+ unsigned int ii;
+
+ for (ii = 0; ii < sizeof(pipeline) / sizeof(pipeline[0]);
+ ii++) {
+ (void)pipeline[ii].hps_send(res, nv, &dptr, &size,
+ &freedata);
+ }
+ nv_add_uint32(nv, size, "size");
+ if (nv_error(nv) != 0) {
+ errno = nv_error(nv);
+ goto end;
+ }
+ }
+
+ eb = nv_hton(nv);
+ if (eb == NULL)
+ goto end;
+
+ hdr.version = res != NULL ? res->hr_version : HAST_PROTO_VERSION;
+ hdr.size = htole32((uint32_t)ebuf_size(eb));
+ if (ebuf_add_head(eb, &hdr, sizeof(hdr)) == -1)
+ goto end;
+
+ hptr = ebuf_data(eb, &hsize);
+ if (proto_send(conn, hptr, hsize) == -1)
+ goto end;
+ if (data != NULL && proto_send(conn, dptr, size) == -1)
+ goto end;
+
+ ret = 0;
+end:
+ if (freedata)
+ free(dptr);
+ return (ret);
+}
+
+int
+hast_proto_recv_hdr(const struct proto_conn *conn, struct nv **nvp)
+{
+ struct hast_main_header hdr;
+ struct nv *nv;
+ struct ebuf *eb;
+ void *hptr;
+
+ eb = NULL;
+ nv = NULL;
+
+ if (proto_recv(conn, &hdr, sizeof(hdr)) == -1)
+ goto fail;
+
+ if (hdr.version > HAST_PROTO_VERSION) {
+ errno = ERPCMISMATCH;
+ goto fail;
+ }
+
+ hdr.size = le32toh(hdr.size);
+
+ eb = ebuf_alloc(hdr.size);
+ if (eb == NULL)
+ goto fail;
+ if (ebuf_add_tail(eb, NULL, hdr.size) == -1)
+ goto fail;
+ hptr = ebuf_data(eb, NULL);
+ PJDLOG_ASSERT(hptr != NULL);
+ if (proto_recv(conn, hptr, hdr.size) == -1)
+ goto fail;
+ nv = nv_ntoh(eb);
+ if (nv == NULL)
+ goto fail;
+
+ *nvp = nv;
+ return (0);
+fail:
+ if (eb != NULL)
+ ebuf_free(eb);
+ return (-1);
+}
+
+int
+hast_proto_recv_data(const struct hast_resource *res, struct proto_conn *conn,
+ struct nv *nv, void *data, size_t size)
+{
+ unsigned int ii;
+ bool freedata;
+ size_t dsize;
+ void *dptr;
+ int ret;
+
+ PJDLOG_ASSERT(data != NULL);
+ PJDLOG_ASSERT(size > 0);
+
+ ret = -1;
+ freedata = false;
+ dptr = data;
+
+ dsize = nv_get_uint32(nv, "size");
+ if (dsize > size) {
+ errno = EINVAL;
+ goto end;
+ } else if (dsize == 0) {
+ (void)nv_set_error(nv, 0);
+ } else {
+ if (proto_recv(conn, data, dsize) == -1)
+ goto end;
+ for (ii = sizeof(pipeline) / sizeof(pipeline[0]); ii > 0;
+ ii--) {
+ ret = pipeline[ii - 1].hps_recv(res, nv, &dptr,
+ &dsize, &freedata);
+ if (ret == -1)
+ goto end;
+ }
+ ret = -1;
+ if (dsize > size) {
+ errno = EINVAL;
+ goto end;
+ }
+ if (dptr != data)
+ bcopy(dptr, data, dsize);
+ }
+
+ ret = 0;
+end:
+ if (freedata)
+ free(dptr);
+ return (ret);
+}
diff --git a/sbin/hastd/hast_proto.h b/sbin/hastd/hast_proto.h
new file mode 100644
index 0000000..49f3b56
--- /dev/null
+++ b/sbin/hastd/hast_proto.h
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HAST_PROTO_H_
+#define _HAST_PROTO_H_
+
+#include <stdlib.h> /* size_t */
+
+#include <nv.h>
+#include <proto.h>
+
+int hast_proto_send(const struct hast_resource *res, struct proto_conn *conn,
+ struct nv *nv, const void *data, size_t size);
+int hast_proto_recv_hdr(const struct proto_conn *conn, struct nv **nvp);
+int hast_proto_recv_data(const struct hast_resource *res,
+ struct proto_conn *conn, struct nv *nv, void *data, size_t size);
+
+#endif /* !_HAST_PROTO_H_ */
diff --git a/sbin/hastd/hastd.8 b/sbin/hastd/hastd.8
new file mode 100644
index 0000000..017e895
--- /dev/null
+++ b/sbin/hastd/hastd.8
@@ -0,0 +1,232 @@
+.\" Copyright (c) 2010 The FreeBSD Foundation
+.\" All rights reserved.
+.\"
+.\" This software was developed by Pawel Jakub Dawidek under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd February 1, 2010
+.Dt HASTD 8
+.Os
+.Sh NAME
+.Nm hastd
+.Nd "Highly Available Storage daemon"
+.Sh SYNOPSIS
+.Nm
+.Op Fl dFh
+.Op Fl c Ar config
+.Op Fl P Ar pidfile
+.Sh DESCRIPTION
+The
+.Nm
+daemon is responsible for managing highly available GEOM providers.
+.Pp
+.Nm
+allows to transparently store data on two physically separated machines
+connected over the TCP/IP network.
+Only one machine (cluster node) can actively use storage provided by
+.Nm .
+This machine is called primary.
+The
+.Nm
+daemon operates on block level, which makes it transparent to file
+systems and applications.
+.Pp
+There is one main
+.Nm
+daemon which starts new worker process as soon as a role for the given
+resource is changed to primary or as soon as a role for the given
+resource is changed to secondary and remote (primary) node will
+successfully connect to it.
+Every worker process gets a new process title (see
+.Xr setproctitle 3 ) ,
+which describes its role and resource it controls.
+The exact format is:
+.Bd -literal -offset indent
+hastd: <resource name> (<role>)
+.Ed
+.Pp
+If (and only if)
+.Nm
+operates in primary role for the given resource, a corresponding
+.Pa /dev/hast/<name>
+disk-like device (GEOM provider) is created.
+File systems and applications can use this provider to send I/O
+requests to.
+Every write, delete and flush operation
+.Dv ( BIO_WRITE , BIO_DELETE , BIO_FLUSH )
+is sent to the local component and replicated on the remote (secondary) node
+if it is available.
+Read operations
+.Dv ( BIO_READ )
+are handled locally unless an I/O error occurs or the local version of the data
+is not up-to-date yet (synchronization is in progress).
+.Pp
+The
+.Nm
+daemon uses the GEOM Gate class to receive I/O requests from the
+in-kernel GEOM infrastructure.
+The
+.Nm geom_gate.ko
+module is loaded automatically if the kernel was not compiled with the
+following option:
+.Bd -ragged -offset indent
+.Cd "options GEOM_GATE"
+.Ed
+.Pp
+The connection between two
+.Nm
+daemons is always initiated from the one running as primary to the one
+running as secondary.
+When the primary
+.Nm
+is unable to connect or the connection fails, it will try to re-establish
+the connection every few seconds.
+Once the connection is established, the primary
+.Nm
+will synchronize every extent that was modified during connection outage
+to the secondary
+.Nm .
+.Pp
+It is possible that in the case of a connection outage between the nodes the
+.Nm
+primary role for the given resource will be configured on both nodes.
+This in turn leads to incompatible data modifications.
+Such a condition is called a split-brain and cannot be automatically
+resolved by the
+.Nm
+daemon as this will lead most likely to data corruption or loss of
+important changes.
+Even though it cannot be fixed by
+.Nm
+itself, it will be detected and a further connection between independently
+modified nodes will not be possible.
+Once this situation is manually resolved by an administrator, the resource
+on one of the nodes can be initialized (erasing local data), which makes
+a connection to the remote node possible again.
+Connection of the freshly initialized component will trigger full resource
+synchronization.
+.Pp
+A
+.Nm
+daemon never picks its role automatically.
+The role has to be configured with the
+.Xr hastctl 8
+control utility by additional software like
+.Nm ucarp
+or
+.Nm heartbeat
+that can reliably manage role separation and switch secondary node to
+primary role in case of the primary's failure.
+.Pp
+The
+.Nm
+daemon can be started with the following command line arguments:
+.Bl -tag -width ".Fl P Ar pidfile"
+.It Fl c Ar config
+Specify alternative location of the configuration file.
+The default location is
+.Pa /etc/hast.conf .
+.It Fl d
+Print or log debugging information.
+This option can be specified multiple times to raise the verbosity
+level.
+.It Fl F
+Start the
+.Nm
+daemon in the foreground.
+By default
+.Nm
+starts in the background.
+.It Fl h
+Print the
+.Nm
+usage message.
+.It Fl P Ar pidfile
+Specify alternative location of a file where main process PID will be
+stored.
+The default location is
+.Pa /var/run/hastd.pid .
+.El
+.Sh FILES
+.Bl -tag -width ".Pa /var/run/hastctl" -compact
+.It Pa /etc/hast.conf
+The configuration file for
+.Nm
+and
+.Xr hastctl 8 .
+.It Pa /var/run/hastctl
+Control socket used by the
+.Xr hastctl 8
+control utility to communicate with
+.Nm .
+.It Pa /var/run/hastd.pid
+The default location of the
+.Nm
+PID file.
+.El
+.Sh EXIT STATUS
+Exit status is 0 on success, or one of the values described in
+.Xr sysexits 3
+on failure.
+.Sh EXAMPLES
+Launch
+.Nm
+on both nodes.
+Set role for resource
+.Nm shared
+to primary on
+.Nm nodeA
+and to secondary on
+.Nm nodeB .
+Create file system on
+.Pa /dev/hast/shared
+provider and mount it.
+.Bd -literal -offset indent
+nodeB# hastd
+nodeB# hastctl role secondary shared
+
+nodeA# hastd
+nodeA# hastctl role primary shared
+nodeA# newfs -U /dev/hast/shared
+nodeA# mount -o noatime /dev/hast/shared /shared
+.Ed
+.Sh SEE ALSO
+.Xr sysexits 3 ,
+.Xr geom 4 ,
+.Xr hast.conf 5 ,
+.Xr ggatec 8 ,
+.Xr ggated 8 ,
+.Xr ggatel 8 ,
+.Xr hastctl 8 ,
+.Xr mount 8 ,
+.Xr newfs 8 ,
+.Xr g_bio 9
+.Sh AUTHORS
+The
+.Nm
+was developed by
+.An Pawel Jakub Dawidek Aq pjd@FreeBSD.org
+under sponsorship of the FreeBSD Foundation.
diff --git a/sbin/hastd/hastd.c b/sbin/hastd/hastd.c
new file mode 100644
index 0000000..06b38e9
--- /dev/null
+++ b/sbin/hastd/hastd.c
@@ -0,0 +1,1337 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker.h>
+#include <sys/module.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+#include <err.h>
+#include <errno.h>
+#include <libutil.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <activemap.h>
+#include <pjdlog.h>
+
+#include "control.h"
+#include "event.h"
+#include "hast.h"
+#include "hast_proto.h"
+#include "hastd.h"
+#include "hooks.h"
+#include "subr.h"
+
+/* Path to configuration file. */
+const char *cfgpath = HAST_CONFIG;
+/* Hastd configuration. */
+static struct hastd_config *cfg;
+/* Was SIGINT or SIGTERM signal received? */
+bool sigexit_received = false;
+/* Path to pidfile. */
+static const char *pidfile;
+/* Pidfile handle. */
+struct pidfh *pfh;
+/* Do we run in foreground? */
+static bool foreground;
+
+/* How often check for hooks running for too long. */
+#define REPORT_INTERVAL 5
+
+static void
+usage(void)
+{
+
+ errx(EX_USAGE, "[-dFh] [-c config] [-P pidfile]");
+}
+
+static void
+g_gate_load(void)
+{
+
+ if (modfind("g_gate") == -1) {
+ /* Not present in kernel, try loading it. */
+ if (kldload("geom_gate") == -1 || modfind("g_gate") == -1) {
+ if (errno != EEXIST) {
+ pjdlog_exit(EX_OSERR,
+ "Unable to load geom_gate module");
+ }
+ }
+ }
+}
+
+void
+descriptors_cleanup(struct hast_resource *res)
+{
+ struct hast_resource *tres, *tmres;
+ struct hastd_listen *lst;
+
+ TAILQ_FOREACH_SAFE(tres, &cfg->hc_resources, hr_next, tmres) {
+ if (tres == res) {
+ PJDLOG_VERIFY(res->hr_role == HAST_ROLE_SECONDARY ||
+ (res->hr_remotein == NULL &&
+ res->hr_remoteout == NULL));
+ continue;
+ }
+ if (tres->hr_remotein != NULL)
+ proto_close(tres->hr_remotein);
+ if (tres->hr_remoteout != NULL)
+ proto_close(tres->hr_remoteout);
+ if (tres->hr_ctrl != NULL)
+ proto_close(tres->hr_ctrl);
+ if (tres->hr_event != NULL)
+ proto_close(tres->hr_event);
+ if (tres->hr_conn != NULL)
+ proto_close(tres->hr_conn);
+ TAILQ_REMOVE(&cfg->hc_resources, tres, hr_next);
+ free(tres);
+ }
+ if (cfg->hc_controlin != NULL)
+ proto_close(cfg->hc_controlin);
+ proto_close(cfg->hc_controlconn);
+ while ((lst = TAILQ_FIRST(&cfg->hc_listen)) != NULL) {
+ TAILQ_REMOVE(&cfg->hc_listen, lst, hl_next);
+ if (lst->hl_conn != NULL)
+ proto_close(lst->hl_conn);
+ free(lst);
+ }
+ (void)pidfile_close(pfh);
+ hook_fini();
+ pjdlog_fini();
+}
+
+static const char *
+dtype2str(mode_t mode)
+{
+
+ if (S_ISBLK(mode))
+ return ("block device");
+ else if (S_ISCHR(mode))
+ return ("character device");
+ else if (S_ISDIR(mode))
+ return ("directory");
+ else if (S_ISFIFO(mode))
+ return ("pipe or FIFO");
+ else if (S_ISLNK(mode))
+ return ("symbolic link");
+ else if (S_ISREG(mode))
+ return ("regular file");
+ else if (S_ISSOCK(mode))
+ return ("socket");
+ else if (S_ISWHT(mode))
+ return ("whiteout");
+ else
+ return ("unknown");
+}
+
+void
+descriptors_assert(const struct hast_resource *res, int pjdlogmode)
+{
+ char msg[256];
+ struct stat sb;
+ long maxfd;
+ bool isopen;
+ mode_t mode;
+ int fd;
+
+ /*
+ * At this point descriptor to syslog socket is closed, so if we want
+ * to log assertion message, we have to first store it in 'msg' local
+ * buffer and then open syslog socket and log it.
+ */
+ msg[0] = '\0';
+
+ maxfd = sysconf(_SC_OPEN_MAX);
+ if (maxfd == -1) {
+ pjdlog_init(pjdlogmode);
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name,
+ role2str(res->hr_role));
+ pjdlog_errno(LOG_WARNING, "sysconf(_SC_OPEN_MAX) failed");
+ pjdlog_fini();
+ maxfd = 16384;
+ }
+ for (fd = 0; fd <= maxfd; fd++) {
+ if (fstat(fd, &sb) == 0) {
+ isopen = true;
+ mode = sb.st_mode;
+ } else if (errno == EBADF) {
+ isopen = false;
+ mode = 0;
+ } else {
+ (void)snprintf(msg, sizeof(msg),
+ "Unable to fstat descriptor %d: %s", fd,
+ strerror(errno));
+ break;
+ }
+ if (fd == STDIN_FILENO || fd == STDOUT_FILENO ||
+ fd == STDERR_FILENO) {
+ if (!isopen) {
+ (void)snprintf(msg, sizeof(msg),
+ "Descriptor %d (%s) is closed, but should be open.",
+ fd, (fd == STDIN_FILENO ? "stdin" :
+ (fd == STDOUT_FILENO ? "stdout" : "stderr")));
+ break;
+ }
+ } else if (fd == proto_descriptor(res->hr_event)) {
+ if (!isopen) {
+ (void)snprintf(msg, sizeof(msg),
+ "Descriptor %d (event) is closed, but should be open.",
+ fd);
+ break;
+ }
+ if (!S_ISSOCK(mode)) {
+ (void)snprintf(msg, sizeof(msg),
+ "Descriptor %d (event) is %s, but should be %s.",
+ fd, dtype2str(mode), dtype2str(S_IFSOCK));
+ break;
+ }
+ } else if (fd == proto_descriptor(res->hr_ctrl)) {
+ if (!isopen) {
+ (void)snprintf(msg, sizeof(msg),
+ "Descriptor %d (ctrl) is closed, but should be open.",
+ fd);
+ break;
+ }
+ if (!S_ISSOCK(mode)) {
+ (void)snprintf(msg, sizeof(msg),
+ "Descriptor %d (ctrl) is %s, but should be %s.",
+ fd, dtype2str(mode), dtype2str(S_IFSOCK));
+ break;
+ }
+ } else if (res->hr_role == HAST_ROLE_PRIMARY &&
+ fd == proto_descriptor(res->hr_conn)) {
+ if (!isopen) {
+ (void)snprintf(msg, sizeof(msg),
+ "Descriptor %d (conn) is closed, but should be open.",
+ fd);
+ break;
+ }
+ if (!S_ISSOCK(mode)) {
+ (void)snprintf(msg, sizeof(msg),
+ "Descriptor %d (conn) is %s, but should be %s.",
+ fd, dtype2str(mode), dtype2str(S_IFSOCK));
+ break;
+ }
+ } else if (res->hr_role == HAST_ROLE_SECONDARY &&
+ res->hr_conn != NULL &&
+ fd == proto_descriptor(res->hr_conn)) {
+ if (isopen) {
+ (void)snprintf(msg, sizeof(msg),
+ "Descriptor %d (conn) is open, but should be closed.",
+ fd);
+ break;
+ }
+ } else if (res->hr_role == HAST_ROLE_SECONDARY &&
+ fd == proto_descriptor(res->hr_remotein)) {
+ if (!isopen) {
+ (void)snprintf(msg, sizeof(msg),
+ "Descriptor %d (remote in) is closed, but should be open.",
+ fd);
+ break;
+ }
+ if (!S_ISSOCK(mode)) {
+ (void)snprintf(msg, sizeof(msg),
+ "Descriptor %d (remote in) is %s, but should be %s.",
+ fd, dtype2str(mode), dtype2str(S_IFSOCK));
+ break;
+ }
+ } else if (res->hr_role == HAST_ROLE_SECONDARY &&
+ fd == proto_descriptor(res->hr_remoteout)) {
+ if (!isopen) {
+ (void)snprintf(msg, sizeof(msg),
+ "Descriptor %d (remote out) is closed, but should be open.",
+ fd);
+ break;
+ }
+ if (!S_ISSOCK(mode)) {
+ (void)snprintf(msg, sizeof(msg),
+ "Descriptor %d (remote out) is %s, but should be %s.",
+ fd, dtype2str(mode), dtype2str(S_IFSOCK));
+ break;
+ }
+ } else {
+ if (isopen) {
+ (void)snprintf(msg, sizeof(msg),
+ "Descriptor %d is open (%s), but should be closed.",
+ fd, dtype2str(mode));
+ break;
+ }
+ }
+ }
+ if (msg[0] != '\0') {
+ pjdlog_init(pjdlogmode);
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name,
+ role2str(res->hr_role));
+ PJDLOG_ABORT("%s", msg);
+ }
+}
+
+static void
+child_exit_log(unsigned int pid, int status)
+{
+
+ if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
+ pjdlog_debug(1, "Worker process exited gracefully (pid=%u).",
+ pid);
+ } else if (WIFSIGNALED(status)) {
+ pjdlog_error("Worker process killed (pid=%u, signal=%d).",
+ pid, WTERMSIG(status));
+ } else {
+ pjdlog_error("Worker process exited ungracefully (pid=%u, exitcode=%d).",
+ pid, WIFEXITED(status) ? WEXITSTATUS(status) : -1);
+ }
+}
+
+static void
+child_exit(void)
+{
+ struct hast_resource *res;
+ int status;
+ pid_t pid;
+
+ while ((pid = wait3(&status, WNOHANG, NULL)) > 0) {
+ /* Find resource related to the process that just exited. */
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (pid == res->hr_workerpid)
+ break;
+ }
+ if (res == NULL) {
+ /*
+ * This can happen when new connection arrives and we
+ * cancel child responsible for the old one or if this
+ * was hook which we executed.
+ */
+ hook_check_one(pid, status);
+ continue;
+ }
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name,
+ role2str(res->hr_role));
+ child_exit_log(pid, status);
+ child_cleanup(res);
+ if (res->hr_role == HAST_ROLE_PRIMARY) {
+ /*
+ * Restart child process if it was killed by signal
+ * or exited because of temporary problem.
+ */
+ if (WIFSIGNALED(status) ||
+ (WIFEXITED(status) &&
+ WEXITSTATUS(status) == EX_TEMPFAIL)) {
+ sleep(1);
+ pjdlog_info("Restarting worker process.");
+ hastd_primary(res);
+ } else {
+ res->hr_role = HAST_ROLE_INIT;
+ pjdlog_info("Changing resource role back to %s.",
+ role2str(res->hr_role));
+ }
+ }
+ pjdlog_prefix_set("%s", "");
+ }
+}
+
+static bool
+resource_needs_restart(const struct hast_resource *res0,
+ const struct hast_resource *res1)
+{
+
+ PJDLOG_ASSERT(strcmp(res0->hr_name, res1->hr_name) == 0);
+
+ if (strcmp(res0->hr_provname, res1->hr_provname) != 0)
+ return (true);
+ if (strcmp(res0->hr_localpath, res1->hr_localpath) != 0)
+ return (true);
+ if (res0->hr_role == HAST_ROLE_INIT ||
+ res0->hr_role == HAST_ROLE_SECONDARY) {
+ if (strcmp(res0->hr_remoteaddr, res1->hr_remoteaddr) != 0)
+ return (true);
+ if (strcmp(res0->hr_sourceaddr, res1->hr_sourceaddr) != 0)
+ return (true);
+ if (res0->hr_replication != res1->hr_replication)
+ return (true);
+ if (res0->hr_checksum != res1->hr_checksum)
+ return (true);
+ if (res0->hr_compression != res1->hr_compression)
+ return (true);
+ if (res0->hr_timeout != res1->hr_timeout)
+ return (true);
+ if (strcmp(res0->hr_exec, res1->hr_exec) != 0)
+ return (true);
+ /*
+ * When metaflush has changed we don't really need restart,
+ * but it is just easier this way.
+ */
+ if (res0->hr_metaflush != res1->hr_metaflush)
+ return (true);
+ }
+ return (false);
+}
+
+static bool
+resource_needs_reload(const struct hast_resource *res0,
+ const struct hast_resource *res1)
+{
+
+ PJDLOG_ASSERT(strcmp(res0->hr_name, res1->hr_name) == 0);
+ PJDLOG_ASSERT(strcmp(res0->hr_provname, res1->hr_provname) == 0);
+ PJDLOG_ASSERT(strcmp(res0->hr_localpath, res1->hr_localpath) == 0);
+
+ if (res0->hr_role != HAST_ROLE_PRIMARY)
+ return (false);
+
+ if (strcmp(res0->hr_remoteaddr, res1->hr_remoteaddr) != 0)
+ return (true);
+ if (strcmp(res0->hr_sourceaddr, res1->hr_sourceaddr) != 0)
+ return (true);
+ if (res0->hr_replication != res1->hr_replication)
+ return (true);
+ if (res0->hr_checksum != res1->hr_checksum)
+ return (true);
+ if (res0->hr_compression != res1->hr_compression)
+ return (true);
+ if (res0->hr_timeout != res1->hr_timeout)
+ return (true);
+ if (strcmp(res0->hr_exec, res1->hr_exec) != 0)
+ return (true);
+ if (res0->hr_metaflush != res1->hr_metaflush)
+ return (true);
+ return (false);
+}
+
+static void
+resource_reload(const struct hast_resource *res)
+{
+ struct nv *nvin, *nvout;
+ int error;
+
+ PJDLOG_ASSERT(res->hr_role == HAST_ROLE_PRIMARY);
+
+ nvout = nv_alloc();
+ nv_add_uint8(nvout, CONTROL_RELOAD, "cmd");
+ nv_add_string(nvout, res->hr_remoteaddr, "remoteaddr");
+ nv_add_string(nvout, res->hr_sourceaddr, "sourceaddr");
+ nv_add_int32(nvout, (int32_t)res->hr_replication, "replication");
+ nv_add_int32(nvout, (int32_t)res->hr_checksum, "checksum");
+ nv_add_int32(nvout, (int32_t)res->hr_compression, "compression");
+ nv_add_int32(nvout, (int32_t)res->hr_timeout, "timeout");
+ nv_add_string(nvout, res->hr_exec, "exec");
+ nv_add_int32(nvout, (int32_t)res->hr_metaflush, "metaflush");
+ if (nv_error(nvout) != 0) {
+ nv_free(nvout);
+ pjdlog_error("Unable to allocate header for reload message.");
+ return;
+ }
+ if (hast_proto_send(res, res->hr_ctrl, nvout, NULL, 0) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to send reload message");
+ nv_free(nvout);
+ return;
+ }
+ nv_free(nvout);
+
+ /* Receive response. */
+ if (hast_proto_recv_hdr(res->hr_ctrl, &nvin) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to receive reload reply");
+ return;
+ }
+ error = nv_get_int16(nvin, "error");
+ nv_free(nvin);
+ if (error != 0) {
+ pjdlog_common(LOG_ERR, 0, error, "Reload failed");
+ return;
+ }
+}
+
+static void
+hastd_reload(void)
+{
+ struct hastd_config *newcfg;
+ struct hast_resource *nres, *cres, *tres;
+ struct hastd_listen *nlst, *clst;
+ struct pidfh *newpfh;
+ unsigned int nlisten;
+ uint8_t role;
+ pid_t otherpid;
+
+ pjdlog_info("Reloading configuration...");
+
+ newpfh = NULL;
+
+ newcfg = yy_config_parse(cfgpath, false);
+ if (newcfg == NULL)
+ goto failed;
+
+ /*
+ * Check if control address has changed.
+ */
+ if (strcmp(cfg->hc_controladdr, newcfg->hc_controladdr) != 0) {
+ if (proto_server(newcfg->hc_controladdr,
+ &newcfg->hc_controlconn) == -1) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to listen on control address %s",
+ newcfg->hc_controladdr);
+ goto failed;
+ }
+ }
+ /*
+ * Check if any listen address has changed.
+ */
+ nlisten = 0;
+ TAILQ_FOREACH(nlst, &newcfg->hc_listen, hl_next) {
+ TAILQ_FOREACH(clst, &cfg->hc_listen, hl_next) {
+ if (strcmp(nlst->hl_addr, clst->hl_addr) == 0)
+ break;
+ }
+ if (clst != NULL && clst->hl_conn != NULL) {
+ pjdlog_info("Keep listening on address %s.",
+ nlst->hl_addr);
+ nlst->hl_conn = clst->hl_conn;
+ nlisten++;
+ } else if (proto_server(nlst->hl_addr, &nlst->hl_conn) == 0) {
+ pjdlog_info("Listening on new address %s.",
+ nlst->hl_addr);
+ nlisten++;
+ } else {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to listen on address %s", nlst->hl_addr);
+ }
+ }
+ if (nlisten == 0) {
+ pjdlog_error("No addresses to listen on.");
+ goto failed;
+ }
+ /*
+ * Check if pidfile's path has changed.
+ */
+ if (!foreground && pidfile == NULL &&
+ strcmp(cfg->hc_pidfile, newcfg->hc_pidfile) != 0) {
+ newpfh = pidfile_open(newcfg->hc_pidfile, 0600, &otherpid);
+ if (newpfh == NULL) {
+ if (errno == EEXIST) {
+ pjdlog_errno(LOG_WARNING,
+ "Another hastd is already running, pidfile: %s, pid: %jd.",
+ newcfg->hc_pidfile, (intmax_t)otherpid);
+ } else {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to open or create pidfile %s",
+ newcfg->hc_pidfile);
+ }
+ } else if (pidfile_write(newpfh) == -1) {
+ /* Write PID to a file. */
+ pjdlog_errno(LOG_WARNING,
+ "Unable to write PID to file %s",
+ newcfg->hc_pidfile);
+ } else {
+ pjdlog_debug(1, "PID stored in %s.",
+ newcfg->hc_pidfile);
+ }
+ }
+
+ /* No failures from now on. */
+
+ /*
+ * Switch to new control socket.
+ */
+ if (newcfg->hc_controlconn != NULL) {
+ pjdlog_info("Control socket changed from %s to %s.",
+ cfg->hc_controladdr, newcfg->hc_controladdr);
+ proto_close(cfg->hc_controlconn);
+ cfg->hc_controlconn = newcfg->hc_controlconn;
+ newcfg->hc_controlconn = NULL;
+ strlcpy(cfg->hc_controladdr, newcfg->hc_controladdr,
+ sizeof(cfg->hc_controladdr));
+ }
+ /*
+ * Switch to new pidfile.
+ */
+ if (newpfh != NULL) {
+ pjdlog_info("Pidfile changed from %s to %s.", cfg->hc_pidfile,
+ newcfg->hc_pidfile);
+ (void)pidfile_remove(pfh);
+ pfh = newpfh;
+ (void)strlcpy(cfg->hc_pidfile, newcfg->hc_pidfile,
+ sizeof(cfg->hc_pidfile));
+ }
+ /*
+ * Switch to new listen addresses. Close all that were removed.
+ */
+ while ((clst = TAILQ_FIRST(&cfg->hc_listen)) != NULL) {
+ TAILQ_FOREACH(nlst, &newcfg->hc_listen, hl_next) {
+ if (strcmp(nlst->hl_addr, clst->hl_addr) == 0)
+ break;
+ }
+ if (nlst == NULL && clst->hl_conn != NULL) {
+ proto_close(clst->hl_conn);
+ pjdlog_info("No longer listening on address %s.",
+ clst->hl_addr);
+ }
+ TAILQ_REMOVE(&cfg->hc_listen, clst, hl_next);
+ free(clst);
+ }
+ TAILQ_CONCAT(&cfg->hc_listen, &newcfg->hc_listen, hl_next);
+
+ /*
+ * Stop and remove resources that were removed from the configuration.
+ */
+ TAILQ_FOREACH_SAFE(cres, &cfg->hc_resources, hr_next, tres) {
+ TAILQ_FOREACH(nres, &newcfg->hc_resources, hr_next) {
+ if (strcmp(cres->hr_name, nres->hr_name) == 0)
+ break;
+ }
+ if (nres == NULL) {
+ control_set_role(cres, HAST_ROLE_INIT);
+ TAILQ_REMOVE(&cfg->hc_resources, cres, hr_next);
+ pjdlog_info("Resource %s removed.", cres->hr_name);
+ free(cres);
+ }
+ }
+ /*
+ * Move new resources to the current configuration.
+ */
+ TAILQ_FOREACH_SAFE(nres, &newcfg->hc_resources, hr_next, tres) {
+ TAILQ_FOREACH(cres, &cfg->hc_resources, hr_next) {
+ if (strcmp(cres->hr_name, nres->hr_name) == 0)
+ break;
+ }
+ if (cres == NULL) {
+ TAILQ_REMOVE(&newcfg->hc_resources, nres, hr_next);
+ TAILQ_INSERT_TAIL(&cfg->hc_resources, nres, hr_next);
+ pjdlog_info("Resource %s added.", nres->hr_name);
+ }
+ }
+ /*
+ * Deal with modified resources.
+ * Depending on what has changed exactly we might want to perform
+ * different actions.
+ *
+ * We do full resource restart in the following situations:
+ * Resource role is INIT or SECONDARY.
+ * Resource role is PRIMARY and path to local component or provider
+ * name has changed.
+ * In case of PRIMARY, the worker process will be killed and restarted,
+ * which also means removing /dev/hast/<name> provider and
+ * recreating it.
+ *
+ * We do just reload (send SIGHUP to worker process) if we act as
+ * PRIMARY, but only if remote address, source address, replication
+ * mode, timeout, execution path or metaflush has changed.
+ * For those, there is no need to restart worker process.
+ * If PRIMARY receives SIGHUP, it will reconnect if remote address or
+ * source address has changed or it will set new timeout if only timeout
+ * has changed or it will update metaflush if only metaflush has
+ * changed.
+ */
+ TAILQ_FOREACH_SAFE(nres, &newcfg->hc_resources, hr_next, tres) {
+ TAILQ_FOREACH(cres, &cfg->hc_resources, hr_next) {
+ if (strcmp(cres->hr_name, nres->hr_name) == 0)
+ break;
+ }
+ PJDLOG_ASSERT(cres != NULL);
+ if (resource_needs_restart(cres, nres)) {
+ pjdlog_info("Resource %s configuration was modified, restarting it.",
+ cres->hr_name);
+ role = cres->hr_role;
+ control_set_role(cres, HAST_ROLE_INIT);
+ TAILQ_REMOVE(&cfg->hc_resources, cres, hr_next);
+ free(cres);
+ TAILQ_REMOVE(&newcfg->hc_resources, nres, hr_next);
+ TAILQ_INSERT_TAIL(&cfg->hc_resources, nres, hr_next);
+ control_set_role(nres, role);
+ } else if (resource_needs_reload(cres, nres)) {
+ pjdlog_info("Resource %s configuration was modified, reloading it.",
+ cres->hr_name);
+ strlcpy(cres->hr_remoteaddr, nres->hr_remoteaddr,
+ sizeof(cres->hr_remoteaddr));
+ strlcpy(cres->hr_sourceaddr, nres->hr_sourceaddr,
+ sizeof(cres->hr_sourceaddr));
+ cres->hr_replication = nres->hr_replication;
+ cres->hr_checksum = nres->hr_checksum;
+ cres->hr_compression = nres->hr_compression;
+ cres->hr_timeout = nres->hr_timeout;
+ strlcpy(cres->hr_exec, nres->hr_exec,
+ sizeof(cres->hr_exec));
+ cres->hr_metaflush = nres->hr_metaflush;
+ if (cres->hr_workerpid != 0)
+ resource_reload(cres);
+ }
+ }
+
+ yy_config_free(newcfg);
+ pjdlog_info("Configuration reloaded successfully.");
+ return;
+failed:
+ if (newcfg != NULL) {
+ if (newcfg->hc_controlconn != NULL)
+ proto_close(newcfg->hc_controlconn);
+ while ((nlst = TAILQ_FIRST(&newcfg->hc_listen)) != NULL) {
+ if (nlst->hl_conn != NULL) {
+ TAILQ_FOREACH(clst, &cfg->hc_listen, hl_next) {
+ if (strcmp(nlst->hl_addr,
+ clst->hl_addr) == 0) {
+ break;
+ }
+ }
+ if (clst == NULL || clst->hl_conn == NULL)
+ proto_close(nlst->hl_conn);
+ }
+ TAILQ_REMOVE(&newcfg->hc_listen, nlst, hl_next);
+ free(nlst);
+ }
+ yy_config_free(newcfg);
+ }
+ if (newpfh != NULL)
+ (void)pidfile_remove(newpfh);
+ pjdlog_warning("Configuration not reloaded.");
+}
+
+static void
+terminate_workers(void)
+{
+ struct hast_resource *res;
+
+ pjdlog_info("Termination signal received, exiting.");
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (res->hr_workerpid == 0)
+ continue;
+ pjdlog_info("Terminating worker process (resource=%s, role=%s, pid=%u).",
+ res->hr_name, role2str(res->hr_role), res->hr_workerpid);
+ if (kill(res->hr_workerpid, SIGTERM) == 0)
+ continue;
+ pjdlog_errno(LOG_WARNING,
+ "Unable to send signal to worker process (resource=%s, role=%s, pid=%u).",
+ res->hr_name, role2str(res->hr_role), res->hr_workerpid);
+ }
+}
+
+static void
+listen_accept(struct hastd_listen *lst)
+{
+ struct hast_resource *res;
+ struct proto_conn *conn;
+ struct nv *nvin, *nvout, *nverr;
+ const char *resname;
+ const unsigned char *token;
+ char laddr[256], raddr[256];
+ uint8_t version;
+ size_t size;
+ pid_t pid;
+ int status;
+
+ proto_local_address(lst->hl_conn, laddr, sizeof(laddr));
+ pjdlog_debug(1, "Accepting connection to %s.", laddr);
+
+ if (proto_accept(lst->hl_conn, &conn) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to accept connection %s", laddr);
+ return;
+ }
+
+ proto_local_address(conn, laddr, sizeof(laddr));
+ proto_remote_address(conn, raddr, sizeof(raddr));
+ pjdlog_info("Connection from %s to %s.", raddr, laddr);
+
+ /* Error in setting timeout is not critical, but why should it fail? */
+ if (proto_timeout(conn, HAST_TIMEOUT) == -1)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
+
+ nvin = nvout = nverr = NULL;
+
+ /*
+ * Before receiving any data see if remote host have access to any
+ * resource.
+ */
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (proto_address_match(conn, res->hr_remoteaddr))
+ break;
+ }
+ if (res == NULL) {
+ pjdlog_error("Client %s isn't known.", raddr);
+ goto close;
+ }
+ /* Ok, remote host can access at least one resource. */
+
+ if (hast_proto_recv_hdr(conn, &nvin) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to receive header from %s",
+ raddr);
+ goto close;
+ }
+
+ resname = nv_get_string(nvin, "resource");
+ if (resname == NULL) {
+ pjdlog_error("No 'resource' field in the header received from %s.",
+ raddr);
+ goto close;
+ }
+ pjdlog_debug(2, "%s: resource=%s", raddr, resname);
+ version = nv_get_uint8(nvin, "version");
+ pjdlog_debug(2, "%s: version=%hhu", raddr, version);
+ if (version == 0) {
+ /*
+ * If no version is sent, it means this is protocol version 1.
+ */
+ version = 1;
+ }
+ if (version > HAST_PROTO_VERSION) {
+ pjdlog_info("Remote protocol version %hhu is not supported, falling back to version %hhu.",
+ version, (unsigned char)HAST_PROTO_VERSION);
+ version = HAST_PROTO_VERSION;
+ }
+ pjdlog_debug(1, "Negotiated protocol version %hhu.", version);
+ token = nv_get_uint8_array(nvin, &size, "token");
+ /*
+ * NULL token means that this is first connection.
+ */
+ if (token != NULL && size != sizeof(res->hr_token)) {
+ pjdlog_error("Received token of invalid size from %s (expected %zu, got %zu).",
+ raddr, sizeof(res->hr_token), size);
+ goto close;
+ }
+
+ /*
+ * From now on we want to send errors to the remote node.
+ */
+ nverr = nv_alloc();
+
+ /* Find resource related to this connection. */
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (strcmp(resname, res->hr_name) == 0)
+ break;
+ }
+ /* Have we found the resource? */
+ if (res == NULL) {
+ pjdlog_error("No resource '%s' as requested by %s.",
+ resname, raddr);
+ nv_add_stringf(nverr, "errmsg", "Resource not configured.");
+ goto fail;
+ }
+
+ /* Now that we know resource name setup log prefix. */
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
+
+ /* Does the remote host have access to this resource? */
+ if (!proto_address_match(conn, res->hr_remoteaddr)) {
+ pjdlog_error("Client %s has no access to the resource.", raddr);
+ nv_add_stringf(nverr, "errmsg", "No access to the resource.");
+ goto fail;
+ }
+ /* Is the resource marked as secondary? */
+ if (res->hr_role != HAST_ROLE_SECONDARY) {
+ pjdlog_warning("We act as %s for the resource and not as %s as requested by %s.",
+ role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY),
+ raddr);
+ nv_add_stringf(nverr, "errmsg",
+ "Remote node acts as %s for the resource and not as %s.",
+ role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY));
+ if (res->hr_role == HAST_ROLE_PRIMARY) {
+ /*
+ * If we act as primary request the other side to wait
+ * for us a bit, as we might be finishing cleanups.
+ */
+ nv_add_uint8(nverr, 1, "wait");
+ }
+ goto fail;
+ }
+ /* Does token (if exists) match? */
+ if (token != NULL && memcmp(token, res->hr_token,
+ sizeof(res->hr_token)) != 0) {
+ pjdlog_error("Token received from %s doesn't match.", raddr);
+ nv_add_stringf(nverr, "errmsg", "Token doesn't match.");
+ goto fail;
+ }
+ /*
+ * If there is no token, but we have half-open connection
+ * (only remotein) or full connection (worker process is running)
+ * we have to cancel those and accept the new connection.
+ */
+ if (token == NULL) {
+ PJDLOG_ASSERT(res->hr_remoteout == NULL);
+ pjdlog_debug(1, "Initial connection from %s.", raddr);
+ if (res->hr_workerpid != 0) {
+ PJDLOG_ASSERT(res->hr_remotein == NULL);
+ pjdlog_debug(1,
+ "Worker process exists (pid=%u), stopping it.",
+ (unsigned int)res->hr_workerpid);
+ /* Stop child process. */
+ if (kill(res->hr_workerpid, SIGINT) == -1) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to stop worker process (pid=%u)",
+ (unsigned int)res->hr_workerpid);
+ /*
+ * Other than logging the problem we
+ * ignore it - nothing smart to do.
+ */
+ }
+ /* Wait for it to exit. */
+ else if ((pid = waitpid(res->hr_workerpid,
+ &status, 0)) != res->hr_workerpid) {
+ /* We can only log the problem. */
+ pjdlog_errno(LOG_ERR,
+ "Waiting for worker process (pid=%u) failed",
+ (unsigned int)res->hr_workerpid);
+ } else {
+ child_exit_log(res->hr_workerpid, status);
+ }
+ child_cleanup(res);
+ } else if (res->hr_remotein != NULL) {
+ char oaddr[256];
+
+ proto_remote_address(res->hr_remotein, oaddr,
+ sizeof(oaddr));
+ pjdlog_debug(1,
+ "Canceling half-open connection from %s on connection from %s.",
+ oaddr, raddr);
+ proto_close(res->hr_remotein);
+ res->hr_remotein = NULL;
+ }
+ }
+
+ /*
+ * Checks and cleanups are done.
+ */
+
+ if (token == NULL) {
+ res->hr_version = version;
+ arc4random_buf(res->hr_token, sizeof(res->hr_token));
+ nvout = nv_alloc();
+ nv_add_uint8(nvout, version, "version");
+ nv_add_uint8_array(nvout, res->hr_token,
+ sizeof(res->hr_token), "token");
+ if (nv_error(nvout) != 0) {
+ pjdlog_common(LOG_ERR, 0, nv_error(nvout),
+ "Unable to prepare return header for %s", raddr);
+ nv_add_stringf(nverr, "errmsg",
+ "Remote node was unable to prepare return header: %s.",
+ strerror(nv_error(nvout)));
+ goto fail;
+ }
+ if (hast_proto_send(res, conn, nvout, NULL, 0) == -1) {
+ int error = errno;
+
+ pjdlog_errno(LOG_ERR, "Unable to send response to %s",
+ raddr);
+ nv_add_stringf(nverr, "errmsg",
+ "Remote node was unable to send response: %s.",
+ strerror(error));
+ goto fail;
+ }
+ res->hr_remotein = conn;
+ pjdlog_debug(1, "Incoming connection from %s configured.",
+ raddr);
+ } else {
+ res->hr_remoteout = conn;
+ pjdlog_debug(1, "Outgoing connection to %s configured.", raddr);
+ hastd_secondary(res, nvin);
+ }
+ nv_free(nvin);
+ nv_free(nvout);
+ nv_free(nverr);
+ pjdlog_prefix_set("%s", "");
+ return;
+fail:
+ if (nv_error(nverr) != 0) {
+ pjdlog_common(LOG_ERR, 0, nv_error(nverr),
+ "Unable to prepare error header for %s", raddr);
+ goto close;
+ }
+ if (hast_proto_send(NULL, conn, nverr, NULL, 0) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to send error to %s", raddr);
+ goto close;
+ }
+close:
+ if (nvin != NULL)
+ nv_free(nvin);
+ if (nvout != NULL)
+ nv_free(nvout);
+ if (nverr != NULL)
+ nv_free(nverr);
+ proto_close(conn);
+ pjdlog_prefix_set("%s", "");
+}
+
+static void
+connection_migrate(struct hast_resource *res)
+{
+ struct proto_conn *conn;
+ int16_t val = 0;
+
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
+
+ PJDLOG_ASSERT(res->hr_role == HAST_ROLE_PRIMARY);
+
+ if (proto_recv(res->hr_conn, &val, sizeof(val)) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to receive connection command");
+ return;
+ }
+ if (proto_client(res->hr_sourceaddr[0] != '\0' ? res->hr_sourceaddr : NULL,
+ res->hr_remoteaddr, &conn) == -1) {
+ val = errno;
+ pjdlog_errno(LOG_WARNING,
+ "Unable to create outgoing connection to %s",
+ res->hr_remoteaddr);
+ goto out;
+ }
+ if (proto_connect(conn, -1) == -1) {
+ val = errno;
+ pjdlog_errno(LOG_WARNING, "Unable to connect to %s",
+ res->hr_remoteaddr);
+ proto_close(conn);
+ goto out;
+ }
+ val = 0;
+out:
+ if (proto_send(res->hr_conn, &val, sizeof(val)) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to send reply to connection request");
+ }
+ if (val == 0 && proto_connection_send(res->hr_conn, conn) == -1)
+ pjdlog_errno(LOG_WARNING, "Unable to send connection");
+
+ pjdlog_prefix_set("%s", "");
+}
+
+static void
+check_signals(void)
+{
+ struct timespec sigtimeout;
+ sigset_t mask;
+ int signo;
+
+ sigtimeout.tv_sec = 0;
+ sigtimeout.tv_nsec = 0;
+
+ PJDLOG_VERIFY(sigemptyset(&mask) == 0);
+ PJDLOG_VERIFY(sigaddset(&mask, SIGHUP) == 0);
+ PJDLOG_VERIFY(sigaddset(&mask, SIGINT) == 0);
+ PJDLOG_VERIFY(sigaddset(&mask, SIGTERM) == 0);
+ PJDLOG_VERIFY(sigaddset(&mask, SIGCHLD) == 0);
+
+ while ((signo = sigtimedwait(&mask, NULL, &sigtimeout)) != -1) {
+ switch (signo) {
+ case SIGINT:
+ case SIGTERM:
+ sigexit_received = true;
+ terminate_workers();
+ proto_close(cfg->hc_controlconn);
+ exit(EX_OK);
+ break;
+ case SIGCHLD:
+ child_exit();
+ break;
+ case SIGHUP:
+ hastd_reload();
+ break;
+ default:
+ PJDLOG_ABORT("Unexpected signal (%d).", signo);
+ }
+ }
+}
+
+static void
+main_loop(void)
+{
+ struct hast_resource *res;
+ struct hastd_listen *lst;
+ struct timeval seltimeout;
+ int fd, maxfd, ret;
+ time_t lastcheck, now;
+ fd_set rfds;
+
+ lastcheck = time(NULL);
+ seltimeout.tv_sec = REPORT_INTERVAL;
+ seltimeout.tv_usec = 0;
+
+ for (;;) {
+ check_signals();
+
+ /* Setup descriptors for select(2). */
+ FD_ZERO(&rfds);
+ maxfd = fd = proto_descriptor(cfg->hc_controlconn);
+ PJDLOG_ASSERT(fd >= 0);
+ FD_SET(fd, &rfds);
+ TAILQ_FOREACH(lst, &cfg->hc_listen, hl_next) {
+ if (lst->hl_conn == NULL)
+ continue;
+ fd = proto_descriptor(lst->hl_conn);
+ PJDLOG_ASSERT(fd >= 0);
+ FD_SET(fd, &rfds);
+ maxfd = fd > maxfd ? fd : maxfd;
+ }
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (res->hr_event == NULL)
+ continue;
+ fd = proto_descriptor(res->hr_event);
+ PJDLOG_ASSERT(fd >= 0);
+ FD_SET(fd, &rfds);
+ maxfd = fd > maxfd ? fd : maxfd;
+ if (res->hr_role == HAST_ROLE_PRIMARY) {
+ /* Only primary workers asks for connections. */
+ PJDLOG_ASSERT(res->hr_conn != NULL);
+ fd = proto_descriptor(res->hr_conn);
+ PJDLOG_ASSERT(fd >= 0);
+ FD_SET(fd, &rfds);
+ maxfd = fd > maxfd ? fd : maxfd;
+ } else {
+ PJDLOG_ASSERT(res->hr_conn == NULL);
+ }
+ }
+
+ PJDLOG_ASSERT(maxfd + 1 <= (int)FD_SETSIZE);
+ ret = select(maxfd + 1, &rfds, NULL, NULL, &seltimeout);
+ now = time(NULL);
+ if (lastcheck + REPORT_INTERVAL <= now) {
+ hook_check();
+ lastcheck = now;
+ }
+ if (ret == 0) {
+ /*
+ * select(2) timed out, so there should be no
+ * descriptors to check.
+ */
+ continue;
+ } else if (ret == -1) {
+ if (errno == EINTR)
+ continue;
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "select() failed");
+ }
+
+ /*
+ * Check for signals before we do anything to update our
+ * info about terminated workers in the meantime.
+ */
+ check_signals();
+
+ if (FD_ISSET(proto_descriptor(cfg->hc_controlconn), &rfds))
+ control_handle(cfg);
+ TAILQ_FOREACH(lst, &cfg->hc_listen, hl_next) {
+ if (lst->hl_conn == NULL)
+ continue;
+ if (FD_ISSET(proto_descriptor(lst->hl_conn), &rfds))
+ listen_accept(lst);
+ }
+ TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
+ if (res->hr_event == NULL)
+ continue;
+ if (FD_ISSET(proto_descriptor(res->hr_event), &rfds)) {
+ if (event_recv(res) == 0)
+ continue;
+ /* The worker process exited? */
+ proto_close(res->hr_event);
+ res->hr_event = NULL;
+ if (res->hr_conn != NULL) {
+ proto_close(res->hr_conn);
+ res->hr_conn = NULL;
+ }
+ continue;
+ }
+ if (res->hr_role == HAST_ROLE_PRIMARY) {
+ PJDLOG_ASSERT(res->hr_conn != NULL);
+ if (FD_ISSET(proto_descriptor(res->hr_conn),
+ &rfds)) {
+ connection_migrate(res);
+ }
+ } else {
+ PJDLOG_ASSERT(res->hr_conn == NULL);
+ }
+ }
+ }
+}
+
+static void
+dummy_sighandler(int sig __unused)
+{
+ /* Nothing to do. */
+}
+
+int
+main(int argc, char *argv[])
+{
+ struct hastd_listen *lst;
+ pid_t otherpid;
+ int debuglevel;
+ sigset_t mask;
+
+ foreground = false;
+ debuglevel = 0;
+
+ for (;;) {
+ int ch;
+
+ ch = getopt(argc, argv, "c:dFhP:");
+ if (ch == -1)
+ break;
+ switch (ch) {
+ case 'c':
+ cfgpath = optarg;
+ break;
+ case 'd':
+ debuglevel++;
+ break;
+ case 'F':
+ foreground = true;
+ break;
+ case 'P':
+ pidfile = optarg;
+ break;
+ case 'h':
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ pjdlog_init(PJDLOG_MODE_STD);
+ pjdlog_debug_set(debuglevel);
+
+ g_gate_load();
+
+ /*
+ * When path to the configuration file is relative, obtain full path,
+ * so we can always find the file, even after daemonizing and changing
+ * working directory to /.
+ */
+ if (cfgpath[0] != '/') {
+ const char *newcfgpath;
+
+ newcfgpath = realpath(cfgpath, NULL);
+ if (newcfgpath == NULL) {
+ pjdlog_exit(EX_CONFIG,
+ "Unable to obtain full path of %s", cfgpath);
+ }
+ cfgpath = newcfgpath;
+ }
+
+ cfg = yy_config_parse(cfgpath, true);
+ PJDLOG_ASSERT(cfg != NULL);
+
+ if (pidfile != NULL) {
+ if (strlcpy(cfg->hc_pidfile, pidfile,
+ sizeof(cfg->hc_pidfile)) >= sizeof(cfg->hc_pidfile)) {
+ pjdlog_exitx(EX_CONFIG, "Pidfile path is too long.");
+ }
+ }
+
+ if (pidfile != NULL || !foreground) {
+ pfh = pidfile_open(cfg->hc_pidfile, 0600, &otherpid);
+ if (pfh == NULL) {
+ if (errno == EEXIST) {
+ pjdlog_exitx(EX_TEMPFAIL,
+ "Another hastd is already running, pidfile: %s, pid: %jd.",
+ cfg->hc_pidfile, (intmax_t)otherpid);
+ }
+ /*
+ * If we cannot create pidfile for other reasons,
+ * only warn.
+ */
+ pjdlog_errno(LOG_WARNING,
+ "Unable to open or create pidfile %s",
+ cfg->hc_pidfile);
+ }
+ }
+
+ /*
+ * Restore default actions for interesting signals in case parent
+ * process (like init(8)) decided to ignore some of them (like SIGHUP).
+ */
+ PJDLOG_VERIFY(signal(SIGHUP, SIG_DFL) != SIG_ERR);
+ PJDLOG_VERIFY(signal(SIGINT, SIG_DFL) != SIG_ERR);
+ PJDLOG_VERIFY(signal(SIGTERM, SIG_DFL) != SIG_ERR);
+ /*
+ * Because SIGCHLD is ignored by default, setup dummy handler for it,
+ * so we can mask it.
+ */
+ PJDLOG_VERIFY(signal(SIGCHLD, dummy_sighandler) != SIG_ERR);
+
+ PJDLOG_VERIFY(sigemptyset(&mask) == 0);
+ PJDLOG_VERIFY(sigaddset(&mask, SIGHUP) == 0);
+ PJDLOG_VERIFY(sigaddset(&mask, SIGINT) == 0);
+ PJDLOG_VERIFY(sigaddset(&mask, SIGTERM) == 0);
+ PJDLOG_VERIFY(sigaddset(&mask, SIGCHLD) == 0);
+ PJDLOG_VERIFY(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
+
+ /* Listen on control address. */
+ if (proto_server(cfg->hc_controladdr, &cfg->hc_controlconn) == -1) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "Unable to listen on control address %s",
+ cfg->hc_controladdr);
+ }
+ /* Listen for remote connections. */
+ TAILQ_FOREACH(lst, &cfg->hc_listen, hl_next) {
+ if (proto_server(lst->hl_addr, &lst->hl_conn) == -1) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "Unable to listen on address %s",
+ lst->hl_addr);
+ }
+ }
+
+ if (!foreground) {
+ if (daemon(0, 0) == -1) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "Unable to daemonize");
+ }
+
+ /* Start logging to syslog. */
+ pjdlog_mode_set(PJDLOG_MODE_SYSLOG);
+ }
+ if (pidfile != NULL || !foreground) {
+ /* Write PID to a file. */
+ if (pidfile_write(pfh) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to write PID to a file %s",
+ cfg->hc_pidfile);
+ } else {
+ pjdlog_debug(1, "PID stored in %s.", cfg->hc_pidfile);
+ }
+ }
+
+ pjdlog_info("Started successfully, running protocol version %d.",
+ HAST_PROTO_VERSION);
+
+ pjdlog_debug(1, "Listening on control address %s.",
+ cfg->hc_controladdr);
+ TAILQ_FOREACH(lst, &cfg->hc_listen, hl_next)
+ pjdlog_info("Listening on address %s.", lst->hl_addr);
+
+ hook_init();
+
+ main_loop();
+
+ exit(0);
+}
diff --git a/sbin/hastd/hastd.h b/sbin/hastd/hastd.h
new file mode 100644
index 0000000..d23e855
--- /dev/null
+++ b/sbin/hastd/hastd.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HASTD_H_
+#define _HASTD_H_
+
+#include <sys/param.h>
+#include <libutil.h>
+
+#include <nv.h>
+
+#include "hast.h"
+
+extern const char *cfgpath;
+extern bool sigexit_received;
+extern struct pidfh *pfh;
+
+void descriptors_cleanup(struct hast_resource *res);
+void descriptors_assert(const struct hast_resource *res, int pjdlogmode);
+
+void hastd_primary(struct hast_resource *res);
+void hastd_secondary(struct hast_resource *res, struct nv *nvin);
+
+void primary_config_reload(struct hast_resource *res, struct nv *nv);
+
+#endif /* !_HASTD_H_ */
diff --git a/sbin/hastd/hooks.c b/sbin/hastd/hooks.c
new file mode 100644
index 0000000..b1886ca
--- /dev/null
+++ b/sbin/hastd/hooks.c
@@ -0,0 +1,391 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/wait.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <paths.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
+#include <unistd.h>
+
+#include <pjdlog.h>
+
+#include "hooks.h"
+#include "subr.h"
+#include "synch.h"
+
+/* Report processes that are running for too long not often than this value. */
+#define REPORT_INTERVAL 60
+
+/* Are we initialized? */
+static bool hooks_initialized = false;
+
+/*
+ * Keep all processes we forked on a global queue, so we can report nicely
+ * when they finish or report that they are running for a long time.
+ */
+#define HOOKPROC_MAGIC_ALLOCATED 0x80090ca
+#define HOOKPROC_MAGIC_ONLIST 0x80090c0
+struct hookproc {
+ /* Magic. */
+ int hp_magic;
+ /* PID of a forked child. */
+ pid_t hp_pid;
+ /* When process were forked? */
+ time_t hp_birthtime;
+ /* When we logged previous reported? */
+ time_t hp_lastreport;
+ /* Path to executable and all the arguments we passed. */
+ char hp_comm[PATH_MAX];
+ TAILQ_ENTRY(hookproc) hp_next;
+};
+static TAILQ_HEAD(, hookproc) hookprocs;
+static pthread_mutex_t hookprocs_lock;
+
+static void hook_remove(struct hookproc *hp);
+static void hook_free(struct hookproc *hp);
+
+static void
+descriptors(void)
+{
+ int fd;
+
+ /*
+ * Close all (or almost all) descriptors.
+ */
+ if (pjdlog_mode_get() == PJDLOG_MODE_STD) {
+ closefrom(MAX(MAX(STDIN_FILENO, STDOUT_FILENO),
+ STDERR_FILENO) + 1);
+ return;
+ }
+
+ closefrom(0);
+
+ /*
+ * Redirect stdin, stdout and stderr to /dev/null.
+ */
+ fd = open(_PATH_DEVNULL, O_RDONLY);
+ if (fd == -1) {
+ pjdlog_errno(LOG_WARNING, "Unable to open %s for reading",
+ _PATH_DEVNULL);
+ } else if (fd != STDIN_FILENO) {
+ if (dup2(fd, STDIN_FILENO) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to duplicate descriptor for stdin");
+ }
+ close(fd);
+ }
+ fd = open(_PATH_DEVNULL, O_WRONLY);
+ if (fd == -1) {
+ pjdlog_errno(LOG_WARNING, "Unable to open %s for writing",
+ _PATH_DEVNULL);
+ } else {
+ if (fd != STDOUT_FILENO && dup2(fd, STDOUT_FILENO) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to duplicate descriptor for stdout");
+ }
+ if (fd != STDERR_FILENO && dup2(fd, STDERR_FILENO) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to duplicate descriptor for stderr");
+ }
+ if (fd != STDOUT_FILENO && fd != STDERR_FILENO)
+ close(fd);
+ }
+}
+
+void
+hook_init(void)
+{
+
+ PJDLOG_ASSERT(!hooks_initialized);
+
+ mtx_init(&hookprocs_lock);
+ TAILQ_INIT(&hookprocs);
+ hooks_initialized = true;
+}
+
+void
+hook_fini(void)
+{
+ struct hookproc *hp;
+
+ PJDLOG_ASSERT(hooks_initialized);
+
+ mtx_lock(&hookprocs_lock);
+ while ((hp = TAILQ_FIRST(&hookprocs)) != NULL) {
+ PJDLOG_ASSERT(hp->hp_magic == HOOKPROC_MAGIC_ONLIST);
+ PJDLOG_ASSERT(hp->hp_pid > 0);
+
+ hook_remove(hp);
+ hook_free(hp);
+ }
+ mtx_unlock(&hookprocs_lock);
+
+ mtx_destroy(&hookprocs_lock);
+ TAILQ_INIT(&hookprocs);
+ hooks_initialized = false;
+}
+
+static struct hookproc *
+hook_alloc(const char *path, char **args)
+{
+ struct hookproc *hp;
+ unsigned int ii;
+
+ hp = malloc(sizeof(*hp));
+ if (hp == NULL) {
+ pjdlog_error("Unable to allocate %zu bytes of memory for a hook.",
+ sizeof(*hp));
+ return (NULL);
+ }
+
+ hp->hp_pid = 0;
+ hp->hp_birthtime = hp->hp_lastreport = time(NULL);
+ (void)strlcpy(hp->hp_comm, path, sizeof(hp->hp_comm));
+ /* We start at 2nd argument as we don't want to have exec name twice. */
+ for (ii = 1; args[ii] != NULL; ii++) {
+ (void)snprlcat(hp->hp_comm, sizeof(hp->hp_comm), " %s",
+ args[ii]);
+ }
+ if (strlen(hp->hp_comm) >= sizeof(hp->hp_comm) - 1) {
+ pjdlog_error("Exec path too long, correct configuration file.");
+ free(hp);
+ return (NULL);
+ }
+ hp->hp_magic = HOOKPROC_MAGIC_ALLOCATED;
+ return (hp);
+}
+
+static void
+hook_add(struct hookproc *hp, pid_t pid)
+{
+
+ PJDLOG_ASSERT(hp->hp_magic == HOOKPROC_MAGIC_ALLOCATED);
+ PJDLOG_ASSERT(hp->hp_pid == 0);
+
+ hp->hp_pid = pid;
+ mtx_lock(&hookprocs_lock);
+ hp->hp_magic = HOOKPROC_MAGIC_ONLIST;
+ TAILQ_INSERT_TAIL(&hookprocs, hp, hp_next);
+ mtx_unlock(&hookprocs_lock);
+}
+
+static void
+hook_remove(struct hookproc *hp)
+{
+
+ PJDLOG_ASSERT(hp->hp_magic == HOOKPROC_MAGIC_ONLIST);
+ PJDLOG_ASSERT(hp->hp_pid > 0);
+ PJDLOG_ASSERT(mtx_owned(&hookprocs_lock));
+
+ TAILQ_REMOVE(&hookprocs, hp, hp_next);
+ hp->hp_magic = HOOKPROC_MAGIC_ALLOCATED;
+}
+
+static void
+hook_free(struct hookproc *hp)
+{
+
+ PJDLOG_ASSERT(hp->hp_magic == HOOKPROC_MAGIC_ALLOCATED);
+ PJDLOG_ASSERT(hp->hp_pid > 0);
+
+ hp->hp_magic = 0;
+ free(hp);
+}
+
+static struct hookproc *
+hook_find(pid_t pid)
+{
+ struct hookproc *hp;
+
+ PJDLOG_ASSERT(mtx_owned(&hookprocs_lock));
+
+ TAILQ_FOREACH(hp, &hookprocs, hp_next) {
+ PJDLOG_ASSERT(hp->hp_magic == HOOKPROC_MAGIC_ONLIST);
+ PJDLOG_ASSERT(hp->hp_pid > 0);
+
+ if (hp->hp_pid == pid)
+ break;
+ }
+
+ return (hp);
+}
+
+void
+hook_check_one(pid_t pid, int status)
+{
+ struct hookproc *hp;
+
+ mtx_lock(&hookprocs_lock);
+ hp = hook_find(pid);
+ if (hp == NULL) {
+ mtx_unlock(&hookprocs_lock);
+ pjdlog_debug(1, "Unknown process pid=%u", pid);
+ return;
+ }
+ hook_remove(hp);
+ mtx_unlock(&hookprocs_lock);
+ if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
+ pjdlog_debug(1, "Hook exited gracefully (pid=%u, cmd=[%s]).",
+ pid, hp->hp_comm);
+ } else if (WIFSIGNALED(status)) {
+ pjdlog_error("Hook was killed (pid=%u, signal=%d, cmd=[%s]).",
+ pid, WTERMSIG(status), hp->hp_comm);
+ } else {
+ pjdlog_error("Hook exited ungracefully (pid=%u, exitcode=%d, cmd=[%s]).",
+ pid, WIFEXITED(status) ? WEXITSTATUS(status) : -1,
+ hp->hp_comm);
+ }
+ hook_free(hp);
+}
+
+void
+hook_check(void)
+{
+ struct hookproc *hp, *hp2;
+ time_t now;
+
+ PJDLOG_ASSERT(hooks_initialized);
+
+ pjdlog_debug(2, "Checking hooks.");
+
+ /*
+ * Report about processes that are running for a long time.
+ */
+ now = time(NULL);
+ mtx_lock(&hookprocs_lock);
+ TAILQ_FOREACH_SAFE(hp, &hookprocs, hp_next, hp2) {
+ PJDLOG_ASSERT(hp->hp_magic == HOOKPROC_MAGIC_ONLIST);
+ PJDLOG_ASSERT(hp->hp_pid > 0);
+
+ /*
+ * If process doesn't exists we somehow missed it.
+ * Not much can be done expect for logging this situation.
+ */
+ if (kill(hp->hp_pid, 0) == -1 && errno == ESRCH) {
+ pjdlog_warning("Hook disappeared (pid=%u, cmd=[%s]).",
+ hp->hp_pid, hp->hp_comm);
+ hook_remove(hp);
+ hook_free(hp);
+ continue;
+ }
+
+ /*
+ * Skip proccesses younger than 1 minute.
+ */
+ if (now - hp->hp_lastreport < REPORT_INTERVAL)
+ continue;
+
+ /*
+ * Hook is running for too long, report it.
+ */
+ pjdlog_warning("Hook is running for %ju seconds (pid=%u, cmd=[%s]).",
+ (uintmax_t)(now - hp->hp_birthtime), hp->hp_pid,
+ hp->hp_comm);
+ hp->hp_lastreport = now;
+ }
+ mtx_unlock(&hookprocs_lock);
+}
+
+void
+hook_exec(const char *path, ...)
+{
+ va_list ap;
+
+ va_start(ap, path);
+ hook_execv(path, ap);
+ va_end(ap);
+}
+
+void
+hook_execv(const char *path, va_list ap)
+{
+ struct hookproc *hp;
+ char *args[64];
+ unsigned int ii;
+ sigset_t mask;
+ pid_t pid;
+
+ PJDLOG_ASSERT(hooks_initialized);
+
+ if (path == NULL || path[0] == '\0')
+ return;
+
+ memset(args, 0, sizeof(args));
+ args[0] = basename(path);
+ for (ii = 1; ii < sizeof(args) / sizeof(args[0]); ii++) {
+ args[ii] = va_arg(ap, char *);
+ if (args[ii] == NULL)
+ break;
+ }
+ PJDLOG_ASSERT(ii < sizeof(args) / sizeof(args[0]));
+
+ hp = hook_alloc(path, args);
+ if (hp == NULL)
+ return;
+
+ pjdlog_debug(1, "Executing hook: %s", hp->hp_comm);
+
+ pid = fork();
+ switch (pid) {
+ case -1: /* Error. */
+ pjdlog_errno(LOG_ERR, "Unable to fork to execute %s", path);
+ hook_free(hp);
+ return;
+ case 0: /* Child. */
+ descriptors();
+ PJDLOG_VERIFY(sigemptyset(&mask) == 0);
+ PJDLOG_VERIFY(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
+ /*
+ * Dummy handler set for SIGCHLD in the parent will be restored
+ * to SIG_IGN on execv(3) below, so there is no need to do
+ * anything with it.
+ */
+ execv(path, args);
+ pjdlog_errno(LOG_ERR, "Unable to execute %s", path);
+ exit(EX_SOFTWARE);
+ default: /* Parent. */
+ hook_add(hp, pid);
+ break;
+ }
+}
diff --git a/sbin/hastd/hooks.h b/sbin/hastd/hooks.h
new file mode 100644
index 0000000..4ce435e
--- /dev/null
+++ b/sbin/hastd/hooks.h
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HOOKS_H_
+#define _HOOKS_H_
+
+#include <sys/types.h>
+
+#include <stdarg.h>
+#include <stdbool.h>
+
+void hook_init(void);
+void hook_fini(void);
+void hook_check_one(pid_t pid, int status);
+void hook_check(void);
+void hook_exec(const char *path, ...);
+void hook_execv(const char *path, va_list ap);
+
+#endif /* !_HOOKS_H_ */
diff --git a/sbin/hastd/lzf.c b/sbin/hastd/lzf.c
new file mode 100644
index 0000000..cca6a17
--- /dev/null
+++ b/sbin/hastd/lzf.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2000-2008 Marc Alexander Lehmann <schmorp@schmorp.de>
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ */
+
+#include "lzf.h"
+
+#define HSIZE (1 << (HLOG))
+
+/*
+ * don't play with this unless you benchmark!
+ * decompression is not dependent on the hash function
+ * the hashing function might seem strange, just believe me
+ * it works ;)
+ */
+#ifndef FRST
+# define FRST(p) (((p[0]) << 8) | p[1])
+# define NEXT(v,p) (((v) << 8) | p[2])
+# if ULTRA_FAST
+# define IDX(h) ((( h >> (3*8 - HLOG)) - h ) & (HSIZE - 1))
+# elif VERY_FAST
+# define IDX(h) ((( h >> (3*8 - HLOG)) - h*5) & (HSIZE - 1))
+# else
+# define IDX(h) ((((h ^ (h << 5)) >> (3*8 - HLOG)) - h*5) & (HSIZE - 1))
+# endif
+#endif
+/*
+ * IDX works because it is very similar to a multiplicative hash, e.g.
+ * ((h * 57321 >> (3*8 - HLOG)) & (HSIZE - 1))
+ * the latter is also quite fast on newer CPUs, and compresses similarly.
+ *
+ * the next one is also quite good, albeit slow ;)
+ * (int)(cos(h & 0xffffff) * 1e6)
+ */
+
+#if 0
+/* original lzv-like hash function, much worse and thus slower */
+# define FRST(p) (p[0] << 5) ^ p[1]
+# define NEXT(v,p) ((v) << 5) ^ p[2]
+# define IDX(h) ((h) & (HSIZE - 1))
+#endif
+
+#define MAX_LIT (1 << 5)
+#define MAX_OFF (1 << 13)
+#define MAX_REF ((1 << 8) + (1 << 3))
+
+#if __GNUC__ >= 3
+# define expect(expr,value) __builtin_expect ((expr),(value))
+# define inline inline
+#else
+# define expect(expr,value) (expr)
+# define inline static
+#endif
+
+#define expect_false(expr) expect ((expr) != 0, 0)
+#define expect_true(expr) expect ((expr) != 0, 1)
+
+/*
+ * compressed format
+ *
+ * 000LLLLL <L+1> ; literal
+ * LLLooooo oooooooo ; backref L
+ * 111ooooo LLLLLLLL oooooooo ; backref L+7
+ *
+ */
+
+unsigned int
+lzf_compress (const void *const in_data, unsigned int in_len,
+ void *out_data, unsigned int out_len
+#if LZF_STATE_ARG
+ , LZF_STATE htab
+#endif
+ )
+{
+#if !LZF_STATE_ARG
+ LZF_STATE htab;
+#endif
+ const u8 **hslot;
+ const u8 *ip = (const u8 *)in_data;
+ u8 *op = (u8 *)out_data;
+ const u8 *in_end = ip + in_len;
+ u8 *out_end = op + out_len;
+ const u8 *ref;
+
+ /* off requires a type wide enough to hold a general pointer difference.
+ * ISO C doesn't have that (size_t might not be enough and ptrdiff_t only
+ * works for differences within a single object). We also assume that no
+ * no bit pattern traps. Since the only platform that is both non-POSIX
+ * and fails to support both assumptions is windows 64 bit, we make a
+ * special workaround for it.
+ */
+#if defined (WIN32) && defined (_M_X64)
+ unsigned _int64 off; /* workaround for missing POSIX compliance */
+#else
+ unsigned long off;
+#endif
+ unsigned int hval;
+ int lit;
+
+ if (!in_len || !out_len)
+ return 0;
+
+#if INIT_HTAB
+ memset (htab, 0, sizeof (htab));
+# if 0
+ for (hslot = htab; hslot < htab + HSIZE; hslot++)
+ *hslot++ = ip;
+# endif
+#endif
+
+ lit = 0; op++; /* start run */
+
+ hval = FRST (ip);
+ while (ip < in_end - 2)
+ {
+ hval = NEXT (hval, ip);
+ hslot = htab + IDX (hval);
+ ref = *hslot; *hslot = ip;
+
+ if (1
+#if INIT_HTAB
+ && ref < ip /* the next test will actually take care of this, but this is faster */
+#endif
+ && (off = ip - ref - 1) < MAX_OFF
+ && ip + 4 < in_end
+ && ref > (const u8 *)in_data
+#if STRICT_ALIGN
+ && ref[0] == ip[0]
+ && ref[1] == ip[1]
+ && ref[2] == ip[2]
+#else
+ && *(const u16 *)ref == *(const u16 *)ip
+ && ref[2] == ip[2]
+#endif
+ )
+ {
+ /* match found at *ref++ */
+ unsigned int len = 2;
+ unsigned int maxlen = in_end - ip - len;
+ maxlen = maxlen > MAX_REF ? MAX_REF : maxlen;
+
+ if (expect_false (op + 3 + 1 >= out_end)) /* first a faster conservative test */
+ if (op - !lit + 3 + 1 >= out_end) /* second the exact but rare test */
+ return 0;
+
+ op [- lit - 1] = lit - 1; /* stop run */
+ op -= !lit; /* undo run if length is zero */
+
+ for (;;)
+ {
+ if (expect_true (maxlen > 16))
+ {
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ }
+
+ do
+ len++;
+ while (len < maxlen && ref[len] == ip[len]);
+
+ break;
+ }
+
+ len -= 2; /* len is now #octets - 1 */
+ ip++;
+
+ if (len < 7)
+ {
+ *op++ = (off >> 8) + (len << 5);
+ }
+ else
+ {
+ *op++ = (off >> 8) + ( 7 << 5);
+ *op++ = len - 7;
+ }
+
+ *op++ = off;
+ lit = 0; op++; /* start run */
+
+ ip += len + 1;
+
+ if (expect_false (ip >= in_end - 2))
+ break;
+
+#if ULTRA_FAST || VERY_FAST
+ --ip;
+# if VERY_FAST && !ULTRA_FAST
+ --ip;
+# endif
+ hval = FRST (ip);
+
+ hval = NEXT (hval, ip);
+ htab[IDX (hval)] = ip;
+ ip++;
+
+# if VERY_FAST && !ULTRA_FAST
+ hval = NEXT (hval, ip);
+ htab[IDX (hval)] = ip;
+ ip++;
+# endif
+#else
+ ip -= len + 1;
+
+ do
+ {
+ hval = NEXT (hval, ip);
+ htab[IDX (hval)] = ip;
+ ip++;
+ }
+ while (len--);
+#endif
+ }
+ else
+ {
+ /* one more literal byte we must copy */
+ if (expect_false (op >= out_end))
+ return 0;
+
+ lit++; *op++ = *ip++;
+
+ if (expect_false (lit == MAX_LIT))
+ {
+ op [- lit - 1] = lit - 1; /* stop run */
+ lit = 0; op++; /* start run */
+ }
+ }
+ }
+
+ if (op + 3 > out_end) /* at most 3 bytes can be missing here */
+ return 0;
+
+ while (ip < in_end)
+ {
+ lit++; *op++ = *ip++;
+
+ if (expect_false (lit == MAX_LIT))
+ {
+ op [- lit - 1] = lit - 1; /* stop run */
+ lit = 0; op++; /* start run */
+ }
+ }
+
+ op [- lit - 1] = lit - 1; /* end run */
+ op -= !lit; /* undo run if length is zero */
+
+ return op - (u8 *)out_data;
+}
+
+#if AVOID_ERRNO
+# define SET_ERRNO(n)
+#else
+# include <errno.h>
+# define SET_ERRNO(n) errno = (n)
+#endif
+
+#if (__i386 || __amd64) && __GNUC__ >= 3
+# define lzf_movsb(dst, src, len) \
+ asm ("rep movsb" \
+ : "=D" (dst), "=S" (src), "=c" (len) \
+ : "0" (dst), "1" (src), "2" (len));
+#endif
+
+unsigned int
+lzf_decompress (const void *const in_data, unsigned int in_len,
+ void *out_data, unsigned int out_len)
+{
+ u8 const *ip = (const u8 *)in_data;
+ u8 *op = (u8 *)out_data;
+ u8 const *const in_end = ip + in_len;
+ u8 *const out_end = op + out_len;
+
+ do
+ {
+ unsigned int ctrl = *ip++;
+
+ if (ctrl < (1 << 5)) /* literal run */
+ {
+ ctrl++;
+
+ if (op + ctrl > out_end)
+ {
+ SET_ERRNO (E2BIG);
+ return 0;
+ }
+
+#if CHECK_INPUT
+ if (ip + ctrl > in_end)
+ {
+ SET_ERRNO (EINVAL);
+ return 0;
+ }
+#endif
+
+#ifdef lzf_movsb
+ lzf_movsb (op, ip, ctrl);
+#else
+ do
+ *op++ = *ip++;
+ while (--ctrl);
+#endif
+ }
+ else /* back reference */
+ {
+ unsigned int len = ctrl >> 5;
+
+ u8 *ref = op - ((ctrl & 0x1f) << 8) - 1;
+
+#if CHECK_INPUT
+ if (ip >= in_end)
+ {
+ SET_ERRNO (EINVAL);
+ return 0;
+ }
+#endif
+ if (len == 7)
+ {
+ len += *ip++;
+#if CHECK_INPUT
+ if (ip >= in_end)
+ {
+ SET_ERRNO (EINVAL);
+ return 0;
+ }
+#endif
+ }
+
+ ref -= *ip++;
+
+ if (op + len + 2 > out_end)
+ {
+ SET_ERRNO (E2BIG);
+ return 0;
+ }
+
+ if (ref < (u8 *)out_data)
+ {
+ SET_ERRNO (EINVAL);
+ return 0;
+ }
+
+#ifdef lzf_movsb
+ len += 2;
+ lzf_movsb (op, ref, len);
+#else
+ *op++ = *ref++;
+ *op++ = *ref++;
+
+ do
+ *op++ = *ref++;
+ while (--len);
+#endif
+ }
+ }
+ while (ip < in_end);
+
+ return op - (u8 *)out_data;
+}
+
diff --git a/sbin/hastd/lzf.h b/sbin/hastd/lzf.h
new file mode 100644
index 0000000..d9563ef
--- /dev/null
+++ b/sbin/hastd/lzf.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2000-2008 Marc Alexander Lehmann <schmorp@schmorp.de>
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ */
+
+#ifndef LZF_H
+#define LZF_H
+
+/***********************************************************************
+**
+** lzf -- an extremely fast/free compression/decompression-method
+** http://liblzf.plan9.de/
+**
+** This algorithm is believed to be patent-free.
+**
+***********************************************************************/
+
+#define LZF_VERSION 0x0105 /* 1.5, API version */
+
+/*
+ * Compress in_len bytes stored at the memory block starting at
+ * in_data and write the result to out_data, up to a maximum length
+ * of out_len bytes.
+ *
+ * If the output buffer is not large enough or any error occurs return 0,
+ * otherwise return the number of bytes used, which might be considerably
+ * more than in_len (but less than 104% of the original size), so it
+ * makes sense to always use out_len == in_len - 1), to ensure _some_
+ * compression, and store the data uncompressed otherwise (with a flag, of
+ * course.
+ *
+ * lzf_compress might use different algorithms on different systems and
+ * even different runs, thus might result in different compressed strings
+ * depending on the phase of the moon or similar factors. However, all
+ * these strings are architecture-independent and will result in the
+ * original data when decompressed using lzf_decompress.
+ *
+ * The buffers must not be overlapping.
+ *
+ * If the option LZF_STATE_ARG is enabled, an extra argument must be
+ * supplied which is not reflected in this header file. Refer to lzfP.h
+ * and lzf_c.c.
+ *
+ */
+unsigned int
+lzf_compress (const void *const in_data, unsigned int in_len,
+ void *out_data, unsigned int out_len);
+
+/*
+ * Decompress data compressed with some version of the lzf_compress
+ * function and stored at location in_data and length in_len. The result
+ * will be stored at out_data up to a maximum of out_len characters.
+ *
+ * If the output buffer is not large enough to hold the decompressed
+ * data, a 0 is returned and errno is set to E2BIG. Otherwise the number
+ * of decompressed bytes (i.e. the original length of the data) is
+ * returned.
+ *
+ * If an error in the compressed data is detected, a zero is returned and
+ * errno is set to EINVAL.
+ *
+ * This function is very fast, about as fast as a copying loop.
+ */
+unsigned int
+lzf_decompress (const void *const in_data, unsigned int in_len,
+ void *out_data, unsigned int out_len);
+
+/*
+ * Size of hashtable is (1 << HLOG) * sizeof (char *)
+ * decompression is independent of the hash table size
+ * the difference between 15 and 14 is very small
+ * for small blocks (and 14 is usually a bit faster).
+ * For a low-memory/faster configuration, use HLOG == 13;
+ * For best compression, use 15 or 16 (or more, up to 23).
+ */
+#ifndef HLOG
+# define HLOG 16
+#endif
+
+/*
+ * Sacrifice very little compression quality in favour of compression speed.
+ * This gives almost the same compression as the default code, and is
+ * (very roughly) 15% faster. This is the preferred mode of operation.
+ */
+#ifndef VERY_FAST
+# define VERY_FAST 1
+#endif
+
+/*
+ * Sacrifice some more compression quality in favour of compression speed.
+ * (roughly 1-2% worse compression for large blocks and
+ * 9-10% for small, redundant, blocks and >>20% better speed in both cases)
+ * In short: when in need for speed, enable this for binary data,
+ * possibly disable this for text data.
+ */
+#ifndef ULTRA_FAST
+# define ULTRA_FAST 0
+#endif
+
+/*
+ * Unconditionally aligning does not cost very much, so do it if unsure
+ */
+#ifndef STRICT_ALIGN
+# define STRICT_ALIGN !(defined(__i386) || defined (__amd64))
+#endif
+
+/*
+ * You may choose to pre-set the hash table (might be faster on some
+ * modern cpus and large (>>64k) blocks, and also makes compression
+ * deterministic/repeatable when the configuration otherwise is the same).
+ */
+#ifndef INIT_HTAB
+# define INIT_HTAB 1
+#endif
+
+/*
+ * Avoid assigning values to errno variable? for some embedding purposes
+ * (linux kernel for example), this is necessary. NOTE: this breaks
+ * the documentation in lzf.h.
+ */
+#ifndef AVOID_ERRNO
+# define AVOID_ERRNO 0
+#endif
+
+/*
+ * Wether to pass the LZF_STATE variable as argument, or allocate it
+ * on the stack. For small-stack environments, define this to 1.
+ * NOTE: this breaks the prototype in lzf.h.
+ */
+#ifndef LZF_STATE_ARG
+# define LZF_STATE_ARG 0
+#endif
+
+/*
+ * Wether to add extra checks for input validity in lzf_decompress
+ * and return EINVAL if the input stream has been corrupted. This
+ * only shields against overflowing the input buffer and will not
+ * detect most corrupted streams.
+ * This check is not normally noticeable on modern hardware
+ * (<1% slowdown), but might slow down older cpus considerably.
+ */
+#ifndef CHECK_INPUT
+# define CHECK_INPUT 1
+#endif
+
+/*****************************************************************************/
+/* nothing should be changed below */
+
+typedef unsigned char u8;
+
+typedef const u8 *LZF_STATE[1 << (HLOG)];
+
+#if !STRICT_ALIGN
+/* for unaligned accesses we need a 16 bit datatype. */
+# include <limits.h>
+# if USHRT_MAX == 65535
+ typedef unsigned short u16;
+# elif UINT_MAX == 65535
+ typedef unsigned int u16;
+# else
+# undef STRICT_ALIGN
+# define STRICT_ALIGN 1
+# endif
+#endif
+
+#if ULTRA_FAST
+# if defined(VERY_FAST)
+# undef VERY_FAST
+# endif
+#endif
+
+#if INIT_HTAB
+# ifdef __cplusplus
+# include <cstring>
+# else
+# include <string.h>
+# endif
+#endif
+
+#endif
diff --git a/sbin/hastd/metadata.c b/sbin/hastd/metadata.c
new file mode 100644
index 0000000..6d9f366
--- /dev/null
+++ b/sbin/hastd/metadata.c
@@ -0,0 +1,225 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <ebuf.h>
+#include <nv.h>
+#include <pjdlog.h>
+#include <subr.h>
+
+#include "metadata.h"
+
+int
+metadata_read(struct hast_resource *res, bool openrw)
+{
+ unsigned char *buf;
+ struct ebuf *eb;
+ struct nv *nv;
+ ssize_t done;
+ const char *str;
+ int rerrno;
+ bool opened_here;
+
+ opened_here = false;
+ rerrno = 0;
+
+ /*
+ * Is this first metadata_read() call for this resource?
+ */
+ if (res->hr_localfd == -1) {
+ if (provinfo(res, openrw) == -1) {
+ rerrno = errno;
+ goto fail;
+ }
+ opened_here = true;
+ pjdlog_debug(1, "Obtained info about %s.", res->hr_localpath);
+ if (openrw) {
+ if (flock(res->hr_localfd, LOCK_EX | LOCK_NB) == -1) {
+ rerrno = errno;
+ if (errno == EOPNOTSUPP) {
+ pjdlog_warning("Unable to lock %s (operation not supported), but continuing.",
+ res->hr_localpath);
+ } else {
+ pjdlog_errno(LOG_ERR,
+ "Unable to lock %s",
+ res->hr_localpath);
+ goto fail;
+ }
+ }
+ pjdlog_debug(1, "Locked %s.", res->hr_localpath);
+ }
+ }
+
+ eb = ebuf_alloc(METADATA_SIZE);
+ if (eb == NULL) {
+ rerrno = errno;
+ pjdlog_errno(LOG_ERR,
+ "Unable to allocate memory to read metadata");
+ goto fail;
+ }
+ if (ebuf_add_tail(eb, NULL, METADATA_SIZE) == -1) {
+ rerrno = errno;
+ pjdlog_errno(LOG_ERR,
+ "Unable to allocate memory to read metadata");
+ ebuf_free(eb);
+ goto fail;
+ }
+ buf = ebuf_data(eb, NULL);
+ PJDLOG_ASSERT(buf != NULL);
+ done = pread(res->hr_localfd, buf, METADATA_SIZE, 0);
+ if (done == -1 || done != METADATA_SIZE) {
+ rerrno = errno;
+ pjdlog_errno(LOG_ERR, "Unable to read metadata");
+ ebuf_free(eb);
+ goto fail;
+ }
+ nv = nv_ntoh(eb);
+ if (nv == NULL) {
+ rerrno = errno;
+ pjdlog_errno(LOG_ERR, "Metadata read from %s is invalid",
+ res->hr_localpath);
+ ebuf_free(eb);
+ goto fail;
+ }
+
+ str = nv_get_string(nv, "resource");
+ if (str != NULL && strcmp(str, res->hr_name) != 0) {
+ pjdlog_error("Provider %s is not part of resource %s.",
+ res->hr_localpath, res->hr_name);
+ nv_free(nv);
+ goto fail;
+ }
+
+ res->hr_datasize = nv_get_uint64(nv, "datasize");
+ res->hr_extentsize = (int)nv_get_uint32(nv, "extentsize");
+ res->hr_keepdirty = (int)nv_get_uint32(nv, "keepdirty");
+ res->hr_localoff = nv_get_uint64(nv, "offset");
+ res->hr_resuid = nv_get_uint64(nv, "resuid");
+ if (res->hr_role != HAST_ROLE_PRIMARY) {
+ /* Secondary or init role. */
+ res->hr_secondary_localcnt = nv_get_uint64(nv, "localcnt");
+ res->hr_secondary_remotecnt = nv_get_uint64(nv, "remotecnt");
+ }
+ if (res->hr_role != HAST_ROLE_SECONDARY) {
+ /* Primary or init role. */
+ res->hr_primary_localcnt = nv_get_uint64(nv, "localcnt");
+ res->hr_primary_remotecnt = nv_get_uint64(nv, "remotecnt");
+ }
+ str = nv_get_string(nv, "prevrole");
+ if (str != NULL) {
+ if (strcmp(str, "primary") == 0)
+ res->hr_previous_role = HAST_ROLE_PRIMARY;
+ else if (strcmp(str, "secondary") == 0)
+ res->hr_previous_role = HAST_ROLE_SECONDARY;
+ }
+
+ if (nv_error(nv) != 0) {
+ errno = rerrno = nv_error(nv);
+ pjdlog_errno(LOG_ERR, "Unable to read metadata from %s",
+ res->hr_localpath);
+ nv_free(nv);
+ goto fail;
+ }
+ nv_free(nv);
+ return (0);
+fail:
+ if (opened_here) {
+ close(res->hr_localfd);
+ res->hr_localfd = -1;
+ }
+ errno = rerrno;
+ return (-1);
+}
+
+int
+metadata_write(struct hast_resource *res)
+{
+ struct ebuf *eb;
+ struct nv *nv;
+ unsigned char *buf, *ptr;
+ size_t size;
+ ssize_t done;
+ int ret;
+
+ buf = calloc(1, METADATA_SIZE);
+ if (buf == NULL) {
+ pjdlog_error("Unable to allocate %zu bytes for metadata.",
+ (size_t)METADATA_SIZE);
+ return (-1);
+ }
+
+ ret = -1;
+
+ nv = nv_alloc();
+ nv_add_string(nv, res->hr_name, "resource");
+ nv_add_uint64(nv, (uint64_t)res->hr_datasize, "datasize");
+ nv_add_uint32(nv, (uint32_t)res->hr_extentsize, "extentsize");
+ nv_add_uint32(nv, (uint32_t)res->hr_keepdirty, "keepdirty");
+ nv_add_uint64(nv, (uint64_t)res->hr_localoff, "offset");
+ nv_add_uint64(nv, res->hr_resuid, "resuid");
+ if (res->hr_role == HAST_ROLE_PRIMARY ||
+ res->hr_role == HAST_ROLE_INIT) {
+ nv_add_uint64(nv, res->hr_primary_localcnt, "localcnt");
+ nv_add_uint64(nv, res->hr_primary_remotecnt, "remotecnt");
+ } else /* if (res->hr_role == HAST_ROLE_SECONDARY) */ {
+ PJDLOG_ASSERT(res->hr_role == HAST_ROLE_SECONDARY);
+ nv_add_uint64(nv, res->hr_secondary_localcnt, "localcnt");
+ nv_add_uint64(nv, res->hr_secondary_remotecnt, "remotecnt");
+ }
+ nv_add_string(nv, role2str(res->hr_role), "prevrole");
+ if (nv_error(nv) != 0) {
+ pjdlog_error("Unable to create metadata.");
+ goto end;
+ }
+ res->hr_previous_role = res->hr_role;
+ eb = nv_hton(nv);
+ PJDLOG_ASSERT(eb != NULL);
+ ptr = ebuf_data(eb, &size);
+ PJDLOG_ASSERT(ptr != NULL);
+ PJDLOG_ASSERT(size < METADATA_SIZE);
+ bcopy(ptr, buf, size);
+ done = pwrite(res->hr_localfd, buf, METADATA_SIZE, 0);
+ if (done == -1 || done != METADATA_SIZE) {
+ pjdlog_errno(LOG_ERR, "Unable to write metadata");
+ goto end;
+ }
+ ret = 0;
+end:
+ free(buf);
+ nv_free(nv);
+ return (ret);
+}
diff --git a/sbin/hastd/metadata.h b/sbin/hastd/metadata.h
new file mode 100644
index 0000000..83d35f4
--- /dev/null
+++ b/sbin/hastd/metadata.h
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _METADATA_H_
+#define _METADATA_H_
+
+#include <stdbool.h>
+
+#include <hast.h>
+
+/*
+ * Maximum size of metadata.
+ * XXX: We should take sector size into account.
+ */
+#define METADATA_SIZE 4096
+
+int metadata_read(struct hast_resource *res, bool openrw);
+int metadata_write(struct hast_resource *res);
+
+#endif /* !_METADATA_H_ */
diff --git a/sbin/hastd/nv.c b/sbin/hastd/nv.c
new file mode 100644
index 0000000..8dcf697
--- /dev/null
+++ b/sbin/hastd/nv.c
@@ -0,0 +1,966 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/endian.h>
+
+#include <bitstring.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <ebuf.h>
+#include <pjdlog.h>
+
+#include "nv.h"
+
+#ifndef PJDLOG_ASSERT
+#include <assert.h>
+#define PJDLOG_ASSERT(...) assert(__VA_ARGS__)
+#endif
+#ifndef PJDLOG_ABORT
+#define PJDLOG_ABORT(...) abort()
+#endif
+
+#define NV_TYPE_NONE 0
+
+#define NV_TYPE_INT8 1
+#define NV_TYPE_UINT8 2
+#define NV_TYPE_INT16 3
+#define NV_TYPE_UINT16 4
+#define NV_TYPE_INT32 5
+#define NV_TYPE_UINT32 6
+#define NV_TYPE_INT64 7
+#define NV_TYPE_UINT64 8
+#define NV_TYPE_INT8_ARRAY 9
+#define NV_TYPE_UINT8_ARRAY 10
+#define NV_TYPE_INT16_ARRAY 11
+#define NV_TYPE_UINT16_ARRAY 12
+#define NV_TYPE_INT32_ARRAY 13
+#define NV_TYPE_UINT32_ARRAY 14
+#define NV_TYPE_INT64_ARRAY 15
+#define NV_TYPE_UINT64_ARRAY 16
+#define NV_TYPE_STRING 17
+
+#define NV_TYPE_MASK 0x7f
+#define NV_TYPE_FIRST NV_TYPE_INT8
+#define NV_TYPE_LAST NV_TYPE_STRING
+
+#define NV_ORDER_NETWORK 0x00
+#define NV_ORDER_HOST 0x80
+
+#define NV_ORDER_MASK 0x80
+
+#define NV_MAGIC 0xaea1e
+struct nv {
+ int nv_magic;
+ int nv_error;
+ struct ebuf *nv_ebuf;
+};
+
+struct nvhdr {
+ uint8_t nvh_type;
+ uint8_t nvh_namesize;
+ uint32_t nvh_dsize;
+ char nvh_name[0];
+} __packed;
+#define NVH_DATA(nvh) ((unsigned char *)nvh + NVH_HSIZE(nvh))
+#define NVH_HSIZE(nvh) \
+ (sizeof(struct nvhdr) + roundup2((nvh)->nvh_namesize, 8))
+#define NVH_DSIZE(nvh) \
+ (((nvh)->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST ? \
+ (nvh)->nvh_dsize : \
+ le32toh((nvh)->nvh_dsize))
+#define NVH_SIZE(nvh) (NVH_HSIZE(nvh) + roundup2(NVH_DSIZE(nvh), 8))
+
+#define NV_CHECK(nv) do { \
+ PJDLOG_ASSERT((nv) != NULL); \
+ PJDLOG_ASSERT((nv)->nv_magic == NV_MAGIC); \
+} while (0)
+
+static void nv_add(struct nv *nv, const unsigned char *value, size_t vsize,
+ int type, const char *name);
+static void nv_addv(struct nv *nv, const unsigned char *value, size_t vsize,
+ int type, const char *namefmt, va_list nameap);
+static struct nvhdr *nv_find(struct nv *nv, int type, const char *namefmt,
+ va_list nameap);
+static void nv_swap(struct nvhdr *nvh, bool tohost);
+
+/*
+ * Allocate and initialize new nv structure.
+ * Return NULL in case of malloc(3) failure.
+ */
+struct nv *
+nv_alloc(void)
+{
+ struct nv *nv;
+
+ nv = malloc(sizeof(*nv));
+ if (nv == NULL)
+ return (NULL);
+ nv->nv_ebuf = ebuf_alloc(0);
+ if (nv->nv_ebuf == NULL) {
+ free(nv);
+ return (NULL);
+ }
+ nv->nv_error = 0;
+ nv->nv_magic = NV_MAGIC;
+ return (nv);
+}
+
+/*
+ * Free the given nv structure.
+ */
+void
+nv_free(struct nv *nv)
+{
+
+ if (nv == NULL)
+ return;
+
+ NV_CHECK(nv);
+
+ nv->nv_magic = 0;
+ ebuf_free(nv->nv_ebuf);
+ free(nv);
+}
+
+/*
+ * Return error for the given nv structure.
+ */
+int
+nv_error(const struct nv *nv)
+{
+
+ if (nv == NULL)
+ return (ENOMEM);
+
+ NV_CHECK(nv);
+
+ return (nv->nv_error);
+}
+
+/*
+ * Set error for the given nv structure and return previous error.
+ */
+int
+nv_set_error(struct nv *nv, int error)
+{
+ int preverr;
+
+ if (nv == NULL)
+ return (ENOMEM);
+
+ NV_CHECK(nv);
+
+ preverr = nv->nv_error;
+ nv->nv_error = error;
+ return (preverr);
+}
+
+/*
+ * Validate correctness of the entire nv structure and all its elements.
+ * If extrap is not NULL, store number of extra bytes at the end of the buffer.
+ */
+int
+nv_validate(struct nv *nv, size_t *extrap)
+{
+ struct nvhdr *nvh;
+ unsigned char *data, *ptr;
+ size_t dsize, size, vsize;
+ int error;
+
+ if (nv == NULL) {
+ errno = ENOMEM;
+ return (-1);
+ }
+
+ NV_CHECK(nv);
+ PJDLOG_ASSERT(nv->nv_error == 0);
+
+ /* TODO: Check that names are unique? */
+
+ error = 0;
+ ptr = ebuf_data(nv->nv_ebuf, &size);
+ while (size > 0) {
+ /*
+ * Zeros at the end of the buffer are acceptable.
+ */
+ if (ptr[0] == '\0')
+ break;
+ /*
+ * Minimum size at this point is size of nvhdr structure, one
+ * character long name plus terminating '\0'.
+ */
+ if (size < sizeof(*nvh) + 2) {
+ error = EINVAL;
+ break;
+ }
+ nvh = (struct nvhdr *)ptr;
+ if (size < NVH_HSIZE(nvh)) {
+ error = EINVAL;
+ break;
+ }
+ if (nvh->nvh_name[nvh->nvh_namesize - 1] != '\0') {
+ error = EINVAL;
+ break;
+ }
+ if (strlen(nvh->nvh_name) !=
+ (size_t)(nvh->nvh_namesize - 1)) {
+ error = EINVAL;
+ break;
+ }
+ if ((nvh->nvh_type & NV_TYPE_MASK) < NV_TYPE_FIRST ||
+ (nvh->nvh_type & NV_TYPE_MASK) > NV_TYPE_LAST) {
+ error = EINVAL;
+ break;
+ }
+ dsize = NVH_DSIZE(nvh);
+ if (dsize == 0) {
+ error = EINVAL;
+ break;
+ }
+ if (size < NVH_SIZE(nvh)) {
+ error = EINVAL;
+ break;
+ }
+ vsize = 0;
+ switch (nvh->nvh_type & NV_TYPE_MASK) {
+ case NV_TYPE_INT8:
+ case NV_TYPE_UINT8:
+ if (vsize == 0)
+ vsize = 1;
+ /* FALLTHROUGH */
+ case NV_TYPE_INT16:
+ case NV_TYPE_UINT16:
+ if (vsize == 0)
+ vsize = 2;
+ /* FALLTHROUGH */
+ case NV_TYPE_INT32:
+ case NV_TYPE_UINT32:
+ if (vsize == 0)
+ vsize = 4;
+ /* FALLTHROUGH */
+ case NV_TYPE_INT64:
+ case NV_TYPE_UINT64:
+ if (vsize == 0)
+ vsize = 8;
+ if (dsize != vsize) {
+ error = EINVAL;
+ break;
+ }
+ break;
+ case NV_TYPE_INT8_ARRAY:
+ case NV_TYPE_UINT8_ARRAY:
+ break;
+ case NV_TYPE_INT16_ARRAY:
+ case NV_TYPE_UINT16_ARRAY:
+ if (vsize == 0)
+ vsize = 2;
+ /* FALLTHROUGH */
+ case NV_TYPE_INT32_ARRAY:
+ case NV_TYPE_UINT32_ARRAY:
+ if (vsize == 0)
+ vsize = 4;
+ /* FALLTHROUGH */
+ case NV_TYPE_INT64_ARRAY:
+ case NV_TYPE_UINT64_ARRAY:
+ if (vsize == 0)
+ vsize = 8;
+ if ((dsize % vsize) != 0) {
+ error = EINVAL;
+ break;
+ }
+ break;
+ case NV_TYPE_STRING:
+ data = NVH_DATA(nvh);
+ if (data[dsize - 1] != '\0') {
+ error = EINVAL;
+ break;
+ }
+ if (strlen((char *)data) != dsize - 1) {
+ error = EINVAL;
+ break;
+ }
+ break;
+ default:
+ PJDLOG_ABORT("invalid condition");
+ }
+ if (error != 0)
+ break;
+ ptr += NVH_SIZE(nvh);
+ size -= NVH_SIZE(nvh);
+ }
+ if (error != 0) {
+ errno = error;
+ if (nv->nv_error == 0)
+ nv->nv_error = error;
+ return (-1);
+ }
+ if (extrap != NULL)
+ *extrap = size;
+ return (0);
+}
+
+/*
+ * Convert the given nv structure to network byte order and return ebuf
+ * structure.
+ */
+struct ebuf *
+nv_hton(struct nv *nv)
+{
+ struct nvhdr *nvh;
+ unsigned char *ptr;
+ size_t size;
+
+ NV_CHECK(nv);
+ PJDLOG_ASSERT(nv->nv_error == 0);
+
+ ptr = ebuf_data(nv->nv_ebuf, &size);
+ while (size > 0) {
+ /*
+ * Minimum size at this point is size of nvhdr structure,
+ * one character long name plus terminating '\0'.
+ */
+ PJDLOG_ASSERT(size >= sizeof(*nvh) + 2);
+ nvh = (struct nvhdr *)ptr;
+ PJDLOG_ASSERT(NVH_SIZE(nvh) <= size);
+ nv_swap(nvh, false);
+ ptr += NVH_SIZE(nvh);
+ size -= NVH_SIZE(nvh);
+ }
+
+ return (nv->nv_ebuf);
+}
+
+/*
+ * Create nv structure based on ebuf received from the network.
+ */
+struct nv *
+nv_ntoh(struct ebuf *eb)
+{
+ struct nv *nv;
+ size_t extra;
+ int rerrno;
+
+ PJDLOG_ASSERT(eb != NULL);
+
+ nv = malloc(sizeof(*nv));
+ if (nv == NULL)
+ return (NULL);
+ nv->nv_error = 0;
+ nv->nv_ebuf = eb;
+ nv->nv_magic = NV_MAGIC;
+
+ if (nv_validate(nv, &extra) == -1) {
+ rerrno = errno;
+ nv->nv_magic = 0;
+ free(nv);
+ errno = rerrno;
+ return (NULL);
+ }
+ /*
+ * Remove extra zeros at the end of the buffer.
+ */
+ ebuf_del_tail(eb, extra);
+
+ return (nv);
+}
+
+#define NV_DEFINE_ADD(type, TYPE) \
+void \
+nv_add_##type(struct nv *nv, type##_t value, const char *namefmt, ...) \
+{ \
+ va_list nameap; \
+ \
+ va_start(nameap, namefmt); \
+ nv_addv(nv, (unsigned char *)&value, sizeof(value), \
+ NV_TYPE_##TYPE, namefmt, nameap); \
+ va_end(nameap); \
+}
+
+NV_DEFINE_ADD(int8, INT8)
+NV_DEFINE_ADD(uint8, UINT8)
+NV_DEFINE_ADD(int16, INT16)
+NV_DEFINE_ADD(uint16, UINT16)
+NV_DEFINE_ADD(int32, INT32)
+NV_DEFINE_ADD(uint32, UINT32)
+NV_DEFINE_ADD(int64, INT64)
+NV_DEFINE_ADD(uint64, UINT64)
+
+#undef NV_DEFINE_ADD
+
+#define NV_DEFINE_ADD_ARRAY(type, TYPE) \
+void \
+nv_add_##type##_array(struct nv *nv, const type##_t *value, \
+ size_t nsize, const char *namefmt, ...) \
+{ \
+ va_list nameap; \
+ \
+ va_start(nameap, namefmt); \
+ nv_addv(nv, (const unsigned char *)value, \
+ sizeof(value[0]) * nsize, NV_TYPE_##TYPE##_ARRAY, namefmt, \
+ nameap); \
+ va_end(nameap); \
+}
+
+NV_DEFINE_ADD_ARRAY(int8, INT8)
+NV_DEFINE_ADD_ARRAY(uint8, UINT8)
+NV_DEFINE_ADD_ARRAY(int16, INT16)
+NV_DEFINE_ADD_ARRAY(uint16, UINT16)
+NV_DEFINE_ADD_ARRAY(int32, INT32)
+NV_DEFINE_ADD_ARRAY(uint32, UINT32)
+NV_DEFINE_ADD_ARRAY(int64, INT64)
+NV_DEFINE_ADD_ARRAY(uint64, UINT64)
+
+#undef NV_DEFINE_ADD_ARRAY
+
+void
+nv_add_string(struct nv *nv, const char *value, const char *namefmt, ...)
+{
+ va_list nameap;
+ size_t size;
+
+ size = strlen(value) + 1;
+
+ va_start(nameap, namefmt);
+ nv_addv(nv, (const unsigned char *)value, size, NV_TYPE_STRING,
+ namefmt, nameap);
+ va_end(nameap);
+}
+
+void
+nv_add_stringf(struct nv *nv, const char *name, const char *valuefmt, ...)
+{
+ va_list valueap;
+
+ va_start(valueap, valuefmt);
+ nv_add_stringv(nv, name, valuefmt, valueap);
+ va_end(valueap);
+}
+
+void
+nv_add_stringv(struct nv *nv, const char *name, const char *valuefmt,
+ va_list valueap)
+{
+ char *value;
+ ssize_t size;
+
+ size = vasprintf(&value, valuefmt, valueap);
+ if (size == -1) {
+ if (nv->nv_error == 0)
+ nv->nv_error = ENOMEM;
+ return;
+ }
+ size++;
+ nv_add(nv, (const unsigned char *)value, size, NV_TYPE_STRING, name);
+ free(value);
+}
+
+#define NV_DEFINE_GET(type, TYPE) \
+type##_t \
+nv_get_##type(struct nv *nv, const char *namefmt, ...) \
+{ \
+ struct nvhdr *nvh; \
+ va_list nameap; \
+ type##_t value; \
+ \
+ va_start(nameap, namefmt); \
+ nvh = nv_find(nv, NV_TYPE_##TYPE, namefmt, nameap); \
+ va_end(nameap); \
+ if (nvh == NULL) \
+ return (0); \
+ PJDLOG_ASSERT((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST);\
+ PJDLOG_ASSERT(sizeof(value) == nvh->nvh_dsize); \
+ bcopy(NVH_DATA(nvh), &value, sizeof(value)); \
+ \
+ return (value); \
+}
+
+NV_DEFINE_GET(int8, INT8)
+NV_DEFINE_GET(uint8, UINT8)
+NV_DEFINE_GET(int16, INT16)
+NV_DEFINE_GET(uint16, UINT16)
+NV_DEFINE_GET(int32, INT32)
+NV_DEFINE_GET(uint32, UINT32)
+NV_DEFINE_GET(int64, INT64)
+NV_DEFINE_GET(uint64, UINT64)
+
+#undef NV_DEFINE_GET
+
+#define NV_DEFINE_GET_ARRAY(type, TYPE) \
+const type##_t * \
+nv_get_##type##_array(struct nv *nv, size_t *sizep, \
+ const char *namefmt, ...) \
+{ \
+ struct nvhdr *nvh; \
+ va_list nameap; \
+ \
+ va_start(nameap, namefmt); \
+ nvh = nv_find(nv, NV_TYPE_##TYPE##_ARRAY, namefmt, nameap); \
+ va_end(nameap); \
+ if (nvh == NULL) \
+ return (NULL); \
+ PJDLOG_ASSERT((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST);\
+ PJDLOG_ASSERT((nvh->nvh_dsize % sizeof(type##_t)) == 0); \
+ if (sizep != NULL) \
+ *sizep = nvh->nvh_dsize / sizeof(type##_t); \
+ return ((type##_t *)(void *)NVH_DATA(nvh)); \
+}
+
+NV_DEFINE_GET_ARRAY(int8, INT8)
+NV_DEFINE_GET_ARRAY(uint8, UINT8)
+NV_DEFINE_GET_ARRAY(int16, INT16)
+NV_DEFINE_GET_ARRAY(uint16, UINT16)
+NV_DEFINE_GET_ARRAY(int32, INT32)
+NV_DEFINE_GET_ARRAY(uint32, UINT32)
+NV_DEFINE_GET_ARRAY(int64, INT64)
+NV_DEFINE_GET_ARRAY(uint64, UINT64)
+
+#undef NV_DEFINE_GET_ARRAY
+
+const char *
+nv_get_string(struct nv *nv, const char *namefmt, ...)
+{
+ struct nvhdr *nvh;
+ va_list nameap;
+ char *str;
+
+ va_start(nameap, namefmt);
+ nvh = nv_find(nv, NV_TYPE_STRING, namefmt, nameap);
+ va_end(nameap);
+ if (nvh == NULL)
+ return (NULL);
+ PJDLOG_ASSERT((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST);
+ PJDLOG_ASSERT(nvh->nvh_dsize >= 1);
+ str = NVH_DATA(nvh);
+ PJDLOG_ASSERT(str[nvh->nvh_dsize - 1] == '\0');
+ PJDLOG_ASSERT(strlen(str) == nvh->nvh_dsize - 1);
+ return (str);
+}
+
+static bool
+nv_vexists(struct nv *nv, const char *namefmt, va_list nameap)
+{
+ struct nvhdr *nvh;
+ int snverror, serrno;
+
+ if (nv == NULL)
+ return (false);
+
+ serrno = errno;
+ snverror = nv->nv_error;
+
+ nvh = nv_find(nv, NV_TYPE_NONE, namefmt, nameap);
+
+ errno = serrno;
+ nv->nv_error = snverror;
+
+ return (nvh != NULL);
+}
+
+bool
+nv_exists(struct nv *nv, const char *namefmt, ...)
+{
+ va_list nameap;
+ bool ret;
+
+ va_start(nameap, namefmt);
+ ret = nv_vexists(nv, namefmt, nameap);
+ va_end(nameap);
+
+ return (ret);
+}
+
+void
+nv_assert(struct nv *nv, const char *namefmt, ...)
+{
+ va_list nameap;
+
+ va_start(nameap, namefmt);
+ PJDLOG_ASSERT(nv_vexists(nv, namefmt, nameap));
+ va_end(nameap);
+}
+
+/*
+ * Dump content of the nv structure.
+ */
+void
+nv_dump(struct nv *nv)
+{
+ struct nvhdr *nvh;
+ unsigned char *data, *ptr;
+ size_t dsize, size;
+ unsigned int ii;
+ bool swap;
+
+ if (nv_validate(nv, NULL) == -1) {
+ printf("error: %d\n", errno);
+ return;
+ }
+
+ NV_CHECK(nv);
+ PJDLOG_ASSERT(nv->nv_error == 0);
+
+ ptr = ebuf_data(nv->nv_ebuf, &size);
+ while (size > 0) {
+ PJDLOG_ASSERT(size >= sizeof(*nvh) + 2);
+ nvh = (struct nvhdr *)ptr;
+ PJDLOG_ASSERT(size >= NVH_SIZE(nvh));
+ swap = ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_NETWORK);
+ dsize = NVH_DSIZE(nvh);
+ data = NVH_DATA(nvh);
+ printf(" %s", nvh->nvh_name);
+ switch (nvh->nvh_type & NV_TYPE_MASK) {
+ case NV_TYPE_INT8:
+ printf("(int8): %jd", (intmax_t)(*(int8_t *)data));
+ break;
+ case NV_TYPE_UINT8:
+ printf("(uint8): %ju", (uintmax_t)(*(uint8_t *)data));
+ break;
+ case NV_TYPE_INT16:
+ printf("(int16): %jd", swap ?
+ (intmax_t)le16toh(*(int16_t *)(void *)data) :
+ (intmax_t)*(int16_t *)(void *)data);
+ break;
+ case NV_TYPE_UINT16:
+ printf("(uint16): %ju", swap ?
+ (uintmax_t)le16toh(*(uint16_t *)(void *)data) :
+ (uintmax_t)*(uint16_t *)(void *)data);
+ break;
+ case NV_TYPE_INT32:
+ printf("(int32): %jd", swap ?
+ (intmax_t)le32toh(*(int32_t *)(void *)data) :
+ (intmax_t)*(int32_t *)(void *)data);
+ break;
+ case NV_TYPE_UINT32:
+ printf("(uint32): %ju", swap ?
+ (uintmax_t)le32toh(*(uint32_t *)(void *)data) :
+ (uintmax_t)*(uint32_t *)(void *)data);
+ break;
+ case NV_TYPE_INT64:
+ printf("(int64): %jd", swap ?
+ (intmax_t)le64toh(*(int64_t *)(void *)data) :
+ (intmax_t)*(int64_t *)(void *)data);
+ break;
+ case NV_TYPE_UINT64:
+ printf("(uint64): %ju", swap ?
+ (uintmax_t)le64toh(*(uint64_t *)(void *)data) :
+ (uintmax_t)*(uint64_t *)(void *)data);
+ break;
+ case NV_TYPE_INT8_ARRAY:
+ printf("(int8 array):");
+ for (ii = 0; ii < dsize; ii++)
+ printf(" %jd", (intmax_t)((int8_t *)data)[ii]);
+ break;
+ case NV_TYPE_UINT8_ARRAY:
+ printf("(uint8 array):");
+ for (ii = 0; ii < dsize; ii++)
+ printf(" %ju", (uintmax_t)((uint8_t *)data)[ii]);
+ break;
+ case NV_TYPE_INT16_ARRAY:
+ printf("(int16 array):");
+ for (ii = 0; ii < dsize / 2; ii++) {
+ printf(" %jd", swap ?
+ (intmax_t)le16toh(((int16_t *)(void *)data)[ii]) :
+ (intmax_t)((int16_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_UINT16_ARRAY:
+ printf("(uint16 array):");
+ for (ii = 0; ii < dsize / 2; ii++) {
+ printf(" %ju", swap ?
+ (uintmax_t)le16toh(((uint16_t *)(void *)data)[ii]) :
+ (uintmax_t)((uint16_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_INT32_ARRAY:
+ printf("(int32 array):");
+ for (ii = 0; ii < dsize / 4; ii++) {
+ printf(" %jd", swap ?
+ (intmax_t)le32toh(((int32_t *)(void *)data)[ii]) :
+ (intmax_t)((int32_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_UINT32_ARRAY:
+ printf("(uint32 array):");
+ for (ii = 0; ii < dsize / 4; ii++) {
+ printf(" %ju", swap ?
+ (uintmax_t)le32toh(((uint32_t *)(void *)data)[ii]) :
+ (uintmax_t)((uint32_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_INT64_ARRAY:
+ printf("(int64 array):");
+ for (ii = 0; ii < dsize / 8; ii++) {
+ printf(" %ju", swap ?
+ (uintmax_t)le64toh(((uint64_t *)(void *)data)[ii]) :
+ (uintmax_t)((uint64_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_UINT64_ARRAY:
+ printf("(uint64 array):");
+ for (ii = 0; ii < dsize / 8; ii++) {
+ printf(" %ju", swap ?
+ (uintmax_t)le64toh(((uint64_t *)(void *)data)[ii]) :
+ (uintmax_t)((uint64_t *)(void *)data)[ii]);
+ }
+ break;
+ case NV_TYPE_STRING:
+ printf("(string): %s", (char *)data);
+ break;
+ default:
+ PJDLOG_ABORT("invalid condition");
+ }
+ printf("\n");
+ ptr += NVH_SIZE(nvh);
+ size -= NVH_SIZE(nvh);
+ }
+}
+
+/*
+ * Local routines below.
+ */
+
+static void
+nv_add(struct nv *nv, const unsigned char *value, size_t vsize, int type,
+ const char *name)
+{
+ static unsigned char align[7];
+ struct nvhdr *nvh;
+ size_t namesize;
+
+ if (nv == NULL) {
+ errno = ENOMEM;
+ return;
+ }
+
+ NV_CHECK(nv);
+
+ namesize = strlen(name) + 1;
+
+ nvh = malloc(sizeof(*nvh) + roundup2(namesize, 8));
+ if (nvh == NULL) {
+ if (nv->nv_error == 0)
+ nv->nv_error = ENOMEM;
+ return;
+ }
+ nvh->nvh_type = NV_ORDER_HOST | type;
+ nvh->nvh_namesize = (uint8_t)namesize;
+ nvh->nvh_dsize = (uint32_t)vsize;
+ bcopy(name, nvh->nvh_name, namesize);
+
+ /* Add header first. */
+ if (ebuf_add_tail(nv->nv_ebuf, nvh, NVH_HSIZE(nvh)) == -1) {
+ PJDLOG_ASSERT(errno != 0);
+ if (nv->nv_error == 0)
+ nv->nv_error = errno;
+ free(nvh);
+ return;
+ }
+ free(nvh);
+ /* Add the actual data. */
+ if (ebuf_add_tail(nv->nv_ebuf, value, vsize) == -1) {
+ PJDLOG_ASSERT(errno != 0);
+ if (nv->nv_error == 0)
+ nv->nv_error = errno;
+ return;
+ }
+ /* Align the data (if needed). */
+ vsize = roundup2(vsize, 8) - vsize;
+ if (vsize == 0)
+ return;
+ PJDLOG_ASSERT(vsize > 0 && vsize <= sizeof(align));
+ if (ebuf_add_tail(nv->nv_ebuf, align, vsize) == -1) {
+ PJDLOG_ASSERT(errno != 0);
+ if (nv->nv_error == 0)
+ nv->nv_error = errno;
+ return;
+ }
+}
+
+static void
+nv_addv(struct nv *nv, const unsigned char *value, size_t vsize, int type,
+ const char *namefmt, va_list nameap)
+{
+ char name[255];
+ size_t namesize;
+
+ namesize = vsnprintf(name, sizeof(name), namefmt, nameap);
+ PJDLOG_ASSERT(namesize > 0 && namesize < sizeof(name));
+
+ nv_add(nv, value, vsize, type, name);
+}
+
+static struct nvhdr *
+nv_find(struct nv *nv, int type, const char *namefmt, va_list nameap)
+{
+ char name[255];
+ struct nvhdr *nvh;
+ unsigned char *ptr;
+ size_t size, namesize;
+
+ if (nv == NULL) {
+ errno = ENOMEM;
+ return (NULL);
+ }
+
+ NV_CHECK(nv);
+
+ namesize = vsnprintf(name, sizeof(name), namefmt, nameap);
+ PJDLOG_ASSERT(namesize > 0 && namesize < sizeof(name));
+ namesize++;
+
+ ptr = ebuf_data(nv->nv_ebuf, &size);
+ while (size > 0) {
+ PJDLOG_ASSERT(size >= sizeof(*nvh) + 2);
+ nvh = (struct nvhdr *)ptr;
+ PJDLOG_ASSERT(size >= NVH_SIZE(nvh));
+ nv_swap(nvh, true);
+ if (strcmp(nvh->nvh_name, name) == 0) {
+ if (type != NV_TYPE_NONE &&
+ (nvh->nvh_type & NV_TYPE_MASK) != type) {
+ errno = EINVAL;
+ if (nv->nv_error == 0)
+ nv->nv_error = EINVAL;
+ return (NULL);
+ }
+ return (nvh);
+ }
+ ptr += NVH_SIZE(nvh);
+ size -= NVH_SIZE(nvh);
+ }
+ errno = ENOENT;
+ if (nv->nv_error == 0)
+ nv->nv_error = ENOENT;
+ return (NULL);
+}
+
+static void
+nv_swap(struct nvhdr *nvh, bool tohost)
+{
+ unsigned char *data, *end, *p;
+ size_t vsize;
+
+ data = NVH_DATA(nvh);
+ if (tohost) {
+ if ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_HOST)
+ return;
+ nvh->nvh_dsize = le32toh(nvh->nvh_dsize);
+ end = data + nvh->nvh_dsize;
+ nvh->nvh_type &= ~NV_ORDER_MASK;
+ nvh->nvh_type |= NV_ORDER_HOST;
+ } else {
+ if ((nvh->nvh_type & NV_ORDER_MASK) == NV_ORDER_NETWORK)
+ return;
+ end = data + nvh->nvh_dsize;
+ nvh->nvh_dsize = htole32(nvh->nvh_dsize);
+ nvh->nvh_type &= ~NV_ORDER_MASK;
+ nvh->nvh_type |= NV_ORDER_NETWORK;
+ }
+
+ vsize = 0;
+
+ switch (nvh->nvh_type & NV_TYPE_MASK) {
+ case NV_TYPE_INT8:
+ case NV_TYPE_UINT8:
+ case NV_TYPE_INT8_ARRAY:
+ case NV_TYPE_UINT8_ARRAY:
+ break;
+ case NV_TYPE_INT16:
+ case NV_TYPE_UINT16:
+ case NV_TYPE_INT16_ARRAY:
+ case NV_TYPE_UINT16_ARRAY:
+ if (vsize == 0)
+ vsize = 2;
+ /* FALLTHROUGH */
+ case NV_TYPE_INT32:
+ case NV_TYPE_UINT32:
+ case NV_TYPE_INT32_ARRAY:
+ case NV_TYPE_UINT32_ARRAY:
+ if (vsize == 0)
+ vsize = 4;
+ /* FALLTHROUGH */
+ case NV_TYPE_INT64:
+ case NV_TYPE_UINT64:
+ case NV_TYPE_INT64_ARRAY:
+ case NV_TYPE_UINT64_ARRAY:
+ if (vsize == 0)
+ vsize = 8;
+ for (p = data; p < end; p += vsize) {
+ if (tohost) {
+ switch (vsize) {
+ case 2:
+ *(uint16_t *)(void *)p =
+ le16toh(*(uint16_t *)(void *)p);
+ break;
+ case 4:
+ *(uint32_t *)(void *)p =
+ le32toh(*(uint32_t *)(void *)p);
+ break;
+ case 8:
+ *(uint64_t *)(void *)p =
+ le64toh(*(uint64_t *)(void *)p);
+ break;
+ default:
+ PJDLOG_ABORT("invalid condition");
+ }
+ } else {
+ switch (vsize) {
+ case 2:
+ *(uint16_t *)(void *)p =
+ htole16(*(uint16_t *)(void *)p);
+ break;
+ case 4:
+ *(uint32_t *)(void *)p =
+ htole32(*(uint32_t *)(void *)p);
+ break;
+ case 8:
+ *(uint64_t *)(void *)p =
+ htole64(*(uint64_t *)(void *)p);
+ break;
+ default:
+ PJDLOG_ABORT("invalid condition");
+ }
+ }
+ }
+ break;
+ case NV_TYPE_STRING:
+ break;
+ default:
+ PJDLOG_ABORT("unrecognized type");
+ }
+}
diff --git a/sbin/hastd/nv.h b/sbin/hastd/nv.h
new file mode 100644
index 0000000..d49fa5d
--- /dev/null
+++ b/sbin/hastd/nv.h
@@ -0,0 +1,133 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NV_H_
+#define _NV_H_
+
+#include <sys/cdefs.h>
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <ebuf.h>
+
+struct nv;
+
+struct nv *nv_alloc(void);
+void nv_free(struct nv *nv);
+int nv_error(const struct nv *nv);
+int nv_set_error(struct nv *nv, int error);
+int nv_validate(struct nv *nv, size_t *extrap);
+
+struct ebuf *nv_hton(struct nv *nv);
+struct nv *nv_ntoh(struct ebuf *eb);
+
+void nv_add_int8(struct nv *nv, int8_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_uint8(struct nv *nv, uint8_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_int16(struct nv *nv, int16_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_uint16(struct nv *nv, uint16_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_int32(struct nv *nv, int32_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_uint32(struct nv *nv, uint32_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_int64(struct nv *nv, int64_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_uint64(struct nv *nv, uint64_t value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_int8_array(struct nv *nv, const int8_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_uint8_array(struct nv *nv, const uint8_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_int16_array(struct nv *nv, const int16_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_uint16_array(struct nv *nv, const uint16_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_int32_array(struct nv *nv, const int32_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_uint32_array(struct nv *nv, const uint32_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_int64_array(struct nv *nv, const int64_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_uint64_array(struct nv *nv, const uint64_t *value, size_t size,
+ const char *namefmt, ...) __printflike(4, 5);
+void nv_add_string(struct nv *nv, const char *value, const char *namefmt, ...)
+ __printflike(3, 4);
+void nv_add_stringf(struct nv *nv, const char *name, const char *valuefmt, ...)
+ __printflike(3, 4);
+void nv_add_stringv(struct nv *nv, const char *name, const char *valuefmt,
+ va_list valueap) __printflike(3, 0);
+
+int8_t nv_get_int8(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+uint8_t nv_get_uint8(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+int16_t nv_get_int16(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+uint16_t nv_get_uint16(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+int32_t nv_get_int32(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+uint32_t nv_get_uint32(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+int64_t nv_get_int64(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+uint64_t nv_get_uint64(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+const int8_t *nv_get_int8_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const uint8_t *nv_get_uint8_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const int16_t *nv_get_int16_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const uint16_t *nv_get_uint16_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const int32_t *nv_get_int32_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const uint32_t *nv_get_uint32_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const int64_t *nv_get_int64_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const uint64_t *nv_get_uint64_array(struct nv *nv, size_t *sizep,
+ const char *namefmt, ...) __printflike(3, 4);
+const char *nv_get_string(struct nv *nv, const char *namefmt, ...)
+ __printflike(2, 3);
+
+bool nv_exists(struct nv *nv, const char *namefmt, ...) __printflike(2, 3);
+void nv_assert(struct nv *nv, const char *namefmt, ...) __printflike(2, 3);
+void nv_dump(struct nv *nv);
+
+#endif /* !_NV_H_ */
diff --git a/sbin/hastd/parse.y b/sbin/hastd/parse.y
new file mode 100644
index 0000000..6bfb537
--- /dev/null
+++ b/sbin/hastd/parse.y
@@ -0,0 +1,1037 @@
+%{
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h> /* MAXHOSTNAMELEN */
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <arpa/inet.h>
+
+#include <err.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <pjdlog.h>
+
+#include "hast.h"
+
+extern int depth;
+extern int lineno;
+
+extern FILE *yyin;
+extern char *yytext;
+
+static struct hastd_config *lconfig;
+static struct hast_resource *curres;
+static bool mynode, hadmynode;
+
+static char depth0_control[HAST_ADDRSIZE];
+static char depth0_pidfile[PATH_MAX];
+static char depth0_listen_tcp4[HAST_ADDRSIZE];
+static char depth0_listen_tcp6[HAST_ADDRSIZE];
+static TAILQ_HEAD(, hastd_listen) depth0_listen;
+static int depth0_replication;
+static int depth0_checksum;
+static int depth0_compression;
+static int depth0_timeout;
+static char depth0_exec[PATH_MAX];
+static int depth0_metaflush;
+
+static char depth1_provname[PATH_MAX];
+static char depth1_localpath[PATH_MAX];
+static int depth1_metaflush;
+
+extern void yyerror(const char *);
+extern int yylex(void);
+extern void yyrestart(FILE *);
+
+static int isitme(const char *name);
+static bool family_supported(int family);
+static int node_names(char **namesp);
+%}
+
+%token CONTROL PIDFILE LISTEN REPLICATION CHECKSUM COMPRESSION METAFLUSH
+%token TIMEOUT EXEC RESOURCE NAME LOCAL REMOTE SOURCE ON OFF
+%token FULLSYNC MEMSYNC ASYNC NONE CRC32 SHA256 HOLE LZF
+%token NUM STR OB CB
+
+%type <str> remote_str
+%type <num> replication_type
+%type <num> checksum_type
+%type <num> compression_type
+%type <num> boolean
+
+%union
+{
+ int num;
+ char *str;
+}
+
+%token <num> NUM
+%token <str> STR
+
+%%
+
+statements:
+ |
+ statements statement
+ ;
+
+statement:
+ control_statement
+ |
+ pidfile_statement
+ |
+ listen_statement
+ |
+ replication_statement
+ |
+ checksum_statement
+ |
+ compression_statement
+ |
+ timeout_statement
+ |
+ exec_statement
+ |
+ metaflush_statement
+ |
+ node_statement
+ |
+ resource_statement
+ ;
+
+control_statement: CONTROL STR
+ {
+ switch (depth) {
+ case 0:
+ if (strlcpy(depth0_control, $2,
+ sizeof(depth0_control)) >=
+ sizeof(depth0_control)) {
+ pjdlog_error("control argument is too long.");
+ free($2);
+ return (1);
+ }
+ break;
+ case 1:
+ if (!mynode)
+ break;
+ if (strlcpy(lconfig->hc_controladdr, $2,
+ sizeof(lconfig->hc_controladdr)) >=
+ sizeof(lconfig->hc_controladdr)) {
+ pjdlog_error("control argument is too long.");
+ free($2);
+ return (1);
+ }
+ break;
+ default:
+ PJDLOG_ABORT("control at wrong depth level");
+ }
+ free($2);
+ }
+ ;
+
+pidfile_statement: PIDFILE STR
+ {
+ switch (depth) {
+ case 0:
+ if (strlcpy(depth0_pidfile, $2,
+ sizeof(depth0_pidfile)) >=
+ sizeof(depth0_pidfile)) {
+ pjdlog_error("pidfile argument is too long.");
+ free($2);
+ return (1);
+ }
+ break;
+ case 1:
+ if (!mynode)
+ break;
+ if (strlcpy(lconfig->hc_pidfile, $2,
+ sizeof(lconfig->hc_pidfile)) >=
+ sizeof(lconfig->hc_pidfile)) {
+ pjdlog_error("pidfile argument is too long.");
+ free($2);
+ return (1);
+ }
+ break;
+ default:
+ PJDLOG_ABORT("pidfile at wrong depth level");
+ }
+ free($2);
+ }
+ ;
+
+listen_statement: LISTEN STR
+ {
+ struct hastd_listen *lst;
+
+ lst = calloc(1, sizeof(*lst));
+ if (lst == NULL) {
+ pjdlog_error("Unable to allocate memory for listen address.");
+ free($2);
+ return (1);
+ }
+ if (strlcpy(lst->hl_addr, $2, sizeof(lst->hl_addr)) >=
+ sizeof(lst->hl_addr)) {
+ pjdlog_error("listen argument is too long.");
+ free($2);
+ free(lst);
+ return (1);
+ }
+ switch (depth) {
+ case 0:
+ TAILQ_INSERT_TAIL(&depth0_listen, lst, hl_next);
+ break;
+ case 1:
+ if (mynode)
+ TAILQ_INSERT_TAIL(&depth0_listen, lst, hl_next);
+ else
+ free(lst);
+ break;
+ default:
+ PJDLOG_ABORT("listen at wrong depth level");
+ }
+ free($2);
+ }
+ ;
+
+replication_statement: REPLICATION replication_type
+ {
+ switch (depth) {
+ case 0:
+ depth0_replication = $2;
+ break;
+ case 1:
+ PJDLOG_ASSERT(curres != NULL);
+ curres->hr_replication = $2;
+ curres->hr_original_replication = $2;
+ break;
+ default:
+ PJDLOG_ABORT("replication at wrong depth level");
+ }
+ }
+ ;
+
+replication_type:
+ FULLSYNC { $$ = HAST_REPLICATION_FULLSYNC; }
+ |
+ MEMSYNC { $$ = HAST_REPLICATION_MEMSYNC; }
+ |
+ ASYNC { $$ = HAST_REPLICATION_ASYNC; }
+ ;
+
+checksum_statement: CHECKSUM checksum_type
+ {
+ switch (depth) {
+ case 0:
+ depth0_checksum = $2;
+ break;
+ case 1:
+ PJDLOG_ASSERT(curres != NULL);
+ curres->hr_checksum = $2;
+ break;
+ default:
+ PJDLOG_ABORT("checksum at wrong depth level");
+ }
+ }
+ ;
+
+checksum_type:
+ NONE { $$ = HAST_CHECKSUM_NONE; }
+ |
+ CRC32 { $$ = HAST_CHECKSUM_CRC32; }
+ |
+ SHA256 { $$ = HAST_CHECKSUM_SHA256; }
+ ;
+
+compression_statement: COMPRESSION compression_type
+ {
+ switch (depth) {
+ case 0:
+ depth0_compression = $2;
+ break;
+ case 1:
+ PJDLOG_ASSERT(curres != NULL);
+ curres->hr_compression = $2;
+ break;
+ default:
+ PJDLOG_ABORT("compression at wrong depth level");
+ }
+ }
+ ;
+
+compression_type:
+ NONE { $$ = HAST_COMPRESSION_NONE; }
+ |
+ HOLE { $$ = HAST_COMPRESSION_HOLE; }
+ |
+ LZF { $$ = HAST_COMPRESSION_LZF; }
+ ;
+
+timeout_statement: TIMEOUT NUM
+ {
+ if ($2 <= 0) {
+ pjdlog_error("Negative or zero timeout.");
+ return (1);
+ }
+ switch (depth) {
+ case 0:
+ depth0_timeout = $2;
+ break;
+ case 1:
+ PJDLOG_ASSERT(curres != NULL);
+ curres->hr_timeout = $2;
+ break;
+ default:
+ PJDLOG_ABORT("timeout at wrong depth level");
+ }
+ }
+ ;
+
+exec_statement: EXEC STR
+ {
+ switch (depth) {
+ case 0:
+ if (strlcpy(depth0_exec, $2, sizeof(depth0_exec)) >=
+ sizeof(depth0_exec)) {
+ pjdlog_error("Exec path is too long.");
+ free($2);
+ return (1);
+ }
+ break;
+ case 1:
+ PJDLOG_ASSERT(curres != NULL);
+ if (strlcpy(curres->hr_exec, $2,
+ sizeof(curres->hr_exec)) >=
+ sizeof(curres->hr_exec)) {
+ pjdlog_error("Exec path is too long.");
+ free($2);
+ return (1);
+ }
+ break;
+ default:
+ PJDLOG_ABORT("exec at wrong depth level");
+ }
+ free($2);
+ }
+ ;
+
+metaflush_statement: METAFLUSH boolean
+ {
+ switch (depth) {
+ case 0:
+ depth0_metaflush = $2;
+ break;
+ case 1:
+ PJDLOG_ASSERT(curres != NULL);
+ depth1_metaflush = $2;
+ break;
+ case 2:
+ if (!mynode)
+ break;
+ PJDLOG_ASSERT(curres != NULL);
+ curres->hr_metaflush = $2;
+ break;
+ default:
+ PJDLOG_ABORT("metaflush at wrong depth level");
+ }
+ }
+ ;
+
+boolean:
+ ON { $$ = 1; }
+ |
+ OFF { $$ = 0; }
+ ;
+
+node_statement: ON node_start OB node_entries CB
+ {
+ mynode = false;
+ }
+ ;
+
+node_start: STR
+ {
+ switch (isitme($1)) {
+ case -1:
+ free($1);
+ return (1);
+ case 0:
+ break;
+ case 1:
+ mynode = true;
+ break;
+ default:
+ PJDLOG_ABORT("invalid isitme() return value");
+ }
+ free($1);
+ }
+ ;
+
+node_entries:
+ |
+ node_entries node_entry
+ ;
+
+node_entry:
+ control_statement
+ |
+ pidfile_statement
+ |
+ listen_statement
+ ;
+
+resource_statement: RESOURCE resource_start OB resource_entries CB
+ {
+ if (curres != NULL) {
+ /*
+ * There must be section for this node, at least with
+ * remote address configuration.
+ */
+ if (!hadmynode) {
+ char *names;
+
+ if (node_names(&names) != 0)
+ return (1);
+ pjdlog_error("No resource %s configuration for this node (acceptable node names: %s).",
+ curres->hr_name, names);
+ return (1);
+ }
+
+ /*
+ * Let's see if there are some resource-level settings
+ * that we can use for node-level settings.
+ */
+ if (curres->hr_provname[0] == '\0' &&
+ depth1_provname[0] != '\0') {
+ /*
+ * Provider name is not set at node-level,
+ * but is set at resource-level, use it.
+ */
+ strlcpy(curres->hr_provname, depth1_provname,
+ sizeof(curres->hr_provname));
+ }
+ if (curres->hr_localpath[0] == '\0' &&
+ depth1_localpath[0] != '\0') {
+ /*
+ * Path to local provider is not set at
+ * node-level, but is set at resource-level,
+ * use it.
+ */
+ strlcpy(curres->hr_localpath, depth1_localpath,
+ sizeof(curres->hr_localpath));
+ }
+ if (curres->hr_metaflush == -1 && depth1_metaflush != -1) {
+ /*
+ * Metaflush is not set at node-level,
+ * but is set at resource-level, use it.
+ */
+ curres->hr_metaflush = depth1_metaflush;
+ }
+
+ /*
+ * If provider name is not given, use resource name
+ * as provider name.
+ */
+ if (curres->hr_provname[0] == '\0') {
+ strlcpy(curres->hr_provname, curres->hr_name,
+ sizeof(curres->hr_provname));
+ }
+
+ /*
+ * Remote address has to be configured at this point.
+ */
+ if (curres->hr_remoteaddr[0] == '\0') {
+ pjdlog_error("Remote address not configured for resource %s.",
+ curres->hr_name);
+ return (1);
+ }
+ /*
+ * Path to local provider has to be configured at this
+ * point.
+ */
+ if (curres->hr_localpath[0] == '\0') {
+ pjdlog_error("Path to local component not configured for resource %s.",
+ curres->hr_name);
+ return (1);
+ }
+
+ /* Put it onto resource list. */
+ TAILQ_INSERT_TAIL(&lconfig->hc_resources, curres, hr_next);
+ curres = NULL;
+ }
+ }
+ ;
+
+resource_start: STR
+ {
+ /* Check if there is no duplicate entry. */
+ TAILQ_FOREACH(curres, &lconfig->hc_resources, hr_next) {
+ if (strcmp(curres->hr_name, $1) == 0) {
+ pjdlog_error("Resource %s configured more than once.",
+ curres->hr_name);
+ free($1);
+ return (1);
+ }
+ }
+
+ /*
+ * Clear those, so we can tell if they were set at
+ * resource-level or not.
+ */
+ depth1_provname[0] = '\0';
+ depth1_localpath[0] = '\0';
+ depth1_metaflush = -1;
+ hadmynode = false;
+
+ curres = calloc(1, sizeof(*curres));
+ if (curres == NULL) {
+ pjdlog_error("Unable to allocate memory for resource.");
+ free($1);
+ return (1);
+ }
+ if (strlcpy(curres->hr_name, $1,
+ sizeof(curres->hr_name)) >=
+ sizeof(curres->hr_name)) {
+ pjdlog_error("Resource name is too long.");
+ free(curres);
+ free($1);
+ return (1);
+ }
+ free($1);
+ curres->hr_role = HAST_ROLE_INIT;
+ curres->hr_previous_role = HAST_ROLE_INIT;
+ curres->hr_replication = -1;
+ curres->hr_original_replication = -1;
+ curres->hr_checksum = -1;
+ curres->hr_compression = -1;
+ curres->hr_version = 1;
+ curres->hr_timeout = -1;
+ curres->hr_exec[0] = '\0';
+ curres->hr_provname[0] = '\0';
+ curres->hr_localpath[0] = '\0';
+ curres->hr_localfd = -1;
+ curres->hr_localflush = true;
+ curres->hr_metaflush = -1;
+ curres->hr_remoteaddr[0] = '\0';
+ curres->hr_sourceaddr[0] = '\0';
+ curres->hr_ggateunit = -1;
+ }
+ ;
+
+resource_entries:
+ |
+ resource_entries resource_entry
+ ;
+
+resource_entry:
+ replication_statement
+ |
+ checksum_statement
+ |
+ compression_statement
+ |
+ timeout_statement
+ |
+ exec_statement
+ |
+ metaflush_statement
+ |
+ name_statement
+ |
+ local_statement
+ |
+ resource_node_statement
+ ;
+
+name_statement: NAME STR
+ {
+ switch (depth) {
+ case 1:
+ if (strlcpy(depth1_provname, $2,
+ sizeof(depth1_provname)) >=
+ sizeof(depth1_provname)) {
+ pjdlog_error("name argument is too long.");
+ free($2);
+ return (1);
+ }
+ break;
+ case 2:
+ if (!mynode)
+ break;
+ PJDLOG_ASSERT(curres != NULL);
+ if (strlcpy(curres->hr_provname, $2,
+ sizeof(curres->hr_provname)) >=
+ sizeof(curres->hr_provname)) {
+ pjdlog_error("name argument is too long.");
+ free($2);
+ return (1);
+ }
+ break;
+ default:
+ PJDLOG_ABORT("name at wrong depth level");
+ }
+ free($2);
+ }
+ ;
+
+local_statement: LOCAL STR
+ {
+ switch (depth) {
+ case 1:
+ if (strlcpy(depth1_localpath, $2,
+ sizeof(depth1_localpath)) >=
+ sizeof(depth1_localpath)) {
+ pjdlog_error("local argument is too long.");
+ free($2);
+ return (1);
+ }
+ break;
+ case 2:
+ if (!mynode)
+ break;
+ PJDLOG_ASSERT(curres != NULL);
+ if (strlcpy(curres->hr_localpath, $2,
+ sizeof(curres->hr_localpath)) >=
+ sizeof(curres->hr_localpath)) {
+ pjdlog_error("local argument is too long.");
+ free($2);
+ return (1);
+ }
+ break;
+ default:
+ PJDLOG_ABORT("local at wrong depth level");
+ }
+ free($2);
+ }
+ ;
+
+resource_node_statement:ON resource_node_start OB resource_node_entries CB
+ {
+ mynode = false;
+ }
+ ;
+
+resource_node_start: STR
+ {
+ if (curres != NULL) {
+ switch (isitme($1)) {
+ case -1:
+ free($1);
+ return (1);
+ case 0:
+ break;
+ case 1:
+ mynode = hadmynode = true;
+ break;
+ default:
+ PJDLOG_ABORT("invalid isitme() return value");
+ }
+ }
+ free($1);
+ }
+ ;
+
+resource_node_entries:
+ |
+ resource_node_entries resource_node_entry
+ ;
+
+resource_node_entry:
+ name_statement
+ |
+ local_statement
+ |
+ remote_statement
+ |
+ source_statement
+ |
+ metaflush_statement
+ ;
+
+remote_statement: REMOTE remote_str
+ {
+ PJDLOG_ASSERT(depth == 2);
+ if (mynode) {
+ PJDLOG_ASSERT(curres != NULL);
+ if (strlcpy(curres->hr_remoteaddr, $2,
+ sizeof(curres->hr_remoteaddr)) >=
+ sizeof(curres->hr_remoteaddr)) {
+ pjdlog_error("remote argument is too long.");
+ free($2);
+ return (1);
+ }
+ }
+ free($2);
+ }
+ ;
+
+remote_str:
+ NONE { $$ = strdup("none"); }
+ |
+ STR { }
+ ;
+
+source_statement: SOURCE STR
+ {
+ PJDLOG_ASSERT(depth == 2);
+ if (mynode) {
+ PJDLOG_ASSERT(curres != NULL);
+ if (strlcpy(curres->hr_sourceaddr, $2,
+ sizeof(curres->hr_sourceaddr)) >=
+ sizeof(curres->hr_sourceaddr)) {
+ pjdlog_error("source argument is too long.");
+ free($2);
+ return (1);
+ }
+ }
+ free($2);
+ }
+ ;
+
+%%
+
+static int
+isitme(const char *name)
+{
+ char buf[MAXHOSTNAMELEN];
+ unsigned long hostid;
+ char *pos;
+ size_t bufsize;
+
+ /*
+ * First check if the given name matches our full hostname.
+ */
+ if (gethostname(buf, sizeof(buf)) < 0) {
+ pjdlog_errno(LOG_ERR, "gethostname() failed");
+ return (-1);
+ }
+ if (strcmp(buf, name) == 0)
+ return (1);
+
+ /*
+ * Check if it matches first part of the host name.
+ */
+ pos = strchr(buf, '.');
+ if (pos != NULL && (size_t)(pos - buf) == strlen(name) &&
+ strncmp(buf, name, pos - buf) == 0) {
+ return (1);
+ }
+
+ /*
+ * Check if it matches host UUID.
+ */
+ bufsize = sizeof(buf);
+ if (sysctlbyname("kern.hostuuid", buf, &bufsize, NULL, 0) < 0) {
+ pjdlog_errno(LOG_ERR, "sysctlbyname(kern.hostuuid) failed");
+ return (-1);
+ }
+ if (strcasecmp(buf, name) == 0)
+ return (1);
+
+ /*
+ * Check if it matches hostid.
+ */
+ bufsize = sizeof(hostid);
+ if (sysctlbyname("kern.hostid", &hostid, &bufsize, NULL, 0) < 0) {
+ pjdlog_errno(LOG_ERR, "sysctlbyname(kern.hostid) failed");
+ return (-1);
+ }
+ (void)snprintf(buf, sizeof(buf), "hostid%lu", hostid);
+ if (strcmp(buf, name) == 0)
+ return (1);
+
+ /*
+ * Looks like this isn't about us.
+ */
+ return (0);
+}
+
+static bool
+family_supported(int family)
+{
+ int sock;
+
+ sock = socket(family, SOCK_STREAM, 0);
+ if (sock == -1 && errno == EPROTONOSUPPORT)
+ return (false);
+ if (sock >= 0)
+ (void)close(sock);
+ return (true);
+}
+
+static int
+node_names(char **namesp)
+{
+ static char names[MAXHOSTNAMELEN * 3];
+ char buf[MAXHOSTNAMELEN];
+ unsigned long hostid;
+ char *pos;
+ size_t bufsize;
+
+ if (gethostname(buf, sizeof(buf)) < 0) {
+ pjdlog_errno(LOG_ERR, "gethostname() failed");
+ return (-1);
+ }
+
+ /* First component of the host name. */
+ pos = strchr(buf, '.');
+ if (pos != NULL && pos != buf) {
+ (void)strlcpy(names, buf, MIN((size_t)(pos - buf + 1),
+ sizeof(names)));
+ (void)strlcat(names, ", ", sizeof(names));
+ }
+
+ /* Full host name. */
+ (void)strlcat(names, buf, sizeof(names));
+ (void)strlcat(names, ", ", sizeof(names));
+
+ /* Host UUID. */
+ bufsize = sizeof(buf);
+ if (sysctlbyname("kern.hostuuid", buf, &bufsize, NULL, 0) < 0) {
+ pjdlog_errno(LOG_ERR, "sysctlbyname(kern.hostuuid) failed");
+ return (-1);
+ }
+ (void)strlcat(names, buf, sizeof(names));
+ (void)strlcat(names, ", ", sizeof(names));
+
+ /* Host ID. */
+ bufsize = sizeof(hostid);
+ if (sysctlbyname("kern.hostid", &hostid, &bufsize, NULL, 0) < 0) {
+ pjdlog_errno(LOG_ERR, "sysctlbyname(kern.hostid) failed");
+ return (-1);
+ }
+ (void)snprintf(buf, sizeof(buf), "hostid%lu", hostid);
+ (void)strlcat(names, buf, sizeof(names));
+
+ *namesp = names;
+
+ return (0);
+}
+
+void
+yyerror(const char *str)
+{
+
+ pjdlog_error("Unable to parse configuration file at line %d near '%s': %s",
+ lineno, yytext, str);
+}
+
+struct hastd_config *
+yy_config_parse(const char *config, bool exitonerror)
+{
+ int ret;
+
+ curres = NULL;
+ mynode = false;
+ depth = 0;
+ lineno = 0;
+
+ depth0_timeout = HAST_TIMEOUT;
+ depth0_replication = HAST_REPLICATION_MEMSYNC;
+ depth0_checksum = HAST_CHECKSUM_NONE;
+ depth0_compression = HAST_COMPRESSION_HOLE;
+ strlcpy(depth0_control, HAST_CONTROL, sizeof(depth0_control));
+ strlcpy(depth0_pidfile, HASTD_PIDFILE, sizeof(depth0_pidfile));
+ TAILQ_INIT(&depth0_listen);
+ strlcpy(depth0_listen_tcp4, HASTD_LISTEN_TCP4,
+ sizeof(depth0_listen_tcp4));
+ strlcpy(depth0_listen_tcp6, HASTD_LISTEN_TCP6,
+ sizeof(depth0_listen_tcp6));
+ depth0_exec[0] = '\0';
+ depth0_metaflush = 1;
+
+ lconfig = calloc(1, sizeof(*lconfig));
+ if (lconfig == NULL) {
+ pjdlog_error("Unable to allocate memory for configuration.");
+ if (exitonerror)
+ exit(EX_TEMPFAIL);
+ return (NULL);
+ }
+
+ TAILQ_INIT(&lconfig->hc_listen);
+ TAILQ_INIT(&lconfig->hc_resources);
+
+ yyin = fopen(config, "r");
+ if (yyin == NULL) {
+ pjdlog_errno(LOG_ERR, "Unable to open configuration file %s",
+ config);
+ yy_config_free(lconfig);
+ if (exitonerror)
+ exit(EX_OSFILE);
+ return (NULL);
+ }
+ yyrestart(yyin);
+ ret = yyparse();
+ fclose(yyin);
+ if (ret != 0) {
+ yy_config_free(lconfig);
+ if (exitonerror)
+ exit(EX_CONFIG);
+ return (NULL);
+ }
+
+ /*
+ * Let's see if everything is set up.
+ */
+ if (lconfig->hc_controladdr[0] == '\0') {
+ strlcpy(lconfig->hc_controladdr, depth0_control,
+ sizeof(lconfig->hc_controladdr));
+ }
+ if (lconfig->hc_pidfile[0] == '\0') {
+ strlcpy(lconfig->hc_pidfile, depth0_pidfile,
+ sizeof(lconfig->hc_pidfile));
+ }
+ if (!TAILQ_EMPTY(&depth0_listen))
+ TAILQ_CONCAT(&lconfig->hc_listen, &depth0_listen, hl_next);
+ if (TAILQ_EMPTY(&lconfig->hc_listen)) {
+ struct hastd_listen *lst;
+
+ if (family_supported(AF_INET)) {
+ lst = calloc(1, sizeof(*lst));
+ if (lst == NULL) {
+ pjdlog_error("Unable to allocate memory for listen address.");
+ yy_config_free(lconfig);
+ if (exitonerror)
+ exit(EX_TEMPFAIL);
+ return (NULL);
+ }
+ (void)strlcpy(lst->hl_addr, depth0_listen_tcp4,
+ sizeof(lst->hl_addr));
+ TAILQ_INSERT_TAIL(&lconfig->hc_listen, lst, hl_next);
+ } else {
+ pjdlog_debug(1,
+ "No IPv4 support in the kernel, not listening on IPv4 address.");
+ }
+ if (family_supported(AF_INET6)) {
+ lst = calloc(1, sizeof(*lst));
+ if (lst == NULL) {
+ pjdlog_error("Unable to allocate memory for listen address.");
+ yy_config_free(lconfig);
+ if (exitonerror)
+ exit(EX_TEMPFAIL);
+ return (NULL);
+ }
+ (void)strlcpy(lst->hl_addr, depth0_listen_tcp6,
+ sizeof(lst->hl_addr));
+ TAILQ_INSERT_TAIL(&lconfig->hc_listen, lst, hl_next);
+ } else {
+ pjdlog_debug(1,
+ "No IPv6 support in the kernel, not listening on IPv6 address.");
+ }
+ if (TAILQ_EMPTY(&lconfig->hc_listen)) {
+ pjdlog_error("No address to listen on.");
+ yy_config_free(lconfig);
+ if (exitonerror)
+ exit(EX_TEMPFAIL);
+ return (NULL);
+ }
+ }
+ TAILQ_FOREACH(curres, &lconfig->hc_resources, hr_next) {
+ PJDLOG_ASSERT(curres->hr_provname[0] != '\0');
+ PJDLOG_ASSERT(curres->hr_localpath[0] != '\0');
+ PJDLOG_ASSERT(curres->hr_remoteaddr[0] != '\0');
+
+ if (curres->hr_replication == -1) {
+ /*
+ * Replication is not set at resource-level.
+ * Use global or default setting.
+ */
+ curres->hr_replication = depth0_replication;
+ curres->hr_original_replication = depth0_replication;
+ }
+ if (curres->hr_checksum == -1) {
+ /*
+ * Checksum is not set at resource-level.
+ * Use global or default setting.
+ */
+ curres->hr_checksum = depth0_checksum;
+ }
+ if (curres->hr_compression == -1) {
+ /*
+ * Compression is not set at resource-level.
+ * Use global or default setting.
+ */
+ curres->hr_compression = depth0_compression;
+ }
+ if (curres->hr_timeout == -1) {
+ /*
+ * Timeout is not set at resource-level.
+ * Use global or default setting.
+ */
+ curres->hr_timeout = depth0_timeout;
+ }
+ if (curres->hr_exec[0] == '\0') {
+ /*
+ * Exec is not set at resource-level.
+ * Use global or default setting.
+ */
+ strlcpy(curres->hr_exec, depth0_exec,
+ sizeof(curres->hr_exec));
+ }
+ if (curres->hr_metaflush == -1) {
+ /*
+ * Metaflush is not set at resource-level.
+ * Use global or default setting.
+ */
+ curres->hr_metaflush = depth0_metaflush;
+ }
+ }
+
+ return (lconfig);
+}
+
+void
+yy_config_free(struct hastd_config *config)
+{
+ struct hastd_listen *lst;
+ struct hast_resource *res;
+
+ while ((lst = TAILQ_FIRST(&depth0_listen)) != NULL) {
+ TAILQ_REMOVE(&depth0_listen, lst, hl_next);
+ free(lst);
+ }
+ while ((lst = TAILQ_FIRST(&config->hc_listen)) != NULL) {
+ TAILQ_REMOVE(&config->hc_listen, lst, hl_next);
+ free(lst);
+ }
+ while ((res = TAILQ_FIRST(&config->hc_resources)) != NULL) {
+ TAILQ_REMOVE(&config->hc_resources, res, hr_next);
+ free(res);
+ }
+ free(config);
+}
diff --git a/sbin/hastd/pjdlog.c b/sbin/hastd/pjdlog.c
new file mode 100644
index 0000000..bc4018f
--- /dev/null
+++ b/sbin/hastd/pjdlog.c
@@ -0,0 +1,614 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <libutil.h>
+#include <printf.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
+
+#include "pjdlog.h"
+
+#define PJDLOG_NEVER_INITIALIZED 0
+#define PJDLOG_NOT_INITIALIZED 1
+#define PJDLOG_INITIALIZED 2
+
+static int pjdlog_initialized = PJDLOG_NEVER_INITIALIZED;
+static int pjdlog_mode, pjdlog_debug_level;
+static char pjdlog_prefix[128];
+
+static int
+pjdlog_printf_arginfo_humanized_number(const struct printf_info *pi __unused,
+ size_t n, int *argt)
+{
+
+ assert(n >= 1);
+ argt[0] = PA_INT | PA_FLAG_INTMAX;
+ return (1);
+}
+
+static int
+pjdlog_printf_render_humanized_number(struct __printf_io *io,
+ const struct printf_info *pi, const void * const *arg)
+{
+ char buf[5];
+ intmax_t num;
+ int ret;
+
+ num = *(const intmax_t *)arg[0];
+ humanize_number(buf, sizeof(buf), (int64_t)num, "", HN_AUTOSCALE,
+ HN_NOSPACE | HN_DECIMAL);
+ ret = __printf_out(io, pi, buf, strlen(buf));
+ __printf_flush(io);
+ return (ret);
+}
+
+static int
+pjdlog_printf_arginfo_sockaddr(const struct printf_info *pi __unused,
+ size_t n, int *argt)
+{
+
+ assert(n >= 1);
+ argt[0] = PA_POINTER;
+ return (1);
+}
+
+static int
+pjdlog_printf_render_sockaddr(struct __printf_io *io,
+ const struct printf_info *pi, const void * const *arg)
+{
+ const struct sockaddr_storage *ss;
+ char buf[64];
+ int ret;
+
+ ss = *(const struct sockaddr_storage * const *)arg[0];
+ switch (ss->ss_family) {
+ case AF_INET:
+ {
+ char addr[INET_ADDRSTRLEN];
+ const struct sockaddr_in *sin;
+ unsigned int port;
+
+ sin = (const struct sockaddr_in *)ss;
+ port = ntohs(sin->sin_port);
+ if (inet_ntop(ss->ss_family, &sin->sin_addr, addr,
+ sizeof(addr)) == NULL) {
+ PJDLOG_ABORT("inet_ntop(AF_INET) failed: %s.",
+ strerror(errno));
+ }
+ snprintf(buf, sizeof(buf), "%s:%u", addr, port);
+ break;
+ }
+ case AF_INET6:
+ {
+ char addr[INET6_ADDRSTRLEN];
+ const struct sockaddr_in6 *sin;
+ unsigned int port;
+
+ sin = (const struct sockaddr_in6 *)ss;
+ port = ntohs(sin->sin6_port);
+ if (inet_ntop(ss->ss_family, &sin->sin6_addr, addr,
+ sizeof(addr)) == NULL) {
+ PJDLOG_ABORT("inet_ntop(AF_INET6) failed: %s.",
+ strerror(errno));
+ }
+ snprintf(buf, sizeof(buf), "[%s]:%u", addr, port);
+ break;
+ }
+ default:
+ snprintf(buf, sizeof(buf), "[unsupported family %hhu]",
+ ss->ss_family);
+ break;
+ }
+ ret = __printf_out(io, pi, buf, strlen(buf));
+ __printf_flush(io);
+ return (ret);
+}
+
+void
+pjdlog_init(int mode)
+{
+ int saved_errno;
+
+ assert(pjdlog_initialized == PJDLOG_NEVER_INITIALIZED ||
+ pjdlog_initialized == PJDLOG_NOT_INITIALIZED);
+ assert(mode == PJDLOG_MODE_STD || mode == PJDLOG_MODE_SYSLOG);
+
+ saved_errno = errno;
+
+ if (pjdlog_initialized == PJDLOG_NEVER_INITIALIZED) {
+ __use_xprintf = 1;
+ register_printf_render_std("T");
+ register_printf_render('N',
+ pjdlog_printf_render_humanized_number,
+ pjdlog_printf_arginfo_humanized_number);
+ register_printf_render('S',
+ pjdlog_printf_render_sockaddr,
+ pjdlog_printf_arginfo_sockaddr);
+ }
+
+ if (mode == PJDLOG_MODE_SYSLOG)
+ openlog(NULL, LOG_PID | LOG_NDELAY, LOG_DAEMON);
+ pjdlog_mode = mode;
+ pjdlog_debug_level = 0;
+ bzero(pjdlog_prefix, sizeof(pjdlog_prefix));
+
+ pjdlog_initialized = PJDLOG_INITIALIZED;
+
+ errno = saved_errno;
+}
+
+void
+pjdlog_fini(void)
+{
+ int saved_errno;
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ saved_errno = errno;
+
+ if (pjdlog_mode == PJDLOG_MODE_SYSLOG)
+ closelog();
+
+ pjdlog_initialized = PJDLOG_NOT_INITIALIZED;
+
+ errno = saved_errno;
+}
+
+/*
+ * Configure where the logs should go.
+ * By default they are send to stdout/stderr, but after going into background
+ * (eg. by calling daemon(3)) application is responsible for changing mode to
+ * PJDLOG_MODE_SYSLOG, so logs will be send to syslog.
+ */
+void
+pjdlog_mode_set(int mode)
+{
+ int saved_errno;
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+ assert(mode == PJDLOG_MODE_STD || mode == PJDLOG_MODE_SYSLOG);
+
+ if (pjdlog_mode == mode)
+ return;
+
+ saved_errno = errno;
+
+ if (mode == PJDLOG_MODE_SYSLOG)
+ openlog(NULL, LOG_PID | LOG_NDELAY, LOG_DAEMON);
+ else /* if (mode == PJDLOG_MODE_STD) */
+ closelog();
+
+ pjdlog_mode = mode;
+
+ errno = saved_errno;
+}
+
+/*
+ * Return current mode.
+ */
+int
+pjdlog_mode_get(void)
+{
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ return (pjdlog_mode);
+}
+
+/*
+ * Set debug level. All the logs above the level specified here will be
+ * ignored.
+ */
+void
+pjdlog_debug_set(int level)
+{
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+ assert(level >= 0);
+
+ pjdlog_debug_level = level;
+}
+
+/*
+ * Return current debug level.
+ */
+int
+pjdlog_debug_get(void)
+{
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ return (pjdlog_debug_level);
+}
+
+/*
+ * Set prefix that will be used before each log.
+ * Setting prefix to NULL will remove it.
+ */
+void
+pjdlog_prefix_set(const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ va_start(ap, fmt);
+ pjdlogv_prefix_set(fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Set prefix that will be used before each log.
+ * Setting prefix to NULL will remove it.
+ */
+void
+pjdlogv_prefix_set(const char *fmt, va_list ap)
+{
+ int saved_errno;
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+ assert(fmt != NULL);
+
+ saved_errno = errno;
+
+ vsnprintf(pjdlog_prefix, sizeof(pjdlog_prefix), fmt, ap);
+
+ errno = saved_errno;
+}
+
+/*
+ * Convert log level into string.
+ */
+static const char *
+pjdlog_level_string(int loglevel)
+{
+
+ switch (loglevel) {
+ case LOG_EMERG:
+ return ("EMERG");
+ case LOG_ALERT:
+ return ("ALERT");
+ case LOG_CRIT:
+ return ("CRIT");
+ case LOG_ERR:
+ return ("ERROR");
+ case LOG_WARNING:
+ return ("WARNING");
+ case LOG_NOTICE:
+ return ("NOTICE");
+ case LOG_INFO:
+ return ("INFO");
+ case LOG_DEBUG:
+ return ("DEBUG");
+ }
+ assert(!"Invalid log level.");
+ abort(); /* XXX: gcc */
+}
+
+/*
+ * Common log routine.
+ */
+void
+pjdlog_common(int loglevel, int debuglevel, int error, const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ va_start(ap, fmt);
+ pjdlogv_common(loglevel, debuglevel, error, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Common log routine, which can handle regular log level as well as debug
+ * level. We decide here where to send the logs (stdout/stderr or syslog).
+ */
+void
+pjdlogv_common(int loglevel, int debuglevel, int error, const char *fmt,
+ va_list ap)
+{
+ int saved_errno;
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+ assert(loglevel == LOG_EMERG || loglevel == LOG_ALERT ||
+ loglevel == LOG_CRIT || loglevel == LOG_ERR ||
+ loglevel == LOG_WARNING || loglevel == LOG_NOTICE ||
+ loglevel == LOG_INFO || loglevel == LOG_DEBUG);
+ assert(loglevel != LOG_DEBUG || debuglevel > 0);
+ assert(error >= -1);
+
+ /* Ignore debug above configured level. */
+ if (loglevel == LOG_DEBUG && debuglevel > pjdlog_debug_level)
+ return;
+
+ saved_errno = errno;
+
+ switch (pjdlog_mode) {
+ case PJDLOG_MODE_STD:
+ {
+ FILE *out;
+
+ /*
+ * We send errors and warning to stderr and the rest to stdout.
+ */
+ switch (loglevel) {
+ case LOG_EMERG:
+ case LOG_ALERT:
+ case LOG_CRIT:
+ case LOG_ERR:
+ case LOG_WARNING:
+ out = stderr;
+ break;
+ case LOG_NOTICE:
+ case LOG_INFO:
+ case LOG_DEBUG:
+ out = stdout;
+ break;
+ default:
+ assert(!"Invalid loglevel.");
+ abort(); /* XXX: gcc */
+ }
+
+ fprintf(out, "[%s]", pjdlog_level_string(loglevel));
+ /* Attach debuglevel if this is debug log. */
+ if (loglevel == LOG_DEBUG)
+ fprintf(out, "[%d]", debuglevel);
+ fprintf(out, " %s", pjdlog_prefix);
+ vfprintf(out, fmt, ap);
+ if (error != -1)
+ fprintf(out, ": %s.", strerror(error));
+ fprintf(out, "\n");
+ fflush(out);
+ break;
+ }
+ case PJDLOG_MODE_SYSLOG:
+ {
+ char log[1024];
+ int len;
+
+ len = snprintf(log, sizeof(log), "%s", pjdlog_prefix);
+ if ((size_t)len < sizeof(log))
+ len += vsnprintf(log + len, sizeof(log) - len, fmt, ap);
+ if (error != -1 && (size_t)len < sizeof(log)) {
+ (void)snprintf(log + len, sizeof(log) - len, ": %s.",
+ strerror(error));
+ }
+ syslog(loglevel, "%s", log);
+ break;
+ }
+ default:
+ assert(!"Invalid mode.");
+ }
+
+ errno = saved_errno;
+}
+
+/*
+ * Regular logs.
+ */
+void
+pjdlogv(int loglevel, const char *fmt, va_list ap)
+{
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ /* LOG_DEBUG is invalid here, pjdlogv?_debug() should be used. */
+ assert(loglevel == LOG_EMERG || loglevel == LOG_ALERT ||
+ loglevel == LOG_CRIT || loglevel == LOG_ERR ||
+ loglevel == LOG_WARNING || loglevel == LOG_NOTICE ||
+ loglevel == LOG_INFO);
+
+ pjdlogv_common(loglevel, 0, -1, fmt, ap);
+}
+
+/*
+ * Regular logs.
+ */
+void
+pjdlog(int loglevel, const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ va_start(ap, fmt);
+ pjdlogv(loglevel, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Debug logs.
+ */
+void
+pjdlogv_debug(int debuglevel, const char *fmt, va_list ap)
+{
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ pjdlogv_common(LOG_DEBUG, debuglevel, -1, fmt, ap);
+}
+
+/*
+ * Debug logs.
+ */
+void
+pjdlog_debug(int debuglevel, const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ va_start(ap, fmt);
+ pjdlogv_debug(debuglevel, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Error logs with errno logging.
+ */
+void
+pjdlogv_errno(int loglevel, const char *fmt, va_list ap)
+{
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ pjdlogv_common(loglevel, 0, errno, fmt, ap);
+}
+
+/*
+ * Error logs with errno logging.
+ */
+void
+pjdlog_errno(int loglevel, const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ va_start(ap, fmt);
+ pjdlogv_errno(loglevel, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Log error, errno and exit.
+ */
+void
+pjdlogv_exit(int exitcode, const char *fmt, va_list ap)
+{
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ pjdlogv_errno(LOG_ERR, fmt, ap);
+ exit(exitcode);
+ /* NOTREACHED */
+}
+
+/*
+ * Log error, errno and exit.
+ */
+void
+pjdlog_exit(int exitcode, const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ va_start(ap, fmt);
+ pjdlogv_exit(exitcode, fmt, ap);
+ /* NOTREACHED */
+ va_end(ap);
+}
+
+/*
+ * Log error and exit.
+ */
+void
+pjdlogv_exitx(int exitcode, const char *fmt, va_list ap)
+{
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ pjdlogv(LOG_ERR, fmt, ap);
+ exit(exitcode);
+ /* NOTREACHED */
+}
+
+/*
+ * Log error and exit.
+ */
+void
+pjdlog_exitx(int exitcode, const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ va_start(ap, fmt);
+ pjdlogv_exitx(exitcode, fmt, ap);
+ /* NOTREACHED */
+ va_end(ap);
+}
+
+/*
+ * Log failure message and exit.
+ */
+void
+pjdlog_abort(const char *func, const char *file, int line,
+ const char *failedexpr, const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(pjdlog_initialized == PJDLOG_INITIALIZED);
+
+ /*
+ * When there is no message we pass __func__ as 'fmt'.
+ * It would be cleaner to pass NULL or "", but gcc generates a warning
+ * for both of those.
+ */
+ if (fmt != func) {
+ va_start(ap, fmt);
+ pjdlogv_critical(fmt, ap);
+ va_end(ap);
+ }
+ if (failedexpr == NULL) {
+ if (func == NULL) {
+ pjdlog_critical("Aborted at file %s, line %d.", file,
+ line);
+ } else {
+ pjdlog_critical("Aborted at function %s, file %s, line %d.",
+ func, file, line);
+ }
+ } else {
+ if (func == NULL) {
+ pjdlog_critical("Assertion failed: (%s), file %s, line %d.",
+ failedexpr, file, line);
+ } else {
+ pjdlog_critical("Assertion failed: (%s), function %s, file %s, line %d.",
+ failedexpr, func, file, line);
+ }
+ }
+ abort();
+}
diff --git a/sbin/hastd/pjdlog.h b/sbin/hastd/pjdlog.h
new file mode 100644
index 0000000..0f01f79
--- /dev/null
+++ b/sbin/hastd/pjdlog.h
@@ -0,0 +1,117 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PJDLOG_H_
+#define _PJDLOG_H_
+
+#include <sys/cdefs.h>
+
+#include <stdarg.h>
+#include <sysexits.h>
+#include <syslog.h>
+
+#define PJDLOG_MODE_STD 0
+#define PJDLOG_MODE_SYSLOG 1
+
+void pjdlog_init(int mode);
+void pjdlog_fini(void);
+
+void pjdlog_mode_set(int mode);
+int pjdlog_mode_get(void);
+
+void pjdlog_debug_set(int level);
+int pjdlog_debug_get(void);
+
+void pjdlog_prefix_set(const char *fmt, ...) __printflike(1, 2);
+void pjdlogv_prefix_set(const char *fmt, va_list ap) __printflike(1, 0);
+
+void pjdlog_common(int loglevel, int debuglevel, int error, const char *fmt,
+ ...) __printflike(4, 5);
+void pjdlogv_common(int loglevel, int debuglevel, int error, const char *fmt,
+ va_list ap) __printflike(4, 0);
+
+void pjdlog(int loglevel, const char *fmt, ...) __printflike(2, 3);
+void pjdlogv(int loglevel, const char *fmt, va_list ap) __printflike(2, 0);
+
+#define pjdlogv_emergency(fmt, ap) pjdlogv(LOG_EMERG, (fmt), (ap))
+#define pjdlog_emergency(...) pjdlog(LOG_EMERG, __VA_ARGS__)
+#define pjdlogv_alert(fmt, ap) pjdlogv(LOG_ALERT, (fmt), (ap))
+#define pjdlog_alert(...) pjdlog(LOG_ALERT, __VA_ARGS__)
+#define pjdlogv_critical(fmt, ap) pjdlogv(LOG_CRIT, (fmt), (ap))
+#define pjdlog_critical(...) pjdlog(LOG_CRIT, __VA_ARGS__)
+#define pjdlogv_error(fmt, ap) pjdlogv(LOG_ERR, (fmt), (ap))
+#define pjdlog_error(...) pjdlog(LOG_ERR, __VA_ARGS__)
+#define pjdlogv_warning(fmt, ap) pjdlogv(LOG_WARNING, (fmt), (ap))
+#define pjdlog_warning(...) pjdlog(LOG_WARNING, __VA_ARGS__)
+#define pjdlogv_notice(fmt, ap) pjdlogv(LOG_NOTICE, (fmt), (ap))
+#define pjdlog_notice(...) pjdlog(LOG_NOTICE, __VA_ARGS__)
+#define pjdlogv_info(fmt, ap) pjdlogv(LOG_INFO, (fmt), (ap))
+#define pjdlog_info(...) pjdlog(LOG_INFO, __VA_ARGS__)
+
+void pjdlog_debug(int debuglevel, const char *fmt, ...) __printflike(2, 3);
+void pjdlogv_debug(int debuglevel, const char *fmt, va_list ap) __printflike(2, 0);
+
+void pjdlog_errno(int loglevel, const char *fmt, ...) __printflike(2, 3);
+void pjdlogv_errno(int loglevel, const char *fmt, va_list ap) __printflike(2, 0);
+
+void pjdlog_exit(int exitcode, const char *fmt, ...) __printflike(2, 3) __dead2;
+void pjdlogv_exit(int exitcode, const char *fmt, va_list ap) __printflike(2, 0) __dead2;
+
+void pjdlog_exitx(int exitcode, const char *fmt, ...) __printflike(2, 3) __dead2;
+void pjdlogv_exitx(int exitcode, const char *fmt, va_list ap) __printflike(2, 0) __dead2;
+
+void pjdlog_abort(const char *func, const char *file, int line,
+ const char *failedexpr, const char *fmt, ...) __printflike(5, 6) __dead2;
+
+#define PJDLOG_VERIFY(expr) do { \
+ if (!(expr)) { \
+ pjdlog_abort(__func__, __FILE__, __LINE__, #expr, \
+ __func__); \
+ } \
+} while (0)
+#define PJDLOG_RVERIFY(expr, ...) do { \
+ if (!(expr)) { \
+ pjdlog_abort(__func__, __FILE__, __LINE__, #expr, \
+ __VA_ARGS__); \
+ } \
+} while (0)
+#define PJDLOG_ABORT(...) pjdlog_abort(__func__, __FILE__, \
+ __LINE__, NULL, __VA_ARGS__)
+#ifdef NDEBUG
+#define PJDLOG_ASSERT(expr) do { } while (0)
+#define PJDLOG_RASSERT(...) do { } while (0)
+#else
+#define PJDLOG_ASSERT(expr) PJDLOG_VERIFY(expr)
+#define PJDLOG_RASSERT(...) PJDLOG_RVERIFY(__VA_ARGS__)
+#endif
+
+#endif /* !_PJDLOG_H_ */
diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c
new file mode 100644
index 0000000..09ae17b
--- /dev/null
+++ b/sbin/hastd/primary.c
@@ -0,0 +1,2477 @@
+/*-
+ * Copyright (c) 2009 The FreeBSD Foundation
+ * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/bio.h>
+#include <sys/disk.h>
+#include <sys/stat.h>
+
+#include <geom/gate/g_gate.h>
+
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgeom.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <activemap.h>
+#include <nv.h>
+#include <rangelock.h>
+
+#include "control.h"
+#include "event.h"
+#include "hast.h"
+#include "hast_proto.h"
+#include "hastd.h"
+#include "hooks.h"
+#include "metadata.h"
+#include "proto.h"
+#include "pjdlog.h"
+#include "refcnt.h"
+#include "subr.h"
+#include "synch.h"
+
+/* The is only one remote component for now. */
+#define ISREMOTE(no) ((no) == 1)
+
+struct hio {
+ /*
+ * Number of components we are still waiting for.
+ * When this field goes to 0, we can send the request back to the
+ * kernel. Each component has to decrease this counter by one
+ * even on failure.
+ */
+ refcnt_t hio_countdown;
+ /*
+ * Each component has a place to store its own error.
+ * Once the request is handled by all components we can decide if the
+ * request overall is successful or not.
+ */
+ int *hio_errors;
+ /*
+ * Structure used to communicate with GEOM Gate class.
+ */
+ struct g_gate_ctl_io hio_ggio;
+ /*
+ * Request was already confirmed to GEOM Gate.
+ */
+ bool hio_done;
+ /*
+ * Remember replication from the time the request was initiated,
+ * so we won't get confused when replication changes on reload.
+ */
+ int hio_replication;
+ TAILQ_ENTRY(hio) *hio_next;
+};
+#define hio_free_next hio_next[0]
+#define hio_done_next hio_next[0]
+
+/*
+ * Free list holds unused structures. When free list is empty, we have to wait
+ * until some in-progress requests are freed.
+ */
+static TAILQ_HEAD(, hio) hio_free_list;
+static pthread_mutex_t hio_free_list_lock;
+static pthread_cond_t hio_free_list_cond;
+/*
+ * There is one send list for every component. One requests is placed on all
+ * send lists - each component gets the same request, but each component is
+ * responsible for managing his own send list.
+ */
+static TAILQ_HEAD(, hio) *hio_send_list;
+static pthread_mutex_t *hio_send_list_lock;
+static pthread_cond_t *hio_send_list_cond;
+/*
+ * There is one recv list for every component, although local components don't
+ * use recv lists as local requests are done synchronously.
+ */
+static TAILQ_HEAD(, hio) *hio_recv_list;
+static pthread_mutex_t *hio_recv_list_lock;
+static pthread_cond_t *hio_recv_list_cond;
+/*
+ * Request is placed on done list by the slowest component (the one that
+ * decreased hio_countdown from 1 to 0).
+ */
+static TAILQ_HEAD(, hio) hio_done_list;
+static pthread_mutex_t hio_done_list_lock;
+static pthread_cond_t hio_done_list_cond;
+/*
+ * Structure below are for interaction with sync thread.
+ */
+static bool sync_inprogress;
+static pthread_mutex_t sync_lock;
+static pthread_cond_t sync_cond;
+/*
+ * The lock below allows to synchornize access to remote connections.
+ */
+static pthread_rwlock_t *hio_remote_lock;
+
+/*
+ * Lock to synchronize metadata updates. Also synchronize access to
+ * hr_primary_localcnt and hr_primary_remotecnt fields.
+ */
+static pthread_mutex_t metadata_lock;
+
+/*
+ * Maximum number of outstanding I/O requests.
+ */
+#define HAST_HIO_MAX 256
+/*
+ * Number of components. At this point there are only two components: local
+ * and remote, but in the future it might be possible to use multiple local
+ * and remote components.
+ */
+#define HAST_NCOMPONENTS 2
+
+#define ISCONNECTED(res, no) \
+ ((res)->hr_remotein != NULL && (res)->hr_remoteout != NULL)
+
+#define QUEUE_INSERT1(hio, name, ncomp) do { \
+ bool _wakeup; \
+ \
+ mtx_lock(&hio_##name##_list_lock[(ncomp)]); \
+ _wakeup = TAILQ_EMPTY(&hio_##name##_list[(ncomp)]); \
+ TAILQ_INSERT_TAIL(&hio_##name##_list[(ncomp)], (hio), \
+ hio_next[(ncomp)]); \
+ mtx_unlock(&hio_##name##_list_lock[ncomp]); \
+ if (_wakeup) \
+ cv_broadcast(&hio_##name##_list_cond[(ncomp)]); \
+} while (0)
+#define QUEUE_INSERT2(hio, name) do { \
+ bool _wakeup; \
+ \
+ mtx_lock(&hio_##name##_list_lock); \
+ _wakeup = TAILQ_EMPTY(&hio_##name##_list); \
+ TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_##name##_next);\
+ mtx_unlock(&hio_##name##_list_lock); \
+ if (_wakeup) \
+ cv_broadcast(&hio_##name##_list_cond); \
+} while (0)
+#define QUEUE_TAKE1(hio, name, ncomp, timeout) do { \
+ bool _last; \
+ \
+ mtx_lock(&hio_##name##_list_lock[(ncomp)]); \
+ _last = false; \
+ while (((hio) = TAILQ_FIRST(&hio_##name##_list[(ncomp)])) == NULL && !_last) { \
+ cv_timedwait(&hio_##name##_list_cond[(ncomp)], \
+ &hio_##name##_list_lock[(ncomp)], (timeout)); \
+ if ((timeout) != 0) \
+ _last = true; \
+ } \
+ if (hio != NULL) { \
+ TAILQ_REMOVE(&hio_##name##_list[(ncomp)], (hio), \
+ hio_next[(ncomp)]); \
+ } \
+ mtx_unlock(&hio_##name##_list_lock[(ncomp)]); \
+} while (0)
+#define QUEUE_TAKE2(hio, name) do { \
+ mtx_lock(&hio_##name##_list_lock); \
+ while (((hio) = TAILQ_FIRST(&hio_##name##_list)) == NULL) { \
+ cv_wait(&hio_##name##_list_cond, \
+ &hio_##name##_list_lock); \
+ } \
+ TAILQ_REMOVE(&hio_##name##_list, (hio), hio_##name##_next); \
+ mtx_unlock(&hio_##name##_list_lock); \
+} while (0)
+
+#define SYNCREQ(hio) do { \
+ (hio)->hio_ggio.gctl_unit = -1; \
+ (hio)->hio_ggio.gctl_seq = 1; \
+} while (0)
+#define ISSYNCREQ(hio) ((hio)->hio_ggio.gctl_unit == -1)
+#define SYNCREQDONE(hio) do { (hio)->hio_ggio.gctl_unit = -2; } while (0)
+#define ISSYNCREQDONE(hio) ((hio)->hio_ggio.gctl_unit == -2)
+
+static struct hast_resource *gres;
+
+static pthread_mutex_t range_lock;
+static struct rangelocks *range_regular;
+static bool range_regular_wait;
+static pthread_cond_t range_regular_cond;
+static struct rangelocks *range_sync;
+static bool range_sync_wait;
+static pthread_cond_t range_sync_cond;
+static bool fullystarted;
+
+static void *ggate_recv_thread(void *arg);
+static void *local_send_thread(void *arg);
+static void *remote_send_thread(void *arg);
+static void *remote_recv_thread(void *arg);
+static void *ggate_send_thread(void *arg);
+static void *sync_thread(void *arg);
+static void *guard_thread(void *arg);
+
+static void
+cleanup(struct hast_resource *res)
+{
+ int rerrno;
+
+ /* Remember errno. */
+ rerrno = errno;
+
+ /* Destroy ggate provider if we created one. */
+ if (res->hr_ggateunit >= 0) {
+ struct g_gate_ctl_destroy ggiod;
+
+ bzero(&ggiod, sizeof(ggiod));
+ ggiod.gctl_version = G_GATE_VERSION;
+ ggiod.gctl_unit = res->hr_ggateunit;
+ ggiod.gctl_force = 1;
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_DESTROY, &ggiod) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to destroy hast/%s device",
+ res->hr_provname);
+ }
+ res->hr_ggateunit = -1;
+ }
+
+ /* Restore errno. */
+ errno = rerrno;
+}
+
+static __dead2 void
+primary_exit(int exitcode, const char *fmt, ...)
+{
+ va_list ap;
+
+ PJDLOG_ASSERT(exitcode != EX_OK);
+ va_start(ap, fmt);
+ pjdlogv_errno(LOG_ERR, fmt, ap);
+ va_end(ap);
+ cleanup(gres);
+ exit(exitcode);
+}
+
+static __dead2 void
+primary_exitx(int exitcode, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ pjdlogv(exitcode == EX_OK ? LOG_INFO : LOG_ERR, fmt, ap);
+ va_end(ap);
+ cleanup(gres);
+ exit(exitcode);
+}
+
+/* Expects res->hr_amp locked, returns unlocked. */
+static int
+hast_activemap_flush(struct hast_resource *res)
+{
+ const unsigned char *buf;
+ size_t size;
+ int ret;
+
+ mtx_lock(&res->hr_amp_diskmap_lock);
+ buf = activemap_bitmap(res->hr_amp, &size);
+ mtx_unlock(&res->hr_amp_lock);
+ PJDLOG_ASSERT(buf != NULL);
+ PJDLOG_ASSERT((size % res->hr_local_sectorsize) == 0);
+ ret = 0;
+ if (pwrite(res->hr_localfd, buf, size, METADATA_SIZE) !=
+ (ssize_t)size) {
+ pjdlog_errno(LOG_ERR, "Unable to flush activemap to disk");
+ res->hr_stat_activemap_write_error++;
+ ret = -1;
+ }
+ if (ret == 0 && res->hr_metaflush == 1 &&
+ g_flush(res->hr_localfd) == -1) {
+ if (errno == EOPNOTSUPP) {
+ pjdlog_warning("The %s provider doesn't support flushing write cache. Disabling it.",
+ res->hr_localpath);
+ res->hr_metaflush = 0;
+ } else {
+ pjdlog_errno(LOG_ERR,
+ "Unable to flush disk cache on activemap update");
+ res->hr_stat_activemap_flush_error++;
+ ret = -1;
+ }
+ }
+ mtx_unlock(&res->hr_amp_diskmap_lock);
+ return (ret);
+}
+
+static bool
+real_remote(const struct hast_resource *res)
+{
+
+ return (strcmp(res->hr_remoteaddr, "none") != 0);
+}
+
+static void
+init_environment(struct hast_resource *res __unused)
+{
+ struct hio *hio;
+ unsigned int ii, ncomps;
+
+ /*
+ * In the future it might be per-resource value.
+ */
+ ncomps = HAST_NCOMPONENTS;
+
+ /*
+ * Allocate memory needed by lists.
+ */
+ hio_send_list = malloc(sizeof(hio_send_list[0]) * ncomps);
+ if (hio_send_list == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for send lists.",
+ sizeof(hio_send_list[0]) * ncomps);
+ }
+ hio_send_list_lock = malloc(sizeof(hio_send_list_lock[0]) * ncomps);
+ if (hio_send_list_lock == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for send list locks.",
+ sizeof(hio_send_list_lock[0]) * ncomps);
+ }
+ hio_send_list_cond = malloc(sizeof(hio_send_list_cond[0]) * ncomps);
+ if (hio_send_list_cond == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for send list condition variables.",
+ sizeof(hio_send_list_cond[0]) * ncomps);
+ }
+ hio_recv_list = malloc(sizeof(hio_recv_list[0]) * ncomps);
+ if (hio_recv_list == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for recv lists.",
+ sizeof(hio_recv_list[0]) * ncomps);
+ }
+ hio_recv_list_lock = malloc(sizeof(hio_recv_list_lock[0]) * ncomps);
+ if (hio_recv_list_lock == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for recv list locks.",
+ sizeof(hio_recv_list_lock[0]) * ncomps);
+ }
+ hio_recv_list_cond = malloc(sizeof(hio_recv_list_cond[0]) * ncomps);
+ if (hio_recv_list_cond == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for recv list condition variables.",
+ sizeof(hio_recv_list_cond[0]) * ncomps);
+ }
+ hio_remote_lock = malloc(sizeof(hio_remote_lock[0]) * ncomps);
+ if (hio_remote_lock == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for remote connections locks.",
+ sizeof(hio_remote_lock[0]) * ncomps);
+ }
+
+ /*
+ * Initialize lists, their locks and theirs condition variables.
+ */
+ TAILQ_INIT(&hio_free_list);
+ mtx_init(&hio_free_list_lock);
+ cv_init(&hio_free_list_cond);
+ for (ii = 0; ii < HAST_NCOMPONENTS; ii++) {
+ TAILQ_INIT(&hio_send_list[ii]);
+ mtx_init(&hio_send_list_lock[ii]);
+ cv_init(&hio_send_list_cond[ii]);
+ TAILQ_INIT(&hio_recv_list[ii]);
+ mtx_init(&hio_recv_list_lock[ii]);
+ cv_init(&hio_recv_list_cond[ii]);
+ rw_init(&hio_remote_lock[ii]);
+ }
+ TAILQ_INIT(&hio_done_list);
+ mtx_init(&hio_done_list_lock);
+ cv_init(&hio_done_list_cond);
+ mtx_init(&metadata_lock);
+
+ /*
+ * Allocate requests pool and initialize requests.
+ */
+ for (ii = 0; ii < HAST_HIO_MAX; ii++) {
+ hio = malloc(sizeof(*hio));
+ if (hio == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for hio request.",
+ sizeof(*hio));
+ }
+ refcnt_init(&hio->hio_countdown, 0);
+ hio->hio_errors = malloc(sizeof(hio->hio_errors[0]) * ncomps);
+ if (hio->hio_errors == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable allocate %zu bytes of memory for hio errors.",
+ sizeof(hio->hio_errors[0]) * ncomps);
+ }
+ hio->hio_next = malloc(sizeof(hio->hio_next[0]) * ncomps);
+ if (hio->hio_next == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable allocate %zu bytes of memory for hio_next field.",
+ sizeof(hio->hio_next[0]) * ncomps);
+ }
+ hio->hio_ggio.gctl_version = G_GATE_VERSION;
+ hio->hio_ggio.gctl_data = malloc(MAXPHYS);
+ if (hio->hio_ggio.gctl_data == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate %zu bytes of memory for gctl_data.",
+ MAXPHYS);
+ }
+ hio->hio_ggio.gctl_length = MAXPHYS;
+ hio->hio_ggio.gctl_error = 0;
+ TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_free_next);
+ }
+}
+
+static bool
+init_resuid(struct hast_resource *res)
+{
+
+ mtx_lock(&metadata_lock);
+ if (res->hr_resuid != 0) {
+ mtx_unlock(&metadata_lock);
+ return (false);
+ } else {
+ /* Initialize unique resource identifier. */
+ arc4random_buf(&res->hr_resuid, sizeof(res->hr_resuid));
+ mtx_unlock(&metadata_lock);
+ if (metadata_write(res) == -1)
+ exit(EX_NOINPUT);
+ return (true);
+ }
+}
+
+static void
+init_local(struct hast_resource *res)
+{
+ unsigned char *buf;
+ size_t mapsize;
+
+ if (metadata_read(res, true) == -1)
+ exit(EX_NOINPUT);
+ mtx_init(&res->hr_amp_lock);
+ if (activemap_init(&res->hr_amp, res->hr_datasize, res->hr_extentsize,
+ res->hr_local_sectorsize, res->hr_keepdirty) == -1) {
+ primary_exit(EX_TEMPFAIL, "Unable to create activemap");
+ }
+ mtx_init(&range_lock);
+ cv_init(&range_regular_cond);
+ if (rangelock_init(&range_regular) == -1)
+ primary_exit(EX_TEMPFAIL, "Unable to create regular range lock");
+ cv_init(&range_sync_cond);
+ if (rangelock_init(&range_sync) == -1)
+ primary_exit(EX_TEMPFAIL, "Unable to create sync range lock");
+ mapsize = activemap_ondisk_size(res->hr_amp);
+ buf = calloc(1, mapsize);
+ if (buf == NULL) {
+ primary_exitx(EX_TEMPFAIL,
+ "Unable to allocate buffer for activemap.");
+ }
+ if (pread(res->hr_localfd, buf, mapsize, METADATA_SIZE) !=
+ (ssize_t)mapsize) {
+ primary_exit(EX_NOINPUT, "Unable to read activemap");
+ }
+ activemap_copyin(res->hr_amp, buf, mapsize);
+ free(buf);
+ if (res->hr_resuid != 0)
+ return;
+ /*
+ * We're using provider for the first time. Initialize local and remote
+ * counters. We don't initialize resuid here, as we want to do it just
+ * in time. The reason for this is that we want to inform secondary
+ * that there were no writes yet, so there is no need to synchronize
+ * anything.
+ */
+ res->hr_primary_localcnt = 0;
+ res->hr_primary_remotecnt = 0;
+ if (metadata_write(res) == -1)
+ exit(EX_NOINPUT);
+}
+
+static int
+primary_connect(struct hast_resource *res, struct proto_conn **connp)
+{
+ struct proto_conn *conn;
+ int16_t val;
+
+ val = 1;
+ if (proto_send(res->hr_conn, &val, sizeof(val)) == -1) {
+ primary_exit(EX_TEMPFAIL,
+ "Unable to send connection request to parent");
+ }
+ if (proto_recv(res->hr_conn, &val, sizeof(val)) == -1) {
+ primary_exit(EX_TEMPFAIL,
+ "Unable to receive reply to connection request from parent");
+ }
+ if (val != 0) {
+ errno = val;
+ pjdlog_errno(LOG_WARNING, "Unable to connect to %s",
+ res->hr_remoteaddr);
+ return (-1);
+ }
+ if (proto_connection_recv(res->hr_conn, true, &conn) == -1) {
+ primary_exit(EX_TEMPFAIL,
+ "Unable to receive connection from parent");
+ }
+ if (proto_connect_wait(conn, res->hr_timeout) == -1) {
+ pjdlog_errno(LOG_WARNING, "Unable to connect to %s",
+ res->hr_remoteaddr);
+ proto_close(conn);
+ return (-1);
+ }
+ /* Error in setting timeout is not critical, but why should it fail? */
+ if (proto_timeout(conn, res->hr_timeout) == -1)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
+
+ *connp = conn;
+
+ return (0);
+}
+
+/*
+ * Function instructs GEOM_GATE to handle reads directly from within the kernel.
+ */
+static void
+enable_direct_reads(struct hast_resource *res)
+{
+ struct g_gate_ctl_modify ggiomodify;
+
+ bzero(&ggiomodify, sizeof(ggiomodify));
+ ggiomodify.gctl_version = G_GATE_VERSION;
+ ggiomodify.gctl_unit = res->hr_ggateunit;
+ ggiomodify.gctl_modify = GG_MODIFY_READPROV | GG_MODIFY_READOFFSET;
+ strlcpy(ggiomodify.gctl_readprov, res->hr_localpath,
+ sizeof(ggiomodify.gctl_readprov));
+ ggiomodify.gctl_readoffset = res->hr_localoff;
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_MODIFY, &ggiomodify) == 0)
+ pjdlog_debug(1, "Direct reads enabled.");
+ else
+ pjdlog_errno(LOG_WARNING, "Failed to enable direct reads");
+}
+
+static int
+init_remote(struct hast_resource *res, struct proto_conn **inp,
+ struct proto_conn **outp)
+{
+ struct proto_conn *in, *out;
+ struct nv *nvout, *nvin;
+ const unsigned char *token;
+ unsigned char *map;
+ const char *errmsg;
+ int32_t extentsize;
+ int64_t datasize;
+ uint32_t mapsize;
+ uint8_t version;
+ size_t size;
+ int error;
+
+ PJDLOG_ASSERT((inp == NULL && outp == NULL) || (inp != NULL && outp != NULL));
+ PJDLOG_ASSERT(real_remote(res));
+
+ in = out = NULL;
+ errmsg = NULL;
+
+ if (primary_connect(res, &out) == -1)
+ return (ECONNREFUSED);
+
+ error = ECONNABORTED;
+
+ /*
+ * First handshake step.
+ * Setup outgoing connection with remote node.
+ */
+ nvout = nv_alloc();
+ nv_add_string(nvout, res->hr_name, "resource");
+ nv_add_uint8(nvout, HAST_PROTO_VERSION, "version");
+ if (nv_error(nvout) != 0) {
+ pjdlog_common(LOG_WARNING, 0, nv_error(nvout),
+ "Unable to allocate header for connection with %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ goto close;
+ }
+ if (hast_proto_send(res, out, nvout, NULL, 0) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to send handshake header to %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ goto close;
+ }
+ nv_free(nvout);
+ if (hast_proto_recv_hdr(out, &nvin) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to receive handshake header from %s",
+ res->hr_remoteaddr);
+ goto close;
+ }
+ errmsg = nv_get_string(nvin, "errmsg");
+ if (errmsg != NULL) {
+ pjdlog_warning("%s", errmsg);
+ if (nv_exists(nvin, "wait"))
+ error = EBUSY;
+ nv_free(nvin);
+ goto close;
+ }
+ version = nv_get_uint8(nvin, "version");
+ if (version == 0) {
+ /*
+ * If no version is sent, it means this is protocol version 1.
+ */
+ version = 1;
+ }
+ if (version > HAST_PROTO_VERSION) {
+ pjdlog_warning("Invalid version received (%hhu).", version);
+ nv_free(nvin);
+ goto close;
+ }
+ res->hr_version = version;
+ pjdlog_debug(1, "Negotiated protocol version %d.", res->hr_version);
+ token = nv_get_uint8_array(nvin, &size, "token");
+ if (token == NULL) {
+ pjdlog_warning("Handshake header from %s has no 'token' field.",
+ res->hr_remoteaddr);
+ nv_free(nvin);
+ goto close;
+ }
+ if (size != sizeof(res->hr_token)) {
+ pjdlog_warning("Handshake header from %s contains 'token' of wrong size (got %zu, expected %zu).",
+ res->hr_remoteaddr, size, sizeof(res->hr_token));
+ nv_free(nvin);
+ goto close;
+ }
+ bcopy(token, res->hr_token, sizeof(res->hr_token));
+ nv_free(nvin);
+
+ /*
+ * Second handshake step.
+ * Setup incoming connection with remote node.
+ */
+ if (primary_connect(res, &in) == -1)
+ goto close;
+
+ nvout = nv_alloc();
+ nv_add_string(nvout, res->hr_name, "resource");
+ nv_add_uint8_array(nvout, res->hr_token, sizeof(res->hr_token),
+ "token");
+ if (res->hr_resuid == 0) {
+ /*
+ * The resuid field was not yet initialized.
+ * Because we do synchronization inside init_resuid(), it is
+ * possible that someone already initialized it, the function
+ * will return false then, but if we successfully initialized
+ * it, we will get true. True means that there were no writes
+ * to this resource yet and we want to inform secondary that
+ * synchronization is not needed by sending "virgin" argument.
+ */
+ if (init_resuid(res))
+ nv_add_int8(nvout, 1, "virgin");
+ }
+ nv_add_uint64(nvout, res->hr_resuid, "resuid");
+ nv_add_uint64(nvout, res->hr_primary_localcnt, "localcnt");
+ nv_add_uint64(nvout, res->hr_primary_remotecnt, "remotecnt");
+ if (nv_error(nvout) != 0) {
+ pjdlog_common(LOG_WARNING, 0, nv_error(nvout),
+ "Unable to allocate header for connection with %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ goto close;
+ }
+ if (hast_proto_send(res, in, nvout, NULL, 0) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to send handshake header to %s",
+ res->hr_remoteaddr);
+ nv_free(nvout);
+ goto close;
+ }
+ nv_free(nvout);
+ if (hast_proto_recv_hdr(out, &nvin) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to receive handshake header from %s",
+ res->hr_remoteaddr);
+ goto close;
+ }
+ errmsg = nv_get_string(nvin, "errmsg");
+ if (errmsg != NULL) {
+ pjdlog_warning("%s", errmsg);
+ nv_free(nvin);
+ goto close;
+ }
+ datasize = nv_get_int64(nvin, "datasize");
+ if (datasize != res->hr_datasize) {
+ pjdlog_warning("Data size differs between nodes (local=%jd, remote=%jd).",
+ (intmax_t)res->hr_datasize, (intmax_t)datasize);
+ nv_free(nvin);
+ goto close;
+ }
+ extentsize = nv_get_int32(nvin, "extentsize");
+ if (extentsize != res->hr_extentsize) {
+ pjdlog_warning("Extent size differs between nodes (local=%zd, remote=%zd).",
+ (ssize_t)res->hr_extentsize, (ssize_t)extentsize);
+ nv_free(nvin);
+ goto close;
+ }
+ res->hr_secondary_localcnt = nv_get_uint64(nvin, "localcnt");
+ res->hr_secondary_remotecnt = nv_get_uint64(nvin, "remotecnt");
+ res->hr_syncsrc = nv_get_uint8(nvin, "syncsrc");
+ if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY)
+ enable_direct_reads(res);
+ if (nv_exists(nvin, "virgin")) {
+ /*
+ * Secondary was reinitialized, bump localcnt if it is 0 as
+ * only we have the data.
+ */
+ PJDLOG_ASSERT(res->hr_syncsrc == HAST_SYNCSRC_PRIMARY);
+ PJDLOG_ASSERT(res->hr_secondary_localcnt == 0);
+
+ if (res->hr_primary_localcnt == 0) {
+ PJDLOG_ASSERT(res->hr_secondary_remotecnt == 0);
+
+ mtx_lock(&metadata_lock);
+ res->hr_primary_localcnt++;
+ pjdlog_debug(1, "Increasing localcnt to %ju.",
+ (uintmax_t)res->hr_primary_localcnt);
+ (void)metadata_write(res);
+ mtx_unlock(&metadata_lock);
+ }
+ }
+ map = NULL;
+ mapsize = nv_get_uint32(nvin, "mapsize");
+ if (mapsize > 0) {
+ map = malloc(mapsize);
+ if (map == NULL) {
+ pjdlog_error("Unable to allocate memory for remote activemap (mapsize=%ju).",
+ (uintmax_t)mapsize);
+ nv_free(nvin);
+ goto close;
+ }
+ /*
+ * Remote node have some dirty extents on its own, lets
+ * download its activemap.
+ */
+ if (hast_proto_recv_data(res, out, nvin, map,
+ mapsize) == -1) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to receive remote activemap");
+ nv_free(nvin);
+ free(map);
+ goto close;
+ }
+ /*
+ * Merge local and remote bitmaps.
+ */
+ activemap_merge(res->hr_amp, map, mapsize);
+ free(map);
+ /*
+ * Now that we merged bitmaps from both nodes, flush it to the
+ * disk before we start to synchronize.
+ */
+ mtx_lock(&res->hr_amp_lock);
+ (void)hast_activemap_flush(res);
+ }
+ nv_free(nvin);
+#ifdef notyet
+ /* Setup directions. */
+ if (proto_send(out, NULL, 0) == -1)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection direction");
+ if (proto_recv(in, NULL, 0) == -1)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection direction");
+#endif
+ pjdlog_info("Connected to %s.", res->hr_remoteaddr);
+ if (res->hr_original_replication == HAST_REPLICATION_MEMSYNC &&
+ res->hr_version < 2) {
+ pjdlog_warning("The 'memsync' replication mode is not supported by the remote node, falling back to 'fullsync' mode.");
+ res->hr_replication = HAST_REPLICATION_FULLSYNC;
+ } else if (res->hr_replication != res->hr_original_replication) {
+ /*
+ * This is in case hastd disconnected and was upgraded.
+ */
+ res->hr_replication = res->hr_original_replication;
+ }
+ if (inp != NULL && outp != NULL) {
+ *inp = in;
+ *outp = out;
+ } else {
+ res->hr_remotein = in;
+ res->hr_remoteout = out;
+ }
+ event_send(res, EVENT_CONNECT);
+ return (0);
+close:
+ if (errmsg != NULL && strcmp(errmsg, "Split-brain condition!") == 0)
+ event_send(res, EVENT_SPLITBRAIN);
+ proto_close(out);
+ if (in != NULL)
+ proto_close(in);
+ return (error);
+}
+
+static void
+sync_start(void)
+{
+
+ mtx_lock(&sync_lock);
+ sync_inprogress = true;
+ mtx_unlock(&sync_lock);
+ cv_signal(&sync_cond);
+}
+
+static void
+sync_stop(void)
+{
+
+ mtx_lock(&sync_lock);
+ if (sync_inprogress)
+ sync_inprogress = false;
+ mtx_unlock(&sync_lock);
+}
+
+static void
+init_ggate(struct hast_resource *res)
+{
+ struct g_gate_ctl_create ggiocreate;
+ struct g_gate_ctl_cancel ggiocancel;
+
+ /*
+ * We communicate with ggate via /dev/ggctl. Open it.
+ */
+ res->hr_ggatefd = open("/dev/" G_GATE_CTL_NAME, O_RDWR);
+ if (res->hr_ggatefd == -1)
+ primary_exit(EX_OSFILE, "Unable to open /dev/" G_GATE_CTL_NAME);
+ /*
+ * Create provider before trying to connect, as connection failure
+ * is not critical, but may take some time.
+ */
+ bzero(&ggiocreate, sizeof(ggiocreate));
+ ggiocreate.gctl_version = G_GATE_VERSION;
+ ggiocreate.gctl_mediasize = res->hr_datasize;
+ ggiocreate.gctl_sectorsize = res->hr_local_sectorsize;
+ ggiocreate.gctl_flags = 0;
+ ggiocreate.gctl_maxcount = 0;
+ ggiocreate.gctl_timeout = 0;
+ ggiocreate.gctl_unit = G_GATE_NAME_GIVEN;
+ snprintf(ggiocreate.gctl_name, sizeof(ggiocreate.gctl_name), "hast/%s",
+ res->hr_provname);
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_CREATE, &ggiocreate) == 0) {
+ pjdlog_info("Device hast/%s created.", res->hr_provname);
+ res->hr_ggateunit = ggiocreate.gctl_unit;
+ return;
+ }
+ if (errno != EEXIST) {
+ primary_exit(EX_OSERR, "Unable to create hast/%s device",
+ res->hr_provname);
+ }
+ pjdlog_debug(1,
+ "Device hast/%s already exists, we will try to take it over.",
+ res->hr_provname);
+ /*
+ * If we received EEXIST, we assume that the process who created the
+ * provider died and didn't clean up. In that case we will start from
+ * where he left of.
+ */
+ bzero(&ggiocancel, sizeof(ggiocancel));
+ ggiocancel.gctl_version = G_GATE_VERSION;
+ ggiocancel.gctl_unit = G_GATE_NAME_GIVEN;
+ snprintf(ggiocancel.gctl_name, sizeof(ggiocancel.gctl_name), "hast/%s",
+ res->hr_provname);
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_CANCEL, &ggiocancel) == 0) {
+ pjdlog_info("Device hast/%s recovered.", res->hr_provname);
+ res->hr_ggateunit = ggiocancel.gctl_unit;
+ return;
+ }
+ primary_exit(EX_OSERR, "Unable to take over hast/%s device",
+ res->hr_provname);
+}
+
+void
+hastd_primary(struct hast_resource *res)
+{
+ pthread_t td;
+ pid_t pid;
+ int error, mode, debuglevel;
+
+ /*
+ * Create communication channel for sending control commands from
+ * parent to child.
+ */
+ if (proto_client(NULL, "socketpair://", &res->hr_ctrl) == -1) {
+ /* TODO: There's no need for this to be fatal error. */
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR,
+ "Unable to create control sockets between parent and child");
+ }
+ /*
+ * Create communication channel for sending events from child to parent.
+ */
+ if (proto_client(NULL, "socketpair://", &res->hr_event) == -1) {
+ /* TODO: There's no need for this to be fatal error. */
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR,
+ "Unable to create event sockets between child and parent");
+ }
+ /*
+ * Create communication channel for sending connection requests from
+ * child to parent.
+ */
+ if (proto_client(NULL, "socketpair://", &res->hr_conn) == -1) {
+ /* TODO: There's no need for this to be fatal error. */
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR,
+ "Unable to create connection sockets between child and parent");
+ }
+
+ pid = fork();
+ if (pid == -1) {
+ /* TODO: There's no need for this to be fatal error. */
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_TEMPFAIL, "Unable to fork");
+ }
+
+ if (pid > 0) {
+ /* This is parent. */
+ /* Declare that we are receiver. */
+ proto_recv(res->hr_event, NULL, 0);
+ proto_recv(res->hr_conn, NULL, 0);
+ /* Declare that we are sender. */
+ proto_send(res->hr_ctrl, NULL, 0);
+ res->hr_workerpid = pid;
+ return;
+ }
+
+ gres = res;
+ mode = pjdlog_mode_get();
+ debuglevel = pjdlog_debug_get();
+
+ /* Declare that we are sender. */
+ proto_send(res->hr_event, NULL, 0);
+ proto_send(res->hr_conn, NULL, 0);
+ /* Declare that we are receiver. */
+ proto_recv(res->hr_ctrl, NULL, 0);
+ descriptors_cleanup(res);
+
+ descriptors_assert(res, mode);
+
+ pjdlog_init(mode);
+ pjdlog_debug_set(debuglevel);
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
+ setproctitle("%s (%s)", res->hr_name, role2str(res->hr_role));
+
+ init_local(res);
+ init_ggate(res);
+ init_environment(res);
+
+ if (drop_privs(res) != 0) {
+ cleanup(res);
+ exit(EX_CONFIG);
+ }
+ pjdlog_info("Privileges successfully dropped.");
+
+ /*
+ * Create the guard thread first, so we can handle signals from the
+ * very beginning.
+ */
+ error = pthread_create(&td, NULL, guard_thread, res);
+ PJDLOG_ASSERT(error == 0);
+ /*
+ * Create the control thread before sending any event to the parent,
+ * as we can deadlock when parent sends control request to worker,
+ * but worker has no control thread started yet, so parent waits.
+ * In the meantime worker sends an event to the parent, but parent
+ * is unable to handle the event, because it waits for control
+ * request response.
+ */
+ error = pthread_create(&td, NULL, ctrl_thread, res);
+ PJDLOG_ASSERT(error == 0);
+ if (real_remote(res)) {
+ error = init_remote(res, NULL, NULL);
+ if (error == 0) {
+ sync_start();
+ } else if (error == EBUSY) {
+ time_t start = time(NULL);
+
+ pjdlog_warning("Waiting for remote node to become %s for %ds.",
+ role2str(HAST_ROLE_SECONDARY),
+ res->hr_timeout);
+ for (;;) {
+ sleep(1);
+ error = init_remote(res, NULL, NULL);
+ if (error != EBUSY)
+ break;
+ if (time(NULL) > start + res->hr_timeout)
+ break;
+ }
+ if (error == EBUSY) {
+ pjdlog_warning("Remote node is still %s, starting anyway.",
+ role2str(HAST_ROLE_PRIMARY));
+ }
+ }
+ }
+ error = pthread_create(&td, NULL, ggate_recv_thread, res);
+ PJDLOG_ASSERT(error == 0);
+ error = pthread_create(&td, NULL, local_send_thread, res);
+ PJDLOG_ASSERT(error == 0);
+ error = pthread_create(&td, NULL, remote_send_thread, res);
+ PJDLOG_ASSERT(error == 0);
+ error = pthread_create(&td, NULL, remote_recv_thread, res);
+ PJDLOG_ASSERT(error == 0);
+ error = pthread_create(&td, NULL, ggate_send_thread, res);
+ PJDLOG_ASSERT(error == 0);
+ fullystarted = true;
+ (void)sync_thread(res);
+}
+
+static void
+reqlog(int loglevel, int debuglevel, struct g_gate_ctl_io *ggio,
+ const char *fmt, ...)
+{
+ char msg[1024];
+ va_list ap;
+
+ va_start(ap, fmt);
+ (void)vsnprintf(msg, sizeof(msg), fmt, ap);
+ va_end(ap);
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ (void)snprlcat(msg, sizeof(msg), "READ(%ju, %ju).",
+ (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length);
+ break;
+ case BIO_DELETE:
+ (void)snprlcat(msg, sizeof(msg), "DELETE(%ju, %ju).",
+ (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length);
+ break;
+ case BIO_FLUSH:
+ (void)snprlcat(msg, sizeof(msg), "FLUSH.");
+ break;
+ case BIO_WRITE:
+ (void)snprlcat(msg, sizeof(msg), "WRITE(%ju, %ju).",
+ (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length);
+ break;
+ default:
+ (void)snprlcat(msg, sizeof(msg), "UNKNOWN(%u).",
+ (unsigned int)ggio->gctl_cmd);
+ break;
+ }
+ pjdlog_common(loglevel, debuglevel, -1, "%s", msg);
+}
+
+static void
+remote_close(struct hast_resource *res, int ncomp)
+{
+
+ rw_wlock(&hio_remote_lock[ncomp]);
+ /*
+ * Check for a race between dropping rlock and acquiring wlock -
+ * another thread can close connection in-between.
+ */
+ if (!ISCONNECTED(res, ncomp)) {
+ PJDLOG_ASSERT(res->hr_remotein == NULL);
+ PJDLOG_ASSERT(res->hr_remoteout == NULL);
+ rw_unlock(&hio_remote_lock[ncomp]);
+ return;
+ }
+
+ PJDLOG_ASSERT(res->hr_remotein != NULL);
+ PJDLOG_ASSERT(res->hr_remoteout != NULL);
+
+ pjdlog_debug(2, "Closing incoming connection to %s.",
+ res->hr_remoteaddr);
+ proto_close(res->hr_remotein);
+ res->hr_remotein = NULL;
+ pjdlog_debug(2, "Closing outgoing connection to %s.",
+ res->hr_remoteaddr);
+ proto_close(res->hr_remoteout);
+ res->hr_remoteout = NULL;
+
+ rw_unlock(&hio_remote_lock[ncomp]);
+
+ pjdlog_warning("Disconnected from %s.", res->hr_remoteaddr);
+
+ /*
+ * Stop synchronization if in-progress.
+ */
+ sync_stop();
+
+ event_send(res, EVENT_DISCONNECT);
+}
+
+/*
+ * Acknowledge write completion to the kernel, but don't update activemap yet.
+ */
+static void
+write_complete(struct hast_resource *res, struct hio *hio)
+{
+ struct g_gate_ctl_io *ggio;
+ unsigned int ncomp;
+
+ PJDLOG_ASSERT(!hio->hio_done);
+
+ ggio = &hio->hio_ggio;
+ PJDLOG_ASSERT(ggio->gctl_cmd == BIO_WRITE);
+
+ /*
+ * Bump local count if this is first write after
+ * connection failure with remote node.
+ */
+ ncomp = 1;
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (!ISCONNECTED(res, ncomp)) {
+ mtx_lock(&metadata_lock);
+ if (res->hr_primary_localcnt == res->hr_secondary_remotecnt) {
+ res->hr_primary_localcnt++;
+ pjdlog_debug(1, "Increasing localcnt to %ju.",
+ (uintmax_t)res->hr_primary_localcnt);
+ (void)metadata_write(res);
+ }
+ mtx_unlock(&metadata_lock);
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_DONE, ggio) == -1)
+ primary_exit(EX_OSERR, "G_GATE_CMD_DONE failed");
+ hio->hio_done = true;
+}
+
+/*
+ * Thread receives ggate I/O requests from the kernel and passes them to
+ * appropriate threads:
+ * WRITE - always goes to both local_send and remote_send threads
+ * READ (when the block is up-to-date on local component) -
+ * only local_send thread
+ * READ (when the block isn't up-to-date on local component) -
+ * only remote_send thread
+ * DELETE - always goes to both local_send and remote_send threads
+ * FLUSH - always goes to both local_send and remote_send threads
+ */
+static void *
+ggate_recv_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ unsigned int ii, ncomp, ncomps;
+ int error;
+
+ for (;;) {
+ pjdlog_debug(2, "ggate_recv: Taking free request.");
+ QUEUE_TAKE2(hio, free);
+ pjdlog_debug(2, "ggate_recv: (%p) Got free request.", hio);
+ ggio = &hio->hio_ggio;
+ ggio->gctl_unit = res->hr_ggateunit;
+ ggio->gctl_length = MAXPHYS;
+ ggio->gctl_error = 0;
+ hio->hio_done = false;
+ hio->hio_replication = res->hr_replication;
+ pjdlog_debug(2,
+ "ggate_recv: (%p) Waiting for request from the kernel.",
+ hio);
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_START, ggio) == -1) {
+ if (sigexit_received)
+ pthread_exit(NULL);
+ primary_exit(EX_OSERR, "G_GATE_CMD_START failed");
+ }
+ error = ggio->gctl_error;
+ switch (error) {
+ case 0:
+ break;
+ case ECANCELED:
+ /* Exit gracefully. */
+ if (!sigexit_received) {
+ pjdlog_debug(2,
+ "ggate_recv: (%p) Received cancel from the kernel.",
+ hio);
+ pjdlog_info("Received cancel from the kernel, exiting.");
+ }
+ pthread_exit(NULL);
+ case ENOMEM:
+ /*
+ * Buffer too small? Impossible, we allocate MAXPHYS
+ * bytes - request can't be bigger than that.
+ */
+ /* FALLTHROUGH */
+ case ENXIO:
+ default:
+ primary_exitx(EX_OSERR, "G_GATE_CMD_START failed: %s.",
+ strerror(error));
+ }
+
+ ncomp = 0;
+ ncomps = HAST_NCOMPONENTS;
+
+ for (ii = 0; ii < ncomps; ii++)
+ hio->hio_errors[ii] = EINVAL;
+ reqlog(LOG_DEBUG, 2, ggio,
+ "ggate_recv: (%p) Request received from the kernel: ",
+ hio);
+
+ /*
+ * Inform all components about new write request.
+ * For read request prefer local component unless the given
+ * range is out-of-date, then use remote component.
+ */
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ res->hr_stat_read++;
+ ncomps = 1;
+ mtx_lock(&metadata_lock);
+ if (res->hr_syncsrc == HAST_SYNCSRC_UNDEF ||
+ res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) {
+ /*
+ * This range is up-to-date on local component,
+ * so handle request locally.
+ */
+ /* Local component is 0 for now. */
+ ncomp = 0;
+ } else /* if (res->hr_syncsrc ==
+ HAST_SYNCSRC_SECONDARY) */ {
+ PJDLOG_ASSERT(res->hr_syncsrc ==
+ HAST_SYNCSRC_SECONDARY);
+ /*
+ * This range is out-of-date on local component,
+ * so send request to the remote node.
+ */
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+ }
+ mtx_unlock(&metadata_lock);
+ break;
+ case BIO_WRITE:
+ res->hr_stat_write++;
+ if (res->hr_resuid == 0 &&
+ res->hr_primary_localcnt == 0) {
+ /* This is first write. */
+ res->hr_primary_localcnt = 1;
+ }
+ for (;;) {
+ mtx_lock(&range_lock);
+ if (rangelock_islocked(range_sync,
+ ggio->gctl_offset, ggio->gctl_length)) {
+ pjdlog_debug(2,
+ "regular: Range offset=%jd length=%zu locked.",
+ (intmax_t)ggio->gctl_offset,
+ (size_t)ggio->gctl_length);
+ range_regular_wait = true;
+ cv_wait(&range_regular_cond, &range_lock);
+ range_regular_wait = false;
+ mtx_unlock(&range_lock);
+ continue;
+ }
+ if (rangelock_add(range_regular,
+ ggio->gctl_offset, ggio->gctl_length) == -1) {
+ mtx_unlock(&range_lock);
+ pjdlog_debug(2,
+ "regular: Range offset=%jd length=%zu is already locked, waiting.",
+ (intmax_t)ggio->gctl_offset,
+ (size_t)ggio->gctl_length);
+ sleep(1);
+ continue;
+ }
+ mtx_unlock(&range_lock);
+ break;
+ }
+ mtx_lock(&res->hr_amp_lock);
+ if (activemap_write_start(res->hr_amp,
+ ggio->gctl_offset, ggio->gctl_length)) {
+ res->hr_stat_activemap_update++;
+ (void)hast_activemap_flush(res);
+ } else {
+ mtx_unlock(&res->hr_amp_lock);
+ }
+ break;
+ case BIO_DELETE:
+ res->hr_stat_delete++;
+ break;
+ case BIO_FLUSH:
+ res->hr_stat_flush++;
+ break;
+ }
+ pjdlog_debug(2,
+ "ggate_recv: (%p) Moving request to the send queues.", hio);
+ if (hio->hio_replication == HAST_REPLICATION_MEMSYNC &&
+ ggio->gctl_cmd == BIO_WRITE) {
+ /* Each remote request needs two responses in memsync. */
+ refcnt_init(&hio->hio_countdown, ncomps + 1);
+ } else {
+ refcnt_init(&hio->hio_countdown, ncomps);
+ }
+ for (ii = ncomp; ii < ncomps; ii++)
+ QUEUE_INSERT1(hio, send, ii);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread reads from or writes to local component.
+ * If local read fails, it redirects it to remote_send thread.
+ */
+static void *
+local_send_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ unsigned int ncomp, rncomp;
+ ssize_t ret;
+
+ /* Local component is 0 for now. */
+ ncomp = 0;
+ /* Remote component is 1 for now. */
+ rncomp = 1;
+
+ for (;;) {
+ pjdlog_debug(2, "local_send: Taking request.");
+ QUEUE_TAKE1(hio, send, ncomp, 0);
+ pjdlog_debug(2, "local_send: (%p) Got request.", hio);
+ ggio = &hio->hio_ggio;
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ ret = pread(res->hr_localfd, ggio->gctl_data,
+ ggio->gctl_length,
+ ggio->gctl_offset + res->hr_localoff);
+ if (ret == ggio->gctl_length)
+ hio->hio_errors[ncomp] = 0;
+ else if (!ISSYNCREQ(hio)) {
+ /*
+ * If READ failed, try to read from remote node.
+ */
+ if (ret == -1) {
+ reqlog(LOG_WARNING, 0, ggio,
+ "Local request failed (%s), trying remote node. ",
+ strerror(errno));
+ } else if (ret != ggio->gctl_length) {
+ reqlog(LOG_WARNING, 0, ggio,
+ "Local request failed (%zd != %jd), trying remote node. ",
+ ret, (intmax_t)ggio->gctl_length);
+ }
+ QUEUE_INSERT1(hio, send, rncomp);
+ continue;
+ }
+ break;
+ case BIO_WRITE:
+ ret = pwrite(res->hr_localfd, ggio->gctl_data,
+ ggio->gctl_length,
+ ggio->gctl_offset + res->hr_localoff);
+ if (ret == -1) {
+ hio->hio_errors[ncomp] = errno;
+ reqlog(LOG_WARNING, 0, ggio,
+ "Local request failed (%s): ",
+ strerror(errno));
+ } else if (ret != ggio->gctl_length) {
+ hio->hio_errors[ncomp] = EIO;
+ reqlog(LOG_WARNING, 0, ggio,
+ "Local request failed (%zd != %jd): ",
+ ret, (intmax_t)ggio->gctl_length);
+ } else {
+ hio->hio_errors[ncomp] = 0;
+ if (hio->hio_replication ==
+ HAST_REPLICATION_ASYNC) {
+ ggio->gctl_error = 0;
+ write_complete(res, hio);
+ }
+ }
+ break;
+ case BIO_DELETE:
+ ret = g_delete(res->hr_localfd,
+ ggio->gctl_offset + res->hr_localoff,
+ ggio->gctl_length);
+ if (ret == -1) {
+ hio->hio_errors[ncomp] = errno;
+ reqlog(LOG_WARNING, 0, ggio,
+ "Local request failed (%s): ",
+ strerror(errno));
+ } else {
+ hio->hio_errors[ncomp] = 0;
+ }
+ break;
+ case BIO_FLUSH:
+ if (!res->hr_localflush) {
+ ret = -1;
+ errno = EOPNOTSUPP;
+ break;
+ }
+ ret = g_flush(res->hr_localfd);
+ if (ret == -1) {
+ if (errno == EOPNOTSUPP)
+ res->hr_localflush = false;
+ hio->hio_errors[ncomp] = errno;
+ reqlog(LOG_WARNING, 0, ggio,
+ "Local request failed (%s): ",
+ strerror(errno));
+ } else {
+ hio->hio_errors[ncomp] = 0;
+ }
+ break;
+ }
+
+ if (hio->hio_replication != HAST_REPLICATION_MEMSYNC ||
+ ggio->gctl_cmd != BIO_WRITE || ISSYNCREQ(hio)) {
+ if (refcnt_release(&hio->hio_countdown) > 0)
+ continue;
+ } else {
+ /*
+ * Depending on hio_countdown value, requests finished
+ * in the following order:
+ * 0: remote memsync, remote final, local write
+ * 1: remote memsync, local write, (remote final)
+ * 2: local write, (remote memsync), (remote final)
+ */
+ switch (refcnt_release(&hio->hio_countdown)) {
+ case 0:
+ /*
+ * Local write finished as last.
+ */
+ break;
+ case 1:
+ /*
+ * Local write finished after remote memsync
+ * reply arrvied. We can complete the write now.
+ */
+ if (hio->hio_errors[0] == 0)
+ write_complete(res, hio);
+ continue;
+ case 2:
+ /*
+ * Local write finished as first.
+ */
+ continue;
+ default:
+ PJDLOG_ABORT("Invalid hio_countdown.");
+ }
+ }
+ if (ISSYNCREQ(hio)) {
+ mtx_lock(&sync_lock);
+ SYNCREQDONE(hio);
+ mtx_unlock(&sync_lock);
+ cv_signal(&sync_cond);
+ } else {
+ pjdlog_debug(2,
+ "local_send: (%p) Moving request to the done queue.",
+ hio);
+ QUEUE_INSERT2(hio, done);
+ }
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+static void
+keepalive_send(struct hast_resource *res, unsigned int ncomp)
+{
+ struct nv *nv;
+
+ rw_rlock(&hio_remote_lock[ncomp]);
+
+ if (!ISCONNECTED(res, ncomp)) {
+ rw_unlock(&hio_remote_lock[ncomp]);
+ return;
+ }
+
+ PJDLOG_ASSERT(res->hr_remotein != NULL);
+ PJDLOG_ASSERT(res->hr_remoteout != NULL);
+
+ nv = nv_alloc();
+ nv_add_uint8(nv, HIO_KEEPALIVE, "cmd");
+ if (nv_error(nv) != 0) {
+ rw_unlock(&hio_remote_lock[ncomp]);
+ nv_free(nv);
+ pjdlog_debug(1,
+ "keepalive_send: Unable to prepare header to send.");
+ return;
+ }
+ if (hast_proto_send(res, res->hr_remoteout, nv, NULL, 0) == -1) {
+ rw_unlock(&hio_remote_lock[ncomp]);
+ pjdlog_common(LOG_DEBUG, 1, errno,
+ "keepalive_send: Unable to send request");
+ nv_free(nv);
+ remote_close(res, ncomp);
+ return;
+ }
+
+ rw_unlock(&hio_remote_lock[ncomp]);
+ nv_free(nv);
+ pjdlog_debug(2, "keepalive_send: Request sent.");
+}
+
+/*
+ * Thread sends request to secondary node.
+ */
+static void *
+remote_send_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ time_t lastcheck, now;
+ struct hio *hio;
+ struct nv *nv;
+ unsigned int ncomp;
+ bool wakeup;
+ uint64_t offset, length;
+ uint8_t cmd;
+ void *data;
+
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+ lastcheck = time(NULL);
+
+ for (;;) {
+ pjdlog_debug(2, "remote_send: Taking request.");
+ QUEUE_TAKE1(hio, send, ncomp, HAST_KEEPALIVE);
+ if (hio == NULL) {
+ now = time(NULL);
+ if (lastcheck + HAST_KEEPALIVE <= now) {
+ keepalive_send(res, ncomp);
+ lastcheck = now;
+ }
+ continue;
+ }
+ pjdlog_debug(2, "remote_send: (%p) Got request.", hio);
+ ggio = &hio->hio_ggio;
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ cmd = HIO_READ;
+ data = NULL;
+ offset = ggio->gctl_offset;
+ length = ggio->gctl_length;
+ break;
+ case BIO_WRITE:
+ cmd = HIO_WRITE;
+ data = ggio->gctl_data;
+ offset = ggio->gctl_offset;
+ length = ggio->gctl_length;
+ break;
+ case BIO_DELETE:
+ cmd = HIO_DELETE;
+ data = NULL;
+ offset = ggio->gctl_offset;
+ length = ggio->gctl_length;
+ break;
+ case BIO_FLUSH:
+ cmd = HIO_FLUSH;
+ data = NULL;
+ offset = 0;
+ length = 0;
+ break;
+ default:
+ PJDLOG_ABORT("invalid condition");
+ }
+ nv = nv_alloc();
+ nv_add_uint8(nv, cmd, "cmd");
+ nv_add_uint64(nv, (uint64_t)ggio->gctl_seq, "seq");
+ nv_add_uint64(nv, offset, "offset");
+ nv_add_uint64(nv, length, "length");
+ if (hio->hio_replication == HAST_REPLICATION_MEMSYNC &&
+ ggio->gctl_cmd == BIO_WRITE && !ISSYNCREQ(hio)) {
+ nv_add_uint8(nv, 1, "memsync");
+ }
+ if (nv_error(nv) != 0) {
+ hio->hio_errors[ncomp] = nv_error(nv);
+ pjdlog_debug(2,
+ "remote_send: (%p) Unable to prepare header to send.",
+ hio);
+ reqlog(LOG_ERR, 0, ggio,
+ "Unable to prepare header to send (%s): ",
+ strerror(nv_error(nv)));
+ /* Move failed request immediately to the done queue. */
+ goto done_queue;
+ }
+ /*
+ * Protect connection from disappearing.
+ */
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (!ISCONNECTED(res, ncomp)) {
+ rw_unlock(&hio_remote_lock[ncomp]);
+ hio->hio_errors[ncomp] = ENOTCONN;
+ goto done_queue;
+ }
+ /*
+ * Move the request to recv queue before sending it, because
+ * in different order we can get reply before we move request
+ * to recv queue.
+ */
+ pjdlog_debug(2,
+ "remote_send: (%p) Moving request to the recv queue.",
+ hio);
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ wakeup = TAILQ_EMPTY(&hio_recv_list[ncomp]);
+ TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], hio, hio_next[ncomp]);
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ if (hast_proto_send(res, res->hr_remoteout, nv, data,
+ data != NULL ? length : 0) == -1) {
+ hio->hio_errors[ncomp] = errno;
+ rw_unlock(&hio_remote_lock[ncomp]);
+ pjdlog_debug(2,
+ "remote_send: (%p) Unable to send request.", hio);
+ reqlog(LOG_ERR, 0, ggio,
+ "Unable to send request (%s): ",
+ strerror(hio->hio_errors[ncomp]));
+ remote_close(res, ncomp);
+ /*
+ * Take request back from the receive queue and move
+ * it immediately to the done queue.
+ */
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ TAILQ_REMOVE(&hio_recv_list[ncomp], hio,
+ hio_next[ncomp]);
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ goto done_queue;
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ nv_free(nv);
+ if (wakeup)
+ cv_signal(&hio_recv_list_cond[ncomp]);
+ continue;
+done_queue:
+ nv_free(nv);
+ if (ISSYNCREQ(hio)) {
+ if (refcnt_release(&hio->hio_countdown) > 0)
+ continue;
+ mtx_lock(&sync_lock);
+ SYNCREQDONE(hio);
+ mtx_unlock(&sync_lock);
+ cv_signal(&sync_cond);
+ continue;
+ }
+ if (ggio->gctl_cmd == BIO_WRITE) {
+ mtx_lock(&res->hr_amp_lock);
+ if (activemap_need_sync(res->hr_amp, ggio->gctl_offset,
+ ggio->gctl_length)) {
+ (void)hast_activemap_flush(res);
+ } else {
+ mtx_unlock(&res->hr_amp_lock);
+ }
+ if (hio->hio_replication == HAST_REPLICATION_MEMSYNC)
+ (void)refcnt_release(&hio->hio_countdown);
+ }
+ if (refcnt_release(&hio->hio_countdown) > 0)
+ continue;
+ pjdlog_debug(2,
+ "remote_send: (%p) Moving request to the done queue.",
+ hio);
+ QUEUE_INSERT2(hio, done);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread receives answer from secondary node and passes it to ggate_send
+ * thread.
+ */
+static void *
+remote_recv_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ struct nv *nv;
+ unsigned int ncomp;
+ uint64_t seq;
+ bool memsyncack;
+ int error;
+
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+
+ for (;;) {
+ /* Wait until there is anything to receive. */
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ while (TAILQ_EMPTY(&hio_recv_list[ncomp])) {
+ pjdlog_debug(2, "remote_recv: No requests, waiting.");
+ cv_wait(&hio_recv_list_cond[ncomp],
+ &hio_recv_list_lock[ncomp]);
+ }
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+
+ memsyncack = false;
+
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (!ISCONNECTED(res, ncomp)) {
+ rw_unlock(&hio_remote_lock[ncomp]);
+ /*
+ * Connection is dead, so move all pending requests to
+ * the done queue (one-by-one).
+ */
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ hio = TAILQ_FIRST(&hio_recv_list[ncomp]);
+ PJDLOG_ASSERT(hio != NULL);
+ TAILQ_REMOVE(&hio_recv_list[ncomp], hio,
+ hio_next[ncomp]);
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ goto done_queue;
+ }
+ if (hast_proto_recv_hdr(res->hr_remotein, &nv) == -1) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to receive reply header");
+ rw_unlock(&hio_remote_lock[ncomp]);
+ remote_close(res, ncomp);
+ continue;
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ seq = nv_get_uint64(nv, "seq");
+ if (seq == 0) {
+ pjdlog_error("Header contains no 'seq' field.");
+ nv_free(nv);
+ continue;
+ }
+ memsyncack = nv_exists(nv, "received");
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ TAILQ_FOREACH(hio, &hio_recv_list[ncomp], hio_next[ncomp]) {
+ if (hio->hio_ggio.gctl_seq == seq) {
+ TAILQ_REMOVE(&hio_recv_list[ncomp], hio,
+ hio_next[ncomp]);
+ break;
+ }
+ }
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ if (hio == NULL) {
+ pjdlog_error("Found no request matching received 'seq' field (%ju).",
+ (uintmax_t)seq);
+ nv_free(nv);
+ continue;
+ }
+ ggio = &hio->hio_ggio;
+ error = nv_get_int16(nv, "error");
+ if (error != 0) {
+ /* Request failed on remote side. */
+ hio->hio_errors[ncomp] = error;
+ reqlog(LOG_WARNING, 0, ggio,
+ "Remote request failed (%s): ", strerror(error));
+ nv_free(nv);
+ goto done_queue;
+ }
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (!ISCONNECTED(res, ncomp)) {
+ rw_unlock(&hio_remote_lock[ncomp]);
+ nv_free(nv);
+ goto done_queue;
+ }
+ if (hast_proto_recv_data(res, res->hr_remotein, nv,
+ ggio->gctl_data, ggio->gctl_length) == -1) {
+ hio->hio_errors[ncomp] = errno;
+ pjdlog_errno(LOG_ERR,
+ "Unable to receive reply data");
+ rw_unlock(&hio_remote_lock[ncomp]);
+ nv_free(nv);
+ remote_close(res, ncomp);
+ goto done_queue;
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ break;
+ case BIO_WRITE:
+ case BIO_DELETE:
+ case BIO_FLUSH:
+ break;
+ default:
+ PJDLOG_ABORT("invalid condition");
+ }
+ hio->hio_errors[ncomp] = 0;
+ nv_free(nv);
+done_queue:
+ if (hio->hio_replication != HAST_REPLICATION_MEMSYNC ||
+ hio->hio_ggio.gctl_cmd != BIO_WRITE || ISSYNCREQ(hio)) {
+ if (refcnt_release(&hio->hio_countdown) > 0)
+ continue;
+ } else {
+ /*
+ * Depending on hio_countdown value, requests finished
+ * in the following order:
+ *
+ * 0: local write, remote memsync, remote final
+ * or
+ * 0: remote memsync, local write, remote final
+ *
+ * 1: local write, remote memsync, (remote final)
+ * or
+ * 1: remote memsync, remote final, (local write)
+ *
+ * 2: remote memsync, (local write), (remote final)
+ * or
+ * 2: remote memsync, (remote final), (local write)
+ */
+ switch (refcnt_release(&hio->hio_countdown)) {
+ case 0:
+ /*
+ * Remote final reply arrived.
+ */
+ PJDLOG_ASSERT(!memsyncack);
+ break;
+ case 1:
+ if (memsyncack) {
+ /*
+ * Local request already finished, so we
+ * can complete the write.
+ */
+ if (hio->hio_errors[0] == 0)
+ write_complete(res, hio);
+ /*
+ * We still need to wait for final
+ * remote reply.
+ */
+ pjdlog_debug(2,
+ "remote_recv: (%p) Moving request back to the recv queue.",
+ hio);
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ TAILQ_INSERT_TAIL(&hio_recv_list[ncomp],
+ hio, hio_next[ncomp]);
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ } else {
+ /*
+ * Remote final reply arrived before
+ * local write finished.
+ * Nothing to do in such case.
+ */
+ }
+ continue;
+ case 2:
+ /*
+ * We received remote memsync reply even before
+ * local write finished.
+ */
+ PJDLOG_ASSERT(memsyncack);
+
+ pjdlog_debug(2,
+ "remote_recv: (%p) Moving request back to the recv queue.",
+ hio);
+ mtx_lock(&hio_recv_list_lock[ncomp]);
+ TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], hio,
+ hio_next[ncomp]);
+ mtx_unlock(&hio_recv_list_lock[ncomp]);
+ continue;
+ default:
+ PJDLOG_ABORT("Invalid hio_countdown.");
+ }
+ }
+ if (ISSYNCREQ(hio)) {
+ mtx_lock(&sync_lock);
+ SYNCREQDONE(hio);
+ mtx_unlock(&sync_lock);
+ cv_signal(&sync_cond);
+ } else {
+ pjdlog_debug(2,
+ "remote_recv: (%p) Moving request to the done queue.",
+ hio);
+ QUEUE_INSERT2(hio, done);
+ }
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread sends answer to the kernel.
+ */
+static void *
+ggate_send_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct g_gate_ctl_io *ggio;
+ struct hio *hio;
+ unsigned int ii, ncomps;
+
+ ncomps = HAST_NCOMPONENTS;
+
+ for (;;) {
+ pjdlog_debug(2, "ggate_send: Taking request.");
+ QUEUE_TAKE2(hio, done);
+ pjdlog_debug(2, "ggate_send: (%p) Got request.", hio);
+ ggio = &hio->hio_ggio;
+ for (ii = 0; ii < ncomps; ii++) {
+ if (hio->hio_errors[ii] == 0) {
+ /*
+ * One successful request is enough to declare
+ * success.
+ */
+ ggio->gctl_error = 0;
+ break;
+ }
+ }
+ if (ii == ncomps) {
+ /*
+ * None of the requests were successful.
+ * Use the error from local component except the
+ * case when we did only remote request.
+ */
+ if (ggio->gctl_cmd == BIO_READ &&
+ res->hr_syncsrc == HAST_SYNCSRC_SECONDARY)
+ ggio->gctl_error = hio->hio_errors[1];
+ else
+ ggio->gctl_error = hio->hio_errors[0];
+ }
+ if (ggio->gctl_error == 0 && ggio->gctl_cmd == BIO_WRITE) {
+ mtx_lock(&res->hr_amp_lock);
+ if (activemap_write_complete(res->hr_amp,
+ ggio->gctl_offset, ggio->gctl_length)) {
+ res->hr_stat_activemap_update++;
+ (void)hast_activemap_flush(res);
+ } else {
+ mtx_unlock(&res->hr_amp_lock);
+ }
+ }
+ if (ggio->gctl_cmd == BIO_WRITE) {
+ /*
+ * Unlock range we locked.
+ */
+ mtx_lock(&range_lock);
+ rangelock_del(range_regular, ggio->gctl_offset,
+ ggio->gctl_length);
+ if (range_sync_wait)
+ cv_signal(&range_sync_cond);
+ mtx_unlock(&range_lock);
+ if (!hio->hio_done)
+ write_complete(res, hio);
+ } else {
+ if (ioctl(res->hr_ggatefd, G_GATE_CMD_DONE, ggio) == -1) {
+ primary_exit(EX_OSERR,
+ "G_GATE_CMD_DONE failed");
+ }
+ }
+ if (hio->hio_errors[0]) {
+ switch (ggio->gctl_cmd) {
+ case BIO_READ:
+ res->hr_stat_read_error++;
+ break;
+ case BIO_WRITE:
+ res->hr_stat_write_error++;
+ break;
+ case BIO_DELETE:
+ res->hr_stat_delete_error++;
+ break;
+ case BIO_FLUSH:
+ res->hr_stat_flush_error++;
+ break;
+ }
+ }
+ pjdlog_debug(2,
+ "ggate_send: (%p) Moving request to the free queue.", hio);
+ QUEUE_INSERT2(hio, free);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread synchronize local and remote components.
+ */
+static void *
+sync_thread(void *arg __unused)
+{
+ struct hast_resource *res = arg;
+ struct hio *hio;
+ struct g_gate_ctl_io *ggio;
+ struct timeval tstart, tend, tdiff;
+ unsigned int ii, ncomp, ncomps;
+ off_t offset, length, synced;
+ bool dorewind, directreads;
+ int syncext;
+
+ ncomps = HAST_NCOMPONENTS;
+ dorewind = true;
+ synced = 0;
+ offset = -1;
+ directreads = false;
+
+ for (;;) {
+ mtx_lock(&sync_lock);
+ if (offset >= 0 && !sync_inprogress) {
+ gettimeofday(&tend, NULL);
+ timersub(&tend, &tstart, &tdiff);
+ pjdlog_info("Synchronization interrupted after %#.0T. "
+ "%NB synchronized so far.", &tdiff,
+ (intmax_t)synced);
+ event_send(res, EVENT_SYNCINTR);
+ }
+ while (!sync_inprogress) {
+ dorewind = true;
+ synced = 0;
+ cv_wait(&sync_cond, &sync_lock);
+ }
+ mtx_unlock(&sync_lock);
+ /*
+ * Obtain offset at which we should synchronize.
+ * Rewind synchronization if needed.
+ */
+ mtx_lock(&res->hr_amp_lock);
+ if (dorewind)
+ activemap_sync_rewind(res->hr_amp);
+ offset = activemap_sync_offset(res->hr_amp, &length, &syncext);
+ if (syncext != -1) {
+ /*
+ * We synchronized entire syncext extent, we can mark
+ * it as clean now.
+ */
+ if (activemap_extent_complete(res->hr_amp, syncext))
+ (void)hast_activemap_flush(res);
+ else
+ mtx_unlock(&res->hr_amp_lock);
+ } else {
+ mtx_unlock(&res->hr_amp_lock);
+ }
+ if (dorewind) {
+ dorewind = false;
+ if (offset == -1)
+ pjdlog_info("Nodes are in sync.");
+ else {
+ pjdlog_info("Synchronization started. %NB to go.",
+ (intmax_t)(res->hr_extentsize *
+ activemap_ndirty(res->hr_amp)));
+ event_send(res, EVENT_SYNCSTART);
+ gettimeofday(&tstart, NULL);
+ }
+ }
+ if (offset == -1) {
+ sync_stop();
+ pjdlog_debug(1, "Nothing to synchronize.");
+ /*
+ * Synchronization complete, make both localcnt and
+ * remotecnt equal.
+ */
+ ncomp = 1;
+ rw_rlock(&hio_remote_lock[ncomp]);
+ if (ISCONNECTED(res, ncomp)) {
+ if (synced > 0) {
+ int64_t bps;
+
+ gettimeofday(&tend, NULL);
+ timersub(&tend, &tstart, &tdiff);
+ bps = (int64_t)((double)synced /
+ ((double)tdiff.tv_sec +
+ (double)tdiff.tv_usec / 1000000));
+ pjdlog_info("Synchronization complete. "
+ "%NB synchronized in %#.0lT (%NB/sec).",
+ (intmax_t)synced, &tdiff,
+ (intmax_t)bps);
+ event_send(res, EVENT_SYNCDONE);
+ }
+ mtx_lock(&metadata_lock);
+ if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY)
+ directreads = true;
+ res->hr_syncsrc = HAST_SYNCSRC_UNDEF;
+ res->hr_primary_localcnt =
+ res->hr_secondary_remotecnt;
+ res->hr_primary_remotecnt =
+ res->hr_secondary_localcnt;
+ pjdlog_debug(1,
+ "Setting localcnt to %ju and remotecnt to %ju.",
+ (uintmax_t)res->hr_primary_localcnt,
+ (uintmax_t)res->hr_primary_remotecnt);
+ (void)metadata_write(res);
+ mtx_unlock(&metadata_lock);
+ }
+ rw_unlock(&hio_remote_lock[ncomp]);
+ if (directreads) {
+ directreads = false;
+ enable_direct_reads(res);
+ }
+ continue;
+ }
+ pjdlog_debug(2, "sync: Taking free request.");
+ QUEUE_TAKE2(hio, free);
+ pjdlog_debug(2, "sync: (%p) Got free request.", hio);
+ /*
+ * Lock the range we are going to synchronize. We don't want
+ * race where someone writes between our read and write.
+ */
+ for (;;) {
+ mtx_lock(&range_lock);
+ if (rangelock_islocked(range_regular, offset, length)) {
+ pjdlog_debug(2,
+ "sync: Range offset=%jd length=%jd locked.",
+ (intmax_t)offset, (intmax_t)length);
+ range_sync_wait = true;
+ cv_wait(&range_sync_cond, &range_lock);
+ range_sync_wait = false;
+ mtx_unlock(&range_lock);
+ continue;
+ }
+ if (rangelock_add(range_sync, offset, length) == -1) {
+ mtx_unlock(&range_lock);
+ pjdlog_debug(2,
+ "sync: Range offset=%jd length=%jd is already locked, waiting.",
+ (intmax_t)offset, (intmax_t)length);
+ sleep(1);
+ continue;
+ }
+ mtx_unlock(&range_lock);
+ break;
+ }
+ /*
+ * First read the data from synchronization source.
+ */
+ SYNCREQ(hio);
+ ggio = &hio->hio_ggio;
+ ggio->gctl_cmd = BIO_READ;
+ ggio->gctl_offset = offset;
+ ggio->gctl_length = length;
+ ggio->gctl_error = 0;
+ hio->hio_done = false;
+ hio->hio_replication = res->hr_replication;
+ for (ii = 0; ii < ncomps; ii++)
+ hio->hio_errors[ii] = EINVAL;
+ reqlog(LOG_DEBUG, 2, ggio, "sync: (%p) Sending sync request: ",
+ hio);
+ pjdlog_debug(2, "sync: (%p) Moving request to the send queue.",
+ hio);
+ mtx_lock(&metadata_lock);
+ if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) {
+ /*
+ * This range is up-to-date on local component,
+ * so handle request locally.
+ */
+ /* Local component is 0 for now. */
+ ncomp = 0;
+ } else /* if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) */ {
+ PJDLOG_ASSERT(res->hr_syncsrc == HAST_SYNCSRC_SECONDARY);
+ /*
+ * This range is out-of-date on local component,
+ * so send request to the remote node.
+ */
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+ }
+ mtx_unlock(&metadata_lock);
+ refcnt_init(&hio->hio_countdown, 1);
+ QUEUE_INSERT1(hio, send, ncomp);
+
+ /*
+ * Let's wait for READ to finish.
+ */
+ mtx_lock(&sync_lock);
+ while (!ISSYNCREQDONE(hio))
+ cv_wait(&sync_cond, &sync_lock);
+ mtx_unlock(&sync_lock);
+
+ if (hio->hio_errors[ncomp] != 0) {
+ pjdlog_error("Unable to read synchronization data: %s.",
+ strerror(hio->hio_errors[ncomp]));
+ goto free_queue;
+ }
+
+ /*
+ * We read the data from synchronization source, now write it
+ * to synchronization target.
+ */
+ SYNCREQ(hio);
+ ggio->gctl_cmd = BIO_WRITE;
+ for (ii = 0; ii < ncomps; ii++)
+ hio->hio_errors[ii] = EINVAL;
+ reqlog(LOG_DEBUG, 2, ggio, "sync: (%p) Sending sync request: ",
+ hio);
+ pjdlog_debug(2, "sync: (%p) Moving request to the send queue.",
+ hio);
+ mtx_lock(&metadata_lock);
+ if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) {
+ /*
+ * This range is up-to-date on local component,
+ * so we update remote component.
+ */
+ /* Remote component is 1 for now. */
+ ncomp = 1;
+ } else /* if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) */ {
+ PJDLOG_ASSERT(res->hr_syncsrc == HAST_SYNCSRC_SECONDARY);
+ /*
+ * This range is out-of-date on local component,
+ * so we update it.
+ */
+ /* Local component is 0 for now. */
+ ncomp = 0;
+ }
+ mtx_unlock(&metadata_lock);
+
+ pjdlog_debug(2, "sync: (%p) Moving request to the send queue.",
+ hio);
+ refcnt_init(&hio->hio_countdown, 1);
+ QUEUE_INSERT1(hio, send, ncomp);
+
+ /*
+ * Let's wait for WRITE to finish.
+ */
+ mtx_lock(&sync_lock);
+ while (!ISSYNCREQDONE(hio))
+ cv_wait(&sync_cond, &sync_lock);
+ mtx_unlock(&sync_lock);
+
+ if (hio->hio_errors[ncomp] != 0) {
+ pjdlog_error("Unable to write synchronization data: %s.",
+ strerror(hio->hio_errors[ncomp]));
+ goto free_queue;
+ }
+
+ synced += length;
+free_queue:
+ mtx_lock(&range_lock);
+ rangelock_del(range_sync, offset, length);
+ if (range_regular_wait)
+ cv_signal(&range_regular_cond);
+ mtx_unlock(&range_lock);
+ pjdlog_debug(2, "sync: (%p) Moving request to the free queue.",
+ hio);
+ QUEUE_INSERT2(hio, free);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+void
+primary_config_reload(struct hast_resource *res, struct nv *nv)
+{
+ unsigned int ii, ncomps;
+ int modified, vint;
+ const char *vstr;
+
+ pjdlog_info("Reloading configuration...");
+
+ PJDLOG_ASSERT(res->hr_role == HAST_ROLE_PRIMARY);
+ PJDLOG_ASSERT(gres == res);
+ nv_assert(nv, "remoteaddr");
+ nv_assert(nv, "sourceaddr");
+ nv_assert(nv, "replication");
+ nv_assert(nv, "checksum");
+ nv_assert(nv, "compression");
+ nv_assert(nv, "timeout");
+ nv_assert(nv, "exec");
+ nv_assert(nv, "metaflush");
+
+ ncomps = HAST_NCOMPONENTS;
+
+#define MODIFIED_REMOTEADDR 0x01
+#define MODIFIED_SOURCEADDR 0x02
+#define MODIFIED_REPLICATION 0x04
+#define MODIFIED_CHECKSUM 0x08
+#define MODIFIED_COMPRESSION 0x10
+#define MODIFIED_TIMEOUT 0x20
+#define MODIFIED_EXEC 0x40
+#define MODIFIED_METAFLUSH 0x80
+ modified = 0;
+
+ vstr = nv_get_string(nv, "remoteaddr");
+ if (strcmp(gres->hr_remoteaddr, vstr) != 0) {
+ /*
+ * Don't copy res->hr_remoteaddr to gres just yet.
+ * We want remote_close() to log disconnect from the old
+ * addresses, not from the new ones.
+ */
+ modified |= MODIFIED_REMOTEADDR;
+ }
+ vstr = nv_get_string(nv, "sourceaddr");
+ if (strcmp(gres->hr_sourceaddr, vstr) != 0) {
+ strlcpy(gres->hr_sourceaddr, vstr, sizeof(gres->hr_sourceaddr));
+ modified |= MODIFIED_SOURCEADDR;
+ }
+ vint = nv_get_int32(nv, "replication");
+ if (gres->hr_replication != vint) {
+ gres->hr_replication = vint;
+ modified |= MODIFIED_REPLICATION;
+ }
+ vint = nv_get_int32(nv, "checksum");
+ if (gres->hr_checksum != vint) {
+ gres->hr_checksum = vint;
+ modified |= MODIFIED_CHECKSUM;
+ }
+ vint = nv_get_int32(nv, "compression");
+ if (gres->hr_compression != vint) {
+ gres->hr_compression = vint;
+ modified |= MODIFIED_COMPRESSION;
+ }
+ vint = nv_get_int32(nv, "timeout");
+ if (gres->hr_timeout != vint) {
+ gres->hr_timeout = vint;
+ modified |= MODIFIED_TIMEOUT;
+ }
+ vstr = nv_get_string(nv, "exec");
+ if (strcmp(gres->hr_exec, vstr) != 0) {
+ strlcpy(gres->hr_exec, vstr, sizeof(gres->hr_exec));
+ modified |= MODIFIED_EXEC;
+ }
+ vint = nv_get_int32(nv, "metaflush");
+ if (gres->hr_metaflush != vint) {
+ gres->hr_metaflush = vint;
+ modified |= MODIFIED_METAFLUSH;
+ }
+
+ /*
+ * Change timeout for connected sockets.
+ * Don't bother if we need to reconnect.
+ */
+ if ((modified & MODIFIED_TIMEOUT) != 0 &&
+ (modified & (MODIFIED_REMOTEADDR | MODIFIED_SOURCEADDR)) == 0) {
+ for (ii = 0; ii < ncomps; ii++) {
+ if (!ISREMOTE(ii))
+ continue;
+ rw_rlock(&hio_remote_lock[ii]);
+ if (!ISCONNECTED(gres, ii)) {
+ rw_unlock(&hio_remote_lock[ii]);
+ continue;
+ }
+ rw_unlock(&hio_remote_lock[ii]);
+ if (proto_timeout(gres->hr_remotein,
+ gres->hr_timeout) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to set connection timeout");
+ }
+ if (proto_timeout(gres->hr_remoteout,
+ gres->hr_timeout) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to set connection timeout");
+ }
+ }
+ }
+ if ((modified & (MODIFIED_REMOTEADDR | MODIFIED_SOURCEADDR)) != 0) {
+ for (ii = 0; ii < ncomps; ii++) {
+ if (!ISREMOTE(ii))
+ continue;
+ remote_close(gres, ii);
+ }
+ if (modified & MODIFIED_REMOTEADDR) {
+ vstr = nv_get_string(nv, "remoteaddr");
+ strlcpy(gres->hr_remoteaddr, vstr,
+ sizeof(gres->hr_remoteaddr));
+ }
+ }
+#undef MODIFIED_REMOTEADDR
+#undef MODIFIED_SOURCEADDR
+#undef MODIFIED_REPLICATION
+#undef MODIFIED_CHECKSUM
+#undef MODIFIED_COMPRESSION
+#undef MODIFIED_TIMEOUT
+#undef MODIFIED_EXEC
+#undef MODIFIED_METAFLUSH
+
+ pjdlog_info("Configuration reloaded successfully.");
+}
+
+static void
+guard_one(struct hast_resource *res, unsigned int ncomp)
+{
+ struct proto_conn *in, *out;
+
+ if (!ISREMOTE(ncomp))
+ return;
+
+ rw_rlock(&hio_remote_lock[ncomp]);
+
+ if (!real_remote(res)) {
+ rw_unlock(&hio_remote_lock[ncomp]);
+ return;
+ }
+
+ if (ISCONNECTED(res, ncomp)) {
+ PJDLOG_ASSERT(res->hr_remotein != NULL);
+ PJDLOG_ASSERT(res->hr_remoteout != NULL);
+ rw_unlock(&hio_remote_lock[ncomp]);
+ pjdlog_debug(2, "remote_guard: Connection to %s is ok.",
+ res->hr_remoteaddr);
+ return;
+ }
+
+ PJDLOG_ASSERT(res->hr_remotein == NULL);
+ PJDLOG_ASSERT(res->hr_remoteout == NULL);
+ /*
+ * Upgrade the lock. It doesn't have to be atomic as no other thread
+ * can change connection status from disconnected to connected.
+ */
+ rw_unlock(&hio_remote_lock[ncomp]);
+ pjdlog_debug(2, "remote_guard: Reconnecting to %s.",
+ res->hr_remoteaddr);
+ in = out = NULL;
+ if (init_remote(res, &in, &out) == 0) {
+ rw_wlock(&hio_remote_lock[ncomp]);
+ PJDLOG_ASSERT(res->hr_remotein == NULL);
+ PJDLOG_ASSERT(res->hr_remoteout == NULL);
+ PJDLOG_ASSERT(in != NULL && out != NULL);
+ res->hr_remotein = in;
+ res->hr_remoteout = out;
+ rw_unlock(&hio_remote_lock[ncomp]);
+ pjdlog_info("Successfully reconnected to %s.",
+ res->hr_remoteaddr);
+ sync_start();
+ } else {
+ /* Both connections should be NULL. */
+ PJDLOG_ASSERT(res->hr_remotein == NULL);
+ PJDLOG_ASSERT(res->hr_remoteout == NULL);
+ PJDLOG_ASSERT(in == NULL && out == NULL);
+ pjdlog_debug(2, "remote_guard: Reconnect to %s failed.",
+ res->hr_remoteaddr);
+ }
+}
+
+/*
+ * Thread guards remote connections and reconnects when needed, handles
+ * signals, etc.
+ */
+static void *
+guard_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ unsigned int ii, ncomps;
+ struct timespec timeout;
+ time_t lastcheck, now;
+ sigset_t mask;
+ int signo;
+
+ ncomps = HAST_NCOMPONENTS;
+ lastcheck = time(NULL);
+
+ PJDLOG_VERIFY(sigemptyset(&mask) == 0);
+ PJDLOG_VERIFY(sigaddset(&mask, SIGINT) == 0);
+ PJDLOG_VERIFY(sigaddset(&mask, SIGTERM) == 0);
+
+ timeout.tv_sec = HAST_KEEPALIVE;
+ timeout.tv_nsec = 0;
+ signo = -1;
+
+ for (;;) {
+ switch (signo) {
+ case SIGINT:
+ case SIGTERM:
+ sigexit_received = true;
+ primary_exitx(EX_OK,
+ "Termination signal received, exiting.");
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * Don't check connections until we fully started,
+ * as we may still be looping, waiting for remote node
+ * to switch from primary to secondary.
+ */
+ if (fullystarted) {
+ pjdlog_debug(2, "remote_guard: Checking connections.");
+ now = time(NULL);
+ if (lastcheck + HAST_KEEPALIVE <= now) {
+ for (ii = 0; ii < ncomps; ii++)
+ guard_one(res, ii);
+ lastcheck = now;
+ }
+ }
+ signo = sigtimedwait(&mask, NULL, &timeout);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
diff --git a/sbin/hastd/proto.c b/sbin/hastd/proto.c
new file mode 100644
index 0000000..73487c0
--- /dev/null
+++ b/sbin/hastd/proto.c
@@ -0,0 +1,446 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/socket.h>
+
+#include <errno.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+
+#include "pjdlog.h"
+#include "proto.h"
+#include "proto_impl.h"
+
+#define PROTO_CONN_MAGIC 0x907041c
+struct proto_conn {
+ int pc_magic;
+ struct proto *pc_proto;
+ void *pc_ctx;
+ int pc_side;
+#define PROTO_SIDE_CLIENT 0
+#define PROTO_SIDE_SERVER_LISTEN 1
+#define PROTO_SIDE_SERVER_WORK 2
+};
+
+static TAILQ_HEAD(, proto) protos = TAILQ_HEAD_INITIALIZER(protos);
+
+void
+proto_register(struct proto *proto, bool isdefault)
+{
+ static bool seen_default = false;
+
+ if (!isdefault)
+ TAILQ_INSERT_HEAD(&protos, proto, prt_next);
+ else {
+ PJDLOG_ASSERT(!seen_default);
+ seen_default = true;
+ TAILQ_INSERT_TAIL(&protos, proto, prt_next);
+ }
+}
+
+static struct proto_conn *
+proto_alloc(struct proto *proto, int side)
+{
+ struct proto_conn *conn;
+
+ PJDLOG_ASSERT(proto != NULL);
+ PJDLOG_ASSERT(side == PROTO_SIDE_CLIENT ||
+ side == PROTO_SIDE_SERVER_LISTEN ||
+ side == PROTO_SIDE_SERVER_WORK);
+
+ conn = malloc(sizeof(*conn));
+ if (conn != NULL) {
+ conn->pc_proto = proto;
+ conn->pc_side = side;
+ conn->pc_magic = PROTO_CONN_MAGIC;
+ }
+ return (conn);
+}
+
+static void
+proto_free(struct proto_conn *conn)
+{
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_side == PROTO_SIDE_CLIENT ||
+ conn->pc_side == PROTO_SIDE_SERVER_LISTEN ||
+ conn->pc_side == PROTO_SIDE_SERVER_WORK);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+
+ bzero(conn, sizeof(*conn));
+ free(conn);
+}
+
+static int
+proto_common_setup(const char *srcaddr, const char *dstaddr,
+ struct proto_conn **connp, int side)
+{
+ struct proto *proto;
+ struct proto_conn *conn;
+ void *ctx;
+ int ret;
+
+ PJDLOG_ASSERT(side == PROTO_SIDE_CLIENT ||
+ side == PROTO_SIDE_SERVER_LISTEN);
+
+ TAILQ_FOREACH(proto, &protos, prt_next) {
+ if (side == PROTO_SIDE_CLIENT) {
+ if (proto->prt_client == NULL)
+ ret = -1;
+ else
+ ret = proto->prt_client(srcaddr, dstaddr, &ctx);
+ } else /* if (side == PROTO_SIDE_SERVER_LISTEN) */ {
+ if (proto->prt_server == NULL)
+ ret = -1;
+ else
+ ret = proto->prt_server(dstaddr, &ctx);
+ }
+ /*
+ * ret == 0 - success
+ * ret == -1 - dstaddr is not for this protocol
+ * ret > 0 - right protocol, but an error occurred
+ */
+ if (ret >= 0)
+ break;
+ }
+ if (proto == NULL) {
+ /* Unrecognized address. */
+ errno = EINVAL;
+ return (-1);
+ }
+ if (ret > 0) {
+ /* An error occurred. */
+ errno = ret;
+ return (-1);
+ }
+ conn = proto_alloc(proto, side);
+ if (conn == NULL) {
+ if (proto->prt_close != NULL)
+ proto->prt_close(ctx);
+ errno = ENOMEM;
+ return (-1);
+ }
+ conn->pc_ctx = ctx;
+ *connp = conn;
+
+ return (0);
+}
+
+int
+proto_client(const char *srcaddr, const char *dstaddr,
+ struct proto_conn **connp)
+{
+
+ return (proto_common_setup(srcaddr, dstaddr, connp, PROTO_SIDE_CLIENT));
+}
+
+int
+proto_connect(struct proto_conn *conn, int timeout)
+{
+ int ret;
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_side == PROTO_SIDE_CLIENT);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+ PJDLOG_ASSERT(conn->pc_proto->prt_connect != NULL);
+ PJDLOG_ASSERT(timeout >= -1);
+
+ ret = conn->pc_proto->prt_connect(conn->pc_ctx, timeout);
+ if (ret != 0) {
+ errno = ret;
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
+proto_connect_wait(struct proto_conn *conn, int timeout)
+{
+ int ret;
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_side == PROTO_SIDE_CLIENT);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+ PJDLOG_ASSERT(conn->pc_proto->prt_connect_wait != NULL);
+ PJDLOG_ASSERT(timeout >= 0);
+
+ ret = conn->pc_proto->prt_connect_wait(conn->pc_ctx, timeout);
+ if (ret != 0) {
+ errno = ret;
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
+proto_server(const char *addr, struct proto_conn **connp)
+{
+
+ return (proto_common_setup(NULL, addr, connp, PROTO_SIDE_SERVER_LISTEN));
+}
+
+int
+proto_accept(struct proto_conn *conn, struct proto_conn **newconnp)
+{
+ struct proto_conn *newconn;
+ int ret;
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_side == PROTO_SIDE_SERVER_LISTEN);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+ PJDLOG_ASSERT(conn->pc_proto->prt_accept != NULL);
+
+ newconn = proto_alloc(conn->pc_proto, PROTO_SIDE_SERVER_WORK);
+ if (newconn == NULL)
+ return (-1);
+
+ ret = conn->pc_proto->prt_accept(conn->pc_ctx, &newconn->pc_ctx);
+ if (ret != 0) {
+ proto_free(newconn);
+ errno = ret;
+ return (-1);
+ }
+
+ *newconnp = newconn;
+
+ return (0);
+}
+
+int
+proto_send(const struct proto_conn *conn, const void *data, size_t size)
+{
+ int ret;
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+ PJDLOG_ASSERT(conn->pc_proto->prt_send != NULL);
+
+ ret = conn->pc_proto->prt_send(conn->pc_ctx, data, size, -1);
+ if (ret != 0) {
+ errno = ret;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+proto_recv(const struct proto_conn *conn, void *data, size_t size)
+{
+ int ret;
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+ PJDLOG_ASSERT(conn->pc_proto->prt_recv != NULL);
+
+ ret = conn->pc_proto->prt_recv(conn->pc_ctx, data, size, NULL);
+ if (ret != 0) {
+ errno = ret;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+proto_connection_send(const struct proto_conn *conn, struct proto_conn *mconn)
+{
+ const char *protoname;
+ int ret, fd;
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+ PJDLOG_ASSERT(conn->pc_proto->prt_send != NULL);
+ PJDLOG_ASSERT(mconn != NULL);
+ PJDLOG_ASSERT(mconn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(mconn->pc_proto != NULL);
+ fd = proto_descriptor(mconn);
+ PJDLOG_ASSERT(fd >= 0);
+ protoname = mconn->pc_proto->prt_name;
+ PJDLOG_ASSERT(protoname != NULL);
+
+ ret = conn->pc_proto->prt_send(conn->pc_ctx, protoname,
+ strlen(protoname) + 1, fd);
+ proto_close(mconn);
+ if (ret != 0) {
+ errno = ret;
+ return (-1);
+ }
+ return (0);
+}
+
+int
+proto_connection_recv(const struct proto_conn *conn, bool client,
+ struct proto_conn **newconnp)
+{
+ char protoname[128];
+ struct proto *proto;
+ struct proto_conn *newconn;
+ int ret, fd;
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+ PJDLOG_ASSERT(conn->pc_proto->prt_recv != NULL);
+ PJDLOG_ASSERT(newconnp != NULL);
+
+ bzero(protoname, sizeof(protoname));
+
+ ret = conn->pc_proto->prt_recv(conn->pc_ctx, protoname,
+ sizeof(protoname) - 1, &fd);
+ if (ret != 0) {
+ errno = ret;
+ return (-1);
+ }
+
+ PJDLOG_ASSERT(fd >= 0);
+
+ TAILQ_FOREACH(proto, &protos, prt_next) {
+ if (strcmp(proto->prt_name, protoname) == 0)
+ break;
+ }
+ if (proto == NULL) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ newconn = proto_alloc(proto,
+ client ? PROTO_SIDE_CLIENT : PROTO_SIDE_SERVER_WORK);
+ if (newconn == NULL)
+ return (-1);
+ PJDLOG_ASSERT(newconn->pc_proto->prt_wrap != NULL);
+ ret = newconn->pc_proto->prt_wrap(fd, client, &newconn->pc_ctx);
+ if (ret != 0) {
+ proto_free(newconn);
+ errno = ret;
+ return (-1);
+ }
+
+ *newconnp = newconn;
+
+ return (0);
+}
+
+int
+proto_descriptor(const struct proto_conn *conn)
+{
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+ PJDLOG_ASSERT(conn->pc_proto->prt_descriptor != NULL);
+
+ return (conn->pc_proto->prt_descriptor(conn->pc_ctx));
+}
+
+bool
+proto_address_match(const struct proto_conn *conn, const char *addr)
+{
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+ PJDLOG_ASSERT(conn->pc_proto->prt_address_match != NULL);
+
+ return (conn->pc_proto->prt_address_match(conn->pc_ctx, addr));
+}
+
+void
+proto_local_address(const struct proto_conn *conn, char *addr, size_t size)
+{
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+ PJDLOG_ASSERT(conn->pc_proto->prt_local_address != NULL);
+
+ conn->pc_proto->prt_local_address(conn->pc_ctx, addr, size);
+}
+
+void
+proto_remote_address(const struct proto_conn *conn, char *addr, size_t size)
+{
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+ PJDLOG_ASSERT(conn->pc_proto->prt_remote_address != NULL);
+
+ conn->pc_proto->prt_remote_address(conn->pc_ctx, addr, size);
+}
+
+int
+proto_timeout(const struct proto_conn *conn, int timeout)
+{
+ struct timeval tv;
+ int fd;
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+
+ fd = proto_descriptor(conn);
+ if (fd == -1)
+ return (-1);
+
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ if (setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)) == -1)
+ return (-1);
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) == -1)
+ return (-1);
+
+ return (0);
+}
+
+void
+proto_close(struct proto_conn *conn)
+{
+
+ PJDLOG_ASSERT(conn != NULL);
+ PJDLOG_ASSERT(conn->pc_magic == PROTO_CONN_MAGIC);
+ PJDLOG_ASSERT(conn->pc_proto != NULL);
+ PJDLOG_ASSERT(conn->pc_proto->prt_close != NULL);
+
+ conn->pc_proto->prt_close(conn->pc_ctx);
+ proto_free(conn);
+}
diff --git a/sbin/hastd/proto.h b/sbin/hastd/proto.h
new file mode 100644
index 0000000..1a60e5b
--- /dev/null
+++ b/sbin/hastd/proto.h
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PROTO_H_
+#define _PROTO_H_
+
+#include <stdbool.h> /* bool */
+#include <stdlib.h> /* size_t */
+
+struct proto_conn;
+
+int proto_client(const char *srcaddr, const char *dstaddr,
+ struct proto_conn **connp);
+int proto_connect(struct proto_conn *conn, int timeout);
+int proto_connect_wait(struct proto_conn *conn, int timeout);
+int proto_server(const char *addr, struct proto_conn **connp);
+int proto_accept(struct proto_conn *conn, struct proto_conn **newconnp);
+int proto_send(const struct proto_conn *conn, const void *data, size_t size);
+int proto_recv(const struct proto_conn *conn, void *data, size_t size);
+int proto_connection_send(const struct proto_conn *conn,
+ struct proto_conn *mconn);
+int proto_connection_recv(const struct proto_conn *conn, bool client,
+ struct proto_conn **newconnp);
+int proto_descriptor(const struct proto_conn *conn);
+bool proto_address_match(const struct proto_conn *conn, const char *addr);
+void proto_local_address(const struct proto_conn *conn, char *addr,
+ size_t size);
+void proto_remote_address(const struct proto_conn *conn, char *addr,
+ size_t size);
+int proto_timeout(const struct proto_conn *conn, int timeout);
+void proto_close(struct proto_conn *conn);
+
+#endif /* !_PROTO_H_ */
diff --git a/sbin/hastd/proto_common.c b/sbin/hastd/proto_common.c
new file mode 100644
index 0000000..843366b
--- /dev/null
+++ b/sbin/hastd/proto_common.c
@@ -0,0 +1,232 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include "pjdlog.h"
+#include "proto_impl.h"
+
+/* Maximum size of packet we want to use when sending data. */
+#ifndef MAX_SEND_SIZE
+#define MAX_SEND_SIZE 32768
+#endif
+
+static bool
+blocking_socket(int sock)
+{
+ int flags;
+
+ flags = fcntl(sock, F_GETFL);
+ PJDLOG_ASSERT(flags >= 0);
+ return ((flags & O_NONBLOCK) == 0);
+}
+
+static int
+proto_descriptor_send(int sock, int fd)
+{
+ unsigned char ctrl[CMSG_SPACE(sizeof(fd))];
+ struct msghdr msg;
+ struct cmsghdr *cmsg;
+
+ PJDLOG_ASSERT(sock >= 0);
+ PJDLOG_ASSERT(fd >= 0);
+
+ bzero(&msg, sizeof(msg));
+ bzero(&ctrl, sizeof(ctrl));
+
+ msg.msg_iov = NULL;
+ msg.msg_iovlen = 0;
+ msg.msg_control = ctrl;
+ msg.msg_controllen = sizeof(ctrl);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
+ bcopy(&fd, CMSG_DATA(cmsg), sizeof(fd));
+
+ if (sendmsg(sock, &msg, 0) == -1)
+ return (errno);
+
+ return (0);
+}
+
+int
+proto_common_send(int sock, const unsigned char *data, size_t size, int fd)
+{
+ ssize_t done;
+ size_t sendsize;
+ int errcount = 0;
+
+ PJDLOG_ASSERT(sock >= 0);
+
+ if (data == NULL) {
+ /* The caller is just trying to decide about direction. */
+
+ PJDLOG_ASSERT(size == 0);
+
+ if (shutdown(sock, SHUT_RD) == -1)
+ return (errno);
+ return (0);
+ }
+
+ PJDLOG_ASSERT(data != NULL);
+ PJDLOG_ASSERT(size > 0);
+
+ do {
+ sendsize = size < MAX_SEND_SIZE ? size : MAX_SEND_SIZE;
+ done = send(sock, data, sendsize, MSG_NOSIGNAL);
+ if (done == 0) {
+ return (ENOTCONN);
+ } else if (done == -1) {
+ if (errno == EINTR)
+ continue;
+ if (errno == ENOBUFS) {
+ /*
+ * If there are no buffers we retry.
+ * After each try we increase delay before the
+ * next one and we give up after fifteen times.
+ * This gives 11s of total wait time.
+ */
+ if (errcount == 15) {
+ pjdlog_warning("Getting ENOBUFS errors for 11s on send(), giving up.");
+ } else {
+ if (errcount == 0)
+ pjdlog_warning("Got ENOBUFS error on send(), retrying for a bit.");
+ errcount++;
+ usleep(100000 * errcount);
+ continue;
+ }
+ }
+ /*
+ * If this is blocking socket and we got EAGAIN, this
+ * means the request timed out. Translate errno to
+ * ETIMEDOUT, to give administrator a hint to
+ * eventually increase timeout.
+ */
+ if (errno == EAGAIN && blocking_socket(sock))
+ errno = ETIMEDOUT;
+ return (errno);
+ }
+ data += done;
+ size -= done;
+ } while (size > 0);
+ if (errcount > 0) {
+ pjdlog_info("Data sent successfully after %d ENOBUFS error%s.",
+ errcount, errcount == 1 ? "" : "s");
+ }
+
+ if (fd == -1)
+ return (0);
+ return (proto_descriptor_send(sock, fd));
+}
+
+static int
+proto_descriptor_recv(int sock, int *fdp)
+{
+ unsigned char ctrl[CMSG_SPACE(sizeof(*fdp))];
+ struct msghdr msg;
+ struct cmsghdr *cmsg;
+
+ PJDLOG_ASSERT(sock >= 0);
+ PJDLOG_ASSERT(fdp != NULL);
+
+ bzero(&msg, sizeof(msg));
+ bzero(&ctrl, sizeof(ctrl));
+
+ msg.msg_iov = NULL;
+ msg.msg_iovlen = 0;
+ msg.msg_control = ctrl;
+ msg.msg_controllen = sizeof(ctrl);
+
+ if (recvmsg(sock, &msg, 0) == -1)
+ return (errno);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ if (cmsg == NULL || cmsg->cmsg_level != SOL_SOCKET ||
+ cmsg->cmsg_type != SCM_RIGHTS) {
+ return (EINVAL);
+ }
+ bcopy(CMSG_DATA(cmsg), fdp, sizeof(*fdp));
+
+ return (0);
+}
+
+int
+proto_common_recv(int sock, unsigned char *data, size_t size, int *fdp)
+{
+ ssize_t done;
+
+ PJDLOG_ASSERT(sock >= 0);
+
+ if (data == NULL) {
+ /* The caller is just trying to decide about direction. */
+
+ PJDLOG_ASSERT(size == 0);
+
+ if (shutdown(sock, SHUT_WR) == -1)
+ return (errno);
+ return (0);
+ }
+
+ PJDLOG_ASSERT(data != NULL);
+ PJDLOG_ASSERT(size > 0);
+
+ do {
+ done = recv(sock, data, size, MSG_WAITALL);
+ } while (done == -1 && errno == EINTR);
+ if (done == 0) {
+ return (ENOTCONN);
+ } else if (done == -1) {
+ /*
+ * If this is blocking socket and we got EAGAIN, this
+ * means the request timed out. Translate errno to
+ * ETIMEDOUT, to give administrator a hint to
+ * eventually increase timeout.
+ */
+ if (errno == EAGAIN && blocking_socket(sock))
+ errno = ETIMEDOUT;
+ return (errno);
+ }
+ if (fdp == NULL)
+ return (0);
+ return (proto_descriptor_recv(sock, fdp));
+}
diff --git a/sbin/hastd/proto_impl.h b/sbin/hastd/proto_impl.h
new file mode 100644
index 0000000..d62f26f
--- /dev/null
+++ b/sbin/hastd/proto_impl.h
@@ -0,0 +1,79 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PROTO_IMPL_H_
+#define _PROTO_IMPL_H_
+
+#include <sys/queue.h>
+
+#include <stdbool.h> /* bool */
+#include <stdlib.h> /* size_t */
+
+#define __constructor __attribute__((constructor))
+
+typedef int prt_client_t(const char *, const char *, void **);
+typedef int prt_connect_t(void *, int);
+typedef int prt_connect_wait_t(void *, int);
+typedef int prt_server_t(const char *, void **);
+typedef int prt_accept_t(void *, void **);
+typedef int prt_wrap_t(int, bool, void **);
+typedef int prt_send_t(void *, const unsigned char *, size_t, int);
+typedef int prt_recv_t(void *, unsigned char *, size_t, int *);
+typedef int prt_descriptor_t(const void *);
+typedef bool prt_address_match_t(const void *, const char *);
+typedef void prt_local_address_t(const void *, char *, size_t);
+typedef void prt_remote_address_t(const void *, char *, size_t);
+typedef void prt_close_t(void *);
+
+struct proto {
+ const char *prt_name;
+ prt_client_t *prt_client;
+ prt_connect_t *prt_connect;
+ prt_connect_wait_t *prt_connect_wait;
+ prt_server_t *prt_server;
+ prt_accept_t *prt_accept;
+ prt_wrap_t *prt_wrap;
+ prt_send_t *prt_send;
+ prt_recv_t *prt_recv;
+ prt_descriptor_t *prt_descriptor;
+ prt_address_match_t *prt_address_match;
+ prt_local_address_t *prt_local_address;
+ prt_remote_address_t *prt_remote_address;
+ prt_close_t *prt_close;
+ TAILQ_ENTRY(proto) prt_next;
+};
+
+void proto_register(struct proto *proto, bool isdefault);
+
+int proto_common_send(int sock, const unsigned char *data, size_t size, int fd);
+int proto_common_recv(int sock, unsigned char *data, size_t size, int *fdp);
+
+#endif /* !_PROTO_IMPL_H_ */
diff --git a/sbin/hastd/proto_socketpair.c b/sbin/hastd/proto_socketpair.c
new file mode 100644
index 0000000..d13caa9
--- /dev/null
+++ b/sbin/hastd/proto_socketpair.c
@@ -0,0 +1,237 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "pjdlog.h"
+#include "proto_impl.h"
+
+#define SP_CTX_MAGIC 0x50c3741
+struct sp_ctx {
+ int sp_magic;
+ int sp_fd[2];
+ int sp_side;
+#define SP_SIDE_UNDEF 0
+#define SP_SIDE_CLIENT 1
+#define SP_SIDE_SERVER 2
+};
+
+static void sp_close(void *ctx);
+
+static int
+sp_client(const char *srcaddr, const char *dstaddr, void **ctxp)
+{
+ struct sp_ctx *spctx;
+ int ret;
+
+ if (strcmp(dstaddr, "socketpair://") != 0)
+ return (-1);
+
+ PJDLOG_ASSERT(srcaddr == NULL);
+
+ spctx = malloc(sizeof(*spctx));
+ if (spctx == NULL)
+ return (errno);
+
+ if (socketpair(PF_UNIX, SOCK_STREAM, 0, spctx->sp_fd) == -1) {
+ ret = errno;
+ free(spctx);
+ return (ret);
+ }
+
+ spctx->sp_side = SP_SIDE_UNDEF;
+ spctx->sp_magic = SP_CTX_MAGIC;
+ *ctxp = spctx;
+
+ return (0);
+}
+
+static int
+sp_send(void *ctx, const unsigned char *data, size_t size, int fd)
+{
+ struct sp_ctx *spctx = ctx;
+ int sock;
+
+ PJDLOG_ASSERT(spctx != NULL);
+ PJDLOG_ASSERT(spctx->sp_magic == SP_CTX_MAGIC);
+
+ switch (spctx->sp_side) {
+ case SP_SIDE_UNDEF:
+ /*
+ * If the first operation done by the caller is proto_send(),
+ * we assume this is the client.
+ */
+ /* FALLTHROUGH */
+ spctx->sp_side = SP_SIDE_CLIENT;
+ /* Close other end. */
+ close(spctx->sp_fd[1]);
+ spctx->sp_fd[1] = -1;
+ case SP_SIDE_CLIENT:
+ PJDLOG_ASSERT(spctx->sp_fd[0] >= 0);
+ sock = spctx->sp_fd[0];
+ break;
+ case SP_SIDE_SERVER:
+ PJDLOG_ASSERT(spctx->sp_fd[1] >= 0);
+ sock = spctx->sp_fd[1];
+ break;
+ default:
+ PJDLOG_ABORT("Invalid socket side (%d).", spctx->sp_side);
+ }
+
+ /* Someone is just trying to decide about side. */
+ if (data == NULL)
+ return (0);
+
+ return (proto_common_send(sock, data, size, fd));
+}
+
+static int
+sp_recv(void *ctx, unsigned char *data, size_t size, int *fdp)
+{
+ struct sp_ctx *spctx = ctx;
+ int fd;
+
+ PJDLOG_ASSERT(spctx != NULL);
+ PJDLOG_ASSERT(spctx->sp_magic == SP_CTX_MAGIC);
+
+ switch (spctx->sp_side) {
+ case SP_SIDE_UNDEF:
+ /*
+ * If the first operation done by the caller is proto_recv(),
+ * we assume this is the server.
+ */
+ /* FALLTHROUGH */
+ spctx->sp_side = SP_SIDE_SERVER;
+ /* Close other end. */
+ close(spctx->sp_fd[0]);
+ spctx->sp_fd[0] = -1;
+ case SP_SIDE_SERVER:
+ PJDLOG_ASSERT(spctx->sp_fd[1] >= 0);
+ fd = spctx->sp_fd[1];
+ break;
+ case SP_SIDE_CLIENT:
+ PJDLOG_ASSERT(spctx->sp_fd[0] >= 0);
+ fd = spctx->sp_fd[0];
+ break;
+ default:
+ PJDLOG_ABORT("Invalid socket side (%d).", spctx->sp_side);
+ }
+
+ /* Someone is just trying to decide about side. */
+ if (data == NULL)
+ return (0);
+
+ return (proto_common_recv(fd, data, size, fdp));
+}
+
+static int
+sp_descriptor(const void *ctx)
+{
+ const struct sp_ctx *spctx = ctx;
+
+ PJDLOG_ASSERT(spctx != NULL);
+ PJDLOG_ASSERT(spctx->sp_magic == SP_CTX_MAGIC);
+ PJDLOG_ASSERT(spctx->sp_side == SP_SIDE_CLIENT ||
+ spctx->sp_side == SP_SIDE_SERVER);
+
+ switch (spctx->sp_side) {
+ case SP_SIDE_CLIENT:
+ PJDLOG_ASSERT(spctx->sp_fd[0] >= 0);
+ return (spctx->sp_fd[0]);
+ case SP_SIDE_SERVER:
+ PJDLOG_ASSERT(spctx->sp_fd[1] >= 0);
+ return (spctx->sp_fd[1]);
+ }
+
+ PJDLOG_ABORT("Invalid socket side (%d).", spctx->sp_side);
+}
+
+static void
+sp_close(void *ctx)
+{
+ struct sp_ctx *spctx = ctx;
+
+ PJDLOG_ASSERT(spctx != NULL);
+ PJDLOG_ASSERT(spctx->sp_magic == SP_CTX_MAGIC);
+
+ switch (spctx->sp_side) {
+ case SP_SIDE_UNDEF:
+ PJDLOG_ASSERT(spctx->sp_fd[0] >= 0);
+ close(spctx->sp_fd[0]);
+ spctx->sp_fd[0] = -1;
+ PJDLOG_ASSERT(spctx->sp_fd[1] >= 0);
+ close(spctx->sp_fd[1]);
+ spctx->sp_fd[1] = -1;
+ break;
+ case SP_SIDE_CLIENT:
+ PJDLOG_ASSERT(spctx->sp_fd[0] >= 0);
+ close(spctx->sp_fd[0]);
+ spctx->sp_fd[0] = -1;
+ PJDLOG_ASSERT(spctx->sp_fd[1] == -1);
+ break;
+ case SP_SIDE_SERVER:
+ PJDLOG_ASSERT(spctx->sp_fd[1] >= 0);
+ close(spctx->sp_fd[1]);
+ spctx->sp_fd[1] = -1;
+ PJDLOG_ASSERT(spctx->sp_fd[0] == -1);
+ break;
+ default:
+ PJDLOG_ABORT("Invalid socket side (%d).", spctx->sp_side);
+ }
+
+ spctx->sp_magic = 0;
+ free(spctx);
+}
+
+static struct proto sp_proto = {
+ .prt_name = "socketpair",
+ .prt_client = sp_client,
+ .prt_send = sp_send,
+ .prt_recv = sp_recv,
+ .prt_descriptor = sp_descriptor,
+ .prt_close = sp_close
+};
+
+static __constructor void
+sp_ctor(void)
+{
+
+ proto_register(&sp_proto, false);
+}
diff --git a/sbin/hastd/proto_tcp.c b/sbin/hastd/proto_tcp.c
new file mode 100644
index 0000000..6dc0661
--- /dev/null
+++ b/sbin/hastd/proto_tcp.c
@@ -0,0 +1,637 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h> /* MAXHOSTNAMELEN */
+#include <sys/socket.h>
+
+#include <arpa/inet.h>
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "pjdlog.h"
+#include "proto_impl.h"
+#include "subr.h"
+
+#define TCP_CTX_MAGIC 0x7c41c
+struct tcp_ctx {
+ int tc_magic;
+ struct sockaddr_storage tc_sa;
+ int tc_fd;
+ int tc_side;
+#define TCP_SIDE_CLIENT 0
+#define TCP_SIDE_SERVER_LISTEN 1
+#define TCP_SIDE_SERVER_WORK 2
+};
+
+static int tcp_connect_wait(void *ctx, int timeout);
+static void tcp_close(void *ctx);
+
+/*
+ * Function converts the given string to unsigned number.
+ */
+static int
+numfromstr(const char *str, intmax_t minnum, intmax_t maxnum, intmax_t *nump)
+{
+ intmax_t digit, num;
+
+ if (str[0] == '\0')
+ goto invalid; /* Empty string. */
+ num = 0;
+ for (; *str != '\0'; str++) {
+ if (*str < '0' || *str > '9')
+ goto invalid; /* Non-digit character. */
+ digit = *str - '0';
+ if (num > num * 10 + digit)
+ goto invalid; /* Overflow. */
+ num = num * 10 + digit;
+ if (num > maxnum)
+ goto invalid; /* Too big. */
+ }
+ if (num < minnum)
+ goto invalid; /* Too small. */
+ *nump = num;
+ return (0);
+invalid:
+ errno = EINVAL;
+ return (-1);
+}
+
+static int
+tcp_addr(const char *addr, int defport, struct sockaddr_storage *sap)
+{
+ char iporhost[MAXHOSTNAMELEN], portstr[6];
+ struct addrinfo hints;
+ struct addrinfo *res;
+ const char *pp;
+ intmax_t port;
+ size_t size;
+ int error;
+
+ if (addr == NULL)
+ return (-1);
+
+ bzero(&hints, sizeof(hints));
+ hints.ai_flags = AI_ADDRCONFIG | AI_NUMERICSERV;
+ hints.ai_family = PF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_protocol = IPPROTO_TCP;
+
+ if (strncasecmp(addr, "tcp4://", 7) == 0) {
+ addr += 7;
+ hints.ai_family = PF_INET;
+ } else if (strncasecmp(addr, "tcp6://", 7) == 0) {
+ addr += 7;
+ hints.ai_family = PF_INET6;
+ } else if (strncasecmp(addr, "tcp://", 6) == 0) {
+ addr += 6;
+ } else {
+ /*
+ * Because TCP is the default assume IP or host is given without
+ * prefix.
+ */
+ }
+
+ /*
+ * Extract optional port.
+ * There are three cases to consider.
+ * 1. hostname with port, eg. freefall.freebsd.org:8457
+ * 2. IPv4 address with port, eg. 192.168.0.101:8457
+ * 3. IPv6 address with port, eg. [fe80::1]:8457
+ * We discover IPv6 address by checking for two colons and if port is
+ * given, the address has to start with [.
+ */
+ pp = NULL;
+ if (strchr(addr, ':') != strrchr(addr, ':')) {
+ if (addr[0] == '[')
+ pp = strrchr(addr, ':');
+ } else {
+ pp = strrchr(addr, ':');
+ }
+ if (pp == NULL) {
+ /* Port not given, use the default. */
+ port = defport;
+ } else {
+ if (numfromstr(pp + 1, 1, 65535, &port) == -1)
+ return (errno);
+ }
+ (void)snprintf(portstr, sizeof(portstr), "%jd", (intmax_t)port);
+ /* Extract host name or IP address. */
+ if (pp == NULL) {
+ size = sizeof(iporhost);
+ if (strlcpy(iporhost, addr, size) >= size)
+ return (ENAMETOOLONG);
+ } else if (addr[0] == '[' && pp[-1] == ']') {
+ size = (size_t)(pp - addr - 2 + 1);
+ if (size > sizeof(iporhost))
+ return (ENAMETOOLONG);
+ (void)strlcpy(iporhost, addr + 1, size);
+ } else {
+ size = (size_t)(pp - addr + 1);
+ if (size > sizeof(iporhost))
+ return (ENAMETOOLONG);
+ (void)strlcpy(iporhost, addr, size);
+ }
+
+ error = getaddrinfo(iporhost, portstr, &hints, &res);
+ if (error != 0) {
+ pjdlog_debug(1, "getaddrinfo(%s, %s) failed: %s.", iporhost,
+ portstr, gai_strerror(error));
+ return (EINVAL);
+ }
+ if (res == NULL)
+ return (ENOENT);
+
+ memcpy(sap, res->ai_addr, res->ai_addrlen);
+
+ freeaddrinfo(res);
+
+ return (0);
+}
+
+static int
+tcp_setup_new(const char *addr, int side, void **ctxp)
+{
+ struct tcp_ctx *tctx;
+ int ret, nodelay;
+
+ PJDLOG_ASSERT(addr != NULL);
+ PJDLOG_ASSERT(side == TCP_SIDE_CLIENT ||
+ side == TCP_SIDE_SERVER_LISTEN);
+ PJDLOG_ASSERT(ctxp != NULL);
+
+ tctx = malloc(sizeof(*tctx));
+ if (tctx == NULL)
+ return (errno);
+
+ /* Parse given address. */
+ if ((ret = tcp_addr(addr, PROTO_TCP_DEFAULT_PORT, &tctx->tc_sa)) != 0) {
+ free(tctx);
+ return (ret);
+ }
+
+ PJDLOG_ASSERT(tctx->tc_sa.ss_family != AF_UNSPEC);
+
+ tctx->tc_fd = socket(tctx->tc_sa.ss_family, SOCK_STREAM, 0);
+ if (tctx->tc_fd == -1) {
+ ret = errno;
+ free(tctx);
+ return (ret);
+ }
+
+ PJDLOG_ASSERT(tctx->tc_sa.ss_family != AF_UNSPEC);
+
+ /* Socket settings. */
+ nodelay = 1;
+ if (setsockopt(tctx->tc_fd, IPPROTO_TCP, TCP_NODELAY, &nodelay,
+ sizeof(nodelay)) == -1) {
+ pjdlog_errno(LOG_WARNING, "Unable to set TCP_NOELAY");
+ }
+
+ tctx->tc_side = side;
+ tctx->tc_magic = TCP_CTX_MAGIC;
+ *ctxp = tctx;
+
+ return (0);
+}
+
+static int
+tcp_setup_wrap(int fd, int side, void **ctxp)
+{
+ struct tcp_ctx *tctx;
+
+ PJDLOG_ASSERT(fd >= 0);
+ PJDLOG_ASSERT(side == TCP_SIDE_CLIENT ||
+ side == TCP_SIDE_SERVER_WORK);
+ PJDLOG_ASSERT(ctxp != NULL);
+
+ tctx = malloc(sizeof(*tctx));
+ if (tctx == NULL)
+ return (errno);
+
+ tctx->tc_fd = fd;
+ tctx->tc_sa.ss_family = AF_UNSPEC;
+ tctx->tc_side = side;
+ tctx->tc_magic = TCP_CTX_MAGIC;
+ *ctxp = tctx;
+
+ return (0);
+}
+
+static int
+tcp_client(const char *srcaddr, const char *dstaddr, void **ctxp)
+{
+ struct tcp_ctx *tctx;
+ struct sockaddr_storage sa;
+ int ret;
+
+ ret = tcp_setup_new(dstaddr, TCP_SIDE_CLIENT, ctxp);
+ if (ret != 0)
+ return (ret);
+ tctx = *ctxp;
+ if (srcaddr == NULL)
+ return (0);
+ ret = tcp_addr(srcaddr, 0, &sa);
+ if (ret != 0) {
+ tcp_close(tctx);
+ return (ret);
+ }
+ if (bind(tctx->tc_fd, (struct sockaddr *)&sa, sa.ss_len) == -1) {
+ ret = errno;
+ tcp_close(tctx);
+ return (ret);
+ }
+ return (0);
+}
+
+static int
+tcp_connect(void *ctx, int timeout)
+{
+ struct tcp_ctx *tctx = ctx;
+ int error, flags;
+
+ PJDLOG_ASSERT(tctx != NULL);
+ PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC);
+ PJDLOG_ASSERT(tctx->tc_side == TCP_SIDE_CLIENT);
+ PJDLOG_ASSERT(tctx->tc_fd >= 0);
+ PJDLOG_ASSERT(tctx->tc_sa.ss_family != AF_UNSPEC);
+ PJDLOG_ASSERT(timeout >= -1);
+
+ flags = fcntl(tctx->tc_fd, F_GETFL);
+ if (flags == -1) {
+ pjdlog_common(LOG_DEBUG, 1, errno, "fcntl(F_GETFL) failed");
+ return (errno);
+ }
+ /*
+ * We make socket non-blocking so we can handle connection timeout
+ * manually.
+ */
+ flags |= O_NONBLOCK;
+ if (fcntl(tctx->tc_fd, F_SETFL, flags) == -1) {
+ pjdlog_common(LOG_DEBUG, 1, errno,
+ "fcntl(F_SETFL, O_NONBLOCK) failed");
+ return (errno);
+ }
+
+ if (connect(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sa,
+ tctx->tc_sa.ss_len) == 0) {
+ if (timeout == -1)
+ return (0);
+ error = 0;
+ goto done;
+ }
+ if (errno != EINPROGRESS) {
+ error = errno;
+ pjdlog_common(LOG_DEBUG, 1, errno, "connect() failed");
+ goto done;
+ }
+ if (timeout == -1)
+ return (0);
+ return (tcp_connect_wait(ctx, timeout));
+done:
+ flags &= ~O_NONBLOCK;
+ if (fcntl(tctx->tc_fd, F_SETFL, flags) == -1) {
+ if (error == 0)
+ error = errno;
+ pjdlog_common(LOG_DEBUG, 1, errno,
+ "fcntl(F_SETFL, ~O_NONBLOCK) failed");
+ }
+ return (error);
+}
+
+static int
+tcp_connect_wait(void *ctx, int timeout)
+{
+ struct tcp_ctx *tctx = ctx;
+ struct timeval tv;
+ fd_set fdset;
+ socklen_t esize;
+ int error, flags, ret;
+
+ PJDLOG_ASSERT(tctx != NULL);
+ PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC);
+ PJDLOG_ASSERT(tctx->tc_side == TCP_SIDE_CLIENT);
+ PJDLOG_ASSERT(tctx->tc_fd >= 0);
+ PJDLOG_ASSERT(timeout >= 0);
+
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+again:
+ FD_ZERO(&fdset);
+ FD_SET(tctx->tc_fd, &fdset);
+ ret = select(tctx->tc_fd + 1, NULL, &fdset, NULL, &tv);
+ if (ret == 0) {
+ error = ETIMEDOUT;
+ goto done;
+ } else if (ret == -1) {
+ if (errno == EINTR)
+ goto again;
+ error = errno;
+ pjdlog_common(LOG_DEBUG, 1, errno, "select() failed");
+ goto done;
+ }
+ PJDLOG_ASSERT(ret > 0);
+ PJDLOG_ASSERT(FD_ISSET(tctx->tc_fd, &fdset));
+ esize = sizeof(error);
+ if (getsockopt(tctx->tc_fd, SOL_SOCKET, SO_ERROR, &error,
+ &esize) == -1) {
+ error = errno;
+ pjdlog_common(LOG_DEBUG, 1, errno,
+ "getsockopt(SO_ERROR) failed");
+ goto done;
+ }
+ if (error != 0) {
+ pjdlog_common(LOG_DEBUG, 1, error,
+ "getsockopt(SO_ERROR) returned error");
+ goto done;
+ }
+ error = 0;
+done:
+ flags = fcntl(tctx->tc_fd, F_GETFL);
+ if (flags == -1) {
+ if (error == 0)
+ error = errno;
+ pjdlog_common(LOG_DEBUG, 1, errno, "fcntl(F_GETFL) failed");
+ return (error);
+ }
+ flags &= ~O_NONBLOCK;
+ if (fcntl(tctx->tc_fd, F_SETFL, flags) == -1) {
+ if (error == 0)
+ error = errno;
+ pjdlog_common(LOG_DEBUG, 1, errno,
+ "fcntl(F_SETFL, ~O_NONBLOCK) failed");
+ }
+ return (error);
+}
+
+static int
+tcp_server(const char *addr, void **ctxp)
+{
+ struct tcp_ctx *tctx;
+ int ret, val;
+
+ ret = tcp_setup_new(addr, TCP_SIDE_SERVER_LISTEN, ctxp);
+ if (ret != 0)
+ return (ret);
+
+ tctx = *ctxp;
+
+ val = 1;
+ /* Ignore failure. */
+ (void)setsockopt(tctx->tc_fd, SOL_SOCKET, SO_REUSEADDR, &val,
+ sizeof(val));
+
+ PJDLOG_ASSERT(tctx->tc_sa.ss_family != AF_UNSPEC);
+
+ if (bind(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sa,
+ tctx->tc_sa.ss_len) == -1) {
+ ret = errno;
+ tcp_close(tctx);
+ return (ret);
+ }
+ if (listen(tctx->tc_fd, 8) == -1) {
+ ret = errno;
+ tcp_close(tctx);
+ return (ret);
+ }
+
+ return (0);
+}
+
+static int
+tcp_accept(void *ctx, void **newctxp)
+{
+ struct tcp_ctx *tctx = ctx;
+ struct tcp_ctx *newtctx;
+ socklen_t fromlen;
+ int ret;
+
+ PJDLOG_ASSERT(tctx != NULL);
+ PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC);
+ PJDLOG_ASSERT(tctx->tc_side == TCP_SIDE_SERVER_LISTEN);
+ PJDLOG_ASSERT(tctx->tc_fd >= 0);
+ PJDLOG_ASSERT(tctx->tc_sa.ss_family != AF_UNSPEC);
+
+ newtctx = malloc(sizeof(*newtctx));
+ if (newtctx == NULL)
+ return (errno);
+
+ fromlen = tctx->tc_sa.ss_len;
+ newtctx->tc_fd = accept(tctx->tc_fd, (struct sockaddr *)&tctx->tc_sa,
+ &fromlen);
+ if (newtctx->tc_fd == -1) {
+ ret = errno;
+ free(newtctx);
+ return (ret);
+ }
+
+ newtctx->tc_side = TCP_SIDE_SERVER_WORK;
+ newtctx->tc_magic = TCP_CTX_MAGIC;
+ *newctxp = newtctx;
+
+ return (0);
+}
+
+static int
+tcp_wrap(int fd, bool client, void **ctxp)
+{
+
+ return (tcp_setup_wrap(fd,
+ client ? TCP_SIDE_CLIENT : TCP_SIDE_SERVER_WORK, ctxp));
+}
+
+static int
+tcp_send(void *ctx, const unsigned char *data, size_t size, int fd)
+{
+ struct tcp_ctx *tctx = ctx;
+
+ PJDLOG_ASSERT(tctx != NULL);
+ PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC);
+ PJDLOG_ASSERT(tctx->tc_fd >= 0);
+ PJDLOG_ASSERT(fd == -1);
+
+ return (proto_common_send(tctx->tc_fd, data, size, -1));
+}
+
+static int
+tcp_recv(void *ctx, unsigned char *data, size_t size, int *fdp)
+{
+ struct tcp_ctx *tctx = ctx;
+
+ PJDLOG_ASSERT(tctx != NULL);
+ PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC);
+ PJDLOG_ASSERT(tctx->tc_fd >= 0);
+ PJDLOG_ASSERT(fdp == NULL);
+
+ return (proto_common_recv(tctx->tc_fd, data, size, NULL));
+}
+
+static int
+tcp_descriptor(const void *ctx)
+{
+ const struct tcp_ctx *tctx = ctx;
+
+ PJDLOG_ASSERT(tctx != NULL);
+ PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC);
+
+ return (tctx->tc_fd);
+}
+
+static bool
+tcp_address_match(const void *ctx, const char *addr)
+{
+ const struct tcp_ctx *tctx = ctx;
+ struct sockaddr_storage sa1, sa2;
+ socklen_t salen;
+
+ PJDLOG_ASSERT(tctx != NULL);
+ PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC);
+
+ if (tcp_addr(addr, PROTO_TCP_DEFAULT_PORT, &sa1) != 0)
+ return (false);
+
+ salen = sizeof(sa2);
+ if (getpeername(tctx->tc_fd, (struct sockaddr *)&sa2, &salen) == -1)
+ return (false);
+
+ if (sa1.ss_family != sa2.ss_family || sa1.ss_len != sa2.ss_len)
+ return (false);
+
+ switch (sa1.ss_family) {
+ case AF_INET:
+ {
+ struct sockaddr_in *sin1, *sin2;
+
+ sin1 = (struct sockaddr_in *)&sa1;
+ sin2 = (struct sockaddr_in *)&sa2;
+
+ return (memcmp(&sin1->sin_addr, &sin2->sin_addr,
+ sizeof(sin1->sin_addr)) == 0);
+ }
+ case AF_INET6:
+ {
+ struct sockaddr_in6 *sin1, *sin2;
+
+ sin1 = (struct sockaddr_in6 *)&sa1;
+ sin2 = (struct sockaddr_in6 *)&sa2;
+
+ return (memcmp(&sin1->sin6_addr, &sin2->sin6_addr,
+ sizeof(sin1->sin6_addr)) == 0);
+ }
+ default:
+ return (false);
+ }
+}
+
+static void
+tcp_local_address(const void *ctx, char *addr, size_t size)
+{
+ const struct tcp_ctx *tctx = ctx;
+ struct sockaddr_storage sa;
+ socklen_t salen;
+
+ PJDLOG_ASSERT(tctx != NULL);
+ PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC);
+
+ salen = sizeof(sa);
+ if (getsockname(tctx->tc_fd, (struct sockaddr *)&sa, &salen) == -1) {
+ PJDLOG_VERIFY(strlcpy(addr, "N/A", size) < size);
+ return;
+ }
+ PJDLOG_VERIFY(snprintf(addr, size, "tcp://%S", &sa) < (ssize_t)size);
+}
+
+static void
+tcp_remote_address(const void *ctx, char *addr, size_t size)
+{
+ const struct tcp_ctx *tctx = ctx;
+ struct sockaddr_storage sa;
+ socklen_t salen;
+
+ PJDLOG_ASSERT(tctx != NULL);
+ PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC);
+
+ salen = sizeof(sa);
+ if (getpeername(tctx->tc_fd, (struct sockaddr *)&sa, &salen) == -1) {
+ PJDLOG_VERIFY(strlcpy(addr, "N/A", size) < size);
+ return;
+ }
+ PJDLOG_VERIFY(snprintf(addr, size, "tcp://%S", &sa) < (ssize_t)size);
+}
+
+static void
+tcp_close(void *ctx)
+{
+ struct tcp_ctx *tctx = ctx;
+
+ PJDLOG_ASSERT(tctx != NULL);
+ PJDLOG_ASSERT(tctx->tc_magic == TCP_CTX_MAGIC);
+
+ if (tctx->tc_fd >= 0)
+ close(tctx->tc_fd);
+ tctx->tc_magic = 0;
+ free(tctx);
+}
+
+static struct proto tcp_proto = {
+ .prt_name = "tcp",
+ .prt_client = tcp_client,
+ .prt_connect = tcp_connect,
+ .prt_connect_wait = tcp_connect_wait,
+ .prt_server = tcp_server,
+ .prt_accept = tcp_accept,
+ .prt_wrap = tcp_wrap,
+ .prt_send = tcp_send,
+ .prt_recv = tcp_recv,
+ .prt_descriptor = tcp_descriptor,
+ .prt_address_match = tcp_address_match,
+ .prt_local_address = tcp_local_address,
+ .prt_remote_address = tcp_remote_address,
+ .prt_close = tcp_close
+};
+
+static __constructor void
+tcp_ctor(void)
+{
+
+ proto_register(&tcp_proto, true);
+}
diff --git a/sbin/hastd/proto_uds.c b/sbin/hastd/proto_uds.c
new file mode 100644
index 0000000..087b788
--- /dev/null
+++ b/sbin/hastd/proto_uds.c
@@ -0,0 +1,361 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/* UDS - UNIX Domain Socket */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "pjdlog.h"
+#include "proto_impl.h"
+
+#define UDS_CTX_MAGIC 0xd541c
+struct uds_ctx {
+ int uc_magic;
+ struct sockaddr_un uc_sun;
+ int uc_fd;
+ int uc_side;
+#define UDS_SIDE_CLIENT 0
+#define UDS_SIDE_SERVER_LISTEN 1
+#define UDS_SIDE_SERVER_WORK 2
+ pid_t uc_owner;
+};
+
+static void uds_close(void *ctx);
+
+static int
+uds_addr(const char *addr, struct sockaddr_un *sunp)
+{
+
+ if (addr == NULL)
+ return (-1);
+
+ if (strncasecmp(addr, "uds://", 6) == 0)
+ addr += 6;
+ else if (strncasecmp(addr, "unix://", 7) == 0)
+ addr += 7;
+ else if (addr[0] == '/' && /* If it starts from /... */
+ strstr(addr, "://") == NULL)/* ...and there is no prefix... */
+ ; /* ...we assume its us. */
+ else
+ return (-1);
+
+ sunp->sun_family = AF_UNIX;
+ if (strlcpy(sunp->sun_path, addr, sizeof(sunp->sun_path)) >=
+ sizeof(sunp->sun_path)) {
+ return (ENAMETOOLONG);
+ }
+ sunp->sun_len = SUN_LEN(sunp);
+
+ return (0);
+}
+
+static int
+uds_common_setup(const char *addr, void **ctxp, int side)
+{
+ struct uds_ctx *uctx;
+ int ret;
+
+ uctx = malloc(sizeof(*uctx));
+ if (uctx == NULL)
+ return (errno);
+
+ /* Parse given address. */
+ if ((ret = uds_addr(addr, &uctx->uc_sun)) != 0) {
+ free(uctx);
+ return (ret);
+ }
+
+ uctx->uc_fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (uctx->uc_fd == -1) {
+ ret = errno;
+ free(uctx);
+ return (ret);
+ }
+
+ uctx->uc_side = side;
+ uctx->uc_owner = 0;
+ uctx->uc_magic = UDS_CTX_MAGIC;
+ *ctxp = uctx;
+
+ return (0);
+}
+
+static int
+uds_client(const char *srcaddr, const char *dstaddr, void **ctxp)
+{
+ int ret;
+
+ ret = uds_common_setup(dstaddr, ctxp, UDS_SIDE_CLIENT);
+ if (ret != 0)
+ return (ret);
+
+ PJDLOG_ASSERT(srcaddr == NULL);
+
+ return (0);
+}
+
+static int
+uds_connect(void *ctx, int timeout)
+{
+ struct uds_ctx *uctx = ctx;
+
+ PJDLOG_ASSERT(uctx != NULL);
+ PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC);
+ PJDLOG_ASSERT(uctx->uc_side == UDS_SIDE_CLIENT);
+ PJDLOG_ASSERT(uctx->uc_fd >= 0);
+ PJDLOG_ASSERT(timeout >= -1);
+
+ if (connect(uctx->uc_fd, (struct sockaddr *)&uctx->uc_sun,
+ sizeof(uctx->uc_sun)) == -1) {
+ return (errno);
+ }
+
+ return (0);
+}
+
+static int
+uds_connect_wait(void *ctx, int timeout)
+{
+ struct uds_ctx *uctx = ctx;
+
+ PJDLOG_ASSERT(uctx != NULL);
+ PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC);
+ PJDLOG_ASSERT(uctx->uc_side == UDS_SIDE_CLIENT);
+ PJDLOG_ASSERT(uctx->uc_fd >= 0);
+ PJDLOG_ASSERT(timeout >= 0);
+
+ return (0);
+}
+
+static int
+uds_server(const char *addr, void **ctxp)
+{
+ struct uds_ctx *uctx;
+ int ret;
+
+ ret = uds_common_setup(addr, ctxp, UDS_SIDE_SERVER_LISTEN);
+ if (ret != 0)
+ return (ret);
+
+ uctx = *ctxp;
+
+ (void)unlink(uctx->uc_sun.sun_path);
+ if (bind(uctx->uc_fd, (struct sockaddr *)&uctx->uc_sun,
+ sizeof(uctx->uc_sun)) == -1) {
+ ret = errno;
+ uds_close(uctx);
+ return (ret);
+ }
+ uctx->uc_owner = getpid();
+ if (listen(uctx->uc_fd, 8) == -1) {
+ ret = errno;
+ uds_close(uctx);
+ return (ret);
+ }
+
+ return (0);
+}
+
+static int
+uds_accept(void *ctx, void **newctxp)
+{
+ struct uds_ctx *uctx = ctx;
+ struct uds_ctx *newuctx;
+ socklen_t fromlen;
+ int ret;
+
+ PJDLOG_ASSERT(uctx != NULL);
+ PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC);
+ PJDLOG_ASSERT(uctx->uc_side == UDS_SIDE_SERVER_LISTEN);
+ PJDLOG_ASSERT(uctx->uc_fd >= 0);
+
+ newuctx = malloc(sizeof(*newuctx));
+ if (newuctx == NULL)
+ return (errno);
+
+ fromlen = sizeof(newuctx->uc_sun);
+ newuctx->uc_fd = accept(uctx->uc_fd,
+ (struct sockaddr *)&newuctx->uc_sun, &fromlen);
+ if (newuctx->uc_fd == -1) {
+ ret = errno;
+ free(newuctx);
+ return (ret);
+ }
+
+ newuctx->uc_side = UDS_SIDE_SERVER_WORK;
+ newuctx->uc_magic = UDS_CTX_MAGIC;
+ *newctxp = newuctx;
+
+ return (0);
+}
+
+static int
+uds_send(void *ctx, const unsigned char *data, size_t size, int fd)
+{
+ struct uds_ctx *uctx = ctx;
+
+ PJDLOG_ASSERT(uctx != NULL);
+ PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC);
+ PJDLOG_ASSERT(uctx->uc_fd >= 0);
+
+ return (proto_common_send(uctx->uc_fd, data, size, fd));
+}
+
+static int
+uds_recv(void *ctx, unsigned char *data, size_t size, int *fdp)
+{
+ struct uds_ctx *uctx = ctx;
+
+ PJDLOG_ASSERT(uctx != NULL);
+ PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC);
+ PJDLOG_ASSERT(uctx->uc_fd >= 0);
+
+ return (proto_common_recv(uctx->uc_fd, data, size, fdp));
+}
+
+static int
+uds_descriptor(const void *ctx)
+{
+ const struct uds_ctx *uctx = ctx;
+
+ PJDLOG_ASSERT(uctx != NULL);
+ PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC);
+
+ return (uctx->uc_fd);
+}
+
+static void
+uds_local_address(const void *ctx, char *addr, size_t size)
+{
+ const struct uds_ctx *uctx = ctx;
+ struct sockaddr_un sun;
+ socklen_t sunlen;
+
+ PJDLOG_ASSERT(uctx != NULL);
+ PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC);
+ PJDLOG_ASSERT(addr != NULL);
+
+ sunlen = sizeof(sun);
+ if (getsockname(uctx->uc_fd, (struct sockaddr *)&sun, &sunlen) == -1) {
+ PJDLOG_VERIFY(strlcpy(addr, "N/A", size) < size);
+ return;
+ }
+ PJDLOG_ASSERT(sun.sun_family == AF_UNIX);
+ if (sun.sun_path[0] == '\0') {
+ PJDLOG_VERIFY(strlcpy(addr, "N/A", size) < size);
+ return;
+ }
+ PJDLOG_VERIFY(snprintf(addr, size, "uds://%s", sun.sun_path) < (ssize_t)size);
+}
+
+static void
+uds_remote_address(const void *ctx, char *addr, size_t size)
+{
+ const struct uds_ctx *uctx = ctx;
+ struct sockaddr_un sun;
+ socklen_t sunlen;
+
+ PJDLOG_ASSERT(uctx != NULL);
+ PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC);
+ PJDLOG_ASSERT(addr != NULL);
+
+ sunlen = sizeof(sun);
+ if (getpeername(uctx->uc_fd, (struct sockaddr *)&sun, &sunlen) == -1) {
+ PJDLOG_VERIFY(strlcpy(addr, "N/A", size) < size);
+ return;
+ }
+ PJDLOG_ASSERT(sun.sun_family == AF_UNIX);
+ if (sun.sun_path[0] == '\0') {
+ PJDLOG_VERIFY(strlcpy(addr, "N/A", size) < size);
+ return;
+ }
+ snprintf(addr, size, "uds://%s", sun.sun_path);
+}
+
+static void
+uds_close(void *ctx)
+{
+ struct uds_ctx *uctx = ctx;
+
+ PJDLOG_ASSERT(uctx != NULL);
+ PJDLOG_ASSERT(uctx->uc_magic == UDS_CTX_MAGIC);
+
+ if (uctx->uc_fd >= 0)
+ close(uctx->uc_fd);
+ /*
+ * Unlink the socket only if we are the owner and this is descriptor
+ * we listen on.
+ */
+ if (uctx->uc_side == UDS_SIDE_SERVER_LISTEN &&
+ uctx->uc_owner == getpid()) {
+ PJDLOG_ASSERT(uctx->uc_sun.sun_path[0] != '\0');
+ if (unlink(uctx->uc_sun.sun_path) == -1) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to unlink socket file %s",
+ uctx->uc_sun.sun_path);
+ }
+ }
+ uctx->uc_owner = 0;
+ uctx->uc_magic = 0;
+ free(uctx);
+}
+
+static struct proto uds_proto = {
+ .prt_name = "uds",
+ .prt_client = uds_client,
+ .prt_connect = uds_connect,
+ .prt_connect_wait = uds_connect_wait,
+ .prt_server = uds_server,
+ .prt_accept = uds_accept,
+ .prt_send = uds_send,
+ .prt_recv = uds_recv,
+ .prt_descriptor = uds_descriptor,
+ .prt_local_address = uds_local_address,
+ .prt_remote_address = uds_remote_address,
+ .prt_close = uds_close
+};
+
+static __constructor void
+uds_ctor(void)
+{
+
+ proto_register(&uds_proto, false);
+}
diff --git a/sbin/hastd/rangelock.c b/sbin/hastd/rangelock.c
new file mode 100644
index 0000000..e14c5b8
--- /dev/null
+++ b/sbin/hastd/rangelock.c
@@ -0,0 +1,141 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/queue.h>
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <pjdlog.h>
+
+#include "rangelock.h"
+
+#ifndef PJDLOG_ASSERT
+#include <assert.h>
+#define PJDLOG_ASSERT(...) assert(__VA_ARGS__)
+#endif
+
+#define RANGELOCKS_MAGIC 0x94310c
+struct rangelocks {
+ int rls_magic; /* Magic value. */
+ TAILQ_HEAD(, rlock) rls_locks; /* List of locked ranges. */
+};
+
+struct rlock {
+ off_t rl_start;
+ off_t rl_end;
+ TAILQ_ENTRY(rlock) rl_next;
+};
+
+int
+rangelock_init(struct rangelocks **rlsp)
+{
+ struct rangelocks *rls;
+
+ PJDLOG_ASSERT(rlsp != NULL);
+
+ rls = malloc(sizeof(*rls));
+ if (rls == NULL)
+ return (-1);
+
+ TAILQ_INIT(&rls->rls_locks);
+
+ rls->rls_magic = RANGELOCKS_MAGIC;
+ *rlsp = rls;
+
+ return (0);
+}
+
+void
+rangelock_free(struct rangelocks *rls)
+{
+ struct rlock *rl;
+
+ PJDLOG_ASSERT(rls->rls_magic == RANGELOCKS_MAGIC);
+
+ rls->rls_magic = 0;
+
+ while ((rl = TAILQ_FIRST(&rls->rls_locks)) != NULL) {
+ TAILQ_REMOVE(&rls->rls_locks, rl, rl_next);
+ free(rl);
+ }
+ free(rls);
+}
+
+int
+rangelock_add(struct rangelocks *rls, off_t offset, off_t length)
+{
+ struct rlock *rl;
+
+ PJDLOG_ASSERT(rls->rls_magic == RANGELOCKS_MAGIC);
+
+ rl = malloc(sizeof(*rl));
+ if (rl == NULL)
+ return (-1);
+ rl->rl_start = offset;
+ rl->rl_end = offset + length;
+ TAILQ_INSERT_TAIL(&rls->rls_locks, rl, rl_next);
+ return (0);
+}
+
+void
+rangelock_del(struct rangelocks *rls, off_t offset, off_t length)
+{
+ struct rlock *rl;
+
+ PJDLOG_ASSERT(rls->rls_magic == RANGELOCKS_MAGIC);
+
+ TAILQ_FOREACH(rl, &rls->rls_locks, rl_next) {
+ if (rl->rl_start == offset && rl->rl_end == offset + length)
+ break;
+ }
+ PJDLOG_ASSERT(rl != NULL);
+ TAILQ_REMOVE(&rls->rls_locks, rl, rl_next);
+ free(rl);
+}
+
+bool
+rangelock_islocked(struct rangelocks *rls, off_t offset, off_t length)
+{
+ struct rlock *rl;
+ off_t end;
+
+ PJDLOG_ASSERT(rls->rls_magic == RANGELOCKS_MAGIC);
+
+ end = offset + length;
+ TAILQ_FOREACH(rl, &rls->rls_locks, rl_next) {
+ if (rl->rl_start < end && rl->rl_end > offset)
+ break;
+ }
+ return (rl != NULL);
+}
diff --git a/sbin/hastd/rangelock.h b/sbin/hastd/rangelock.h
new file mode 100644
index 0000000..2ad9895
--- /dev/null
+++ b/sbin/hastd/rangelock.h
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _RANGELOCK_H_
+#define _RANGELOCK_H_
+
+#include <stdbool.h>
+#include <unistd.h>
+
+struct rangelocks;
+
+int rangelock_init(struct rangelocks **rlsp);
+void rangelock_free(struct rangelocks *rls);
+int rangelock_add(struct rangelocks *rls, off_t offset, off_t length);
+void rangelock_del(struct rangelocks *rls, off_t offset, off_t length);
+bool rangelock_islocked(struct rangelocks *rls, off_t offset, off_t length);
+
+#endif /* !_RANGELOCK_H_ */
diff --git a/sbin/hastd/refcnt.h b/sbin/hastd/refcnt.h
new file mode 100644
index 0000000..1246043
--- /dev/null
+++ b/sbin/hastd/refcnt.h
@@ -0,0 +1,66 @@
+/*-
+ * Copyright (c) 2005 John Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __REFCNT_H__
+#define __REFCNT_H__
+
+#include <machine/atomic.h>
+
+#include "pjdlog.h"
+
+typedef unsigned int refcnt_t;
+
+static __inline void
+refcnt_init(refcnt_t *count, unsigned int v)
+{
+
+ *count = v;
+}
+
+static __inline void
+refcnt_acquire(refcnt_t *count)
+{
+
+ atomic_add_acq_int(count, 1);
+}
+
+static __inline unsigned int
+refcnt_release(refcnt_t *count)
+{
+ unsigned int old;
+
+ /* XXX: Should this have a rel membar? */
+ old = atomic_fetchadd_int(count, -1);
+ PJDLOG_ASSERT(old > 0);
+ return (old - 1);
+}
+
+#endif /* ! __REFCNT_H__ */
diff --git a/sbin/hastd/secondary.c b/sbin/hastd/secondary.c
new file mode 100644
index 0000000..067c5d9
--- /dev/null
+++ b/sbin/hastd/secondary.c
@@ -0,0 +1,915 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/bio.h>
+#include <sys/disk.h>
+#include <sys/stat.h>
+
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgeom.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <activemap.h>
+#include <nv.h>
+#include <pjdlog.h>
+
+#include "control.h"
+#include "event.h"
+#include "hast.h"
+#include "hast_proto.h"
+#include "hastd.h"
+#include "hooks.h"
+#include "metadata.h"
+#include "proto.h"
+#include "subr.h"
+#include "synch.h"
+
+struct hio {
+ uint64_t hio_seq;
+ int hio_error;
+ void *hio_data;
+ uint8_t hio_cmd;
+ uint64_t hio_offset;
+ uint64_t hio_length;
+ bool hio_memsync;
+ TAILQ_ENTRY(hio) hio_next;
+};
+
+static struct hast_resource *gres;
+
+/*
+ * Free list holds unused structures. When free list is empty, we have to wait
+ * until some in-progress requests are freed.
+ */
+static TAILQ_HEAD(, hio) hio_free_list;
+static pthread_mutex_t hio_free_list_lock;
+static pthread_cond_t hio_free_list_cond;
+/*
+ * Disk thread (the one that does I/O requests) takes requests from this list.
+ */
+static TAILQ_HEAD(, hio) hio_disk_list;
+static pthread_mutex_t hio_disk_list_lock;
+static pthread_cond_t hio_disk_list_cond;
+/*
+ * Thread that sends requests back to primary takes requests from this list.
+ */
+static TAILQ_HEAD(, hio) hio_send_list;
+static pthread_mutex_t hio_send_list_lock;
+static pthread_cond_t hio_send_list_cond;
+
+/*
+ * Maximum number of outstanding I/O requests.
+ */
+#define HAST_HIO_MAX 256
+
+static void *recv_thread(void *arg);
+static void *disk_thread(void *arg);
+static void *send_thread(void *arg);
+
+#define QUEUE_INSERT(name, hio) do { \
+ bool _wakeup; \
+ \
+ mtx_lock(&hio_##name##_list_lock); \
+ _wakeup = TAILQ_EMPTY(&hio_##name##_list); \
+ TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_next); \
+ mtx_unlock(&hio_##name##_list_lock); \
+ if (_wakeup) \
+ cv_broadcast(&hio_##name##_list_cond); \
+} while (0)
+#define QUEUE_TAKE(name, hio) do { \
+ mtx_lock(&hio_##name##_list_lock); \
+ while (((hio) = TAILQ_FIRST(&hio_##name##_list)) == NULL) { \
+ cv_wait(&hio_##name##_list_cond, \
+ &hio_##name##_list_lock); \
+ } \
+ TAILQ_REMOVE(&hio_##name##_list, (hio), hio_next); \
+ mtx_unlock(&hio_##name##_list_lock); \
+} while (0)
+
+static void
+hio_clear(struct hio *hio)
+{
+
+ hio->hio_seq = 0;
+ hio->hio_error = 0;
+ hio->hio_cmd = HIO_UNDEF;
+ hio->hio_offset = 0;
+ hio->hio_length = 0;
+ hio->hio_memsync = false;
+}
+
+static void
+hio_copy(const struct hio *srchio, struct hio *dsthio)
+{
+
+ /*
+ * We don't copy hio_error, hio_data and hio_next fields.
+ */
+
+ dsthio->hio_seq = srchio->hio_seq;
+ dsthio->hio_cmd = srchio->hio_cmd;
+ dsthio->hio_offset = srchio->hio_offset;
+ dsthio->hio_length = srchio->hio_length;
+ dsthio->hio_memsync = srchio->hio_memsync;
+}
+
+static void
+init_environment(void)
+{
+ struct hio *hio;
+ unsigned int ii;
+
+ /*
+ * Initialize lists, their locks and theirs condition variables.
+ */
+ TAILQ_INIT(&hio_free_list);
+ mtx_init(&hio_free_list_lock);
+ cv_init(&hio_free_list_cond);
+ TAILQ_INIT(&hio_disk_list);
+ mtx_init(&hio_disk_list_lock);
+ cv_init(&hio_disk_list_cond);
+ TAILQ_INIT(&hio_send_list);
+ mtx_init(&hio_send_list_lock);
+ cv_init(&hio_send_list_cond);
+
+ /*
+ * Allocate requests pool and initialize requests.
+ */
+ for (ii = 0; ii < HAST_HIO_MAX; ii++) {
+ hio = malloc(sizeof(*hio));
+ if (hio == NULL) {
+ pjdlog_exitx(EX_TEMPFAIL,
+ "Unable to allocate memory (%zu bytes) for hio request.",
+ sizeof(*hio));
+ }
+ hio->hio_data = malloc(MAXPHYS);
+ if (hio->hio_data == NULL) {
+ pjdlog_exitx(EX_TEMPFAIL,
+ "Unable to allocate memory (%zu bytes) for gctl_data.",
+ (size_t)MAXPHYS);
+ }
+ hio_clear(hio);
+ TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_next);
+ }
+}
+
+static void
+init_local(struct hast_resource *res)
+{
+
+ if (metadata_read(res, true) == -1)
+ exit(EX_NOINPUT);
+}
+
+static void
+init_remote(struct hast_resource *res, struct nv *nvin)
+{
+ uint64_t resuid;
+ struct nv *nvout;
+ unsigned char *map;
+ size_t mapsize;
+
+#ifdef notyet
+ /* Setup direction. */
+ if (proto_send(res->hr_remoteout, NULL, 0) == -1)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection direction");
+#endif
+
+ nvout = nv_alloc();
+ nv_add_int64(nvout, (int64_t)res->hr_datasize, "datasize");
+ nv_add_int32(nvout, (int32_t)res->hr_extentsize, "extentsize");
+ resuid = nv_get_uint64(nvin, "resuid");
+ res->hr_primary_localcnt = nv_get_uint64(nvin, "localcnt");
+ res->hr_primary_remotecnt = nv_get_uint64(nvin, "remotecnt");
+ nv_add_uint64(nvout, res->hr_secondary_localcnt, "localcnt");
+ nv_add_uint64(nvout, res->hr_secondary_remotecnt, "remotecnt");
+ mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize -
+ METADATA_SIZE, res->hr_extentsize, res->hr_local_sectorsize);
+ map = malloc(mapsize);
+ if (map == NULL) {
+ pjdlog_exitx(EX_TEMPFAIL,
+ "Unable to allocate memory (%zu bytes) for activemap.",
+ mapsize);
+ }
+ /*
+ * When we work as primary and secondary is missing we will increase
+ * localcnt in our metadata. When secondary is connected and synced
+ * we make localcnt be equal to remotecnt, which means nodes are more
+ * or less in sync.
+ * Split-brain condition is when both nodes are not able to communicate
+ * and are both configured as primary nodes. In turn, they can both
+ * make incompatible changes to the data and we have to detect that.
+ * Under split-brain condition we will increase our localcnt on first
+ * write and remote node will increase its localcnt on first write.
+ * When we connect we can see that primary's localcnt is greater than
+ * our remotecnt (primary was modified while we weren't watching) and
+ * our localcnt is greater than primary's remotecnt (we were modified
+ * while primary wasn't watching).
+ * There are many possible combinations which are all gathered below.
+ * Don't pay too much attention to exact numbers, the more important
+ * is to compare them. We compare secondary's local with primary's
+ * remote and secondary's remote with primary's local.
+ * Note that every case where primary's localcnt is smaller than
+ * secondary's remotecnt and where secondary's localcnt is smaller than
+ * primary's remotecnt should be impossible in practise. We will perform
+ * full synchronization then. Those cases are marked with an asterisk.
+ * Regular synchronization means that only extents marked as dirty are
+ * synchronized (regular synchronization).
+ *
+ * SECONDARY METADATA PRIMARY METADATA
+ * local=3 remote=3 local=2 remote=2* ?! Full sync from secondary.
+ * local=3 remote=3 local=2 remote=3* ?! Full sync from primary.
+ * local=3 remote=3 local=2 remote=4* ?! Full sync from primary.
+ * local=3 remote=3 local=3 remote=2 Primary is out-of-date,
+ * regular sync from secondary.
+ * local=3 remote=3 local=3 remote=3 Regular sync just in case.
+ * local=3 remote=3 local=3 remote=4* ?! Full sync from primary.
+ * local=3 remote=3 local=4 remote=2 Split-brain condition.
+ * local=3 remote=3 local=4 remote=3 Secondary out-of-date,
+ * regular sync from primary.
+ * local=3 remote=3 local=4 remote=4* ?! Full sync from primary.
+ */
+ if (res->hr_resuid == 0) {
+ /*
+ * Provider is used for the first time. If primary node done no
+ * writes yet as well (we will find "virgin" argument) then
+ * there is no need to synchronize anything. If primary node
+ * done any writes already we have to synchronize everything.
+ */
+ PJDLOG_ASSERT(res->hr_secondary_localcnt == 0);
+ res->hr_resuid = resuid;
+ if (metadata_write(res) == -1)
+ exit(EX_NOINPUT);
+ if (nv_exists(nvin, "virgin")) {
+ free(map);
+ map = NULL;
+ mapsize = 0;
+ } else {
+ memset(map, 0xff, mapsize);
+ }
+ nv_add_int8(nvout, 1, "virgin");
+ nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
+ } else if (res->hr_resuid != resuid) {
+ char errmsg[256];
+
+ free(map);
+ (void)snprintf(errmsg, sizeof(errmsg),
+ "Resource unique ID mismatch (primary=%ju, secondary=%ju).",
+ (uintmax_t)resuid, (uintmax_t)res->hr_resuid);
+ pjdlog_error("%s", errmsg);
+ nv_add_string(nvout, errmsg, "errmsg");
+ if (hast_proto_send(res, res->hr_remotein, nvout,
+ NULL, 0) == -1) {
+ pjdlog_exit(EX_TEMPFAIL,
+ "Unable to send response to %s",
+ res->hr_remoteaddr);
+ }
+ nv_free(nvout);
+ exit(EX_CONFIG);
+ } else if (
+ /* Is primary out-of-date? */
+ (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
+ res->hr_secondary_remotecnt == res->hr_primary_localcnt) ||
+ /* Are the nodes more or less in sync? */
+ (res->hr_secondary_localcnt == res->hr_primary_remotecnt &&
+ res->hr_secondary_remotecnt == res->hr_primary_localcnt) ||
+ /* Is secondary out-of-date? */
+ (res->hr_secondary_localcnt == res->hr_primary_remotecnt &&
+ res->hr_secondary_remotecnt < res->hr_primary_localcnt)) {
+ /*
+ * Nodes are more or less in sync or one of the nodes is
+ * out-of-date.
+ * It doesn't matter at this point which one, we just have to
+ * send out local bitmap to the remote node.
+ */
+ if (pread(res->hr_localfd, map, mapsize, METADATA_SIZE) !=
+ (ssize_t)mapsize) {
+ pjdlog_exit(LOG_ERR, "Unable to read activemap");
+ }
+ if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
+ res->hr_secondary_remotecnt == res->hr_primary_localcnt) {
+ /* Primary is out-of-date, sync from secondary. */
+ nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc");
+ } else {
+ /*
+ * Secondary is out-of-date or counts match.
+ * Sync from primary.
+ */
+ nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
+ }
+ } else if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
+ res->hr_primary_localcnt > res->hr_secondary_remotecnt) {
+ /*
+ * Not good, we have split-brain condition.
+ */
+ free(map);
+ pjdlog_error("Split-brain detected, exiting.");
+ nv_add_string(nvout, "Split-brain condition!", "errmsg");
+ if (hast_proto_send(res, res->hr_remotein, nvout,
+ NULL, 0) == -1) {
+ pjdlog_exit(EX_TEMPFAIL,
+ "Unable to send response to %s",
+ res->hr_remoteaddr);
+ }
+ nv_free(nvout);
+ /* Exit on split-brain. */
+ event_send(res, EVENT_SPLITBRAIN);
+ exit(EX_CONFIG);
+ } else /* if (res->hr_secondary_localcnt < res->hr_primary_remotecnt ||
+ res->hr_primary_localcnt < res->hr_secondary_remotecnt) */ {
+ /*
+ * This should never happen in practise, but we will perform
+ * full synchronization.
+ */
+ PJDLOG_ASSERT(res->hr_secondary_localcnt < res->hr_primary_remotecnt ||
+ res->hr_primary_localcnt < res->hr_secondary_remotecnt);
+ mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize -
+ METADATA_SIZE, res->hr_extentsize,
+ res->hr_local_sectorsize);
+ memset(map, 0xff, mapsize);
+ if (res->hr_secondary_localcnt > res->hr_primary_remotecnt) {
+ /* In this one of five cases sync from secondary. */
+ nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc");
+ } else {
+ /* For the rest four cases sync from primary. */
+ nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
+ }
+ pjdlog_warning("This should never happen, asking for full synchronization (primary(local=%ju, remote=%ju), secondary(local=%ju, remote=%ju)).",
+ (uintmax_t)res->hr_primary_localcnt,
+ (uintmax_t)res->hr_primary_remotecnt,
+ (uintmax_t)res->hr_secondary_localcnt,
+ (uintmax_t)res->hr_secondary_remotecnt);
+ }
+ nv_add_uint32(nvout, (uint32_t)mapsize, "mapsize");
+ if (hast_proto_send(res, res->hr_remotein, nvout, map, mapsize) == -1) {
+ pjdlog_exit(EX_TEMPFAIL, "Unable to send activemap to %s",
+ res->hr_remoteaddr);
+ }
+ if (map != NULL)
+ free(map);
+ nv_free(nvout);
+#ifdef notyet
+ /* Setup direction. */
+ if (proto_recv(res->hr_remotein, NULL, 0) == -1)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection direction");
+#endif
+}
+
+void
+hastd_secondary(struct hast_resource *res, struct nv *nvin)
+{
+ sigset_t mask;
+ pthread_t td;
+ pid_t pid;
+ int error, mode, debuglevel;
+
+ /*
+ * Create communication channel between parent and child.
+ */
+ if (proto_client(NULL, "socketpair://", &res->hr_ctrl) == -1) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR,
+ "Unable to create control sockets between parent and child");
+ }
+ /*
+ * Create communication channel between child and parent.
+ */
+ if (proto_client(NULL, "socketpair://", &res->hr_event) == -1) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR,
+ "Unable to create event sockets between child and parent");
+ }
+
+ pid = fork();
+ if (pid == -1) {
+ KEEP_ERRNO((void)pidfile_remove(pfh));
+ pjdlog_exit(EX_OSERR, "Unable to fork");
+ }
+
+ if (pid > 0) {
+ /* This is parent. */
+ proto_close(res->hr_remotein);
+ res->hr_remotein = NULL;
+ proto_close(res->hr_remoteout);
+ res->hr_remoteout = NULL;
+ /* Declare that we are receiver. */
+ proto_recv(res->hr_event, NULL, 0);
+ /* Declare that we are sender. */
+ proto_send(res->hr_ctrl, NULL, 0);
+ res->hr_workerpid = pid;
+ return;
+ }
+
+ gres = res;
+ mode = pjdlog_mode_get();
+ debuglevel = pjdlog_debug_get();
+
+ /* Declare that we are sender. */
+ proto_send(res->hr_event, NULL, 0);
+ /* Declare that we are receiver. */
+ proto_recv(res->hr_ctrl, NULL, 0);
+ descriptors_cleanup(res);
+
+ descriptors_assert(res, mode);
+
+ pjdlog_init(mode);
+ pjdlog_debug_set(debuglevel);
+ pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
+ setproctitle("%s (%s)", res->hr_name, role2str(res->hr_role));
+
+ PJDLOG_VERIFY(sigemptyset(&mask) == 0);
+ PJDLOG_VERIFY(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
+
+ /* Error in setting timeout is not critical, but why should it fail? */
+ if (proto_timeout(res->hr_remotein, 2 * HAST_KEEPALIVE) == -1)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
+ if (proto_timeout(res->hr_remoteout, res->hr_timeout) == -1)
+ pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
+
+ init_local(res);
+ init_environment();
+
+ if (drop_privs(res) != 0)
+ exit(EX_CONFIG);
+ pjdlog_info("Privileges successfully dropped.");
+
+ /*
+ * Create the control thread before sending any event to the parent,
+ * as we can deadlock when parent sends control request to worker,
+ * but worker has no control thread started yet, so parent waits.
+ * In the meantime worker sends an event to the parent, but parent
+ * is unable to handle the event, because it waits for control
+ * request response.
+ */
+ error = pthread_create(&td, NULL, ctrl_thread, res);
+ PJDLOG_ASSERT(error == 0);
+
+ init_remote(res, nvin);
+ event_send(res, EVENT_CONNECT);
+
+ error = pthread_create(&td, NULL, recv_thread, res);
+ PJDLOG_ASSERT(error == 0);
+ error = pthread_create(&td, NULL, disk_thread, res);
+ PJDLOG_ASSERT(error == 0);
+ (void)send_thread(res);
+}
+
+static void
+reqlog(int loglevel, int debuglevel, int error, struct hio *hio,
+ const char *fmt, ...)
+{
+ char msg[1024];
+ va_list ap;
+ int len;
+
+ va_start(ap, fmt);
+ len = vsnprintf(msg, sizeof(msg), fmt, ap);
+ va_end(ap);
+ if ((size_t)len < sizeof(msg)) {
+ switch (hio->hio_cmd) {
+ case HIO_READ:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "READ(%ju, %ju).", (uintmax_t)hio->hio_offset,
+ (uintmax_t)hio->hio_length);
+ break;
+ case HIO_DELETE:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "DELETE(%ju, %ju).", (uintmax_t)hio->hio_offset,
+ (uintmax_t)hio->hio_length);
+ break;
+ case HIO_FLUSH:
+ (void)snprintf(msg + len, sizeof(msg) - len, "FLUSH.");
+ break;
+ case HIO_WRITE:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "WRITE(%ju, %ju).", (uintmax_t)hio->hio_offset,
+ (uintmax_t)hio->hio_length);
+ break;
+ case HIO_KEEPALIVE:
+ (void)snprintf(msg + len, sizeof(msg) - len, "KEEPALIVE.");
+ break;
+ default:
+ (void)snprintf(msg + len, sizeof(msg) - len,
+ "UNKNOWN(%u).", (unsigned int)hio->hio_cmd);
+ break;
+ }
+ }
+ pjdlog_common(loglevel, debuglevel, error, "%s", msg);
+}
+
+static int
+requnpack(struct hast_resource *res, struct hio *hio, struct nv *nv)
+{
+
+ hio->hio_cmd = nv_get_uint8(nv, "cmd");
+ if (hio->hio_cmd == 0) {
+ pjdlog_error("Header contains no 'cmd' field.");
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if (hio->hio_cmd != HIO_KEEPALIVE) {
+ hio->hio_seq = nv_get_uint64(nv, "seq");
+ if (hio->hio_seq == 0) {
+ pjdlog_error("Header contains no 'seq' field.");
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ }
+ switch (hio->hio_cmd) {
+ case HIO_FLUSH:
+ case HIO_KEEPALIVE:
+ break;
+ case HIO_WRITE:
+ hio->hio_memsync = nv_exists(nv, "memsync");
+ /* FALLTHROUGH */
+ case HIO_READ:
+ case HIO_DELETE:
+ hio->hio_offset = nv_get_uint64(nv, "offset");
+ if (nv_error(nv) != 0) {
+ pjdlog_error("Header is missing 'offset' field.");
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ hio->hio_length = nv_get_uint64(nv, "length");
+ if (nv_error(nv) != 0) {
+ pjdlog_error("Header is missing 'length' field.");
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if (hio->hio_length == 0) {
+ pjdlog_error("Data length is zero.");
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if (hio->hio_cmd != HIO_DELETE && hio->hio_length > MAXPHYS) {
+ pjdlog_error("Data length is too large (%ju > %ju).",
+ (uintmax_t)hio->hio_length, (uintmax_t)MAXPHYS);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if ((hio->hio_offset % res->hr_local_sectorsize) != 0) {
+ pjdlog_error("Offset %ju is not multiple of sector size.",
+ (uintmax_t)hio->hio_offset);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if ((hio->hio_length % res->hr_local_sectorsize) != 0) {
+ pjdlog_error("Length %ju is not multiple of sector size.",
+ (uintmax_t)hio->hio_length);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ if (hio->hio_offset + hio->hio_length >
+ (uint64_t)res->hr_datasize) {
+ pjdlog_error("Data offset is too large (%ju > %ju).",
+ (uintmax_t)(hio->hio_offset + hio->hio_length),
+ (uintmax_t)res->hr_datasize);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ break;
+ default:
+ pjdlog_error("Header contains invalid 'cmd' (%hhu).",
+ hio->hio_cmd);
+ hio->hio_error = EINVAL;
+ goto end;
+ }
+ hio->hio_error = 0;
+end:
+ return (hio->hio_error);
+}
+
+static __dead2 void
+secondary_exit(int exitcode, const char *fmt, ...)
+{
+ va_list ap;
+
+ PJDLOG_ASSERT(exitcode != EX_OK);
+ va_start(ap, fmt);
+ pjdlogv_errno(LOG_ERR, fmt, ap);
+ va_end(ap);
+ event_send(gres, EVENT_DISCONNECT);
+ exit(exitcode);
+}
+
+/*
+ * Thread receives requests from the primary node.
+ */
+static void *
+recv_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct hio *hio, *mshio;
+ struct nv *nv;
+
+ for (;;) {
+ pjdlog_debug(2, "recv: Taking free request.");
+ QUEUE_TAKE(free, hio);
+ pjdlog_debug(2, "recv: (%p) Got request.", hio);
+ if (hast_proto_recv_hdr(res->hr_remotein, &nv) == -1) {
+ secondary_exit(EX_TEMPFAIL,
+ "Unable to receive request header");
+ }
+ if (requnpack(res, hio, nv) != 0) {
+ nv_free(nv);
+ pjdlog_debug(2,
+ "recv: (%p) Moving request to the send queue.",
+ hio);
+ QUEUE_INSERT(send, hio);
+ continue;
+ }
+ switch (hio->hio_cmd) {
+ case HIO_READ:
+ res->hr_stat_read++;
+ break;
+ case HIO_WRITE:
+ res->hr_stat_write++;
+ break;
+ case HIO_DELETE:
+ res->hr_stat_delete++;
+ break;
+ case HIO_FLUSH:
+ res->hr_stat_flush++;
+ break;
+ case HIO_KEEPALIVE:
+ break;
+ default:
+ PJDLOG_ABORT("Unexpected command (cmd=%hhu).",
+ hio->hio_cmd);
+ }
+ reqlog(LOG_DEBUG, 2, -1, hio,
+ "recv: (%p) Got request header: ", hio);
+ if (hio->hio_cmd == HIO_KEEPALIVE) {
+ nv_free(nv);
+ pjdlog_debug(2,
+ "recv: (%p) Moving request to the free queue.",
+ hio);
+ hio_clear(hio);
+ QUEUE_INSERT(free, hio);
+ continue;
+ } else if (hio->hio_cmd == HIO_WRITE) {
+ if (hast_proto_recv_data(res, res->hr_remotein, nv,
+ hio->hio_data, MAXPHYS) == -1) {
+ secondary_exit(EX_TEMPFAIL,
+ "Unable to receive request data");
+ }
+ if (hio->hio_memsync) {
+ /*
+ * For memsync requests we expect two replies.
+ * Clone the hio so we can handle both of them.
+ */
+ pjdlog_debug(2, "recv: Taking free request.");
+ QUEUE_TAKE(free, mshio);
+ pjdlog_debug(2, "recv: (%p) Got request.",
+ mshio);
+ hio_copy(hio, mshio);
+ mshio->hio_error = 0;
+ /*
+ * We want to keep 'memsync' tag only on the
+ * request going onto send queue (mshio).
+ */
+ hio->hio_memsync = false;
+ pjdlog_debug(2,
+ "recv: (%p) Moving memsync request to the send queue.",
+ mshio);
+ QUEUE_INSERT(send, mshio);
+ }
+ }
+ nv_free(nv);
+ pjdlog_debug(2, "recv: (%p) Moving request to the disk queue.",
+ hio);
+ QUEUE_INSERT(disk, hio);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread reads from or writes to local component and also handles DELETE and
+ * FLUSH requests.
+ */
+static void *
+disk_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct hio *hio;
+ ssize_t ret;
+ bool clear_activemap, logerror;
+
+ clear_activemap = true;
+
+ for (;;) {
+ pjdlog_debug(2, "disk: Taking request.");
+ QUEUE_TAKE(disk, hio);
+ while (clear_activemap) {
+ unsigned char *map;
+ size_t mapsize;
+
+ /*
+ * When first request is received, it means that primary
+ * already received our activemap, merged it and stored
+ * locally. We can now safely clear our activemap.
+ */
+ mapsize =
+ activemap_calc_ondisk_size(res->hr_local_mediasize -
+ METADATA_SIZE, res->hr_extentsize,
+ res->hr_local_sectorsize);
+ map = calloc(1, mapsize);
+ if (map == NULL) {
+ pjdlog_warning("Unable to allocate memory to clear local activemap.");
+ break;
+ }
+ if (pwrite(res->hr_localfd, map, mapsize,
+ METADATA_SIZE) != (ssize_t)mapsize) {
+ pjdlog_errno(LOG_WARNING,
+ "Unable to store cleared activemap");
+ free(map);
+ res->hr_stat_activemap_write_error++;
+ break;
+ }
+ free(map);
+ clear_activemap = false;
+ pjdlog_debug(1, "Local activemap cleared.");
+ break;
+ }
+ reqlog(LOG_DEBUG, 2, -1, hio, "disk: (%p) Got request: ", hio);
+ logerror = true;
+ /* Handle the actual request. */
+ switch (hio->hio_cmd) {
+ case HIO_READ:
+ ret = pread(res->hr_localfd, hio->hio_data,
+ hio->hio_length,
+ hio->hio_offset + res->hr_localoff);
+ if (ret == -1)
+ hio->hio_error = errno;
+ else if (ret != (int64_t)hio->hio_length)
+ hio->hio_error = EIO;
+ else
+ hio->hio_error = 0;
+ break;
+ case HIO_WRITE:
+ ret = pwrite(res->hr_localfd, hio->hio_data,
+ hio->hio_length,
+ hio->hio_offset + res->hr_localoff);
+ if (ret == -1)
+ hio->hio_error = errno;
+ else if (ret != (int64_t)hio->hio_length)
+ hio->hio_error = EIO;
+ else
+ hio->hio_error = 0;
+ break;
+ case HIO_DELETE:
+ ret = g_delete(res->hr_localfd,
+ hio->hio_offset + res->hr_localoff,
+ hio->hio_length);
+ if (ret == -1)
+ hio->hio_error = errno;
+ else
+ hio->hio_error = 0;
+ break;
+ case HIO_FLUSH:
+ if (!res->hr_localflush) {
+ ret = -1;
+ hio->hio_error = EOPNOTSUPP;
+ logerror = false;
+ break;
+ }
+ ret = g_flush(res->hr_localfd);
+ if (ret == -1) {
+ if (errno == EOPNOTSUPP)
+ res->hr_localflush = false;
+ hio->hio_error = errno;
+ } else {
+ hio->hio_error = 0;
+ }
+ break;
+ default:
+ PJDLOG_ABORT("Unexpected command (cmd=%hhu).",
+ hio->hio_cmd);
+ }
+ if (logerror && hio->hio_error != 0) {
+ reqlog(LOG_ERR, 0, hio->hio_error, hio,
+ "Request failed: ");
+ }
+ pjdlog_debug(2, "disk: (%p) Moving request to the send queue.",
+ hio);
+ QUEUE_INSERT(send, hio);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
+
+/*
+ * Thread sends requests back to primary node.
+ */
+static void *
+send_thread(void *arg)
+{
+ struct hast_resource *res = arg;
+ struct nv *nvout;
+ struct hio *hio;
+ void *data;
+ size_t length;
+
+ for (;;) {
+ pjdlog_debug(2, "send: Taking request.");
+ QUEUE_TAKE(send, hio);
+ reqlog(LOG_DEBUG, 2, -1, hio, "send: (%p) Got request: ", hio);
+ nvout = nv_alloc();
+ /* Copy sequence number. */
+ nv_add_uint64(nvout, hio->hio_seq, "seq");
+ if (hio->hio_memsync) {
+ PJDLOG_ASSERT(hio->hio_cmd == HIO_WRITE);
+ nv_add_int8(nvout, 1, "received");
+ }
+ switch (hio->hio_cmd) {
+ case HIO_READ:
+ if (hio->hio_error == 0) {
+ data = hio->hio_data;
+ length = hio->hio_length;
+ break;
+ }
+ /*
+ * We send no data in case of an error.
+ */
+ /* FALLTHROUGH */
+ case HIO_DELETE:
+ case HIO_FLUSH:
+ case HIO_WRITE:
+ data = NULL;
+ length = 0;
+ break;
+ default:
+ PJDLOG_ABORT("Unexpected command (cmd=%hhu).",
+ hio->hio_cmd);
+ }
+ if (hio->hio_error != 0) {
+ switch (hio->hio_cmd) {
+ case HIO_READ:
+ res->hr_stat_read_error++;
+ break;
+ case HIO_WRITE:
+ res->hr_stat_write_error++;
+ break;
+ case HIO_DELETE:
+ res->hr_stat_delete_error++;
+ break;
+ case HIO_FLUSH:
+ res->hr_stat_flush_error++;
+ break;
+ }
+ nv_add_int16(nvout, hio->hio_error, "error");
+ }
+ if (hast_proto_send(res, res->hr_remoteout, nvout, data,
+ length) == -1) {
+ secondary_exit(EX_TEMPFAIL, "Unable to send reply");
+ }
+ nv_free(nvout);
+ pjdlog_debug(2, "send: (%p) Moving request to the free queue.",
+ hio);
+ hio_clear(hio);
+ QUEUE_INSERT(free, hio);
+ }
+ /* NOTREACHED */
+ return (NULL);
+}
diff --git a/sbin/hastd/subr.c b/sbin/hastd/subr.c
new file mode 100644
index 0000000..0e9930b
--- /dev/null
+++ b/sbin/hastd/subr.c
@@ -0,0 +1,299 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/disk.h>
+#include <sys/ioctl.h>
+#include <sys/jail.h>
+#include <sys/stat.h>
+#ifdef HAVE_CAPSICUM
+#include <sys/capability.h>
+#include <geom/gate/g_gate.h>
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pwd.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <pjdlog.h>
+
+#include "hast.h"
+#include "subr.h"
+
+int
+vsnprlcat(char *str, size_t size, const char *fmt, va_list ap)
+{
+ size_t len;
+
+ len = strlen(str);
+ return (vsnprintf(str + len, size - len, fmt, ap));
+}
+
+int
+snprlcat(char *str, size_t size, const char *fmt, ...)
+{
+ va_list ap;
+ int result;
+
+ va_start(ap, fmt);
+ result = vsnprlcat(str, size, fmt, ap);
+ va_end(ap);
+ return (result);
+}
+
+int
+provinfo(struct hast_resource *res, bool dowrite)
+{
+ struct stat sb;
+
+ PJDLOG_ASSERT(res->hr_localpath != NULL &&
+ res->hr_localpath[0] != '\0');
+
+ if (res->hr_localfd == -1) {
+ res->hr_localfd = open(res->hr_localpath,
+ dowrite ? O_RDWR : O_RDONLY);
+ if (res->hr_localfd == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to open %s",
+ res->hr_localpath);
+ return (-1);
+ }
+ }
+ if (fstat(res->hr_localfd, &sb) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to stat %s", res->hr_localpath);
+ return (-1);
+ }
+ if (S_ISCHR(sb.st_mode)) {
+ /*
+ * If this is character device, it is most likely GEOM provider.
+ */
+ if (ioctl(res->hr_localfd, DIOCGMEDIASIZE,
+ &res->hr_local_mediasize) == -1) {
+ pjdlog_errno(LOG_ERR,
+ "Unable obtain provider %s mediasize",
+ res->hr_localpath);
+ return (-1);
+ }
+ if (ioctl(res->hr_localfd, DIOCGSECTORSIZE,
+ &res->hr_local_sectorsize) == -1) {
+ pjdlog_errno(LOG_ERR,
+ "Unable obtain provider %s sectorsize",
+ res->hr_localpath);
+ return (-1);
+ }
+ } else if (S_ISREG(sb.st_mode)) {
+ /*
+ * We also support regular files for which we hardcode
+ * sector size of 512 bytes.
+ */
+ res->hr_local_mediasize = sb.st_size;
+ res->hr_local_sectorsize = 512;
+ } else {
+ /*
+ * We support no other file types.
+ */
+ pjdlog_error("%s is neither GEOM provider nor regular file.",
+ res->hr_localpath);
+ errno = EFTYPE;
+ return (-1);
+ }
+ return (0);
+}
+
+const char *
+role2str(int role)
+{
+
+ switch (role) {
+ case HAST_ROLE_INIT:
+ return ("init");
+ case HAST_ROLE_PRIMARY:
+ return ("primary");
+ case HAST_ROLE_SECONDARY:
+ return ("secondary");
+ }
+ return ("unknown");
+}
+
+int
+drop_privs(const struct hast_resource *res)
+{
+ char jailhost[sizeof(res->hr_name) * 2];
+ struct jail jailst;
+ struct passwd *pw;
+ uid_t ruid, euid, suid;
+ gid_t rgid, egid, sgid;
+ gid_t gidset[1];
+ bool capsicum, jailed;
+
+ /*
+ * According to getpwnam(3) we have to clear errno before calling the
+ * function to be able to distinguish between an error and missing
+ * entry (with is not treated as error by getpwnam(3)).
+ */
+ errno = 0;
+ pw = getpwnam(HAST_USER);
+ if (pw == NULL) {
+ if (errno != 0) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to find info about '%s' user", HAST_USER);
+ return (-1);
+ } else {
+ pjdlog_error("'%s' user doesn't exist.", HAST_USER);
+ errno = ENOENT;
+ return (-1);
+ }
+ }
+
+ bzero(&jailst, sizeof(jailst));
+ jailst.version = JAIL_API_VERSION;
+ jailst.path = pw->pw_dir;
+ if (res == NULL) {
+ (void)snprintf(jailhost, sizeof(jailhost), "hastctl");
+ } else {
+ (void)snprintf(jailhost, sizeof(jailhost), "hastd: %s (%s)",
+ res->hr_name, role2str(res->hr_role));
+ }
+ jailst.hostname = jailhost;
+ jailst.jailname = NULL;
+ jailst.ip4s = 0;
+ jailst.ip4 = NULL;
+ jailst.ip6s = 0;
+ jailst.ip6 = NULL;
+ if (jail(&jailst) >= 0) {
+ jailed = true;
+ } else {
+ jailed = false;
+ pjdlog_errno(LOG_WARNING,
+ "Unable to jail to directory to %s", pw->pw_dir);
+ if (chroot(pw->pw_dir) == -1) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to change root directory to %s",
+ pw->pw_dir);
+ return (-1);
+ }
+ }
+ PJDLOG_VERIFY(chdir("/") == 0);
+ gidset[0] = pw->pw_gid;
+ if (setgroups(1, gidset) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to set groups to gid %u",
+ (unsigned int)pw->pw_gid);
+ return (-1);
+ }
+ if (setgid(pw->pw_gid) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to set gid to %u",
+ (unsigned int)pw->pw_gid);
+ return (-1);
+ }
+ if (setuid(pw->pw_uid) == -1) {
+ pjdlog_errno(LOG_ERR, "Unable to set uid to %u",
+ (unsigned int)pw->pw_uid);
+ return (-1);
+ }
+
+#ifdef HAVE_CAPSICUM
+ capsicum = (cap_enter() == 0);
+ if (!capsicum) {
+ pjdlog_common(LOG_DEBUG, 1, errno,
+ "Unable to sandbox using capsicum");
+ } else if (res != NULL) {
+ cap_rights_t rights;
+ static const unsigned long geomcmds[] = {
+ DIOCGDELETE,
+ DIOCGFLUSH
+ };
+
+ PJDLOG_ASSERT(res->hr_role == HAST_ROLE_PRIMARY ||
+ res->hr_role == HAST_ROLE_SECONDARY);
+
+ cap_rights_init(&rights, CAP_FLOCK, CAP_IOCTL, CAP_PREAD,
+ CAP_PWRITE);
+ if (cap_rights_limit(res->hr_localfd, &rights) == -1) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to limit capability rights on local descriptor");
+ }
+ if (cap_ioctls_limit(res->hr_localfd, geomcmds,
+ sizeof(geomcmds) / sizeof(geomcmds[0])) == -1) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to limit allowed GEOM ioctls");
+ }
+
+ if (res->hr_role == HAST_ROLE_PRIMARY) {
+ static const unsigned long ggatecmds[] = {
+ G_GATE_CMD_MODIFY,
+ G_GATE_CMD_START,
+ G_GATE_CMD_DONE,
+ G_GATE_CMD_DESTROY
+ };
+
+ cap_rights_init(&rights, CAP_IOCTL);
+ if (cap_rights_limit(res->hr_ggatefd, &rights) == -1) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to limit capability rights to CAP_IOCTL on ggate descriptor");
+ }
+ if (cap_ioctls_limit(res->hr_ggatefd, ggatecmds,
+ sizeof(ggatecmds) / sizeof(ggatecmds[0])) == -1) {
+ pjdlog_errno(LOG_ERR,
+ "Unable to limit allowed ggate ioctls");
+ }
+ }
+ }
+#else
+ capsicum = false;
+#endif
+
+ /*
+ * Better be sure that everything succeeded.
+ */
+ PJDLOG_VERIFY(getresuid(&ruid, &euid, &suid) == 0);
+ PJDLOG_VERIFY(ruid == pw->pw_uid);
+ PJDLOG_VERIFY(euid == pw->pw_uid);
+ PJDLOG_VERIFY(suid == pw->pw_uid);
+ PJDLOG_VERIFY(getresgid(&rgid, &egid, &sgid) == 0);
+ PJDLOG_VERIFY(rgid == pw->pw_gid);
+ PJDLOG_VERIFY(egid == pw->pw_gid);
+ PJDLOG_VERIFY(sgid == pw->pw_gid);
+ PJDLOG_VERIFY(getgroups(0, NULL) == 1);
+ PJDLOG_VERIFY(getgroups(1, gidset) == 1);
+ PJDLOG_VERIFY(gidset[0] == pw->pw_gid);
+
+ pjdlog_debug(1,
+ "Privileges successfully dropped using %s%s+setgid+setuid.",
+ capsicum ? "capsicum+" : "", jailed ? "jail" : "chroot");
+
+ return (0);
+}
diff --git a/sbin/hastd/subr.h b/sbin/hastd/subr.h
new file mode 100644
index 0000000..c765754
--- /dev/null
+++ b/sbin/hastd/subr.h
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SUBR_H_
+#define _SUBR_H_
+
+#include <sys/types.h>
+#include <stdbool.h>
+
+#include "hast.h"
+
+#define KEEP_ERRNO(work) do { \
+ int _rerrno; \
+ \
+ _rerrno = errno; \
+ work; \
+ errno = _rerrno; \
+} while (0)
+
+int vsnprlcat(char *str, size_t size, const char *fmt, va_list ap);
+int snprlcat(char *str, size_t size, const char *fmt, ...);
+
+int provinfo(struct hast_resource *res, bool dowrite);
+const char *role2str(int role);
+int drop_privs(const struct hast_resource *res);
+
+#endif /* !_SUBR_H_ */
diff --git a/sbin/hastd/synch.h b/sbin/hastd/synch.h
new file mode 100644
index 0000000..65360fd
--- /dev/null
+++ b/sbin/hastd/synch.h
@@ -0,0 +1,194 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYNCH_H_
+#define _SYNCH_H_
+
+#include <errno.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <stdbool.h>
+#include <time.h>
+
+#include <pjdlog.h>
+
+#ifndef PJDLOG_ASSERT
+#include <assert.h>
+#define PJDLOG_ASSERT(...) assert(__VA_ARGS__)
+#endif
+
+static __inline void
+mtx_init(pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_mutex_init(lock, NULL);
+ PJDLOG_ASSERT(error == 0);
+}
+static __inline void
+mtx_destroy(pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_mutex_destroy(lock);
+ PJDLOG_ASSERT(error == 0);
+}
+static __inline void
+mtx_lock(pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_mutex_lock(lock);
+ PJDLOG_ASSERT(error == 0);
+}
+static __inline bool
+mtx_trylock(pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_mutex_trylock(lock);
+ PJDLOG_ASSERT(error == 0 || error == EBUSY);
+ return (error == 0);
+}
+static __inline void
+mtx_unlock(pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_mutex_unlock(lock);
+ PJDLOG_ASSERT(error == 0);
+}
+static __inline bool
+mtx_owned(pthread_mutex_t *lock)
+{
+
+ return (pthread_mutex_isowned_np(lock) != 0);
+}
+
+static __inline void
+rw_init(pthread_rwlock_t *lock)
+{
+ int error;
+
+ error = pthread_rwlock_init(lock, NULL);
+ PJDLOG_ASSERT(error == 0);
+}
+static __inline void
+rw_destroy(pthread_rwlock_t *lock)
+{
+ int error;
+
+ error = pthread_rwlock_destroy(lock);
+ PJDLOG_ASSERT(error == 0);
+}
+static __inline void
+rw_rlock(pthread_rwlock_t *lock)
+{
+ int error;
+
+ error = pthread_rwlock_rdlock(lock);
+ PJDLOG_ASSERT(error == 0);
+}
+static __inline void
+rw_wlock(pthread_rwlock_t *lock)
+{
+ int error;
+
+ error = pthread_rwlock_wrlock(lock);
+ PJDLOG_ASSERT(error == 0);
+}
+static __inline void
+rw_unlock(pthread_rwlock_t *lock)
+{
+ int error;
+
+ error = pthread_rwlock_unlock(lock);
+ PJDLOG_ASSERT(error == 0);
+}
+
+static __inline void
+cv_init(pthread_cond_t *cv)
+{
+ pthread_condattr_t attr;
+ int error;
+
+ error = pthread_condattr_init(&attr);
+ PJDLOG_ASSERT(error == 0);
+ error = pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
+ PJDLOG_ASSERT(error == 0);
+ error = pthread_cond_init(cv, &attr);
+ PJDLOG_ASSERT(error == 0);
+ error = pthread_condattr_destroy(&attr);
+ PJDLOG_ASSERT(error == 0);
+}
+static __inline void
+cv_wait(pthread_cond_t *cv, pthread_mutex_t *lock)
+{
+ int error;
+
+ error = pthread_cond_wait(cv, lock);
+ PJDLOG_ASSERT(error == 0);
+}
+static __inline bool
+cv_timedwait(pthread_cond_t *cv, pthread_mutex_t *lock, int timeout)
+{
+ struct timespec ts;
+ int error;
+
+ if (timeout == 0) {
+ cv_wait(cv, lock);
+ return (false);
+ }
+
+ error = clock_gettime(CLOCK_MONOTONIC, &ts);
+ PJDLOG_ASSERT(error == 0);
+ ts.tv_sec += timeout;
+ error = pthread_cond_timedwait(cv, lock, &ts);
+ PJDLOG_ASSERT(error == 0 || error == ETIMEDOUT);
+ return (error == ETIMEDOUT);
+}
+static __inline void
+cv_signal(pthread_cond_t *cv)
+{
+ int error;
+
+ error = pthread_cond_signal(cv);
+ PJDLOG_ASSERT(error == 0);
+}
+static __inline void
+cv_broadcast(pthread_cond_t *cv)
+{
+ int error;
+
+ error = pthread_cond_broadcast(cv);
+ PJDLOG_ASSERT(error == 0);
+}
+#endif /* !_SYNCH_H_ */
diff --git a/sbin/hastd/token.l b/sbin/hastd/token.l
new file mode 100644
index 0000000..e8f6760
--- /dev/null
+++ b/sbin/hastd/token.l
@@ -0,0 +1,86 @@
+%{
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "hast.h"
+
+#include "y.tab.h"
+
+int depth;
+int lineno;
+
+#define DP do { } while (0)
+#define YY_DECL int yylex(void)
+
+extern int yylex(void);
+%}
+
+%option noinput
+%option nounput
+%option noyywrap
+
+%%
+control { DP; return CONTROL; }
+pidfile { DP; return PIDFILE; }
+listen { DP; return LISTEN; }
+replication { DP; return REPLICATION; }
+checksum { DP; return CHECKSUM; }
+compression { DP; return COMPRESSION; }
+timeout { DP; return TIMEOUT; }
+exec { DP; return EXEC; }
+metaflush { DP; return METAFLUSH; }
+resource { DP; return RESOURCE; }
+name { DP; return NAME; }
+local { DP; return LOCAL; }
+remote { DP; return REMOTE; }
+source { DP; return SOURCE; }
+on { DP; return ON; }
+off { DP; return OFF; }
+fullsync { DP; return FULLSYNC; }
+memsync { DP; return MEMSYNC; }
+async { DP; return ASYNC; }
+none { DP; return NONE; }
+crc32 { DP; return CRC32; }
+sha256 { DP; return SHA256; }
+hole { DP; return HOLE; }
+lzf { DP; return LZF; }
+[0-9]+ { DP; yylval.num = atoi(yytext); return NUM; }
+[a-zA-Z0-9\.\-_/\:\[\]]+ { DP; yylval.str = strdup(yytext); return STR; }
+\{ { DP; depth++; return OB; }
+\} { DP; depth--; return CB; }
+#.*$ /* ignore comments */;
+\n { lineno++; }
+[ \t]+ /* ignore whitespace */;
+%%
OpenPOWER on IntegriCloud